From bd33335aa93d615cac77d991c448b986761e7a8d Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 5 Mar 2021 12:21:40 +0000 Subject: [PATCH 0001/1250] rtc: cmos: Disable irq around direct invocation of cmos_interrupt() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As previously noted in commit 66e4f4a9cc38 ("rtc: cmos: Use spin_lock_irqsave() in cmos_interrupt()"): <4>[ 254.192378] WARNING: inconsistent lock state <4>[ 254.192384] 5.12.0-rc1-CI-CI_DRM_9834+ #1 Not tainted <4>[ 254.192396] -------------------------------- <4>[ 254.192400] inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. <4>[ 254.192409] rtcwake/5309 [HC0[0]:SC0[0]:HE1:SE1] takes: <4>[ 254.192429] ffffffff8263c5f8 (rtc_lock){?...}-{2:2}, at: cmos_interrupt+0x18/0x100 <4>[ 254.192481] {IN-HARDIRQ-W} state was registered at: <4>[ 254.192488] lock_acquire+0xd1/0x3d0 <4>[ 254.192504] _raw_spin_lock+0x2a/0x40 <4>[ 254.192519] cmos_interrupt+0x18/0x100 <4>[ 254.192536] rtc_handler+0x1f/0xc0 <4>[ 254.192553] acpi_ev_fixed_event_detect+0x109/0x13c <4>[ 254.192574] acpi_ev_sci_xrupt_handler+0xb/0x28 <4>[ 254.192596] acpi_irq+0x13/0x30 <4>[ 254.192620] __handle_irq_event_percpu+0x43/0x2c0 <4>[ 254.192641] handle_irq_event_percpu+0x2b/0x70 <4>[ 254.192661] handle_irq_event+0x2f/0x50 <4>[ 254.192680] handle_fasteoi_irq+0x9e/0x150 <4>[ 254.192693] __common_interrupt+0x76/0x140 <4>[ 254.192715] common_interrupt+0x96/0xc0 <4>[ 254.192732] asm_common_interrupt+0x1e/0x40 <4>[ 254.192750] _raw_spin_unlock_irqrestore+0x38/0x60 <4>[ 254.192767] resume_irqs+0xba/0xf0 <4>[ 254.192786] dpm_resume_noirq+0x245/0x3d0 <4>[ 254.192811] suspend_devices_and_enter+0x230/0xaa0 <4>[ 254.192835] pm_suspend.cold.8+0x301/0x34a <4>[ 254.192859] state_store+0x7b/0xe0 <4>[ 254.192879] kernfs_fop_write_iter+0x11d/0x1c0 <4>[ 254.192899] new_sync_write+0x11d/0x1b0 <4>[ 254.192916] vfs_write+0x265/0x390 <4>[ 254.192933] ksys_write+0x5a/0xd0 <4>[ 254.192949] do_syscall_64+0x33/0x80 <4>[ 254.192965] entry_SYSCALL_64_after_hwframe+0x44/0xae <4>[ 254.192986] irq event stamp: 43775 <4>[ 254.192994] hardirqs last enabled at (43775): [] asm_sysvec_apic_timer_interrupt+0x12/0x20 <4>[ 254.193023] hardirqs last disabled at (43774): [] sysvec_apic_timer_interrupt+0xa/0xb0 <4>[ 254.193049] softirqs last enabled at (42548): [] __do_softirq+0x342/0x48e <4>[ 254.193074] softirqs last disabled at (42543): [] irq_exit_rcu+0xad/0xd0 <4>[ 254.193101] other info that might help us debug this: <4>[ 254.193107] Possible unsafe locking scenario: <4>[ 254.193112] CPU0 <4>[ 254.193117] ---- <4>[ 254.193121] lock(rtc_lock); <4>[ 254.193137] <4>[ 254.193142] lock(rtc_lock); <4>[ 254.193156] *** DEADLOCK *** <4>[ 254.193161] 6 locks held by rtcwake/5309: <4>[ 254.193174] #0: ffff888104861430 (sb_writers#5){.+.+}-{0:0}, at: ksys_write+0x5a/0xd0 <4>[ 254.193232] #1: ffff88810f823288 (&of->mutex){+.+.}-{3:3}, at: kernfs_fop_write_iter+0xe7/0x1c0 <4>[ 254.193282] #2: ffff888100cef3c0 (kn->active#285 <7>[ 254.192706] i915 0000:00:02.0: [drm:intel_modeset_setup_hw_state [i915]] [CRTC:51:pipe A] hw state readout: disabled <4>[ 254.193307] ){.+.+}-{0:0}, at: kernfs_fop_write_iter+0xf0/0x1c0 <4>[ 254.193333] #3: ffffffff82649fa8 (system_transition_mutex){+.+.}-{3:3}, at: pm_suspend.cold.8+0xce/0x34a <4>[ 254.193387] #4: ffffffff827a2108 (acpi_scan_lock){+.+.}-{3:3}, at: acpi_suspend_begin+0x47/0x70 <4>[ 254.193433] #5: ffff8881019ea178 (&dev->mutex){....}-{3:3}, at: device_resume+0x68/0x1e0 <4>[ 254.193485] stack backtrace: <4>[ 254.193492] CPU: 1 PID: 5309 Comm: rtcwake Not tainted 5.12.0-rc1-CI-CI_DRM_9834+ #1 <4>[ 254.193514] Hardware name: Google Soraka/Soraka, BIOS MrChromebox-4.10 08/25/2019 <4>[ 254.193524] Call Trace: <4>[ 254.193536] dump_stack+0x7f/0xad <4>[ 254.193567] mark_lock.part.47+0x8ca/0xce0 <4>[ 254.193604] __lock_acquire+0x39b/0x2590 <4>[ 254.193626] ? asm_sysvec_apic_timer_interrupt+0x12/0x20 <4>[ 254.193660] lock_acquire+0xd1/0x3d0 <4>[ 254.193677] ? cmos_interrupt+0x18/0x100 <4>[ 254.193716] _raw_spin_lock+0x2a/0x40 <4>[ 254.193735] ? cmos_interrupt+0x18/0x100 <4>[ 254.193758] cmos_interrupt+0x18/0x100 <4>[ 254.193785] cmos_resume+0x2ac/0x2d0 <4>[ 254.193813] ? acpi_pm_set_device_wakeup+0x1f/0x110 <4>[ 254.193842] ? pnp_bus_suspend+0x10/0x10 <4>[ 254.193864] pnp_bus_resume+0x5e/0x90 <4>[ 254.193885] dpm_run_callback+0x5f/0x240 <4>[ 254.193914] device_resume+0xb2/0x1e0 <4>[ 254.193942] ? pm_dev_err+0x25/0x25 <4>[ 254.193974] dpm_resume+0xea/0x3f0 <4>[ 254.194005] dpm_resume_end+0x8/0x10 <4>[ 254.194030] suspend_devices_and_enter+0x29b/0xaa0 <4>[ 254.194066] pm_suspend.cold.8+0x301/0x34a <4>[ 254.194094] state_store+0x7b/0xe0 <4>[ 254.194124] kernfs_fop_write_iter+0x11d/0x1c0 <4>[ 254.194151] new_sync_write+0x11d/0x1b0 <4>[ 254.194183] vfs_write+0x265/0x390 <4>[ 254.194207] ksys_write+0x5a/0xd0 <4>[ 254.194232] do_syscall_64+0x33/0x80 <4>[ 254.194251] entry_SYSCALL_64_after_hwframe+0x44/0xae <4>[ 254.194274] RIP: 0033:0x7f07d79691e7 <4>[ 254.194293] Code: 64 89 02 48 c7 c0 ff ff ff ff eb bb 0f 1f 80 00 00 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24 <4>[ 254.194312] RSP: 002b:00007ffd9cc2c768 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 <4>[ 254.194337] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f07d79691e7 <4>[ 254.194352] RDX: 0000000000000004 RSI: 0000556ebfc63590 RDI: 000000000000000b <4>[ 254.194366] RBP: 0000556ebfc63590 R08: 0000000000000000 R09: 0000000000000004 <4>[ 254.194379] R10: 0000556ebf0ec2a6 R11: 0000000000000246 R12: 0000000000000004 which breaks S3-resume on fi-kbl-soraka presumably as that's slow enough to trigger the alarm during the suspend. Fixes: 6950d046eb6e ("rtc: cmos: Replace spin_lock_irqsave with spin_lock in hard IRQ") References: 66e4f4a9cc38 ("rtc: cmos: Use spin_lock_irqsave() in cmos_interrupt()"): Signed-off-by: Chris Wilson Cc: Xiaofei Tan Cc: Alexandre Belloni Cc: Alessandro Zummo Cc: Ville Syrjälä Reviewed-by: Ville Syrjälä Signed-off-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210305122140.28774-1-chris@chris-wilson.co.uk --- drivers/rtc/rtc-cmos.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 670fd8a2970e3e..6545afb2f20eb0 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -1053,7 +1053,9 @@ static void cmos_check_wkalrm(struct device *dev) * ACK the rtc irq here */ if (t_now >= cmos->alarm_expires && cmos_use_acpi_alarm()) { + local_irq_disable(); cmos_interrupt(0, (void *)cmos->rtc); + local_irq_enable(); return; } From 444d018d8d3874c9c3784a5df3ad2e5f554fbdb6 Mon Sep 17 00:00:00 2001 From: Cristian Ciocaltea Date: Fri, 12 Mar 2021 11:49:27 +0200 Subject: [PATCH 0002/1250] ARM: dts: owl-s500-roseapplepi: Add ATC2603C PMIC Add device tree node for ATC2603C PMIC and remove the 'fixed-3.1V' dummy regulator used for the uSD supply. Additionally, add 'SYSPWR' fixed regulator and provide cpu0 supply. Signed-off-by: Cristian Ciocaltea Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/2e0a2931ae3757f016948e7c78e8e54afa325ae0.1615538629.git.cristian.ciocaltea@gmail.com Signed-off-by: Manivannan Sadhasivam --- arch/arm/boot/dts/owl-s500-roseapplepi.dts | 132 ++++++++++++++++++++- 1 file changed, 126 insertions(+), 6 deletions(-) diff --git a/arch/arm/boot/dts/owl-s500-roseapplepi.dts b/arch/arm/boot/dts/owl-s500-roseapplepi.dts index ff91561ca99c8e..b8c5db2344aa0c 100644 --- a/arch/arm/boot/dts/owl-s500-roseapplepi.dts +++ b/arch/arm/boot/dts/owl-s500-roseapplepi.dts @@ -2,7 +2,7 @@ /* * Roseapple Pi * - * Copyright (C) 2020 Cristian Ciocaltea + * Copyright (C) 2020-2021 Cristian Ciocaltea */ /dts-v1/; @@ -27,20 +27,140 @@ reg = <0x0 0x80000000>; /* 2GB */ }; - /* Fixed regulator used in the absence of PMIC */ - sd_vcc: sd-vcc { + syspwr: regulator-5v0 { compatible = "regulator-fixed"; - regulator-name = "fixed-3.1V"; - regulator-min-microvolt = <3100000>; - regulator-max-microvolt = <3100000>; + regulator-name = "SYSPWR"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; regulator-always-on; }; }; +&cpu0 { + cpu0-supply = <&vdd_cpu>; +}; + &i2c0 { status = "okay"; pinctrl-names = "default"; pinctrl-0 = <&i2c0_pins>; + + atc260x: pmic@65 { + compatible = "actions,atc2603c"; + reg = <0x65>; + interrupt-parent = <&sirq>; + interrupts = <2 IRQ_TYPE_LEVEL_HIGH>; + + reset-time-sec = <6>; + + regulators { + compatible = "actions,atc2603c-regulator"; + + dcdc1-supply = <&syspwr>; + dcdc2-supply = <&syspwr>; + dcdc3-supply = <&syspwr>; + ldo1-supply = <&syspwr>; + ldo2-supply = <&syspwr>; + ldo3-supply = <&syspwr>; + ldo5-supply = <&syspwr>; + ldo6-supply = <&syspwr>; + ldo7-supply = <&syspwr>; + ldo8-supply = <&syspwr>; + ldo11-supply = <&syspwr>; + ldo12-supply = <&syspwr>; + switchldo1-supply = <&vcc>; + + vdd_cpu: dcdc1 { + regulator-name = "VDD_CPU"; + regulator-min-microvolt = <700000>; + regulator-max-microvolt = <1400000>; + regulator-always-on; + }; + + vddq: dcdc2 { + regulator-name = "VDDQ"; + regulator-min-microvolt = <1300000>; + regulator-max-microvolt = <2150000>; + regulator-always-on; + regulator-boot-on; + }; + + vcc: dcdc3 { + regulator-name = "VCC"; + regulator-min-microvolt = <2600000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + }; + + vcc_3v3: ldo1 { + regulator-name = "VCC_3V3"; + regulator-min-microvolt = <2600000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + }; + + avcc: ldo2 { + regulator-name = "AVCC"; + regulator-min-microvolt = <2600000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + }; + + vdd_1v8: ldo3 { + regulator-name = "VDD_1V8"; + regulator-min-microvolt = <1500000>; + regulator-max-microvolt = <2000000>; + regulator-always-on; + }; + + vcc_3v1: ldo5 { + regulator-name = "VCC_3V1"; + regulator-min-microvolt = <2600000>; + regulator-max-microvolt = <3300000>; + }; + + avdd: ldo6 { + regulator-name = "AVDD"; + regulator-min-microvolt = <700000>; + regulator-max-microvolt = <1400000>; + regulator-always-on; + }; + + sens_1v8: ldo7 { + regulator-name = "SENS_1V8"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + }; + + ldo8: ldo8 { + regulator-name = "LDO8"; + regulator-min-microvolt = <2300000>; + regulator-max-microvolt = <3300000>; + }; + + svcc: ldo11 { + regulator-name = "SVCC"; + regulator-min-microvolt = <2600000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + }; + + rtc_vdd: ldo12 { + regulator-name = "RTC_VDD"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-always-on; + }; + + sd_vcc: switchldo1 { + regulator-name = "SD_VCC"; + regulator-min-microvolt = <3000000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + regulator-boot-on; + }; + }; + }; }; &i2c1 { From b4a213c53eede1647b2b036f5b94f3e6e7489173 Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Sat, 27 Nov 2021 13:09:42 +0100 Subject: [PATCH 0003/1250] ARM: dts: bcm2711-rpi-400: Fix GPIO expander labels Recently 2 labels of the RPi 400 GPIO expander has been fixed in the vendor tree. So upstream this change to be in sync. Fixes: 1c701accecf2 ("ARM: dts: Add Raspberry Pi 400 support") Signed-off-by: Phil Elwell Signed-off-by: Stefan Wahren Signed-off-by: Nicolas Saenz Julienne Link: https://lore.kernel.org/r/1638014982-4878-1-git-send-email-stefan.wahren@i2se.com --- arch/arm/boot/dts/bcm2711-rpi-400.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/bcm2711-rpi-400.dts b/arch/arm/boot/dts/bcm2711-rpi-400.dts index f4d2fc20397c70..82c5ea138e57c8 100644 --- a/arch/arm/boot/dts/bcm2711-rpi-400.dts +++ b/arch/arm/boot/dts/bcm2711-rpi-400.dts @@ -31,9 +31,9 @@ "", "GLOBAL_RESET", "VDD_SD_IO_SEL", - "CAM_GPIO", + "GLOBAL_SHUTDOWN", "SD_PWR_ON", - "SD_OC_N"; + "SHUTDOWN_REQUEST"; }; &genet_mdio { From ce94980d297047cdf3aaf74acb8a6498bc1e4728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 30 Nov 2021 17:11:47 +0100 Subject: [PATCH 0004/1250] ARM: dts: bcm2711-rpi-4-b: Add gpio offsets to line name array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this helps human readers considerably to determine the line name for a given offset or vice versa. Signed-off-by: Uwe Kleine-König [ nsaenz: corrected patch title ] Signed-off-by: Nicolas Saenz Julienne Link: https://lore.kernel.org/r/20211130161147.317653-1-u.kleine-koenig@pengutronix.de --- arch/arm/boot/dts/bcm2711-rpi-4-b.dts | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/arm/boot/dts/bcm2711-rpi-4-b.dts b/arch/arm/boot/dts/bcm2711-rpi-4-b.dts index 631dd5baf68da2..4432412044dec7 100644 --- a/arch/arm/boot/dts/bcm2711-rpi-4-b.dts +++ b/arch/arm/boot/dts/bcm2711-rpi-4-b.dts @@ -65,12 +65,12 @@ }; &expgpio { - gpio-line-names = "BT_ON", + gpio-line-names = "BT_ON", /* 0 */ "WL_ON", "PWR_LED_OFF", "GLOBAL_RESET", "VDD_SD_IO_SEL", - "CAM_GPIO", + "CAM_GPIO", /* 5 */ "SD_PWR_ON", ""; }; @@ -84,66 +84,66 @@ * "FOO" = GPIO line named "FOO" on the schematic * "FOO_N" = GPIO line named "FOO" on schematic, active low */ - gpio-line-names = "ID_SDA", + gpio-line-names = "ID_SDA", /* 0 */ "ID_SCL", "SDA1", "SCL1", "GPIO_GCLK", - "GPIO5", + "GPIO5", /* 5 */ "GPIO6", "SPI_CE1_N", "SPI_CE0_N", "SPI_MISO", - "SPI_MOSI", + "SPI_MOSI", /* 10 */ "SPI_SCLK", "GPIO12", "GPIO13", /* Serial port */ "TXD1", - "RXD1", + "RXD1", /* 15 */ "GPIO16", "GPIO17", "GPIO18", "GPIO19", - "GPIO20", + "GPIO20", /* 20 */ "GPIO21", "GPIO22", "GPIO23", "GPIO24", - "GPIO25", + "GPIO25", /* 25 */ "GPIO26", "GPIO27", "RGMII_MDIO", "RGMIO_MDC", /* Used by BT module */ - "CTS0", + "CTS0", /* 30 */ "RTS0", "TXD0", "RXD0", /* Used by Wifi */ "SD1_CLK", - "SD1_CMD", + "SD1_CMD", /* 35 */ "SD1_DATA0", "SD1_DATA1", "SD1_DATA2", "SD1_DATA3", /* Shared with SPI flash */ - "PWM0_MISO", + "PWM0_MISO", /* 40 */ "PWM1_MOSI", "STATUS_LED_G_CLK", "SPIFLASH_CE_N", "SDA0", - "SCL0", + "SCL0", /* 45 */ "RGMII_RXCLK", "RGMII_RXCTL", "RGMII_RXD0", "RGMII_RXD1", - "RGMII_RXD2", + "RGMII_RXD2", /* 50 */ "RGMII_RXD3", "RGMII_TXCLK", "RGMII_TXCTL", "RGMII_TXD0", - "RGMII_TXD1", + "RGMII_TXD1", /* 55 */ "RGMII_TXD2", "RGMII_TXD3"; }; From c5915b53d4c2021fef3ceaa3c93ccd9ba67515ca Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Wed, 15 Dec 2021 10:44:49 +0100 Subject: [PATCH 0005/1250] dt-bindings: soc: bcm: Convert brcm,bcm2835-vchiq to json-schema This converts the VCHIQ bindings to YAML format. Signed-off-by: Stefan Wahren Co-developed-by: Nicolas Saenz Julienne Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20211215094448.280796-1-nsaenz@kernel.org --- .../bindings/soc/bcm/brcm,bcm2835-vchiq.txt | 17 ------ .../bindings/soc/bcm/brcm,bcm2835-vchiq.yaml | 53 +++++++++++++++++++ 2 files changed, 53 insertions(+), 17 deletions(-) delete mode 100644 Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.txt create mode 100644 Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.yaml diff --git a/Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.txt b/Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.txt deleted file mode 100644 index f331316183f644..00000000000000 --- a/Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.txt +++ /dev/null @@ -1,17 +0,0 @@ -Broadcom VCHIQ firmware services - -Required properties: - -- compatible: Should be "brcm,bcm2835-vchiq" on BCM2835, otherwise - "brcm,bcm2836-vchiq". -- reg: Physical base address and length of the doorbell register pair -- interrupts: The interrupt number - See bindings/interrupt-controller/brcm,bcm2835-armctrl-ic.txt - -Example: - -mailbox@7e00b840 { - compatible = "brcm,bcm2835-vchiq"; - reg = <0x7e00b840 0xf>; - interrupts = <0 2>; -}; diff --git a/Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.yaml b/Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.yaml new file mode 100644 index 00000000000000..e04439b3355b28 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/bcm/brcm,bcm2835-vchiq.yaml @@ -0,0 +1,53 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/soc/bcm/brcm,bcm2835-vchiq.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Broadcom VCHIQ firmware services + +maintainers: + - Nicolas Saenz Julienne + +description: + The VCHIQ communication channel can be provided by BCM283x and Capri SoCs, + to communicate with the VPU-side OS services. + +properties: + compatible: + oneOf: + - description: BCM2835 based boards + items: + - enum: + - brcm,bcm2835-vchiq + + - description: BCM2836/BCM2837 based boards + items: + - enum: + - brcm,bcm2836-vchiq + - const: brcm,bcm2835-vchiq + + reg: + description: Physical base address and length of the doorbell register pair + minItems: 1 + + interrupts: + description: Interrupt number of the doorbell interrupt + minItems: 1 + +required: + - compatible + - reg + - interrupts + +additionalProperties: false + +examples: + - | + mailbox@7e00b840 { + compatible = "brcm,bcm2835-vchiq"; + reg = <0x7e00b840 0xf>; + interrupts = <0 2>; + }; + +... From ba6b652bd8633bc90390e1a02ae0c743ea00c429 Mon Sep 17 00:00:00 2001 From: Michal Orzel Date: Thu, 31 Mar 2022 19:33:58 +0200 Subject: [PATCH 0006/1250] smack: Remove redundant assignments Get rid of redundant assignments which end up in values not being read either because they are overwritten or the function ends. Reported by clang-tidy [deadcode.DeadStores] Signed-off-by: Michal Orzel Signed-off-by: Casey Schaufler --- security/smack/smackfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 658eab05599e6d..9e61014073cc84 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -1192,7 +1192,6 @@ static ssize_t smk_write_net4addr(struct file *file, const char __user *buf, rc = -EINVAL; goto free_out; } - m = BEBITS; masks = 32; } if (masks > BEBITS) { From 2660e71e6080a53aeaaa9d79e7ed9d8d72dd63ae Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Thu, 31 Mar 2022 05:07:38 -0700 Subject: [PATCH 0007/1250] HSI: clients: remove duplicate assignment netdev_alloc_skb() has assigned ssi->netdev to skb->dev if successed, no need to repeat assignment. Signed-off-by: Wang Qing Signed-off-by: Sebastian Reichel --- drivers/hsi/clients/ssi_protocol.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/hsi/clients/ssi_protocol.c b/drivers/hsi/clients/ssi_protocol.c index 21f11a5b965b11..7aacb19fd1ffa4 100644 --- a/drivers/hsi/clients/ssi_protocol.c +++ b/drivers/hsi/clients/ssi_protocol.c @@ -796,7 +796,6 @@ static void ssip_rx_strans(struct hsi_client *cl, u32 cmd) dev_err(&cl->device, "No memory for rx skb\n"); goto out1; } - skb->dev = ssi->netdev; skb_put(skb, len * 4); msg = ssip_alloc_data(ssi, skb, GFP_ATOMIC); if (unlikely(!msg)) { From 43c14f8d18a7ab26e8f0e960bfd8f4d0a9c57c4d Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Mon, 4 Apr 2022 08:52:32 +0000 Subject: [PATCH 0008/1250] HSI: omap_ssi: Fix refcount leak in ssi_probe When returning or breaking early from a for_each_available_child_of_node() loop, we need to explicitly call of_node_put() on the child node to possibly release the node. Fixes: b209e047bc74 ("HSI: Introduce OMAP SSI driver") Signed-off-by: Miaoqian Lin Signed-off-by: Sebastian Reichel --- drivers/hsi/controllers/omap_ssi_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hsi/controllers/omap_ssi_core.c b/drivers/hsi/controllers/omap_ssi_core.c index 44a3f5660c1090..eb982015831858 100644 --- a/drivers/hsi/controllers/omap_ssi_core.c +++ b/drivers/hsi/controllers/omap_ssi_core.c @@ -524,6 +524,7 @@ static int ssi_probe(struct platform_device *pd) if (!childpdev) { err = -ENODEV; dev_err(&pd->dev, "failed to create ssi controller port\n"); + of_node_put(child); goto out3; } } From c71faaffea999b94e2e8b541753520e053e1c30a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 12 Apr 2022 23:10:29 +0200 Subject: [PATCH 0009/1250] EDAC: Use kcalloc() It is syntactic sugar anyway: # drivers/edac/edac_mc.o: text data bss dec hex filename 13378 324 8 13710 358e edac_mc.o.before 13378 324 8 13710 358e edac_mc.o.after md5: 70a53ee3ac7f867730e35c2be9110748 edac_mc.o.before.asm 70a53ee3ac7f867730e35c2be9110748 edac_mc.o.after.asm # drivers/edac/edac_device.o: text data bss dec hex filename 5684 120 4 5808 16b0 edac_device.o.before 5684 120 4 5808 16b0 edac_device.o.after md5: 811325c80acb5a1d6df7b290df3e1636 edac_device.o.before.asm 811325c80acb5a1d6df7b290df3e1636 edac_device.o.after.asm No functional changes. Reported-by: kernel test robot Signed-off-by: Borislav Petkov Link: https://lore.kernel.org/r/20220412211957.28899-1-bp@alien8.de --- drivers/edac/edac_device.c | 9 +++------ drivers/edac/edac_mc.c | 2 +- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index b737349184e320..3d5a4944735ff4 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -70,9 +70,7 @@ edac_device_alloc_ctl_info(unsigned pvt_sz, char *dev_name, unsigned nr_instance if (!dev_ctl) return NULL; - dev_inst = kmalloc_array(nr_instances, - sizeof(struct edac_device_instance), - GFP_KERNEL | __GFP_ZERO); + dev_inst = kcalloc(nr_instances, sizeof(struct edac_device_instance), GFP_KERNEL); if (!dev_inst) goto free; @@ -87,9 +85,8 @@ edac_device_alloc_ctl_info(unsigned pvt_sz, char *dev_name, unsigned nr_instance dev_ctl->blocks = dev_blk; if (nr_attrib) { - dev_attrib = kmalloc_array(nr_attrib, - sizeof(struct edac_dev_sysfs_block_attribute), - GFP_KERNEL | __GFP_ZERO); + dev_attrib = kcalloc(nr_attrib, sizeof(struct edac_dev_sysfs_block_attribute), + GFP_KERNEL); if (!dev_attrib) goto free; diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 387b6851c97576..eb58644bb01906 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -366,7 +366,7 @@ struct mem_ctl_info *edac_mc_alloc(unsigned int mc_num, if (!mci) return NULL; - mci->layers = kmalloc_array(n_layers, sizeof(struct edac_mc_layer), GFP_KERNEL | __GFP_ZERO); + mci->layers = kcalloc(n_layers, sizeof(struct edac_mc_layer), GFP_KERNEL); if (!mci->layers) goto error; From e9c8c7c43b51b277026f94a1175c605436c7c829 Mon Sep 17 00:00:00 2001 From: Pablo Ceballos Date: Tue, 5 Apr 2022 14:39:53 -0400 Subject: [PATCH 0010/1250] HID: Driver for Google Hangouts Meet Speakermic This driver works around a problem with the HID usage sent by this device for the mute button. It prevents key events from being generated for that HID usage since they would be incorrect. Signed-off-by: Pablo Ceballos Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 12 ++++++++ drivers/hid/Makefile | 1 + drivers/hid/hid-google-atrus.c | 55 ++++++++++++++++++++++++++++++++++ drivers/hid/hid-ids.h | 1 + 4 files changed, 69 insertions(+) create mode 100644 drivers/hid/hid-google-atrus.c diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 7a674873d7947f..ef58b5c0378070 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -405,6 +405,18 @@ config HOLTEK_FF Say Y here if you have a Holtek On Line Grip based game controller and want to have force feedback support for it. +config HID_GOOGLE_ATRUS + tristate "Google Hangouts Meet Speakermic" + depends on USB_HID + help + This selects a driver for the Google Hangouts Meet Speakermic. + + This driver works around a problem with the HID usage sent by this + device for the mute button. It prevents key events from being generated + for that HID usage since they would be incorrect. + + Say Y here if you have a Google Hangouts Meet Speakermic. + config HID_GOOGLE_HAMMER tristate "Google Hammer Keyboard" depends on USB_HID && LEDS_CLASS && CROS_EC diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile index d5ce8d747b140b..495e67ec5d9eca 100644 --- a/drivers/hid/Makefile +++ b/drivers/hid/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_HID_FT260) += hid-ft260.o obj-$(CONFIG_HID_GEMBIRD) += hid-gembird.o obj-$(CONFIG_HID_GFRM) += hid-gfrm.o obj-$(CONFIG_HID_GLORIOUS) += hid-glorious.o +obj-$(CONFIG_HID_GOOGLE_ATRUS) += hid-google-atrus.o obj-$(CONFIG_HID_GOOGLE_HAMMER) += hid-google-hammer.o obj-$(CONFIG_HID_VIVALDI) += hid-vivaldi.o obj-$(CONFIG_HID_GT683R) += hid-gt683r.o diff --git a/drivers/hid/hid-google-atrus.c b/drivers/hid/hid-google-atrus.c new file mode 100644 index 00000000000000..e136c70e9425c7 --- /dev/null +++ b/drivers/hid/hid-google-atrus.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HID driver for Google Hangouts Meet Speakermic + * + * Copyright 2022 Google LLC. + */ + +#include +#include + +#include "hid-ids.h" + +/* + * This driver handles the telephony phone mute HID usage by ignoring it. This + * avoids the default handling by the hid-input driver which is to map this to + * a KEY_MICMUTE event. The issue is that this device implements the phone mute + * HID usage as a toggle switch, where 1 indicates muted, and 0 indicates + * unmuted. However, for an EV_KEY event 1 indicates the key has been pressed + * and 0 indicates it has been released. + */ + +static int atrus_event(struct hid_device *hid, struct hid_field *field, + struct hid_usage *usage, __s32 value) +{ + /* + * Return 1 to indicate no further processing should be done for this + * usage. + */ + return 1; +} + +static const struct hid_device_id atrus_devices[] = { + { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, + USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_ATRUS) }, + { } +}; +MODULE_DEVICE_TABLE(hid, atrus_devices); + +static const struct hid_usage_id atrus_usages[] = { + /* Handle only the Telephony Phone Mute usage. */ + { HID_UP_TELEPHONY | 0x2f, EV_KEY, HID_ANY_ID }, + { HID_TERMINATOR, HID_TERMINATOR, HID_TERMINATOR } +}; + +static struct hid_driver atrus_driver = { + .name = "atrus", + .id_table = atrus_devices, + .usage_table = atrus_usages, + .event = atrus_event, +}; +module_hid_driver(atrus_driver); + +MODULE_AUTHOR("Pablo Ceballos "); +MODULE_DESCRIPTION("Google Hangouts Meet Speakermic USB HID Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 053853a891c50b..b6f3bc66269c0d 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -514,6 +514,7 @@ #define USB_DEVICE_ID_GOOGLE_MOONBALL 0x5044 #define USB_DEVICE_ID_GOOGLE_DON 0x5050 #define USB_DEVICE_ID_GOOGLE_EEL 0x5057 +#define USB_DEVICE_ID_GOOGLE_ATRUS 0x8001 #define USB_VENDOR_ID_GOTOP 0x08f2 #define USB_DEVICE_ID_SUPER_Q2 0x007f From d4cb77112c7b654d9e95c45b1871b18fe1a62f31 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 30 Mar 2022 02:56:52 +0100 Subject: [PATCH 0011/1250] media: isl7998x: select V4L2_FWNODE to fix build error Fix build error when VIDEO_ISL7998X=y and V4L2_FWNODE=m by selecting V4L2_FWNODE. microblaze-linux-ld: drivers/media/i2c/isl7998x.o: in function `isl7998x_probe': (.text+0x8f4): undefined reference to `v4l2_fwnode_endpoint_parse' Fixes: 51ef2be546e2 ("media: i2c: isl7998x: Add driver for Intersil ISL7998x") Signed-off-by: Randy Dunlap Reported-by: kernel test robot Cc: Marek Vasut Cc: Pengutronix Kernel Team Reviewed-by: Michael Tretter Signed-off-by: Sakari Ailus Signed-off-by: Mauro Carvalho Chehab --- drivers/media/i2c/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/media/i2c/Kconfig b/drivers/media/i2c/Kconfig index fae2baabb77380..8f54e2a3f24a1e 100644 --- a/drivers/media/i2c/Kconfig +++ b/drivers/media/i2c/Kconfig @@ -1177,6 +1177,7 @@ config VIDEO_ISL7998X depends on OF_GPIO select MEDIA_CONTROLLER select VIDEO_V4L2_SUBDEV_API + select V4L2_FWNODE help Support for Intersil ISL7998x analog to MIPI-CSI2 or BT.656 decoder. From 108f241d7159bd8ec2702c68f96151f27d58ebc6 Mon Sep 17 00:00:00 2001 From: Sherry Sun Date: Wed, 27 Apr 2022 09:51:36 +0800 Subject: [PATCH 0012/1250] EDAC/synopsys: Use the correct register to disable the error interrupt on v3 hw v3.x Synopsys EDAC DDR doesn't have the QOS Interrupt register. Use the ECC Clear Register to disable the error interrupts instead. Fixes: f7824ded4149 ("EDAC/synopsys: Add support for version 3 of the Synopsys EDAC DDR") Signed-off-by: Sherry Sun Signed-off-by: Borislav Petkov Reviewed-by: Shubhrajyoti Datta Acked-by: Michal Simek Cc: Link: https://lore.kernel.org/r/20220427015137.8406-2-sherry.sun@nxp.com --- drivers/edac/synopsys_edac.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 40b1abeca8562e..88a481043d4c3c 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -865,8 +865,11 @@ static void enable_intr(struct synps_edac_priv *priv) static void disable_intr(struct synps_edac_priv *priv) { /* Disable UE/CE Interrupts */ - writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, - priv->baseaddr + DDR_QOS_IRQ_DB_OFST); + if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR) + writel(0x0, priv->baseaddr + ECC_CLR_OFST); + else + writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, + priv->baseaddr + DDR_QOS_IRQ_DB_OFST); } static int setup_irq(struct mem_ctl_info *mci, From 5075cc5ccfc79be9e4e77a8a681d052f0d7c1122 Mon Sep 17 00:00:00 2001 From: Sherry Sun Date: Wed, 27 Apr 2022 09:51:37 +0800 Subject: [PATCH 0013/1250] EDAC/synopsys: Re-enable the error interrupts on v3 hw zynqmp_get_error_info() writes 0 to the ECC_CLR_OFST register after an interrupt for a {un-,}correctable error is raised, which disables the error interrupts. Then the interrupt handler will be called only once. Therefore, re-enable the error interrupt line at the end of intr_handler() for v3.x Synopsys EDAC DDR. Fixes: f7824ded4149 ("EDAC/synopsys: Add support for version 3 of the Synopsys EDAC DDR") Signed-off-by: Sherry Sun Signed-off-by: Borislav Petkov Reviewed-by: Shubhrajyoti Datta Acked-by: Michal Simek Cc: Link: https://lore.kernel.org/r/20220427015137.8406-3-sherry.sun@nxp.com --- drivers/edac/synopsys_edac.c | 47 +++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 88a481043d4c3c..a14baeca640042 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -527,6 +527,28 @@ static void handle_error(struct mem_ctl_info *mci, struct synps_ecc_status *p) memset(p, 0, sizeof(*p)); } +static void enable_intr(struct synps_edac_priv *priv) +{ + /* Enable UE/CE Interrupts */ + if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR) + writel(DDR_UE_MASK | DDR_CE_MASK, + priv->baseaddr + ECC_CLR_OFST); + else + writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, + priv->baseaddr + DDR_QOS_IRQ_EN_OFST); + +} + +static void disable_intr(struct synps_edac_priv *priv) +{ + /* Disable UE/CE Interrupts */ + if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR) + writel(0x0, priv->baseaddr + ECC_CLR_OFST); + else + writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, + priv->baseaddr + DDR_QOS_IRQ_DB_OFST); +} + /** * intr_handler - Interrupt Handler for ECC interrupts. * @irq: IRQ number. @@ -568,6 +590,9 @@ static irqreturn_t intr_handler(int irq, void *dev_id) /* v3.0 of the controller does not have this register */ if (!(priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR)) writel(regval, priv->baseaddr + DDR_QOS_IRQ_STAT_OFST); + else + enable_intr(priv); + return IRQ_HANDLED; } @@ -850,28 +875,6 @@ static void mc_init(struct mem_ctl_info *mci, struct platform_device *pdev) init_csrows(mci); } -static void enable_intr(struct synps_edac_priv *priv) -{ - /* Enable UE/CE Interrupts */ - if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR) - writel(DDR_UE_MASK | DDR_CE_MASK, - priv->baseaddr + ECC_CLR_OFST); - else - writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, - priv->baseaddr + DDR_QOS_IRQ_EN_OFST); - -} - -static void disable_intr(struct synps_edac_priv *priv) -{ - /* Disable UE/CE Interrupts */ - if (priv->p_data->quirks & DDR_ECC_INTR_SELF_CLEAR) - writel(0x0, priv->baseaddr + ECC_CLR_OFST); - else - writel(DDR_QOSUE_MASK | DDR_QOSCE_MASK, - priv->baseaddr + DDR_QOS_IRQ_DB_OFST); -} - static int setup_irq(struct mem_ctl_info *mci, struct platform_device *pdev) { From b500d6d7243d2e0807a39a09c52fbe668b59b2c1 Mon Sep 17 00:00:00 2001 From: jianchunfu Date: Tue, 29 Mar 2022 10:49:54 +0800 Subject: [PATCH 0014/1250] unicode: Handle memory allocation failures in mkutf8data Adding and using a helper function "xmalloc()" to handle memory allocation failures. Signed-off-by: jianchunfu Signed-off-by: Gabriel Krisman Bertazi --- fs/unicode/mkutf8data.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/fs/unicode/mkutf8data.c b/fs/unicode/mkutf8data.c index bc1a7c8b5c8dfc..baf1d7eda0a5ca 100644 --- a/fs/unicode/mkutf8data.c +++ b/fs/unicode/mkutf8data.c @@ -486,6 +486,16 @@ static void tree_walk(struct tree *tree) nodes, leaves, singletons); } +static void *xmalloc(size_t size) +{ + void *p = malloc(size); + + if (p) + return p; + fprintf(stderr, "Out of memory.\n"); + exit(1); +} + /* * Allocate an initialize a new internal node. */ @@ -494,7 +504,7 @@ static struct node *alloc_node(struct node *parent) struct node *node; int bitnum; - node = malloc(sizeof(*node)); + node = xmalloc(sizeof(*node)); node->left = node->right = NULL; node->parent = parent; node->leftnode = NODE; @@ -2159,7 +2169,7 @@ static void nfdi_init(void) } mapping[i++] = 0; - um = malloc(i * sizeof(unsigned int)); + um = xmalloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdi = um; @@ -2215,7 +2225,7 @@ static void nfdicf_init(void) } mapping[i++] = 0; - um = malloc(i * sizeof(unsigned int)); + um = xmalloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdicf = um; @@ -2256,11 +2266,11 @@ static void ignore_init(void) line_fail(prop_name, line); for (unichar = first; unichar <= last; unichar++) { free(unicode_data[unichar].utf32nfdi); - um = malloc(sizeof(unsigned int)); + um = xmalloc(sizeof(unsigned int)); *um = 0; unicode_data[unichar].utf32nfdi = um; free(unicode_data[unichar].utf32nfdicf); - um = malloc(sizeof(unsigned int)); + um = xmalloc(sizeof(unsigned int)); *um = 0; unicode_data[unichar].utf32nfdicf = um; count++; @@ -2277,11 +2287,11 @@ static void ignore_init(void) if (!utf32valid(unichar)) line_fail(prop_name, line); free(unicode_data[unichar].utf32nfdi); - um = malloc(sizeof(unsigned int)); + um = xmalloc(sizeof(unsigned int)); *um = 0; unicode_data[unichar].utf32nfdi = um; free(unicode_data[unichar].utf32nfdicf); - um = malloc(sizeof(unsigned int)); + um = xmalloc(sizeof(unsigned int)); *um = 0; unicode_data[unichar].utf32nfdicf = um; if (verbose > 1) @@ -2359,7 +2369,7 @@ static void corrections_init(void) } mapping[i++] = 0; - um = malloc(i * sizeof(unsigned int)); + um = xmalloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); corrections[count].utf32nfdi = um; @@ -2459,12 +2469,12 @@ static void hangul_decompose(void) mapping[i++] = 0; assert(!unicode_data[unichar].utf32nfdi); - um = malloc(i * sizeof(unsigned int)); + um = xmalloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdi = um; assert(!unicode_data[unichar].utf32nfdicf); - um = malloc(i * sizeof(unsigned int)); + um = xmalloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdicf = um; @@ -2473,7 +2483,7 @@ static void hangul_decompose(void) * decompositions must not be stored in the generated * trie. */ - unicode_data[unichar].utf8nfdi = malloc(2); + unicode_data[unichar].utf8nfdi = xmalloc(2); unicode_data[unichar].utf8nfdi[0] = HANGUL; unicode_data[unichar].utf8nfdi[1] = '\0'; @@ -2523,13 +2533,13 @@ static void nfdi_decompose(void) if (ret) break; free(unicode_data[unichar].utf32nfdi); - um = malloc(i * sizeof(unsigned int)); + um = xmalloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdi = um; } /* Add this decomposition to nfdicf if there is no entry. */ if (!unicode_data[unichar].utf32nfdicf) { - um = malloc(i * sizeof(unsigned int)); + um = xmalloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdicf = um; } @@ -2577,7 +2587,7 @@ static void nfdicf_decompose(void) if (ret) break; free(unicode_data[unichar].utf32nfdicf); - um = malloc(i * sizeof(unsigned int)); + um = xmalloc(i * sizeof(unsigned int)); memcpy(um, mapping, i * sizeof(unsigned int)); unicode_data[unichar].utf32nfdicf = um; } From 33b5a8c953030ab02d283fd050b73950d93e70cb Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Tue, 28 Sep 2021 12:08:37 +0800 Subject: [PATCH 0015/1250] kallsyms: avoid hardcoding the buffer size This makes it easier to update the size later on. Furthermore, a static assert is added to ensure both are updated when that happens. The relationship used is one that keeps the new size (512+1) close to the original buffer size (500). Reviewed-by: Kees Cook Signed-off-by: Boqun Feng Co-developed-by: Miguel Ojeda Signed-off-by: Miguel Ojeda --- scripts/kallsyms.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 8caabddf817ca0..82d6508bdf293d 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -27,8 +27,18 @@ #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) +#define _stringify_1(x) #x +#define _stringify(x) _stringify_1(x) + #define KSYM_NAME_LEN 128 +/* A substantially bigger size than the current maximum. */ +#define KSYM_NAME_LEN_BUFFER 512 +_Static_assert( + KSYM_NAME_LEN_BUFFER == KSYM_NAME_LEN * 4, + "Please keep KSYM_NAME_LEN_BUFFER in sync with KSYM_NAME_LEN" +); + struct sym_entry { unsigned long long addr; unsigned int len; @@ -197,15 +207,15 @@ static void check_symbol_range(const char *sym, unsigned long long addr, static struct sym_entry *read_symbol(FILE *in) { - char name[500], type; + char name[KSYM_NAME_LEN_BUFFER+1], type; unsigned long long addr; unsigned int len; struct sym_entry *sym; int rc; - rc = fscanf(in, "%llx %c %499s\n", &addr, &type, name); + rc = fscanf(in, "%llx %c %" _stringify(KSYM_NAME_LEN_BUFFER) "s\n", &addr, &type, name); if (rc != 3) { - if (rc != EOF && fgets(name, 500, in) == NULL) + if (rc != EOF && fgets(name, sizeof(name), in) == NULL) fprintf(stderr, "Read error or end of file.\n"); return NULL; } From 2087d6ac3567b38f2cddfa8fa445e595af87a04a Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 5 Apr 2021 04:58:39 +0200 Subject: [PATCH 0016/1250] kallsyms: support "big" kernel symbols Rust symbols can become quite long due to namespacing introduced by modules, types, traits, generics, etc. Increasing to 255 is not enough in some cases, and therefore we need to introduce longer lengths to the symbol table. In order to avoid increasing all lengths to 2 bytes (since most of them are small, including many Rust ones), we use ULEB128 to keep smaller symbols in 1 byte, with the rest in 2 bytes. Reviewed-by: Kees Cook Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Co-developed-by: Matthew Wilcox Signed-off-by: Matthew Wilcox Signed-off-by: Miguel Ojeda --- kernel/kallsyms.c | 26 ++++++++++++++++++++++---- scripts/kallsyms.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 79f2eb617a62ae..e8d2262ef2d2a7 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -69,12 +69,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, data = &kallsyms_names[off]; len = *data; data++; + off++; + + /* If MSB is 1, it is a "big" symbol, so needs an additional byte. */ + if ((len & 0x80) != 0) { + len = (len & 0x7F) | (*data << 7); + data++; + off++; + } /* * Update the offset to return the offset for the next symbol on * the compressed stream. */ - off += len + 1; + off += len; /* * For every byte on the compressed symbol data, copy the table @@ -127,7 +135,7 @@ static char kallsyms_get_symbol_type(unsigned int off) static unsigned int get_symbol_offset(unsigned long pos) { const u8 *name; - int i; + int i, len; /* * Use the closest marker we have. We have markers every 256 positions, @@ -141,8 +149,18 @@ static unsigned int get_symbol_offset(unsigned long pos) * so we just need to add the len to the current pointer for every * symbol we wish to skip. */ - for (i = 0; i < (pos & 0xFF); i++) - name = name + (*name) + 1; + for (i = 0; i < (pos & 0xFF); i++) { + len = *name; + + /* + * If MSB is 1, it is a "big" symbol, so we need to look into + * the next byte (and skip it, too). + */ + if ((len & 0x80) != 0) + len = ((len & 0x7F) | (name[1] << 7)) + 1; + + name = name + len + 1; + } return name - kallsyms_names; } diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 82d6508bdf293d..7e99799aa7b9c2 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -480,12 +480,35 @@ static void write_src(void) if ((i & 0xFF) == 0) markers[i >> 8] = off; - printf("\t.byte 0x%02x", table[i]->len); + /* There cannot be any symbol of length zero. */ + if (table[i]->len == 0) { + fprintf(stderr, "kallsyms failure: " + "unexpected zero symbol length\n"); + exit(EXIT_FAILURE); + } + + /* Only lengths that fit in up-to-two-byte ULEB128 are supported. */ + if (table[i]->len > 0x3FFF) { + fprintf(stderr, "kallsyms failure: " + "unexpected huge symbol length\n"); + exit(EXIT_FAILURE); + } + + /* Encode length with ULEB128. */ + if (table[i]->len <= 0x7F) { + /* Most symbols use a single byte for the length. */ + printf("\t.byte 0x%02x", table[i]->len); + off += table[i]->len + 1; + } else { + /* "Big" symbols use two bytes. */ + printf("\t.byte 0x%02x, 0x%02x", + (table[i]->len & 0x7F) | 0x80, + (table[i]->len >> 7) & 0x7F); + off += table[i]->len + 2; + } for (k = 0; k < table[i]->len; k++) printf(", 0x%02x", table[i]->sym[k]); printf("\n"); - - off += table[i]->len + 1; } printf("\n"); From 394dffa6680caa684673f61dd8260db273038660 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 5 Apr 2021 05:03:50 +0200 Subject: [PATCH 0017/1250] kallsyms: increase maximum kernel symbol length to 512 Rust symbols can become quite long due to namespacing introduced by modules, types, traits, generics, etc. For instance, the following code: pub mod my_module { pub struct MyType; pub struct MyGenericType(T); pub trait MyTrait { fn my_method() -> u32; } impl MyTrait for MyGenericType { fn my_method() -> u32 { 42 } } } generates a symbol of length 96 when using the upcoming v0 mangling scheme: _RNvXNtCshGpAVYOtgW1_7example9my_moduleINtB2_13MyGenericTypeNtB2_6MyTypeENtB2_7MyTrait9my_method At the moment, Rust symbols may reach up to 300 in length. Setting 512 as the maximum seems like a reasonable choice to keep some headroom. Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Signed-off-by: Miguel Ojeda --- include/linux/kallsyms.h | 2 +- kernel/livepatch/core.c | 4 ++-- scripts/kallsyms.c | 4 ++-- tools/include/linux/kallsyms.h | 2 +- tools/lib/perf/include/perf/event.h | 2 +- tools/lib/symbol/kallsyms.h | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index ce1bd2fbf23ef7..e5ad6e31697d33 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -15,7 +15,7 @@ #include -#define KSYM_NAME_LEN 128 +#define KSYM_NAME_LEN 512 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s %s]") + \ (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + \ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index bc475e62279d2a..ec06ce59d7283b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -213,7 +213,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, * we use the smallest/strictest upper bound possible (56, based on * the current definition of MODULE_NAME_LEN) to prevent overflows. */ - BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128); + BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 512); relas = (Elf_Rela *) relasec->sh_addr; /* For each rela in this klp relocation section */ @@ -227,7 +227,7 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, - ".klp.sym.%55[^.].%127[^,],%lu", + ".klp.sym.%55[^.].%511[^,],%lu", sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 7e99799aa7b9c2..275044b840dcbe 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -30,10 +30,10 @@ #define _stringify_1(x) #x #define _stringify(x) _stringify_1(x) -#define KSYM_NAME_LEN 128 +#define KSYM_NAME_LEN 512 /* A substantially bigger size than the current maximum. */ -#define KSYM_NAME_LEN_BUFFER 512 +#define KSYM_NAME_LEN_BUFFER 2048 _Static_assert( KSYM_NAME_LEN_BUFFER == KSYM_NAME_LEN * 4, "Please keep KSYM_NAME_LEN_BUFFER in sync with KSYM_NAME_LEN" diff --git a/tools/include/linux/kallsyms.h b/tools/include/linux/kallsyms.h index efb6c3f5f2a9a5..5a37ccbec54fbc 100644 --- a/tools/include/linux/kallsyms.h +++ b/tools/include/linux/kallsyms.h @@ -6,7 +6,7 @@ #include #include -#define KSYM_NAME_LEN 128 +#define KSYM_NAME_LEN 512 struct module; diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index e7758707cadd6a..116a80c31675e0 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -95,7 +95,7 @@ struct perf_record_throttle { }; #ifndef KSYM_NAME_LEN -#define KSYM_NAME_LEN 256 +#define KSYM_NAME_LEN 512 #endif struct perf_record_ksymbol { diff --git a/tools/lib/symbol/kallsyms.h b/tools/lib/symbol/kallsyms.h index 72ab9870454baf..542f9b059c3bd2 100644 --- a/tools/lib/symbol/kallsyms.h +++ b/tools/lib/symbol/kallsyms.h @@ -7,7 +7,7 @@ #include #ifndef KSYM_NAME_LEN -#define KSYM_NAME_LEN 256 +#define KSYM_NAME_LEN 512 #endif static inline u8 kallsyms2elf_binding(char type) From aee5392c24130d5884d8995c356acb02ab4797df Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 2 May 2022 22:55:08 +0200 Subject: [PATCH 0018/1250] kunit: take `kunit_assert` as `const` The `kunit_do_failed_assertion` function passes its `struct kunit_assert` argument to `kunit_fail`. This one, in turn, calls its `format` field passing the assert again as a `const` pointer. Therefore, the whole chain may be made `const`. Reviewed-by: Daniel Latypov Reviewed-by: Brendan Higgins Signed-off-by: Miguel Ojeda --- include/kunit/test.h | 2 +- lib/kunit/test.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/kunit/test.h b/include/kunit/test.h index 00b9ff7783ab80..2eff4f1beb429d 100644 --- a/include/kunit/test.h +++ b/include/kunit/test.h @@ -774,7 +774,7 @@ void __printf(2, 3) kunit_log_append(char *log, const char *fmt, ...); void kunit_do_failed_assertion(struct kunit *test, const struct kunit_loc *loc, enum kunit_assert_type type, - struct kunit_assert *assert, + const struct kunit_assert *assert, const char *fmt, ...); #define KUNIT_ASSERTION(test, assert_type, pass, assert_class, INITIALIZER, fmt, ...) do { \ diff --git a/lib/kunit/test.c b/lib/kunit/test.c index 3bca3bf5c15b19..b84aed09a009f6 100644 --- a/lib/kunit/test.c +++ b/lib/kunit/test.c @@ -241,7 +241,7 @@ static void kunit_print_string_stream(struct kunit *test, } static void kunit_fail(struct kunit *test, const struct kunit_loc *loc, - enum kunit_assert_type type, struct kunit_assert *assert, + enum kunit_assert_type type, const struct kunit_assert *assert, const struct va_format *message) { struct string_stream *stream; @@ -281,7 +281,7 @@ static void __noreturn kunit_abort(struct kunit *test) void kunit_do_failed_assertion(struct kunit *test, const struct kunit_loc *loc, enum kunit_assert_type type, - struct kunit_assert *assert, + const struct kunit_assert *assert, const char *fmt, ...) { va_list args; From 5b3e98c1e29e2166b6cbc79101b32a0c769a8f94 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 16:52:41 +0200 Subject: [PATCH 0019/1250] rust: add C helpers This source file contains forwarders to C macros and inlined functions. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Geoffrey Thomas Signed-off-by: Geoffrey Thomas Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Sven Van Asbroeck Signed-off-by: Sven Van Asbroeck Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Co-developed-by: Maciej Falkowski Signed-off-by: Maciej Falkowski Co-developed-by: Wei Liu Signed-off-by: Wei Liu Signed-off-by: Miguel Ojeda --- rust/helpers.c | 644 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 644 insertions(+) create mode 100644 rust/helpers.c diff --git a/rust/helpers.c b/rust/helpers.c new file mode 100644 index 00000000000000..eb7a66c77cb293 --- /dev/null +++ b/rust/helpers.c @@ -0,0 +1,644 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Non-trivial C macros cannot be used in Rust. Similarly, inlined C functions + * cannot be called either. This file explicitly creates functions ("helpers") + * that wrap those so that they can be called from Rust. + * + * Even though Rust kernel modules should never use directly the bindings, some + * of these helpers need to be exported because Rust generics and inlined + * functions may not get their code generated in the crate where they are + * defined. Other helpers, called from non-inline functions, may not be + * exported, in principle. However, in general, the Rust compiler does not + * guarantee codegen will be performed for a non-inline function either. + * Therefore, this file exports all the helpers. In the future, this may be + * revisited to reduce the number of exports after the compiler is informed + * about the places codegen is required. + * + * All symbols are exported as GPL-only to guarantee no GPL-only feature is + * accidentally exposed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +__noreturn void rust_helper_BUG(void) +{ + BUG(); +} +EXPORT_SYMBOL_GPL(rust_helper_BUG); + +void rust_helper_clk_disable_unprepare(struct clk *clk) +{ + return clk_disable_unprepare(clk); +} +EXPORT_SYMBOL_GPL(rust_helper_clk_disable_unprepare); + +int rust_helper_clk_prepare_enable(struct clk *clk) +{ + return clk_prepare_enable(clk); +} +EXPORT_SYMBOL_GPL(rust_helper_clk_prepare_enable); + +unsigned long rust_helper_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + return copy_from_user(to, from, n); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_from_user); + +unsigned long rust_helper_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_to_user); + +unsigned long rust_helper_clear_user(void __user *to, unsigned long n) +{ + return clear_user(to, n); +} +EXPORT_SYMBOL_GPL(rust_helper_clear_user); + +void __iomem *rust_helper_ioremap(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +EXPORT_SYMBOL_GPL(rust_helper_ioremap); + +u8 rust_helper_readb(const volatile void __iomem *addr) +{ + return readb(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_readb); + +u16 rust_helper_readw(const volatile void __iomem *addr) +{ + return readw(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_readw); + +u32 rust_helper_readl(const volatile void __iomem *addr) +{ + return readl(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_readl); + +#ifdef CONFIG_64BIT +u64 rust_helper_readq(const volatile void __iomem *addr) +{ + return readq(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_readq); +#endif + +void rust_helper_writeb(u8 value, volatile void __iomem *addr) +{ + writeb(value, addr); +} +EXPORT_SYMBOL_GPL(rust_helper_writeb); + +void rust_helper_writew(u16 value, volatile void __iomem *addr) +{ + writew(value, addr); +} +EXPORT_SYMBOL_GPL(rust_helper_writew); + +void rust_helper_writel(u32 value, volatile void __iomem *addr) +{ + writel(value, addr); +} +EXPORT_SYMBOL_GPL(rust_helper_writel); + +#ifdef CONFIG_64BIT +void rust_helper_writeq(u64 value, volatile void __iomem *addr) +{ + writeq(value, addr); +} +EXPORT_SYMBOL_GPL(rust_helper_writeq); +#endif + +u8 rust_helper_readb_relaxed(const volatile void __iomem *addr) +{ + return readb_relaxed(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_readb_relaxed); + +u16 rust_helper_readw_relaxed(const volatile void __iomem *addr) +{ + return readw_relaxed(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_readw_relaxed); + +u32 rust_helper_readl_relaxed(const volatile void __iomem *addr) +{ + return readl_relaxed(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_readl_relaxed); + +#ifdef CONFIG_64BIT +u64 rust_helper_readq_relaxed(const volatile void __iomem *addr) +{ + return readq_relaxed(addr); +} +EXPORT_SYMBOL_GPL(rust_helper_readq_relaxed); +#endif + +void rust_helper_writeb_relaxed(u8 value, volatile void __iomem *addr) +{ + writeb_relaxed(value, addr); +} +EXPORT_SYMBOL_GPL(rust_helper_writeb_relaxed); + +void rust_helper_writew_relaxed(u16 value, volatile void __iomem *addr) +{ + writew_relaxed(value, addr); +} +EXPORT_SYMBOL_GPL(rust_helper_writew_relaxed); + +void rust_helper_writel_relaxed(u32 value, volatile void __iomem *addr) +{ + writel_relaxed(value, addr); +} +EXPORT_SYMBOL_GPL(rust_helper_writel_relaxed); + +#ifdef CONFIG_64BIT +void rust_helper_writeq_relaxed(u64 value, volatile void __iomem *addr) +{ + writeq_relaxed(value, addr); +} +EXPORT_SYMBOL_GPL(rust_helper_writeq_relaxed); +#endif + +void rust_helper_memcpy_fromio(void *to, const volatile void __iomem *from, long count) +{ + memcpy_fromio(to, from, count); +} +EXPORT_SYMBOL_GPL(rust_helper_memcpy_fromio); + +void rust_helper___spin_lock_init(spinlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + __spin_lock_init(lock, name, key); +#else + spin_lock_init(lock); +#endif +} +EXPORT_SYMBOL_GPL(rust_helper___spin_lock_init); + +void rust_helper_spin_lock(spinlock_t *lock) +{ + spin_lock(lock); +} +EXPORT_SYMBOL_GPL(rust_helper_spin_lock); + +void rust_helper_spin_unlock(spinlock_t *lock) +{ + spin_unlock(lock); +} +EXPORT_SYMBOL_GPL(rust_helper_spin_unlock); + +unsigned long rust_helper_spin_lock_irqsave(spinlock_t *lock) +{ + unsigned long flags; + + spin_lock_irqsave(lock, flags); + + return flags; +} +EXPORT_SYMBOL_GPL(rust_helper_spin_lock_irqsave); + +void rust_helper_spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +{ + spin_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL_GPL(rust_helper_spin_unlock_irqrestore); + +void rust_helper__raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_SPINLOCK + _raw_spin_lock_init(lock, name, key); +#else + raw_spin_lock_init(lock); +#endif +} +EXPORT_SYMBOL_GPL(rust_helper__raw_spin_lock_init); + +void rust_helper_raw_spin_lock(raw_spinlock_t *lock) +{ + raw_spin_lock(lock); +} +EXPORT_SYMBOL_GPL(rust_helper_raw_spin_lock); + +void rust_helper_raw_spin_unlock(raw_spinlock_t *lock) +{ + raw_spin_unlock(lock); +} +EXPORT_SYMBOL_GPL(rust_helper_raw_spin_unlock); + +unsigned long rust_helper_raw_spin_lock_irqsave(raw_spinlock_t *lock) +{ + unsigned long flags; + + raw_spin_lock_irqsave(lock, flags); + + return flags; +} +EXPORT_SYMBOL_GPL(rust_helper_raw_spin_lock_irqsave); + +void rust_helper_raw_spin_unlock_irqrestore(raw_spinlock_t *lock, + unsigned long flags) +{ + raw_spin_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL_GPL(rust_helper_raw_spin_unlock_irqrestore); + +void rust_helper_init_wait(struct wait_queue_entry *wq_entry) +{ + init_wait(wq_entry); +} +EXPORT_SYMBOL_GPL(rust_helper_init_wait); + +void rust_helper_init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, + wait_queue_func_t func) +{ + init_waitqueue_func_entry(wq_entry, func); +} +EXPORT_SYMBOL_GPL(rust_helper_init_waitqueue_func_entry); + +int rust_helper_signal_pending(struct task_struct *t) +{ + return signal_pending(t); +} +EXPORT_SYMBOL_GPL(rust_helper_signal_pending); + +struct page *rust_helper_alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + return alloc_pages(gfp_mask, order); +} +EXPORT_SYMBOL_GPL(rust_helper_alloc_pages); + +void *rust_helper_kmap(struct page *page) +{ + return kmap(page); +} +EXPORT_SYMBOL_GPL(rust_helper_kmap); + +void rust_helper_kunmap(struct page *page) +{ + return kunmap(page); +} +EXPORT_SYMBOL_GPL(rust_helper_kunmap); + +int rust_helper_cond_resched(void) +{ + return cond_resched(); +} +EXPORT_SYMBOL_GPL(rust_helper_cond_resched); + +size_t rust_helper_copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_from_iter); + +size_t rust_helper_copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} +EXPORT_SYMBOL_GPL(rust_helper_copy_to_iter); + +bool rust_helper_IS_ERR(__force const void *ptr) +{ + return IS_ERR(ptr); +} +EXPORT_SYMBOL_GPL(rust_helper_IS_ERR); + +long rust_helper_PTR_ERR(__force const void *ptr) +{ + return PTR_ERR(ptr); +} +EXPORT_SYMBOL_GPL(rust_helper_PTR_ERR); + +const char *rust_helper_errname(int err) +{ + return errname(err); +} +EXPORT_SYMBOL_GPL(rust_helper_errname); + +void rust_helper_mutex_lock(struct mutex *lock) +{ + mutex_lock(lock); +} +EXPORT_SYMBOL_GPL(rust_helper_mutex_lock); + +void rust_helper_amba_set_drvdata(struct amba_device *dev, void *data) +{ + amba_set_drvdata(dev, data); +} +EXPORT_SYMBOL_GPL(rust_helper_amba_set_drvdata); + +void *rust_helper_amba_get_drvdata(struct amba_device *dev) +{ + return amba_get_drvdata(dev); +} +EXPORT_SYMBOL_GPL(rust_helper_amba_get_drvdata); + +void * +rust_helper_platform_get_drvdata(const struct platform_device *pdev) +{ + return platform_get_drvdata(pdev); +} +EXPORT_SYMBOL_GPL(rust_helper_platform_get_drvdata); + +void +rust_helper_platform_set_drvdata(struct platform_device *pdev, + void *data) +{ + return platform_set_drvdata(pdev, data); +} +EXPORT_SYMBOL_GPL(rust_helper_platform_set_drvdata); + +refcount_t rust_helper_REFCOUNT_INIT(int n) +{ + return (refcount_t)REFCOUNT_INIT(n); +} +EXPORT_SYMBOL_GPL(rust_helper_REFCOUNT_INIT); + +void rust_helper_refcount_inc(refcount_t *r) +{ + refcount_inc(r); +} +EXPORT_SYMBOL_GPL(rust_helper_refcount_inc); + +bool rust_helper_refcount_dec_and_test(refcount_t *r) +{ + return refcount_dec_and_test(r); +} +EXPORT_SYMBOL_GPL(rust_helper_refcount_dec_and_test); + +void rust_helper_rb_link_node(struct rb_node *node, struct rb_node *parent, + struct rb_node **rb_link) +{ + rb_link_node(node, parent, rb_link); +} +EXPORT_SYMBOL_GPL(rust_helper_rb_link_node); + +struct task_struct *rust_helper_get_current(void) +{ + return current; +} +EXPORT_SYMBOL_GPL(rust_helper_get_current); + +void rust_helper_get_task_struct(struct task_struct *t) +{ + get_task_struct(t); +} +EXPORT_SYMBOL_GPL(rust_helper_get_task_struct); + +void rust_helper_put_task_struct(struct task_struct *t) +{ + put_task_struct(t); +} +EXPORT_SYMBOL_GPL(rust_helper_put_task_struct); + +int rust_helper_security_binder_set_context_mgr(const struct cred *mgr) +{ + return security_binder_set_context_mgr(mgr); +} +EXPORT_SYMBOL_GPL(rust_helper_security_binder_set_context_mgr); + +int rust_helper_security_binder_transaction(const struct cred *from, + const struct cred *to) +{ + return security_binder_transaction(from, to); +} +EXPORT_SYMBOL_GPL(rust_helper_security_binder_transaction); + +int rust_helper_security_binder_transfer_binder(const struct cred *from, + const struct cred *to) +{ + return security_binder_transfer_binder(from, to); +} +EXPORT_SYMBOL_GPL(rust_helper_security_binder_transfer_binder); + +int rust_helper_security_binder_transfer_file(const struct cred *from, + const struct cred *to, + struct file *file) +{ + return security_binder_transfer_file(from, to, file); +} +EXPORT_SYMBOL_GPL(rust_helper_security_binder_transfer_file); + +struct file *rust_helper_get_file(struct file *f) +{ + return get_file(f); +} +EXPORT_SYMBOL_GPL(rust_helper_get_file); + +void rust_helper_rcu_read_lock(void) +{ + rcu_read_lock(); +} +EXPORT_SYMBOL_GPL(rust_helper_rcu_read_lock); + +void rust_helper_rcu_read_unlock(void) +{ + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rust_helper_rcu_read_unlock); + +void rust_helper_synchronize_rcu(void) +{ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(rust_helper_synchronize_rcu); + +void *rust_helper_dev_get_drvdata(struct device *dev) +{ + return dev_get_drvdata(dev); +} +EXPORT_SYMBOL_GPL(rust_helper_dev_get_drvdata); + +const char *rust_helper_dev_name(const struct device *dev) +{ + return dev_name(dev); +} +EXPORT_SYMBOL_GPL(rust_helper_dev_name); + +void rust_helper___seqcount_init(seqcount_t *s, const char *name, + struct lock_class_key *key) +{ + __seqcount_init(s, name, key); +} +EXPORT_SYMBOL_GPL(rust_helper___seqcount_init); + +unsigned rust_helper_read_seqcount_begin(seqcount_t *s) +{ + return read_seqcount_begin(s); +} +EXPORT_SYMBOL_GPL(rust_helper_read_seqcount_begin); + +int rust_helper_read_seqcount_retry(seqcount_t *s, unsigned start) +{ + return read_seqcount_retry(s, start); +} +EXPORT_SYMBOL_GPL(rust_helper_read_seqcount_retry); + +void rust_helper_write_seqcount_begin(seqcount_t *s) +{ + do_write_seqcount_begin(s); +} +EXPORT_SYMBOL_GPL(rust_helper_write_seqcount_begin); + +void rust_helper_write_seqcount_end(seqcount_t *s) +{ + do_write_seqcount_end(s); +} +EXPORT_SYMBOL_GPL(rust_helper_write_seqcount_end); + +void rust_helper_irq_set_handler_locked(struct irq_data *data, + irq_flow_handler_t handler) +{ + irq_set_handler_locked(data, handler); +} +EXPORT_SYMBOL_GPL(rust_helper_irq_set_handler_locked); + +void *rust_helper_irq_data_get_irq_chip_data(struct irq_data *d) +{ + return irq_data_get_irq_chip_data(d); +} +EXPORT_SYMBOL_GPL(rust_helper_irq_data_get_irq_chip_data); + +struct irq_chip *rust_helper_irq_desc_get_chip(struct irq_desc *desc) +{ + return irq_desc_get_chip(desc); +} +EXPORT_SYMBOL_GPL(rust_helper_irq_desc_get_chip); + +void *rust_helper_irq_desc_get_handler_data(struct irq_desc *desc) +{ + return irq_desc_get_handler_data(desc); +} +EXPORT_SYMBOL_GPL(rust_helper_irq_desc_get_handler_data); + +void rust_helper_chained_irq_enter(struct irq_chip *chip, + struct irq_desc *desc) +{ + chained_irq_enter(chip, desc); +} +EXPORT_SYMBOL_GPL(rust_helper_chained_irq_enter); + +void rust_helper_chained_irq_exit(struct irq_chip *chip, + struct irq_desc *desc) +{ + chained_irq_exit(chip, desc); +} +EXPORT_SYMBOL_GPL(rust_helper_chained_irq_exit); + +const struct cred *rust_helper_get_cred(const struct cred *cred) +{ + return get_cred(cred); +} +EXPORT_SYMBOL_GPL(rust_helper_get_cred); + +void rust_helper_put_cred(const struct cred *cred) +{ + put_cred(cred); +} +EXPORT_SYMBOL_GPL(rust_helper_put_cred); + +const struct of_device_id *rust_helper_of_match_device( + const struct of_device_id *matches, const struct device *dev) +{ + return of_match_device(matches, dev); +} +EXPORT_SYMBOL_GPL(rust_helper_of_match_device); + +void rust_helper_init_completion(struct completion *c) +{ + init_completion(c); +} +EXPORT_SYMBOL_GPL(rust_helper_init_completion); + +struct sk_buff *rust_helper_skb_get(struct sk_buff *skb) +{ + return skb_get(skb); +} +EXPORT_SYMBOL_GPL(rust_helper_skb_get); + +unsigned int rust_helper_skb_headlen(const struct sk_buff *skb) +{ + return skb_headlen(skb); +} +EXPORT_SYMBOL_GPL(rust_helper_skb_headlen); + +void rust_helper_dev_hold(struct net_device *dev) +{ + return dev_hold(dev); +} +EXPORT_SYMBOL_GPL(rust_helper_dev_hold); + +void rust_helper_dev_put(struct net_device *dev) +{ + return dev_put(dev); +} +EXPORT_SYMBOL_GPL(rust_helper_dev_put); + +struct net *rust_helper_get_net(struct net *net) +{ + return get_net(net); +} +EXPORT_SYMBOL_GPL(rust_helper_get_net); + +void rust_helper_put_net(struct net *net) +{ + return put_net(net); +} +EXPORT_SYMBOL_GPL(rust_helper_put_net); + +unsigned int rust_helper_NF_QUEUE_NR(unsigned int n) +{ + return NF_QUEUE_NR(n); +} +EXPORT_SYMBOL_GPL(rust_helper_NF_QUEUE_NR); + +/* + * We use `bindgen`'s `--size_t-is-usize` option to bind the C `size_t` type + * as the Rust `usize` type, so we can use it in contexts where Rust + * expects a `usize` like slice (array) indices. `usize` is defined to be + * the same as C's `uintptr_t` type (can hold any pointer) but not + * necessarily the same as `size_t` (can hold the size of any single + * object). Most modern platforms use the same concrete integer type for + * both of them, but in case we find ourselves on a platform where + * that's not true, fail early instead of risking ABI or + * integer-overflow issues. + * + * If your platform fails this assertion, it means that you are in + * danger of integer-overflow bugs (even if you attempt to remove + * `--size_t-is-usize`). It may be easiest to change the kernel ABI on + * your platform such that `size_t` matches `uintptr_t` (i.e., to increase + * `size_t`, because `uintptr_t` has to be at least as big as `size_t`). + */ +static_assert( + sizeof(size_t) == sizeof(uintptr_t) && + __alignof__(size_t) == __alignof__(uintptr_t), + "Rust code expects C `size_t` to match Rust `usize`" +); From d1fec9d9a9389aef15cff39c64d3e9e0226929aa Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 16:54:11 +0200 Subject: [PATCH 0020/1250] rust: add `compiler_builtins` crate Rust provides `compiler_builtins` as a port of LLVM's `compiler-rt`. Since we do not need the vast majority of them, we avoid the dependency by providing our own crate. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Sven Van Asbroeck Signed-off-by: Sven Van Asbroeck Co-developed-by: Gary Guo Signed-off-by: Gary Guo Signed-off-by: Miguel Ojeda --- rust/compiler_builtins.rs | 57 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 rust/compiler_builtins.rs diff --git a/rust/compiler_builtins.rs b/rust/compiler_builtins.rs new file mode 100644 index 00000000000000..80ca4c0dcd24a4 --- /dev/null +++ b/rust/compiler_builtins.rs @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Our own `compiler_builtins`. +//! +//! Rust provides [`compiler_builtins`] as a port of LLVM's [`compiler-rt`]. +//! Since we do not need the vast majority of them, we avoid the dependency +//! by providing this file. +//! +//! At the moment, some builtins are required that should not be. For instance, +//! [`core`] has 128-bit integers functionality which we should not be compiling +//! in. We will work with upstream [`core`] to provide feature flags to disable +//! the parts we do not need. For the moment, we define them to [`panic!`] at +//! runtime for simplicity to catch mistakes, instead of performing surgery +//! on `core.o`. +//! +//! In any case, all these symbols are weakened to ensure we do not override +//! those that may be provided by the rest of the kernel. +//! +//! [`compiler_builtins`]: https://github.com/rust-lang/compiler-builtins +//! [`compiler-rt`]: https://compiler-rt.llvm.org/ + +#![feature(compiler_builtins)] +#![compiler_builtins] +#![no_builtins] +#![no_std] + +macro_rules! define_panicking_intrinsics( + ($reason: tt, { $($ident: ident, )* }) => { + $( + #[doc(hidden)] + #[no_mangle] + pub extern "C" fn $ident() { + panic!($reason); + } + )* + } +); + +define_panicking_intrinsics!("`i128` should not be used", { + __ashrti3, + __muloti4, + __multi3, +}); + +define_panicking_intrinsics!("`u128` should not be used", { + __ashlti3, + __lshrti3, + __udivmodti4, + __udivti3, + __umodti3, +}); + +#[cfg(target_arch = "arm")] +define_panicking_intrinsics!("`u64` division/modulo should not be used", { + __aeabi_uldivmod, + __mulodi4, +}); From 7a12f13e68ee531da47429f8c1612d99fc64b9c8 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 6 May 2022 17:52:44 +0200 Subject: [PATCH 0021/1250] rust: import upstream `alloc` crate This is a subset of the Rust standard library `alloc` crate, version 1.60.0, licensed under "Apache-2.0 OR MIT", from: https://github.com/rust-lang/rust/tree/1.60.0/library/alloc/src The files are copied as-is, with no modifications whatsoever (not even adding the SPDX identifiers). For copyright details, please see: https://github.com/rust-lang/rust/blob/1.60.0/COPYRIGHT The next patch modifies these files as needed for use within the kernel. This patch split allows reviewers to double-check the import and to clearly see the differences introduced. Vendoring `alloc`, at least for the moment, allows us to have fallible allocations support (i.e. the `try_*` versions of methods which return a `Result` instead of panicking) early on. It also gives a bit more freedom to experiment with new interfaces and to iterate quickly. Eventually, the goal is to have everything the kernel needs in upstream `alloc` and drop it from the kernel tree. For a summary of work on `alloc` happening upstream, please see: https://github.com/Rust-for-Linux/linux/issues/408 Reviewed-by: Kees Cook Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Signed-off-by: Miguel Ojeda --- rust/alloc/alloc.rs | 436 ++++ rust/alloc/borrow.rs | 496 +++++ rust/alloc/boxed.rs | 2005 +++++++++++++++++++ rust/alloc/collections/mod.rs | 154 ++ rust/alloc/fmt.rs | 599 ++++++ rust/alloc/lib.rs | 223 +++ rust/alloc/macros.rs | 125 ++ rust/alloc/raw_vec.rs | 519 +++++ rust/alloc/slice.rs | 1191 +++++++++++ rust/alloc/str.rs | 613 ++++++ rust/alloc/string.rs | 2867 +++++++++++++++++++++++++++ rust/alloc/vec/drain.rs | 184 ++ rust/alloc/vec/drain_filter.rs | 143 ++ rust/alloc/vec/into_iter.rs | 354 ++++ rust/alloc/vec/is_zero.rs | 104 + rust/alloc/vec/mod.rs | 3055 +++++++++++++++++++++++++++++ rust/alloc/vec/partial_eq.rs | 47 + rust/alloc/vec/set_len_on_drop.rs | 28 + rust/alloc/vec/spec_extend.rs | 87 + 19 files changed, 13230 insertions(+) create mode 100644 rust/alloc/alloc.rs create mode 100644 rust/alloc/borrow.rs create mode 100644 rust/alloc/boxed.rs create mode 100644 rust/alloc/collections/mod.rs create mode 100644 rust/alloc/fmt.rs create mode 100644 rust/alloc/lib.rs create mode 100644 rust/alloc/macros.rs create mode 100644 rust/alloc/raw_vec.rs create mode 100644 rust/alloc/slice.rs create mode 100644 rust/alloc/str.rs create mode 100644 rust/alloc/string.rs create mode 100644 rust/alloc/vec/drain.rs create mode 100644 rust/alloc/vec/drain_filter.rs create mode 100644 rust/alloc/vec/into_iter.rs create mode 100644 rust/alloc/vec/is_zero.rs create mode 100644 rust/alloc/vec/mod.rs create mode 100644 rust/alloc/vec/partial_eq.rs create mode 100644 rust/alloc/vec/set_len_on_drop.rs create mode 100644 rust/alloc/vec/spec_extend.rs diff --git a/rust/alloc/alloc.rs b/rust/alloc/alloc.rs new file mode 100644 index 00000000000000..9d4f9af91a5e19 --- /dev/null +++ b/rust/alloc/alloc.rs @@ -0,0 +1,436 @@ +//! Memory allocation APIs + +#![stable(feature = "alloc_module", since = "1.28.0")] + +#[cfg(not(test))] +use core::intrinsics; +use core::intrinsics::{min_align_of_val, size_of_val}; + +use core::ptr::Unique; +#[cfg(not(test))] +use core::ptr::{self, NonNull}; + +#[stable(feature = "alloc_module", since = "1.28.0")] +#[doc(inline)] +pub use core::alloc::*; + +#[cfg(test)] +mod tests; + +extern "Rust" { + // These are the magic symbols to call the global allocator. rustc generates + // them to call `__rg_alloc` etc. if there is a `#[global_allocator]` attribute + // (the code expanding that attribute macro generates those functions), or to call + // the default implementations in libstd (`__rdl_alloc` etc. in `library/std/src/alloc.rs`) + // otherwise. + // The rustc fork of LLVM also special-cases these function names to be able to optimize them + // like `malloc`, `realloc`, and `free`, respectively. + #[rustc_allocator] + #[rustc_allocator_nounwind] + fn __rust_alloc(size: usize, align: usize) -> *mut u8; + #[rustc_allocator_nounwind] + fn __rust_dealloc(ptr: *mut u8, size: usize, align: usize); + #[rustc_allocator_nounwind] + fn __rust_realloc(ptr: *mut u8, old_size: usize, align: usize, new_size: usize) -> *mut u8; + #[rustc_allocator_nounwind] + fn __rust_alloc_zeroed(size: usize, align: usize) -> *mut u8; +} + +/// The global memory allocator. +/// +/// This type implements the [`Allocator`] trait by forwarding calls +/// to the allocator registered with the `#[global_allocator]` attribute +/// if there is one, or the `std` crate’s default. +/// +/// Note: while this type is unstable, the functionality it provides can be +/// accessed through the [free functions in `alloc`](self#functions). +#[unstable(feature = "allocator_api", issue = "32838")] +#[derive(Copy, Clone, Default, Debug)] +#[cfg(not(test))] +pub struct Global; + +#[cfg(test)] +pub use std::alloc::Global; + +/// Allocate memory with the global allocator. +/// +/// This function forwards calls to the [`GlobalAlloc::alloc`] method +/// of the allocator registered with the `#[global_allocator]` attribute +/// if there is one, or the `std` crate’s default. +/// +/// This function is expected to be deprecated in favor of the `alloc` method +/// of the [`Global`] type when it and the [`Allocator`] trait become stable. +/// +/// # Safety +/// +/// See [`GlobalAlloc::alloc`]. +/// +/// # Examples +/// +/// ``` +/// use std::alloc::{alloc, dealloc, Layout}; +/// +/// unsafe { +/// let layout = Layout::new::(); +/// let ptr = alloc(layout); +/// +/// *(ptr as *mut u16) = 42; +/// assert_eq!(*(ptr as *mut u16), 42); +/// +/// dealloc(ptr, layout); +/// } +/// ``` +#[stable(feature = "global_alloc", since = "1.28.0")] +#[must_use = "losing the pointer will leak memory"] +#[inline] +pub unsafe fn alloc(layout: Layout) -> *mut u8 { + unsafe { __rust_alloc(layout.size(), layout.align()) } +} + +/// Deallocate memory with the global allocator. +/// +/// This function forwards calls to the [`GlobalAlloc::dealloc`] method +/// of the allocator registered with the `#[global_allocator]` attribute +/// if there is one, or the `std` crate’s default. +/// +/// This function is expected to be deprecated in favor of the `dealloc` method +/// of the [`Global`] type when it and the [`Allocator`] trait become stable. +/// +/// # Safety +/// +/// See [`GlobalAlloc::dealloc`]. +#[stable(feature = "global_alloc", since = "1.28.0")] +#[inline] +pub unsafe fn dealloc(ptr: *mut u8, layout: Layout) { + unsafe { __rust_dealloc(ptr, layout.size(), layout.align()) } +} + +/// Reallocate memory with the global allocator. +/// +/// This function forwards calls to the [`GlobalAlloc::realloc`] method +/// of the allocator registered with the `#[global_allocator]` attribute +/// if there is one, or the `std` crate’s default. +/// +/// This function is expected to be deprecated in favor of the `realloc` method +/// of the [`Global`] type when it and the [`Allocator`] trait become stable. +/// +/// # Safety +/// +/// See [`GlobalAlloc::realloc`]. +#[stable(feature = "global_alloc", since = "1.28.0")] +#[must_use = "losing the pointer will leak memory"] +#[inline] +pub unsafe fn realloc(ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 { + unsafe { __rust_realloc(ptr, layout.size(), layout.align(), new_size) } +} + +/// Allocate zero-initialized memory with the global allocator. +/// +/// This function forwards calls to the [`GlobalAlloc::alloc_zeroed`] method +/// of the allocator registered with the `#[global_allocator]` attribute +/// if there is one, or the `std` crate’s default. +/// +/// This function is expected to be deprecated in favor of the `alloc_zeroed` method +/// of the [`Global`] type when it and the [`Allocator`] trait become stable. +/// +/// # Safety +/// +/// See [`GlobalAlloc::alloc_zeroed`]. +/// +/// # Examples +/// +/// ``` +/// use std::alloc::{alloc_zeroed, dealloc, Layout}; +/// +/// unsafe { +/// let layout = Layout::new::(); +/// let ptr = alloc_zeroed(layout); +/// +/// assert_eq!(*(ptr as *mut u16), 0); +/// +/// dealloc(ptr, layout); +/// } +/// ``` +#[stable(feature = "global_alloc", since = "1.28.0")] +#[must_use = "losing the pointer will leak memory"] +#[inline] +pub unsafe fn alloc_zeroed(layout: Layout) -> *mut u8 { + unsafe { __rust_alloc_zeroed(layout.size(), layout.align()) } +} + +#[cfg(not(test))] +impl Global { + #[inline] + fn alloc_impl(&self, layout: Layout, zeroed: bool) -> Result, AllocError> { + match layout.size() { + 0 => Ok(NonNull::slice_from_raw_parts(layout.dangling(), 0)), + // SAFETY: `layout` is non-zero in size, + size => unsafe { + let raw_ptr = if zeroed { alloc_zeroed(layout) } else { alloc(layout) }; + let ptr = NonNull::new(raw_ptr).ok_or(AllocError)?; + Ok(NonNull::slice_from_raw_parts(ptr, size)) + }, + } + } + + // SAFETY: Same as `Allocator::grow` + #[inline] + unsafe fn grow_impl( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + zeroed: bool, + ) -> Result, AllocError> { + debug_assert!( + new_layout.size() >= old_layout.size(), + "`new_layout.size()` must be greater than or equal to `old_layout.size()`" + ); + + match old_layout.size() { + 0 => self.alloc_impl(new_layout, zeroed), + + // SAFETY: `new_size` is non-zero as `old_size` is greater than or equal to `new_size` + // as required by safety conditions. Other conditions must be upheld by the caller + old_size if old_layout.align() == new_layout.align() => unsafe { + let new_size = new_layout.size(); + + // `realloc` probably checks for `new_size >= old_layout.size()` or something similar. + intrinsics::assume(new_size >= old_layout.size()); + + let raw_ptr = realloc(ptr.as_ptr(), old_layout, new_size); + let ptr = NonNull::new(raw_ptr).ok_or(AllocError)?; + if zeroed { + raw_ptr.add(old_size).write_bytes(0, new_size - old_size); + } + Ok(NonNull::slice_from_raw_parts(ptr, new_size)) + }, + + // SAFETY: because `new_layout.size()` must be greater than or equal to `old_size`, + // both the old and new memory allocation are valid for reads and writes for `old_size` + // bytes. Also, because the old allocation wasn't yet deallocated, it cannot overlap + // `new_ptr`. Thus, the call to `copy_nonoverlapping` is safe. The safety contract + // for `dealloc` must be upheld by the caller. + old_size => unsafe { + let new_ptr = self.alloc_impl(new_layout, zeroed)?; + ptr::copy_nonoverlapping(ptr.as_ptr(), new_ptr.as_mut_ptr(), old_size); + self.deallocate(ptr, old_layout); + Ok(new_ptr) + }, + } + } +} + +#[unstable(feature = "allocator_api", issue = "32838")] +#[cfg(not(test))] +unsafe impl Allocator for Global { + #[inline] + fn allocate(&self, layout: Layout) -> Result, AllocError> { + self.alloc_impl(layout, false) + } + + #[inline] + fn allocate_zeroed(&self, layout: Layout) -> Result, AllocError> { + self.alloc_impl(layout, true) + } + + #[inline] + unsafe fn deallocate(&self, ptr: NonNull, layout: Layout) { + if layout.size() != 0 { + // SAFETY: `layout` is non-zero in size, + // other conditions must be upheld by the caller + unsafe { dealloc(ptr.as_ptr(), layout) } + } + } + + #[inline] + unsafe fn grow( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + // SAFETY: all conditions must be upheld by the caller + unsafe { self.grow_impl(ptr, old_layout, new_layout, false) } + } + + #[inline] + unsafe fn grow_zeroed( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + // SAFETY: all conditions must be upheld by the caller + unsafe { self.grow_impl(ptr, old_layout, new_layout, true) } + } + + #[inline] + unsafe fn shrink( + &self, + ptr: NonNull, + old_layout: Layout, + new_layout: Layout, + ) -> Result, AllocError> { + debug_assert!( + new_layout.size() <= old_layout.size(), + "`new_layout.size()` must be smaller than or equal to `old_layout.size()`" + ); + + match new_layout.size() { + // SAFETY: conditions must be upheld by the caller + 0 => unsafe { + self.deallocate(ptr, old_layout); + Ok(NonNull::slice_from_raw_parts(new_layout.dangling(), 0)) + }, + + // SAFETY: `new_size` is non-zero. Other conditions must be upheld by the caller + new_size if old_layout.align() == new_layout.align() => unsafe { + // `realloc` probably checks for `new_size <= old_layout.size()` or something similar. + intrinsics::assume(new_size <= old_layout.size()); + + let raw_ptr = realloc(ptr.as_ptr(), old_layout, new_size); + let ptr = NonNull::new(raw_ptr).ok_or(AllocError)?; + Ok(NonNull::slice_from_raw_parts(ptr, new_size)) + }, + + // SAFETY: because `new_size` must be smaller than or equal to `old_layout.size()`, + // both the old and new memory allocation are valid for reads and writes for `new_size` + // bytes. Also, because the old allocation wasn't yet deallocated, it cannot overlap + // `new_ptr`. Thus, the call to `copy_nonoverlapping` is safe. The safety contract + // for `dealloc` must be upheld by the caller. + new_size => unsafe { + let new_ptr = self.allocate(new_layout)?; + ptr::copy_nonoverlapping(ptr.as_ptr(), new_ptr.as_mut_ptr(), new_size); + self.deallocate(ptr, old_layout); + Ok(new_ptr) + }, + } + } +} + +/// The allocator for unique pointers. +#[cfg(all(not(no_global_oom_handling), not(test)))] +#[lang = "exchange_malloc"] +#[inline] +unsafe fn exchange_malloc(size: usize, align: usize) -> *mut u8 { + let layout = unsafe { Layout::from_size_align_unchecked(size, align) }; + match Global.allocate(layout) { + Ok(ptr) => ptr.as_mut_ptr(), + Err(_) => handle_alloc_error(layout), + } +} + +#[cfg_attr(not(test), lang = "box_free")] +#[inline] +#[rustc_const_unstable(feature = "const_box", issue = "92521")] +// This signature has to be the same as `Box`, otherwise an ICE will happen. +// When an additional parameter to `Box` is added (like `A: Allocator`), this has to be added here as +// well. +// For example if `Box` is changed to `struct Box(Unique, A)`, +// this function has to be changed to `fn box_free(Unique, A)` as well. +pub(crate) const unsafe fn box_free( + ptr: Unique, + alloc: A, +) { + unsafe { + let size = size_of_val(ptr.as_ref()); + let align = min_align_of_val(ptr.as_ref()); + let layout = Layout::from_size_align_unchecked(size, align); + alloc.deallocate(From::from(ptr.cast()), layout) + } +} + +// # Allocation error handler + +#[cfg(not(no_global_oom_handling))] +extern "Rust" { + // This is the magic symbol to call the global alloc error handler. rustc generates + // it to call `__rg_oom` if there is a `#[alloc_error_handler]`, or to call the + // default implementations below (`__rdl_oom`) otherwise. + fn __rust_alloc_error_handler(size: usize, align: usize) -> !; +} + +/// Abort on memory allocation error or failure. +/// +/// Callers of memory allocation APIs wishing to abort computation +/// in response to an allocation error are encouraged to call this function, +/// rather than directly invoking `panic!` or similar. +/// +/// The default behavior of this function is to print a message to standard error +/// and abort the process. +/// It can be replaced with [`set_alloc_error_hook`] and [`take_alloc_error_hook`]. +/// +/// [`set_alloc_error_hook`]: ../../std/alloc/fn.set_alloc_error_hook.html +/// [`take_alloc_error_hook`]: ../../std/alloc/fn.take_alloc_error_hook.html +#[stable(feature = "global_alloc", since = "1.28.0")] +#[rustc_const_unstable(feature = "const_alloc_error", issue = "92523")] +#[cfg(all(not(no_global_oom_handling), not(test)))] +#[cold] +pub const fn handle_alloc_error(layout: Layout) -> ! { + const fn ct_error(_: Layout) -> ! { + panic!("allocation failed"); + } + + fn rt_error(layout: Layout) -> ! { + unsafe { + __rust_alloc_error_handler(layout.size(), layout.align()); + } + } + + unsafe { core::intrinsics::const_eval_select((layout,), ct_error, rt_error) } +} + +// For alloc test `std::alloc::handle_alloc_error` can be used directly. +#[cfg(all(not(no_global_oom_handling), test))] +pub use std::alloc::handle_alloc_error; + +#[cfg(all(not(no_global_oom_handling), not(any(target_os = "hermit", test))))] +#[doc(hidden)] +#[allow(unused_attributes)] +#[unstable(feature = "alloc_internals", issue = "none")] +pub mod __alloc_error_handler { + use crate::alloc::Layout; + + // called via generated `__rust_alloc_error_handler` + + // if there is no `#[alloc_error_handler]` + #[rustc_std_internal_symbol] + pub unsafe extern "C-unwind" fn __rdl_oom(size: usize, _align: usize) -> ! { + panic!("memory allocation of {} bytes failed", size) + } + + // if there is an `#[alloc_error_handler]` + #[rustc_std_internal_symbol] + pub unsafe extern "C-unwind" fn __rg_oom(size: usize, align: usize) -> ! { + let layout = unsafe { Layout::from_size_align_unchecked(size, align) }; + extern "Rust" { + #[lang = "oom"] + fn oom_impl(layout: Layout) -> !; + } + unsafe { oom_impl(layout) } + } +} + +/// Specialize clones into pre-allocated, uninitialized memory. +/// Used by `Box::clone` and `Rc`/`Arc::make_mut`. +pub(crate) trait WriteCloneIntoRaw: Sized { + unsafe fn write_clone_into_raw(&self, target: *mut Self); +} + +impl WriteCloneIntoRaw for T { + #[inline] + default unsafe fn write_clone_into_raw(&self, target: *mut Self) { + // Having allocated *first* may allow the optimizer to create + // the cloned value in-place, skipping the local and move. + unsafe { target.write(self.clone()) }; + } +} + +impl WriteCloneIntoRaw for T { + #[inline] + unsafe fn write_clone_into_raw(&self, target: *mut Self) { + // We can always copy in-place, without ever involving a local value. + unsafe { target.copy_from_nonoverlapping(self, 1) }; + } +} diff --git a/rust/alloc/borrow.rs b/rust/alloc/borrow.rs new file mode 100644 index 00000000000000..63234ee91f0910 --- /dev/null +++ b/rust/alloc/borrow.rs @@ -0,0 +1,496 @@ +//! A module for working with borrowed data. + +#![stable(feature = "rust1", since = "1.0.0")] + +use core::cmp::Ordering; +use core::hash::{Hash, Hasher}; +use core::ops::Deref; +#[cfg(not(no_global_oom_handling))] +use core::ops::{Add, AddAssign}; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::borrow::{Borrow, BorrowMut}; + +use crate::fmt; +#[cfg(not(no_global_oom_handling))] +use crate::string::String; + +use Cow::*; + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a, B: ?Sized> Borrow for Cow<'a, B> +where + B: ToOwned, + ::Owned: 'a, +{ + fn borrow(&self) -> &B { + &**self + } +} + +/// A generalization of `Clone` to borrowed data. +/// +/// Some types make it possible to go from borrowed to owned, usually by +/// implementing the `Clone` trait. But `Clone` works only for going from `&T` +/// to `T`. The `ToOwned` trait generalizes `Clone` to construct owned data +/// from any borrow of a given type. +#[cfg_attr(not(test), rustc_diagnostic_item = "ToOwned")] +#[stable(feature = "rust1", since = "1.0.0")] +pub trait ToOwned { + /// The resulting type after obtaining ownership. + #[stable(feature = "rust1", since = "1.0.0")] + type Owned: Borrow; + + /// Creates owned data from borrowed data, usually by cloning. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s: &str = "a"; + /// let ss: String = s.to_owned(); + /// + /// let v: &[i32] = &[1, 2]; + /// let vv: Vec = v.to_owned(); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use = "cloning is often expensive and is not expected to have side effects"] + fn to_owned(&self) -> Self::Owned; + + /// Uses borrowed data to replace owned data, usually by cloning. + /// + /// This is borrow-generalized version of `Clone::clone_from`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// # #![feature(toowned_clone_into)] + /// let mut s: String = String::new(); + /// "hello".clone_into(&mut s); + /// + /// let mut v: Vec = Vec::new(); + /// [1, 2][..].clone_into(&mut v); + /// ``` + #[unstable(feature = "toowned_clone_into", reason = "recently added", issue = "41263")] + fn clone_into(&self, target: &mut Self::Owned) { + *target = self.to_owned(); + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ToOwned for T +where + T: Clone, +{ + type Owned = T; + fn to_owned(&self) -> T { + self.clone() + } + + fn clone_into(&self, target: &mut T) { + target.clone_from(self); + } +} + +/// A clone-on-write smart pointer. +/// +/// The type `Cow` is a smart pointer providing clone-on-write functionality: it +/// can enclose and provide immutable access to borrowed data, and clone the +/// data lazily when mutation or ownership is required. The type is designed to +/// work with general borrowed data via the `Borrow` trait. +/// +/// `Cow` implements `Deref`, which means that you can call +/// non-mutating methods directly on the data it encloses. If mutation +/// is desired, `to_mut` will obtain a mutable reference to an owned +/// value, cloning if necessary. +/// +/// If you need reference-counting pointers, note that +/// [`Rc::make_mut`][crate::rc::Rc::make_mut] and +/// [`Arc::make_mut`][crate::sync::Arc::make_mut] can provide clone-on-write +/// functionality as well. +/// +/// # Examples +/// +/// ``` +/// use std::borrow::Cow; +/// +/// fn abs_all(input: &mut Cow<[i32]>) { +/// for i in 0..input.len() { +/// let v = input[i]; +/// if v < 0 { +/// // Clones into a vector if not already owned. +/// input.to_mut()[i] = -v; +/// } +/// } +/// } +/// +/// // No clone occurs because `input` doesn't need to be mutated. +/// let slice = [0, 1, 2]; +/// let mut input = Cow::from(&slice[..]); +/// abs_all(&mut input); +/// +/// // Clone occurs because `input` needs to be mutated. +/// let slice = [-1, 0, 1]; +/// let mut input = Cow::from(&slice[..]); +/// abs_all(&mut input); +/// +/// // No clone occurs because `input` is already owned. +/// let mut input = Cow::from(vec![-1, 0, 1]); +/// abs_all(&mut input); +/// ``` +/// +/// Another example showing how to keep `Cow` in a struct: +/// +/// ``` +/// use std::borrow::Cow; +/// +/// struct Items<'a, X: 'a> where [X]: ToOwned> { +/// values: Cow<'a, [X]>, +/// } +/// +/// impl<'a, X: Clone + 'a> Items<'a, X> where [X]: ToOwned> { +/// fn new(v: Cow<'a, [X]>) -> Self { +/// Items { values: v } +/// } +/// } +/// +/// // Creates a container from borrowed values of a slice +/// let readonly = [1, 2]; +/// let borrowed = Items::new((&readonly[..]).into()); +/// match borrowed { +/// Items { values: Cow::Borrowed(b) } => println!("borrowed {:?}", b), +/// _ => panic!("expect borrowed value"), +/// } +/// +/// let mut clone_on_write = borrowed; +/// // Mutates the data from slice into owned vec and pushes a new value on top +/// clone_on_write.values.to_mut().push(3); +/// println!("clone_on_write = {:?}", clone_on_write.values); +/// +/// // The data was mutated. Let's check it out. +/// match clone_on_write { +/// Items { values: Cow::Owned(_) } => println!("clone_on_write contains owned data"), +/// _ => panic!("expect owned data"), +/// } +/// ``` +#[stable(feature = "rust1", since = "1.0.0")] +#[cfg_attr(not(test), rustc_diagnostic_item = "Cow")] +pub enum Cow<'a, B: ?Sized + 'a> +where + B: ToOwned, +{ + /// Borrowed data. + #[stable(feature = "rust1", since = "1.0.0")] + Borrowed(#[stable(feature = "rust1", since = "1.0.0")] &'a B), + + /// Owned data. + #[stable(feature = "rust1", since = "1.0.0")] + Owned(#[stable(feature = "rust1", since = "1.0.0")] ::Owned), +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Clone for Cow<'_, B> { + fn clone(&self) -> Self { + match *self { + Borrowed(b) => Borrowed(b), + Owned(ref o) => { + let b: &B = o.borrow(); + Owned(b.to_owned()) + } + } + } + + fn clone_from(&mut self, source: &Self) { + match (self, source) { + (&mut Owned(ref mut dest), &Owned(ref o)) => o.borrow().clone_into(dest), + (t, s) => *t = s.clone(), + } + } +} + +impl Cow<'_, B> { + /// Returns true if the data is borrowed, i.e. if `to_mut` would require additional work. + /// + /// # Examples + /// + /// ``` + /// #![feature(cow_is_borrowed)] + /// use std::borrow::Cow; + /// + /// let cow = Cow::Borrowed("moo"); + /// assert!(cow.is_borrowed()); + /// + /// let bull: Cow<'_, str> = Cow::Owned("...moo?".to_string()); + /// assert!(!bull.is_borrowed()); + /// ``` + #[unstable(feature = "cow_is_borrowed", issue = "65143")] + #[rustc_const_unstable(feature = "const_cow_is_borrowed", issue = "65143")] + pub const fn is_borrowed(&self) -> bool { + match *self { + Borrowed(_) => true, + Owned(_) => false, + } + } + + /// Returns true if the data is owned, i.e. if `to_mut` would be a no-op. + /// + /// # Examples + /// + /// ``` + /// #![feature(cow_is_borrowed)] + /// use std::borrow::Cow; + /// + /// let cow: Cow<'_, str> = Cow::Owned("moo".to_string()); + /// assert!(cow.is_owned()); + /// + /// let bull = Cow::Borrowed("...moo?"); + /// assert!(!bull.is_owned()); + /// ``` + #[unstable(feature = "cow_is_borrowed", issue = "65143")] + #[rustc_const_unstable(feature = "const_cow_is_borrowed", issue = "65143")] + pub const fn is_owned(&self) -> bool { + !self.is_borrowed() + } + + /// Acquires a mutable reference to the owned form of the data. + /// + /// Clones the data if it is not already owned. + /// + /// # Examples + /// + /// ``` + /// use std::borrow::Cow; + /// + /// let mut cow = Cow::Borrowed("foo"); + /// cow.to_mut().make_ascii_uppercase(); + /// + /// assert_eq!( + /// cow, + /// Cow::Owned(String::from("FOO")) as Cow + /// ); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + pub fn to_mut(&mut self) -> &mut ::Owned { + match *self { + Borrowed(borrowed) => { + *self = Owned(borrowed.to_owned()); + match *self { + Borrowed(..) => unreachable!(), + Owned(ref mut owned) => owned, + } + } + Owned(ref mut owned) => owned, + } + } + + /// Extracts the owned data. + /// + /// Clones the data if it is not already owned. + /// + /// # Examples + /// + /// Calling `into_owned` on a `Cow::Borrowed` clones the underlying data + /// and becomes a `Cow::Owned`: + /// + /// ``` + /// use std::borrow::Cow; + /// + /// let s = "Hello world!"; + /// let cow = Cow::Borrowed(s); + /// + /// assert_eq!( + /// cow.into_owned(), + /// String::from(s) + /// ); + /// ``` + /// + /// Calling `into_owned` on a `Cow::Owned` is a no-op: + /// + /// ``` + /// use std::borrow::Cow; + /// + /// let s = "Hello world!"; + /// let cow: Cow = Cow::Owned(String::from(s)); + /// + /// assert_eq!( + /// cow.into_owned(), + /// String::from(s) + /// ); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + pub fn into_owned(self) -> ::Owned { + match self { + Borrowed(borrowed) => borrowed.to_owned(), + Owned(owned) => owned, + } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_unstable(feature = "const_deref", issue = "88955")] +impl const Deref for Cow<'_, B> +where + B::Owned: ~const Borrow, +{ + type Target = B; + + fn deref(&self) -> &B { + match *self { + Borrowed(borrowed) => borrowed, + Owned(ref owned) => owned.borrow(), + } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Eq for Cow<'_, B> where B: Eq + ToOwned {} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Ord for Cow<'_, B> +where + B: Ord + ToOwned, +{ + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + Ord::cmp(&**self, &**other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a, 'b, B: ?Sized, C: ?Sized> PartialEq> for Cow<'a, B> +where + B: PartialEq + ToOwned, + C: ToOwned, +{ + #[inline] + fn eq(&self, other: &Cow<'b, C>) -> bool { + PartialEq::eq(&**self, &**other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a, B: ?Sized> PartialOrd for Cow<'a, B> +where + B: PartialOrd + ToOwned, +{ + #[inline] + fn partial_cmp(&self, other: &Cow<'a, B>) -> Option { + PartialOrd::partial_cmp(&**self, &**other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Debug for Cow<'_, B> +where + B: fmt::Debug + ToOwned, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Borrowed(ref b) => fmt::Debug::fmt(b, f), + Owned(ref o) => fmt::Debug::fmt(o, f), + } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Display for Cow<'_, B> +where + B: fmt::Display + ToOwned, +{ + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + Borrowed(ref b) => fmt::Display::fmt(b, f), + Owned(ref o) => fmt::Display::fmt(o, f), + } + } +} + +#[stable(feature = "default", since = "1.11.0")] +impl Default for Cow<'_, B> +where + B: ToOwned, +{ + /// Creates an owned Cow<'a, B> with the default value for the contained owned value. + fn default() -> Self { + Owned(::Owned::default()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Hash for Cow<'_, B> +where + B: Hash + ToOwned, +{ + #[inline] + fn hash(&self, state: &mut H) { + Hash::hash(&**self, state) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef for Cow<'_, T> { + fn as_ref(&self) -> &T { + self + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_add", since = "1.14.0")] +impl<'a> Add<&'a str> for Cow<'a, str> { + type Output = Cow<'a, str>; + + #[inline] + fn add(mut self, rhs: &'a str) -> Self::Output { + self += rhs; + self + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_add", since = "1.14.0")] +impl<'a> Add> for Cow<'a, str> { + type Output = Cow<'a, str>; + + #[inline] + fn add(mut self, rhs: Cow<'a, str>) -> Self::Output { + self += rhs; + self + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_add", since = "1.14.0")] +impl<'a> AddAssign<&'a str> for Cow<'a, str> { + fn add_assign(&mut self, rhs: &'a str) { + if self.is_empty() { + *self = Cow::Borrowed(rhs) + } else if !rhs.is_empty() { + if let Cow::Borrowed(lhs) = *self { + let mut s = String::with_capacity(lhs.len() + rhs.len()); + s.push_str(lhs); + *self = Cow::Owned(s); + } + self.to_mut().push_str(rhs); + } + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_add", since = "1.14.0")] +impl<'a> AddAssign> for Cow<'a, str> { + fn add_assign(&mut self, rhs: Cow<'a, str>) { + if self.is_empty() { + *self = rhs + } else if !rhs.is_empty() { + if let Cow::Borrowed(lhs) = *self { + let mut s = String::with_capacity(lhs.len() + rhs.len()); + s.push_str(lhs); + *self = Cow::Owned(s); + } + self.to_mut().push_str(&rhs); + } + } +} diff --git a/rust/alloc/boxed.rs b/rust/alloc/boxed.rs new file mode 100644 index 00000000000000..f753189c683088 --- /dev/null +++ b/rust/alloc/boxed.rs @@ -0,0 +1,2005 @@ +//! A pointer type for heap allocation. +//! +//! [`Box`], casually referred to as a 'box', provides the simplest form of +//! heap allocation in Rust. Boxes provide ownership for this allocation, and +//! drop their contents when they go out of scope. Boxes also ensure that they +//! never allocate more than `isize::MAX` bytes. +//! +//! # Examples +//! +//! Move a value from the stack to the heap by creating a [`Box`]: +//! +//! ``` +//! let val: u8 = 5; +//! let boxed: Box = Box::new(val); +//! ``` +//! +//! Move a value from a [`Box`] back to the stack by [dereferencing]: +//! +//! ``` +//! let boxed: Box = Box::new(5); +//! let val: u8 = *boxed; +//! ``` +//! +//! Creating a recursive data structure: +//! +//! ``` +//! #[derive(Debug)] +//! enum List { +//! Cons(T, Box>), +//! Nil, +//! } +//! +//! let list: List = List::Cons(1, Box::new(List::Cons(2, Box::new(List::Nil)))); +//! println!("{:?}", list); +//! ``` +//! +//! This will print `Cons(1, Cons(2, Nil))`. +//! +//! Recursive structures must be boxed, because if the definition of `Cons` +//! looked like this: +//! +//! ```compile_fail,E0072 +//! # enum List { +//! Cons(T, List), +//! # } +//! ``` +//! +//! It wouldn't work. This is because the size of a `List` depends on how many +//! elements are in the list, and so we don't know how much memory to allocate +//! for a `Cons`. By introducing a [`Box`], which has a defined size, we know how +//! big `Cons` needs to be. +//! +//! # Memory layout +//! +//! For non-zero-sized values, a [`Box`] will use the [`Global`] allocator for +//! its allocation. It is valid to convert both ways between a [`Box`] and a +//! raw pointer allocated with the [`Global`] allocator, given that the +//! [`Layout`] used with the allocator is correct for the type. More precisely, +//! a `value: *mut T` that has been allocated with the [`Global`] allocator +//! with `Layout::for_value(&*value)` may be converted into a box using +//! [`Box::::from_raw(value)`]. Conversely, the memory backing a `value: *mut +//! T` obtained from [`Box::::into_raw`] may be deallocated using the +//! [`Global`] allocator with [`Layout::for_value(&*value)`]. +//! +//! For zero-sized values, the `Box` pointer still has to be [valid] for reads +//! and writes and sufficiently aligned. In particular, casting any aligned +//! non-zero integer literal to a raw pointer produces a valid pointer, but a +//! pointer pointing into previously allocated memory that since got freed is +//! not valid. The recommended way to build a Box to a ZST if `Box::new` cannot +//! be used is to use [`ptr::NonNull::dangling`]. +//! +//! So long as `T: Sized`, a `Box` is guaranteed to be represented +//! as a single pointer and is also ABI-compatible with C pointers +//! (i.e. the C type `T*`). This means that if you have extern "C" +//! Rust functions that will be called from C, you can define those +//! Rust functions using `Box` types, and use `T*` as corresponding +//! type on the C side. As an example, consider this C header which +//! declares functions that create and destroy some kind of `Foo` +//! value: +//! +//! ```c +//! /* C header */ +//! +//! /* Returns ownership to the caller */ +//! struct Foo* foo_new(void); +//! +//! /* Takes ownership from the caller; no-op when invoked with null */ +//! void foo_delete(struct Foo*); +//! ``` +//! +//! These two functions might be implemented in Rust as follows. Here, the +//! `struct Foo*` type from C is translated to `Box`, which captures +//! the ownership constraints. Note also that the nullable argument to +//! `foo_delete` is represented in Rust as `Option>`, since `Box` +//! cannot be null. +//! +//! ``` +//! #[repr(C)] +//! pub struct Foo; +//! +//! #[no_mangle] +//! pub extern "C" fn foo_new() -> Box { +//! Box::new(Foo) +//! } +//! +//! #[no_mangle] +//! pub extern "C" fn foo_delete(_: Option>) {} +//! ``` +//! +//! Even though `Box` has the same representation and C ABI as a C pointer, +//! this does not mean that you can convert an arbitrary `T*` into a `Box` +//! and expect things to work. `Box` values will always be fully aligned, +//! non-null pointers. Moreover, the destructor for `Box` will attempt to +//! free the value with the global allocator. In general, the best practice +//! is to only use `Box` for pointers that originated from the global +//! allocator. +//! +//! **Important.** At least at present, you should avoid using +//! `Box` types for functions that are defined in C but invoked +//! from Rust. In those cases, you should directly mirror the C types +//! as closely as possible. Using types like `Box` where the C +//! definition is just using `T*` can lead to undefined behavior, as +//! described in [rust-lang/unsafe-code-guidelines#198][ucg#198]. +//! +//! [ucg#198]: https://github.com/rust-lang/unsafe-code-guidelines/issues/198 +//! [dereferencing]: core::ops::Deref +//! [`Box::::from_raw(value)`]: Box::from_raw +//! [`Global`]: crate::alloc::Global +//! [`Layout`]: crate::alloc::Layout +//! [`Layout::for_value(&*value)`]: crate::alloc::Layout::for_value +//! [valid]: ptr#safety + +#![stable(feature = "rust1", since = "1.0.0")] + +use core::any::Any; +use core::async_iter::AsyncIterator; +use core::borrow; +use core::cmp::Ordering; +use core::convert::{From, TryFrom}; +use core::fmt; +use core::future::Future; +use core::hash::{Hash, Hasher}; +#[cfg(not(no_global_oom_handling))] +use core::iter::FromIterator; +use core::iter::{FusedIterator, Iterator}; +use core::marker::{Unpin, Unsize}; +use core::mem; +use core::ops::{ + CoerceUnsized, Deref, DerefMut, DispatchFromDyn, Generator, GeneratorState, Receiver, +}; +use core::pin::Pin; +use core::ptr::{self, Unique}; +use core::task::{Context, Poll}; + +#[cfg(not(no_global_oom_handling))] +use crate::alloc::{handle_alloc_error, WriteCloneIntoRaw}; +use crate::alloc::{AllocError, Allocator, Global, Layout}; +#[cfg(not(no_global_oom_handling))] +use crate::borrow::Cow; +use crate::raw_vec::RawVec; +#[cfg(not(no_global_oom_handling))] +use crate::str::from_boxed_utf8_unchecked; +#[cfg(not(no_global_oom_handling))] +use crate::vec::Vec; + +/// A pointer type for heap allocation. +/// +/// See the [module-level documentation](../../std/boxed/index.html) for more. +#[lang = "owned_box"] +#[fundamental] +#[stable(feature = "rust1", since = "1.0.0")] +// The declaration of the `Box` struct must be kept in sync with the +// `alloc::alloc::box_free` function or ICEs will happen. See the comment +// on `box_free` for more details. +pub struct Box< + T: ?Sized, + #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator = Global, +>(Unique, A); + +impl Box { + /// Allocates memory on the heap and then places `x` into it. + /// + /// This doesn't actually allocate if `T` is zero-sized. + /// + /// # Examples + /// + /// ``` + /// let five = Box::new(5); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline(always)] + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use] + pub fn new(x: T) -> Self { + box x + } + + /// Constructs a new box with uninitialized contents. + /// + /// # Examples + /// + /// ``` + /// #![feature(new_uninit)] + /// + /// let mut five = Box::::new_uninit(); + /// + /// let five = unsafe { + /// // Deferred initialization: + /// five.as_mut_ptr().write(5); + /// + /// five.assume_init() + /// }; + /// + /// assert_eq!(*five, 5) + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "new_uninit", issue = "63291")] + #[must_use] + #[inline] + pub fn new_uninit() -> Box> { + Self::new_uninit_in(Global) + } + + /// Constructs a new `Box` with uninitialized contents, with the memory + /// being filled with `0` bytes. + /// + /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage + /// of this method. + /// + /// # Examples + /// + /// ``` + /// #![feature(new_uninit)] + /// + /// let zero = Box::::new_zeroed(); + /// let zero = unsafe { zero.assume_init() }; + /// + /// assert_eq!(*zero, 0) + /// ``` + /// + /// [zeroed]: mem::MaybeUninit::zeroed + #[cfg(not(no_global_oom_handling))] + #[inline] + #[unstable(feature = "new_uninit", issue = "63291")] + #[must_use] + pub fn new_zeroed() -> Box> { + Self::new_zeroed_in(Global) + } + + /// Constructs a new `Pin>`. If `T` does not implement `Unpin`, then + /// `x` will be pinned in memory and unable to be moved. + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "pin", since = "1.33.0")] + #[must_use] + #[inline(always)] + pub fn pin(x: T) -> Pin> { + (box x).into() + } + + /// Allocates memory on the heap then places `x` into it, + /// returning an error if the allocation fails + /// + /// This doesn't actually allocate if `T` is zero-sized. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api)] + /// + /// let five = Box::try_new(5)?; + /// # Ok::<(), std::alloc::AllocError>(()) + /// ``` + #[unstable(feature = "allocator_api", issue = "32838")] + #[inline] + pub fn try_new(x: T) -> Result { + Self::try_new_in(x, Global) + } + + /// Constructs a new box with uninitialized contents on the heap, + /// returning an error if the allocation fails + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// let mut five = Box::::try_new_uninit()?; + /// + /// let five = unsafe { + /// // Deferred initialization: + /// five.as_mut_ptr().write(5); + /// + /// five.assume_init() + /// }; + /// + /// assert_eq!(*five, 5); + /// # Ok::<(), std::alloc::AllocError>(()) + /// ``` + #[unstable(feature = "allocator_api", issue = "32838")] + // #[unstable(feature = "new_uninit", issue = "63291")] + #[inline] + pub fn try_new_uninit() -> Result>, AllocError> { + Box::try_new_uninit_in(Global) + } + + /// Constructs a new `Box` with uninitialized contents, with the memory + /// being filled with `0` bytes on the heap + /// + /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage + /// of this method. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// let zero = Box::::try_new_zeroed()?; + /// let zero = unsafe { zero.assume_init() }; + /// + /// assert_eq!(*zero, 0); + /// # Ok::<(), std::alloc::AllocError>(()) + /// ``` + /// + /// [zeroed]: mem::MaybeUninit::zeroed + #[unstable(feature = "allocator_api", issue = "32838")] + // #[unstable(feature = "new_uninit", issue = "63291")] + #[inline] + pub fn try_new_zeroed() -> Result>, AllocError> { + Box::try_new_zeroed_in(Global) + } +} + +impl Box { + /// Allocates memory in the given allocator then places `x` into it. + /// + /// This doesn't actually allocate if `T` is zero-sized. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// let five = Box::new_in(5, System); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "allocator_api", issue = "32838")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[must_use] + #[inline] + pub const fn new_in(x: T, alloc: A) -> Self + where + A: ~const Allocator + ~const Drop, + { + let mut boxed = Self::new_uninit_in(alloc); + unsafe { + boxed.as_mut_ptr().write(x); + boxed.assume_init() + } + } + + /// Allocates memory in the given allocator then places `x` into it, + /// returning an error if the allocation fails + /// + /// This doesn't actually allocate if `T` is zero-sized. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// let five = Box::try_new_in(5, System)?; + /// # Ok::<(), std::alloc::AllocError>(()) + /// ``` + #[unstable(feature = "allocator_api", issue = "32838")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[inline] + pub const fn try_new_in(x: T, alloc: A) -> Result + where + T: ~const Drop, + A: ~const Allocator + ~const Drop, + { + let mut boxed = Self::try_new_uninit_in(alloc)?; + unsafe { + boxed.as_mut_ptr().write(x); + Ok(boxed.assume_init()) + } + } + + /// Constructs a new box with uninitialized contents in the provided allocator. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// use std::alloc::System; + /// + /// let mut five = Box::::new_uninit_in(System); + /// + /// let five = unsafe { + /// // Deferred initialization: + /// five.as_mut_ptr().write(5); + /// + /// five.assume_init() + /// }; + /// + /// assert_eq!(*five, 5) + /// ``` + #[unstable(feature = "allocator_api", issue = "32838")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[cfg(not(no_global_oom_handling))] + #[must_use] + // #[unstable(feature = "new_uninit", issue = "63291")] + pub const fn new_uninit_in(alloc: A) -> Box, A> + where + A: ~const Allocator + ~const Drop, + { + let layout = Layout::new::>(); + // NOTE: Prefer match over unwrap_or_else since closure sometimes not inlineable. + // That would make code size bigger. + match Box::try_new_uninit_in(alloc) { + Ok(m) => m, + Err(_) => handle_alloc_error(layout), + } + } + + /// Constructs a new box with uninitialized contents in the provided allocator, + /// returning an error if the allocation fails + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// use std::alloc::System; + /// + /// let mut five = Box::::try_new_uninit_in(System)?; + /// + /// let five = unsafe { + /// // Deferred initialization: + /// five.as_mut_ptr().write(5); + /// + /// five.assume_init() + /// }; + /// + /// assert_eq!(*five, 5); + /// # Ok::<(), std::alloc::AllocError>(()) + /// ``` + #[unstable(feature = "allocator_api", issue = "32838")] + // #[unstable(feature = "new_uninit", issue = "63291")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + pub const fn try_new_uninit_in(alloc: A) -> Result, A>, AllocError> + where + A: ~const Allocator + ~const Drop, + { + let layout = Layout::new::>(); + let ptr = alloc.allocate(layout)?.cast(); + unsafe { Ok(Box::from_raw_in(ptr.as_ptr(), alloc)) } + } + + /// Constructs a new `Box` with uninitialized contents, with the memory + /// being filled with `0` bytes in the provided allocator. + /// + /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage + /// of this method. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// use std::alloc::System; + /// + /// let zero = Box::::new_zeroed_in(System); + /// let zero = unsafe { zero.assume_init() }; + /// + /// assert_eq!(*zero, 0) + /// ``` + /// + /// [zeroed]: mem::MaybeUninit::zeroed + #[unstable(feature = "allocator_api", issue = "32838")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[cfg(not(no_global_oom_handling))] + // #[unstable(feature = "new_uninit", issue = "63291")] + #[must_use] + pub const fn new_zeroed_in(alloc: A) -> Box, A> + where + A: ~const Allocator + ~const Drop, + { + let layout = Layout::new::>(); + // NOTE: Prefer match over unwrap_or_else since closure sometimes not inlineable. + // That would make code size bigger. + match Box::try_new_zeroed_in(alloc) { + Ok(m) => m, + Err(_) => handle_alloc_error(layout), + } + } + + /// Constructs a new `Box` with uninitialized contents, with the memory + /// being filled with `0` bytes in the provided allocator, + /// returning an error if the allocation fails, + /// + /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage + /// of this method. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// use std::alloc::System; + /// + /// let zero = Box::::try_new_zeroed_in(System)?; + /// let zero = unsafe { zero.assume_init() }; + /// + /// assert_eq!(*zero, 0); + /// # Ok::<(), std::alloc::AllocError>(()) + /// ``` + /// + /// [zeroed]: mem::MaybeUninit::zeroed + #[unstable(feature = "allocator_api", issue = "32838")] + // #[unstable(feature = "new_uninit", issue = "63291")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + pub const fn try_new_zeroed_in(alloc: A) -> Result, A>, AllocError> + where + A: ~const Allocator + ~const Drop, + { + let layout = Layout::new::>(); + let ptr = alloc.allocate_zeroed(layout)?.cast(); + unsafe { Ok(Box::from_raw_in(ptr.as_ptr(), alloc)) } + } + + /// Constructs a new `Pin>`. If `T` does not implement `Unpin`, then + /// `x` will be pinned in memory and unable to be moved. + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "allocator_api", issue = "32838")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[must_use] + #[inline(always)] + pub const fn pin_in(x: T, alloc: A) -> Pin + where + A: 'static + ~const Allocator + ~const Drop, + { + Self::into_pin(Self::new_in(x, alloc)) + } + + /// Converts a `Box` into a `Box<[T]>` + /// + /// This conversion does not allocate on the heap and happens in place. + #[unstable(feature = "box_into_boxed_slice", issue = "71582")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + pub const fn into_boxed_slice(boxed: Self) -> Box<[T], A> { + let (raw, alloc) = Box::into_raw_with_allocator(boxed); + unsafe { Box::from_raw_in(raw as *mut [T; 1], alloc) } + } + + /// Consumes the `Box`, returning the wrapped value. + /// + /// # Examples + /// + /// ``` + /// #![feature(box_into_inner)] + /// + /// let c = Box::new(5); + /// + /// assert_eq!(Box::into_inner(c), 5); + /// ``` + #[unstable(feature = "box_into_inner", issue = "80437")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[inline] + pub const fn into_inner(boxed: Self) -> T + where + Self: ~const Drop, + { + *boxed + } +} + +impl Box<[T]> { + /// Constructs a new boxed slice with uninitialized contents. + /// + /// # Examples + /// + /// ``` + /// #![feature(new_uninit)] + /// + /// let mut values = Box::<[u32]>::new_uninit_slice(3); + /// + /// let values = unsafe { + /// // Deferred initialization: + /// values[0].as_mut_ptr().write(1); + /// values[1].as_mut_ptr().write(2); + /// values[2].as_mut_ptr().write(3); + /// + /// values.assume_init() + /// }; + /// + /// assert_eq!(*values, [1, 2, 3]) + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "new_uninit", issue = "63291")] + #[must_use] + pub fn new_uninit_slice(len: usize) -> Box<[mem::MaybeUninit]> { + unsafe { RawVec::with_capacity(len).into_box(len) } + } + + /// Constructs a new boxed slice with uninitialized contents, with the memory + /// being filled with `0` bytes. + /// + /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage + /// of this method. + /// + /// # Examples + /// + /// ``` + /// #![feature(new_uninit)] + /// + /// let values = Box::<[u32]>::new_zeroed_slice(3); + /// let values = unsafe { values.assume_init() }; + /// + /// assert_eq!(*values, [0, 0, 0]) + /// ``` + /// + /// [zeroed]: mem::MaybeUninit::zeroed + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "new_uninit", issue = "63291")] + #[must_use] + pub fn new_zeroed_slice(len: usize) -> Box<[mem::MaybeUninit]> { + unsafe { RawVec::with_capacity_zeroed(len).into_box(len) } + } + + /// Constructs a new boxed slice with uninitialized contents. Returns an error if + /// the allocation fails + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// let mut values = Box::<[u32]>::try_new_uninit_slice(3)?; + /// let values = unsafe { + /// // Deferred initialization: + /// values[0].as_mut_ptr().write(1); + /// values[1].as_mut_ptr().write(2); + /// values[2].as_mut_ptr().write(3); + /// values.assume_init() + /// }; + /// + /// assert_eq!(*values, [1, 2, 3]); + /// # Ok::<(), std::alloc::AllocError>(()) + /// ``` + #[unstable(feature = "allocator_api", issue = "32838")] + #[inline] + pub fn try_new_uninit_slice(len: usize) -> Result]>, AllocError> { + unsafe { + let layout = match Layout::array::>(len) { + Ok(l) => l, + Err(_) => return Err(AllocError), + }; + let ptr = Global.allocate(layout)?; + Ok(RawVec::from_raw_parts_in(ptr.as_mut_ptr() as *mut _, len, Global).into_box(len)) + } + } + + /// Constructs a new boxed slice with uninitialized contents, with the memory + /// being filled with `0` bytes. Returns an error if the allocation fails + /// + /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage + /// of this method. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// let values = Box::<[u32]>::try_new_zeroed_slice(3)?; + /// let values = unsafe { values.assume_init() }; + /// + /// assert_eq!(*values, [0, 0, 0]); + /// # Ok::<(), std::alloc::AllocError>(()) + /// ``` + /// + /// [zeroed]: mem::MaybeUninit::zeroed + #[unstable(feature = "allocator_api", issue = "32838")] + #[inline] + pub fn try_new_zeroed_slice(len: usize) -> Result]>, AllocError> { + unsafe { + let layout = match Layout::array::>(len) { + Ok(l) => l, + Err(_) => return Err(AllocError), + }; + let ptr = Global.allocate_zeroed(layout)?; + Ok(RawVec::from_raw_parts_in(ptr.as_mut_ptr() as *mut _, len, Global).into_box(len)) + } + } +} + +impl Box<[T], A> { + /// Constructs a new boxed slice with uninitialized contents in the provided allocator. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// use std::alloc::System; + /// + /// let mut values = Box::<[u32], _>::new_uninit_slice_in(3, System); + /// + /// let values = unsafe { + /// // Deferred initialization: + /// values[0].as_mut_ptr().write(1); + /// values[1].as_mut_ptr().write(2); + /// values[2].as_mut_ptr().write(3); + /// + /// values.assume_init() + /// }; + /// + /// assert_eq!(*values, [1, 2, 3]) + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "allocator_api", issue = "32838")] + // #[unstable(feature = "new_uninit", issue = "63291")] + #[must_use] + pub fn new_uninit_slice_in(len: usize, alloc: A) -> Box<[mem::MaybeUninit], A> { + unsafe { RawVec::with_capacity_in(len, alloc).into_box(len) } + } + + /// Constructs a new boxed slice with uninitialized contents in the provided allocator, + /// with the memory being filled with `0` bytes. + /// + /// See [`MaybeUninit::zeroed`][zeroed] for examples of correct and incorrect usage + /// of this method. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, new_uninit)] + /// + /// use std::alloc::System; + /// + /// let values = Box::<[u32], _>::new_zeroed_slice_in(3, System); + /// let values = unsafe { values.assume_init() }; + /// + /// assert_eq!(*values, [0, 0, 0]) + /// ``` + /// + /// [zeroed]: mem::MaybeUninit::zeroed + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "allocator_api", issue = "32838")] + // #[unstable(feature = "new_uninit", issue = "63291")] + #[must_use] + pub fn new_zeroed_slice_in(len: usize, alloc: A) -> Box<[mem::MaybeUninit], A> { + unsafe { RawVec::with_capacity_zeroed_in(len, alloc).into_box(len) } + } +} + +impl Box, A> { + /// Converts to `Box`. + /// + /// # Safety + /// + /// As with [`MaybeUninit::assume_init`], + /// it is up to the caller to guarantee that the value + /// really is in an initialized state. + /// Calling this when the content is not yet fully initialized + /// causes immediate undefined behavior. + /// + /// [`MaybeUninit::assume_init`]: mem::MaybeUninit::assume_init + /// + /// # Examples + /// + /// ``` + /// #![feature(new_uninit)] + /// + /// let mut five = Box::::new_uninit(); + /// + /// let five: Box = unsafe { + /// // Deferred initialization: + /// five.as_mut_ptr().write(5); + /// + /// five.assume_init() + /// }; + /// + /// assert_eq!(*five, 5) + /// ``` + #[unstable(feature = "new_uninit", issue = "63291")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[inline] + pub const unsafe fn assume_init(self) -> Box { + let (raw, alloc) = Box::into_raw_with_allocator(self); + unsafe { Box::from_raw_in(raw as *mut T, alloc) } + } + + /// Writes the value and converts to `Box`. + /// + /// This method converts the box similarly to [`Box::assume_init`] but + /// writes `value` into it before conversion thus guaranteeing safety. + /// In some scenarios use of this method may improve performance because + /// the compiler may be able to optimize copying from stack. + /// + /// # Examples + /// + /// ``` + /// #![feature(new_uninit)] + /// + /// let big_box = Box::<[usize; 1024]>::new_uninit(); + /// + /// let mut array = [0; 1024]; + /// for (i, place) in array.iter_mut().enumerate() { + /// *place = i; + /// } + /// + /// // The optimizer may be able to elide this copy, so previous code writes + /// // to heap directly. + /// let big_box = Box::write(big_box, array); + /// + /// for (i, x) in big_box.iter().enumerate() { + /// assert_eq!(*x, i); + /// } + /// ``` + #[unstable(feature = "new_uninit", issue = "63291")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[inline] + pub const fn write(mut boxed: Self, value: T) -> Box { + unsafe { + (*boxed).write(value); + boxed.assume_init() + } + } +} + +impl Box<[mem::MaybeUninit], A> { + /// Converts to `Box<[T], A>`. + /// + /// # Safety + /// + /// As with [`MaybeUninit::assume_init`], + /// it is up to the caller to guarantee that the values + /// really are in an initialized state. + /// Calling this when the content is not yet fully initialized + /// causes immediate undefined behavior. + /// + /// [`MaybeUninit::assume_init`]: mem::MaybeUninit::assume_init + /// + /// # Examples + /// + /// ``` + /// #![feature(new_uninit)] + /// + /// let mut values = Box::<[u32]>::new_uninit_slice(3); + /// + /// let values = unsafe { + /// // Deferred initialization: + /// values[0].as_mut_ptr().write(1); + /// values[1].as_mut_ptr().write(2); + /// values[2].as_mut_ptr().write(3); + /// + /// values.assume_init() + /// }; + /// + /// assert_eq!(*values, [1, 2, 3]) + /// ``` + #[unstable(feature = "new_uninit", issue = "63291")] + #[inline] + pub unsafe fn assume_init(self) -> Box<[T], A> { + let (raw, alloc) = Box::into_raw_with_allocator(self); + unsafe { Box::from_raw_in(raw as *mut [T], alloc) } + } +} + +impl Box { + /// Constructs a box from a raw pointer. + /// + /// After calling this function, the raw pointer is owned by the + /// resulting `Box`. Specifically, the `Box` destructor will call + /// the destructor of `T` and free the allocated memory. For this + /// to be safe, the memory must have been allocated in accordance + /// with the [memory layout] used by `Box` . + /// + /// # Safety + /// + /// This function is unsafe because improper use may lead to + /// memory problems. For example, a double-free may occur if the + /// function is called twice on the same raw pointer. + /// + /// The safety conditions are described in the [memory layout] section. + /// + /// # Examples + /// + /// Recreate a `Box` which was previously converted to a raw pointer + /// using [`Box::into_raw`]: + /// ``` + /// let x = Box::new(5); + /// let ptr = Box::into_raw(x); + /// let x = unsafe { Box::from_raw(ptr) }; + /// ``` + /// Manually create a `Box` from scratch by using the global allocator: + /// ``` + /// use std::alloc::{alloc, Layout}; + /// + /// unsafe { + /// let ptr = alloc(Layout::new::()) as *mut i32; + /// // In general .write is required to avoid attempting to destruct + /// // the (uninitialized) previous contents of `ptr`, though for this + /// // simple example `*ptr = 5` would have worked as well. + /// ptr.write(5); + /// let x = Box::from_raw(ptr); + /// } + /// ``` + /// + /// [memory layout]: self#memory-layout + /// [`Layout`]: crate::Layout + #[stable(feature = "box_raw", since = "1.4.0")] + #[inline] + pub unsafe fn from_raw(raw: *mut T) -> Self { + unsafe { Self::from_raw_in(raw, Global) } + } +} + +impl Box { + /// Constructs a box from a raw pointer in the given allocator. + /// + /// After calling this function, the raw pointer is owned by the + /// resulting `Box`. Specifically, the `Box` destructor will call + /// the destructor of `T` and free the allocated memory. For this + /// to be safe, the memory must have been allocated in accordance + /// with the [memory layout] used by `Box` . + /// + /// # Safety + /// + /// This function is unsafe because improper use may lead to + /// memory problems. For example, a double-free may occur if the + /// function is called twice on the same raw pointer. + /// + /// + /// # Examples + /// + /// Recreate a `Box` which was previously converted to a raw pointer + /// using [`Box::into_raw_with_allocator`]: + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// let x = Box::new_in(5, System); + /// let (ptr, alloc) = Box::into_raw_with_allocator(x); + /// let x = unsafe { Box::from_raw_in(ptr, alloc) }; + /// ``` + /// Manually create a `Box` from scratch by using the system allocator: + /// ``` + /// #![feature(allocator_api, slice_ptr_get)] + /// + /// use std::alloc::{Allocator, Layout, System}; + /// + /// unsafe { + /// let ptr = System.allocate(Layout::new::())?.as_mut_ptr() as *mut i32; + /// // In general .write is required to avoid attempting to destruct + /// // the (uninitialized) previous contents of `ptr`, though for this + /// // simple example `*ptr = 5` would have worked as well. + /// ptr.write(5); + /// let x = Box::from_raw_in(ptr, System); + /// } + /// # Ok::<(), std::alloc::AllocError>(()) + /// ``` + /// + /// [memory layout]: self#memory-layout + /// [`Layout`]: crate::Layout + #[unstable(feature = "allocator_api", issue = "32838")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[inline] + pub const unsafe fn from_raw_in(raw: *mut T, alloc: A) -> Self { + Box(unsafe { Unique::new_unchecked(raw) }, alloc) + } + + /// Consumes the `Box`, returning a wrapped raw pointer. + /// + /// The pointer will be properly aligned and non-null. + /// + /// After calling this function, the caller is responsible for the + /// memory previously managed by the `Box`. In particular, the + /// caller should properly destroy `T` and release the memory, taking + /// into account the [memory layout] used by `Box`. The easiest way to + /// do this is to convert the raw pointer back into a `Box` with the + /// [`Box::from_raw`] function, allowing the `Box` destructor to perform + /// the cleanup. + /// + /// Note: this is an associated function, which means that you have + /// to call it as `Box::into_raw(b)` instead of `b.into_raw()`. This + /// is so that there is no conflict with a method on the inner type. + /// + /// # Examples + /// Converting the raw pointer back into a `Box` with [`Box::from_raw`] + /// for automatic cleanup: + /// ``` + /// let x = Box::new(String::from("Hello")); + /// let ptr = Box::into_raw(x); + /// let x = unsafe { Box::from_raw(ptr) }; + /// ``` + /// Manual cleanup by explicitly running the destructor and deallocating + /// the memory: + /// ``` + /// use std::alloc::{dealloc, Layout}; + /// use std::ptr; + /// + /// let x = Box::new(String::from("Hello")); + /// let p = Box::into_raw(x); + /// unsafe { + /// ptr::drop_in_place(p); + /// dealloc(p as *mut u8, Layout::new::()); + /// } + /// ``` + /// + /// [memory layout]: self#memory-layout + #[stable(feature = "box_raw", since = "1.4.0")] + #[inline] + pub fn into_raw(b: Self) -> *mut T { + Self::into_raw_with_allocator(b).0 + } + + /// Consumes the `Box`, returning a wrapped raw pointer and the allocator. + /// + /// The pointer will be properly aligned and non-null. + /// + /// After calling this function, the caller is responsible for the + /// memory previously managed by the `Box`. In particular, the + /// caller should properly destroy `T` and release the memory, taking + /// into account the [memory layout] used by `Box`. The easiest way to + /// do this is to convert the raw pointer back into a `Box` with the + /// [`Box::from_raw_in`] function, allowing the `Box` destructor to perform + /// the cleanup. + /// + /// Note: this is an associated function, which means that you have + /// to call it as `Box::into_raw_with_allocator(b)` instead of `b.into_raw_with_allocator()`. This + /// is so that there is no conflict with a method on the inner type. + /// + /// # Examples + /// Converting the raw pointer back into a `Box` with [`Box::from_raw_in`] + /// for automatic cleanup: + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// let x = Box::new_in(String::from("Hello"), System); + /// let (ptr, alloc) = Box::into_raw_with_allocator(x); + /// let x = unsafe { Box::from_raw_in(ptr, alloc) }; + /// ``` + /// Manual cleanup by explicitly running the destructor and deallocating + /// the memory: + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::{Allocator, Layout, System}; + /// use std::ptr::{self, NonNull}; + /// + /// let x = Box::new_in(String::from("Hello"), System); + /// let (ptr, alloc) = Box::into_raw_with_allocator(x); + /// unsafe { + /// ptr::drop_in_place(ptr); + /// let non_null = NonNull::new_unchecked(ptr); + /// alloc.deallocate(non_null.cast(), Layout::new::()); + /// } + /// ``` + /// + /// [memory layout]: self#memory-layout + #[unstable(feature = "allocator_api", issue = "32838")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[inline] + pub const fn into_raw_with_allocator(b: Self) -> (*mut T, A) { + let (leaked, alloc) = Box::into_unique(b); + (leaked.as_ptr(), alloc) + } + + #[unstable( + feature = "ptr_internals", + issue = "none", + reason = "use `Box::leak(b).into()` or `Unique::from(Box::leak(b))` instead" + )] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[inline] + #[doc(hidden)] + pub const fn into_unique(b: Self) -> (Unique, A) { + // Box is recognized as a "unique pointer" by Stacked Borrows, but internally it is a + // raw pointer for the type system. Turning it directly into a raw pointer would not be + // recognized as "releasing" the unique pointer to permit aliased raw accesses, + // so all raw pointer methods have to go through `Box::leak`. Turning *that* to a raw pointer + // behaves correctly. + let alloc = unsafe { ptr::read(&b.1) }; + (Unique::from(Box::leak(b)), alloc) + } + + /// Returns a reference to the underlying allocator. + /// + /// Note: this is an associated function, which means that you have + /// to call it as `Box::allocator(&b)` instead of `b.allocator()`. This + /// is so that there is no conflict with a method on the inner type. + #[unstable(feature = "allocator_api", issue = "32838")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[inline] + pub const fn allocator(b: &Self) -> &A { + &b.1 + } + + /// Consumes and leaks the `Box`, returning a mutable reference, + /// `&'a mut T`. Note that the type `T` must outlive the chosen lifetime + /// `'a`. If the type has only static references, or none at all, then this + /// may be chosen to be `'static`. + /// + /// This function is mainly useful for data that lives for the remainder of + /// the program's life. Dropping the returned reference will cause a memory + /// leak. If this is not acceptable, the reference should first be wrapped + /// with the [`Box::from_raw`] function producing a `Box`. This `Box` can + /// then be dropped which will properly destroy `T` and release the + /// allocated memory. + /// + /// Note: this is an associated function, which means that you have + /// to call it as `Box::leak(b)` instead of `b.leak()`. This + /// is so that there is no conflict with a method on the inner type. + /// + /// # Examples + /// + /// Simple usage: + /// + /// ``` + /// let x = Box::new(41); + /// let static_ref: &'static mut usize = Box::leak(x); + /// *static_ref += 1; + /// assert_eq!(*static_ref, 42); + /// ``` + /// + /// Unsized data: + /// + /// ``` + /// let x = vec![1, 2, 3].into_boxed_slice(); + /// let static_ref = Box::leak(x); + /// static_ref[0] = 4; + /// assert_eq!(*static_ref, [4, 2, 3]); + /// ``` + #[stable(feature = "box_leak", since = "1.26.0")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + #[inline] + pub const fn leak<'a>(b: Self) -> &'a mut T + where + A: 'a, + { + unsafe { &mut *mem::ManuallyDrop::new(b).0.as_ptr() } + } + + /// Converts a `Box` into a `Pin>` + /// + /// This conversion does not allocate on the heap and happens in place. + /// + /// This is also available via [`From`]. + #[unstable(feature = "box_into_pin", issue = "62370")] + #[rustc_const_unstable(feature = "const_box", issue = "92521")] + pub const fn into_pin(boxed: Self) -> Pin + where + A: 'static, + { + // It's not possible to move or replace the insides of a `Pin>` + // when `T: !Unpin`, so it's safe to pin it directly without any + // additional requirements. + unsafe { Pin::new_unchecked(boxed) } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +unsafe impl<#[may_dangle] T: ?Sized, A: Allocator> Drop for Box { + fn drop(&mut self) { + // FIXME: Do nothing, drop is currently performed by compiler. + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Default for Box { + /// Creates a `Box`, with the `Default` value for T. + fn default() -> Self { + box T::default() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Default for Box<[T]> { + fn default() -> Self { + Box::<[T; 0]>::new([]) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "default_box_extra", since = "1.17.0")] +impl Default for Box { + fn default() -> Self { + unsafe { from_boxed_utf8_unchecked(Default::default()) } + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Clone for Box { + /// Returns a new box with a `clone()` of this box's contents. + /// + /// # Examples + /// + /// ``` + /// let x = Box::new(5); + /// let y = x.clone(); + /// + /// // The value is the same + /// assert_eq!(x, y); + /// + /// // But they are unique objects + /// assert_ne!(&*x as *const i32, &*y as *const i32); + /// ``` + #[inline] + fn clone(&self) -> Self { + // Pre-allocate memory to allow writing the cloned value directly. + let mut boxed = Self::new_uninit_in(self.1.clone()); + unsafe { + (**self).write_clone_into_raw(boxed.as_mut_ptr()); + boxed.assume_init() + } + } + + /// Copies `source`'s contents into `self` without creating a new allocation. + /// + /// # Examples + /// + /// ``` + /// let x = Box::new(5); + /// let mut y = Box::new(10); + /// let yp: *const i32 = &*y; + /// + /// y.clone_from(&x); + /// + /// // The value is the same + /// assert_eq!(x, y); + /// + /// // And no allocation occurred + /// assert_eq!(yp, &*y); + /// ``` + #[inline] + fn clone_from(&mut self, source: &Self) { + (**self).clone_from(&(**source)); + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_slice_clone", since = "1.3.0")] +impl Clone for Box { + fn clone(&self) -> Self { + // this makes a copy of the data + let buf: Box<[u8]> = self.as_bytes().into(); + unsafe { from_boxed_utf8_unchecked(buf) } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for Box { + #[inline] + fn eq(&self, other: &Self) -> bool { + PartialEq::eq(&**self, &**other) + } + #[inline] + fn ne(&self, other: &Self) -> bool { + PartialEq::ne(&**self, &**other) + } +} +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialOrd for Box { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + PartialOrd::partial_cmp(&**self, &**other) + } + #[inline] + fn lt(&self, other: &Self) -> bool { + PartialOrd::lt(&**self, &**other) + } + #[inline] + fn le(&self, other: &Self) -> bool { + PartialOrd::le(&**self, &**other) + } + #[inline] + fn ge(&self, other: &Self) -> bool { + PartialOrd::ge(&**self, &**other) + } + #[inline] + fn gt(&self, other: &Self) -> bool { + PartialOrd::gt(&**self, &**other) + } +} +#[stable(feature = "rust1", since = "1.0.0")] +impl Ord for Box { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + Ord::cmp(&**self, &**other) + } +} +#[stable(feature = "rust1", since = "1.0.0")] +impl Eq for Box {} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Hash for Box { + fn hash(&self, state: &mut H) { + (**self).hash(state); + } +} + +#[stable(feature = "indirect_hasher_impl", since = "1.22.0")] +impl Hasher for Box { + fn finish(&self) -> u64 { + (**self).finish() + } + fn write(&mut self, bytes: &[u8]) { + (**self).write(bytes) + } + fn write_u8(&mut self, i: u8) { + (**self).write_u8(i) + } + fn write_u16(&mut self, i: u16) { + (**self).write_u16(i) + } + fn write_u32(&mut self, i: u32) { + (**self).write_u32(i) + } + fn write_u64(&mut self, i: u64) { + (**self).write_u64(i) + } + fn write_u128(&mut self, i: u128) { + (**self).write_u128(i) + } + fn write_usize(&mut self, i: usize) { + (**self).write_usize(i) + } + fn write_i8(&mut self, i: i8) { + (**self).write_i8(i) + } + fn write_i16(&mut self, i: i16) { + (**self).write_i16(i) + } + fn write_i32(&mut self, i: i32) { + (**self).write_i32(i) + } + fn write_i64(&mut self, i: i64) { + (**self).write_i64(i) + } + fn write_i128(&mut self, i: i128) { + (**self).write_i128(i) + } + fn write_isize(&mut self, i: isize) { + (**self).write_isize(i) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "from_for_ptrs", since = "1.6.0")] +impl From for Box { + /// Converts a `T` into a `Box` + /// + /// The conversion allocates on the heap and moves `t` + /// from the stack into it. + /// + /// # Examples + /// + /// ```rust + /// let x = 5; + /// let boxed = Box::new(5); + /// + /// assert_eq!(Box::from(x), boxed); + /// ``` + fn from(t: T) -> Self { + Box::new(t) + } +} + +#[stable(feature = "pin", since = "1.33.0")] +#[rustc_const_unstable(feature = "const_box", issue = "92521")] +impl const From> for Pin> +where + A: 'static, +{ + /// Converts a `Box` into a `Pin>` + /// + /// This conversion does not allocate on the heap and happens in place. + fn from(boxed: Box) -> Self { + Box::into_pin(boxed) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_from_slice", since = "1.17.0")] +impl From<&[T]> for Box<[T]> { + /// Converts a `&[T]` into a `Box<[T]>` + /// + /// This conversion allocates on the heap + /// and performs a copy of `slice`. + /// + /// # Examples + /// ```rust + /// // create a &[u8] which will be used to create a Box<[u8]> + /// let slice: &[u8] = &[104, 101, 108, 108, 111]; + /// let boxed_slice: Box<[u8]> = Box::from(slice); + /// + /// println!("{:?}", boxed_slice); + /// ``` + fn from(slice: &[T]) -> Box<[T]> { + let len = slice.len(); + let buf = RawVec::with_capacity(len); + unsafe { + ptr::copy_nonoverlapping(slice.as_ptr(), buf.ptr(), len); + buf.into_box(slice.len()).assume_init() + } + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_from_cow", since = "1.45.0")] +impl From> for Box<[T]> { + /// Converts a `Cow<'_, [T]>` into a `Box<[T]>` + /// + /// When `cow` is the `Cow::Borrowed` variant, this + /// conversion allocates on the heap and copies the + /// underlying slice. Otherwise, it will try to reuse the owned + /// `Vec`'s allocation. + #[inline] + fn from(cow: Cow<'_, [T]>) -> Box<[T]> { + match cow { + Cow::Borrowed(slice) => Box::from(slice), + Cow::Owned(slice) => Box::from(slice), + } + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_from_slice", since = "1.17.0")] +impl From<&str> for Box { + /// Converts a `&str` into a `Box` + /// + /// This conversion allocates on the heap + /// and performs a copy of `s`. + /// + /// # Examples + /// + /// ```rust + /// let boxed: Box = Box::from("hello"); + /// println!("{}", boxed); + /// ``` + #[inline] + fn from(s: &str) -> Box { + unsafe { from_boxed_utf8_unchecked(Box::from(s.as_bytes())) } + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_from_cow", since = "1.45.0")] +impl From> for Box { + /// Converts a `Cow<'_, str>` into a `Box` + /// + /// When `cow` is the `Cow::Borrowed` variant, this + /// conversion allocates on the heap and copies the + /// underlying `str`. Otherwise, it will try to reuse the owned + /// `String`'s allocation. + /// + /// # Examples + /// + /// ```rust + /// use std::borrow::Cow; + /// + /// let unboxed = Cow::Borrowed("hello"); + /// let boxed: Box = Box::from(unboxed); + /// println!("{}", boxed); + /// ``` + /// + /// ```rust + /// # use std::borrow::Cow; + /// let unboxed = Cow::Owned("hello".to_string()); + /// let boxed: Box = Box::from(unboxed); + /// println!("{}", boxed); + /// ``` + #[inline] + fn from(cow: Cow<'_, str>) -> Box { + match cow { + Cow::Borrowed(s) => Box::from(s), + Cow::Owned(s) => Box::from(s), + } + } +} + +#[stable(feature = "boxed_str_conv", since = "1.19.0")] +impl From> for Box<[u8], A> { + /// Converts a `Box` into a `Box<[u8]>` + /// + /// This conversion does not allocate on the heap and happens in place. + /// + /// # Examples + /// ```rust + /// // create a Box which will be used to create a Box<[u8]> + /// let boxed: Box = Box::from("hello"); + /// let boxed_str: Box<[u8]> = Box::from(boxed); + /// + /// // create a &[u8] which will be used to create a Box<[u8]> + /// let slice: &[u8] = &[104, 101, 108, 108, 111]; + /// let boxed_slice = Box::from(slice); + /// + /// assert_eq!(boxed_slice, boxed_str); + /// ``` + #[inline] + fn from(s: Box) -> Self { + let (raw, alloc) = Box::into_raw_with_allocator(s); + unsafe { Box::from_raw_in(raw as *mut [u8], alloc) } + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_from_array", since = "1.45.0")] +impl From<[T; N]> for Box<[T]> { + /// Converts a `[T; N]` into a `Box<[T]>` + /// + /// This conversion moves the array to newly heap-allocated memory. + /// + /// # Examples + /// + /// ```rust + /// let boxed: Box<[u8]> = Box::from([4, 2]); + /// println!("{:?}", boxed); + /// ``` + fn from(array: [T; N]) -> Box<[T]> { + box array + } +} + +#[stable(feature = "boxed_slice_try_from", since = "1.43.0")] +impl TryFrom> for Box<[T; N]> { + type Error = Box<[T]>; + + /// Attempts to convert a `Box<[T]>` into a `Box<[T; N]>`. + /// + /// The conversion occurs in-place and does not require a + /// new memory allocation. + /// + /// # Errors + /// + /// Returns the old `Box<[T]>` in the `Err` variant if + /// `boxed_slice.len()` does not equal `N`. + fn try_from(boxed_slice: Box<[T]>) -> Result { + if boxed_slice.len() == N { + Ok(unsafe { Box::from_raw(Box::into_raw(boxed_slice) as *mut [T; N]) }) + } else { + Err(boxed_slice) + } + } +} + +impl Box { + /// Attempt to downcast the box to a concrete type. + /// + /// # Examples + /// + /// ``` + /// use std::any::Any; + /// + /// fn print_if_string(value: Box) { + /// if let Ok(string) = value.downcast::() { + /// println!("String ({}): {}", string.len(), string); + /// } + /// } + /// + /// let my_string = "Hello World".to_string(); + /// print_if_string(Box::new(my_string)); + /// print_if_string(Box::new(0i8)); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn downcast(self) -> Result, Self> { + if self.is::() { unsafe { Ok(self.downcast_unchecked::()) } } else { Err(self) } + } + + /// Downcasts the box to a concrete type. + /// + /// For a safe alternative see [`downcast`]. + /// + /// # Examples + /// + /// ``` + /// #![feature(downcast_unchecked)] + /// + /// use std::any::Any; + /// + /// let x: Box = Box::new(1_usize); + /// + /// unsafe { + /// assert_eq!(*x.downcast_unchecked::(), 1); + /// } + /// ``` + /// + /// # Safety + /// + /// The contained value must be of type `T`. Calling this method + /// with the incorrect type is *undefined behavior*. + /// + /// [`downcast`]: Self::downcast + #[inline] + #[unstable(feature = "downcast_unchecked", issue = "90850")] + pub unsafe fn downcast_unchecked(self) -> Box { + debug_assert!(self.is::()); + unsafe { + let (raw, alloc): (*mut dyn Any, _) = Box::into_raw_with_allocator(self); + Box::from_raw_in(raw as *mut T, alloc) + } + } +} + +impl Box { + /// Attempt to downcast the box to a concrete type. + /// + /// # Examples + /// + /// ``` + /// use std::any::Any; + /// + /// fn print_if_string(value: Box) { + /// if let Ok(string) = value.downcast::() { + /// println!("String ({}): {}", string.len(), string); + /// } + /// } + /// + /// let my_string = "Hello World".to_string(); + /// print_if_string(Box::new(my_string)); + /// print_if_string(Box::new(0i8)); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn downcast(self) -> Result, Self> { + if self.is::() { unsafe { Ok(self.downcast_unchecked::()) } } else { Err(self) } + } + + /// Downcasts the box to a concrete type. + /// + /// For a safe alternative see [`downcast`]. + /// + /// # Examples + /// + /// ``` + /// #![feature(downcast_unchecked)] + /// + /// use std::any::Any; + /// + /// let x: Box = Box::new(1_usize); + /// + /// unsafe { + /// assert_eq!(*x.downcast_unchecked::(), 1); + /// } + /// ``` + /// + /// # Safety + /// + /// The contained value must be of type `T`. Calling this method + /// with the incorrect type is *undefined behavior*. + /// + /// [`downcast`]: Self::downcast + #[inline] + #[unstable(feature = "downcast_unchecked", issue = "90850")] + pub unsafe fn downcast_unchecked(self) -> Box { + debug_assert!(self.is::()); + unsafe { + let (raw, alloc): (*mut (dyn Any + Send), _) = Box::into_raw_with_allocator(self); + Box::from_raw_in(raw as *mut T, alloc) + } + } +} + +impl Box { + /// Attempt to downcast the box to a concrete type. + /// + /// # Examples + /// + /// ``` + /// use std::any::Any; + /// + /// fn print_if_string(value: Box) { + /// if let Ok(string) = value.downcast::() { + /// println!("String ({}): {}", string.len(), string); + /// } + /// } + /// + /// let my_string = "Hello World".to_string(); + /// print_if_string(Box::new(my_string)); + /// print_if_string(Box::new(0i8)); + /// ``` + #[inline] + #[stable(feature = "box_send_sync_any_downcast", since = "1.51.0")] + pub fn downcast(self) -> Result, Self> { + if self.is::() { unsafe { Ok(self.downcast_unchecked::()) } } else { Err(self) } + } + + /// Downcasts the box to a concrete type. + /// + /// For a safe alternative see [`downcast`]. + /// + /// # Examples + /// + /// ``` + /// #![feature(downcast_unchecked)] + /// + /// use std::any::Any; + /// + /// let x: Box = Box::new(1_usize); + /// + /// unsafe { + /// assert_eq!(*x.downcast_unchecked::(), 1); + /// } + /// ``` + /// + /// # Safety + /// + /// The contained value must be of type `T`. Calling this method + /// with the incorrect type is *undefined behavior*. + /// + /// [`downcast`]: Self::downcast + #[inline] + #[unstable(feature = "downcast_unchecked", issue = "90850")] + pub unsafe fn downcast_unchecked(self) -> Box { + debug_assert!(self.is::()); + unsafe { + let (raw, alloc): (*mut (dyn Any + Send + Sync), _) = + Box::into_raw_with_allocator(self); + Box::from_raw_in(raw as *mut T, alloc) + } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Display for Box { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&**self, f) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Debug for Box { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&**self, f) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Pointer for Box { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // It's not possible to extract the inner Uniq directly from the Box, + // instead we cast it to a *const which aliases the Unique + let ptr: *const T = &**self; + fmt::Pointer::fmt(&ptr, f) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_unstable(feature = "const_box", issue = "92521")] +impl const Deref for Box { + type Target = T; + + fn deref(&self) -> &T { + &**self + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_unstable(feature = "const_box", issue = "92521")] +impl const DerefMut for Box { + fn deref_mut(&mut self) -> &mut T { + &mut **self + } +} + +#[unstable(feature = "receiver_trait", issue = "none")] +impl Receiver for Box {} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for Box { + type Item = I::Item; + fn next(&mut self) -> Option { + (**self).next() + } + fn size_hint(&self) -> (usize, Option) { + (**self).size_hint() + } + fn nth(&mut self, n: usize) -> Option { + (**self).nth(n) + } + fn last(self) -> Option { + BoxIter::last(self) + } +} + +trait BoxIter { + type Item; + fn last(self) -> Option; +} + +impl BoxIter for Box { + type Item = I::Item; + default fn last(self) -> Option { + #[inline] + fn some(_: Option, x: T) -> Option { + Some(x) + } + + self.fold(None, some) + } +} + +/// Specialization for sized `I`s that uses `I`s implementation of `last()` +/// instead of the default. +#[stable(feature = "rust1", since = "1.0.0")] +impl BoxIter for Box { + fn last(self) -> Option { + (*self).last() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl DoubleEndedIterator for Box { + fn next_back(&mut self) -> Option { + (**self).next_back() + } + fn nth_back(&mut self, n: usize) -> Option { + (**self).nth_back(n) + } +} +#[stable(feature = "rust1", since = "1.0.0")] +impl ExactSizeIterator for Box { + fn len(&self) -> usize { + (**self).len() + } + fn is_empty(&self) -> bool { + (**self).is_empty() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for Box {} + +#[stable(feature = "boxed_closure_impls", since = "1.35.0")] +impl + ?Sized, A: Allocator> FnOnce for Box { + type Output = >::Output; + + extern "rust-call" fn call_once(self, args: Args) -> Self::Output { + >::call_once(*self, args) + } +} + +#[stable(feature = "boxed_closure_impls", since = "1.35.0")] +impl + ?Sized, A: Allocator> FnMut for Box { + extern "rust-call" fn call_mut(&mut self, args: Args) -> Self::Output { + >::call_mut(self, args) + } +} + +#[stable(feature = "boxed_closure_impls", since = "1.35.0")] +impl + ?Sized, A: Allocator> Fn for Box { + extern "rust-call" fn call(&self, args: Args) -> Self::Output { + >::call(self, args) + } +} + +#[unstable(feature = "coerce_unsized", issue = "27732")] +impl, U: ?Sized, A: Allocator> CoerceUnsized> for Box {} + +#[unstable(feature = "dispatch_from_dyn", issue = "none")] +impl, U: ?Sized> DispatchFromDyn> for Box {} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "boxed_slice_from_iter", since = "1.32.0")] +impl FromIterator for Box<[I]> { + fn from_iter>(iter: T) -> Self { + iter.into_iter().collect::>().into_boxed_slice() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_slice_clone", since = "1.3.0")] +impl Clone for Box<[T], A> { + fn clone(&self) -> Self { + let alloc = Box::allocator(self).clone(); + self.to_vec_in(alloc).into_boxed_slice() + } + + fn clone_from(&mut self, other: &Self) { + if self.len() == other.len() { + self.clone_from_slice(&other); + } else { + *self = other.clone(); + } + } +} + +#[stable(feature = "box_borrow", since = "1.1.0")] +impl borrow::Borrow for Box { + fn borrow(&self) -> &T { + &**self + } +} + +#[stable(feature = "box_borrow", since = "1.1.0")] +impl borrow::BorrowMut for Box { + fn borrow_mut(&mut self) -> &mut T { + &mut **self + } +} + +#[stable(since = "1.5.0", feature = "smart_ptr_as_ref")] +impl AsRef for Box { + fn as_ref(&self) -> &T { + &**self + } +} + +#[stable(since = "1.5.0", feature = "smart_ptr_as_ref")] +impl AsMut for Box { + fn as_mut(&mut self) -> &mut T { + &mut **self + } +} + +/* Nota bene + * + * We could have chosen not to add this impl, and instead have written a + * function of Pin> to Pin. Such a function would not be sound, + * because Box implements Unpin even when T does not, as a result of + * this impl. + * + * We chose this API instead of the alternative for a few reasons: + * - Logically, it is helpful to understand pinning in regard to the + * memory region being pointed to. For this reason none of the + * standard library pointer types support projecting through a pin + * (Box is the only pointer type in std for which this would be + * safe.) + * - It is in practice very useful to have Box be unconditionally + * Unpin because of trait objects, for which the structural auto + * trait functionality does not apply (e.g., Box would + * otherwise not be Unpin). + * + * Another type with the same semantics as Box but only a conditional + * implementation of `Unpin` (where `T: Unpin`) would be valid/safe, and + * could have a method to project a Pin from it. + */ +#[stable(feature = "pin", since = "1.33.0")] +#[rustc_const_unstable(feature = "const_box", issue = "92521")] +impl const Unpin for Box where A: 'static {} + +#[unstable(feature = "generator_trait", issue = "43122")] +impl + Unpin, R, A: Allocator> Generator for Box +where + A: 'static, +{ + type Yield = G::Yield; + type Return = G::Return; + + fn resume(mut self: Pin<&mut Self>, arg: R) -> GeneratorState { + G::resume(Pin::new(&mut *self), arg) + } +} + +#[unstable(feature = "generator_trait", issue = "43122")] +impl, R, A: Allocator> Generator for Pin> +where + A: 'static, +{ + type Yield = G::Yield; + type Return = G::Return; + + fn resume(mut self: Pin<&mut Self>, arg: R) -> GeneratorState { + G::resume((*self).as_mut(), arg) + } +} + +#[stable(feature = "futures_api", since = "1.36.0")] +impl Future for Box +where + A: 'static, +{ + type Output = F::Output; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + F::poll(Pin::new(&mut *self), cx) + } +} + +#[unstable(feature = "async_iterator", issue = "79024")] +impl AsyncIterator for Box { + type Item = S::Item; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + Pin::new(&mut **self).poll_next(cx) + } + + fn size_hint(&self) -> (usize, Option) { + (**self).size_hint() + } +} diff --git a/rust/alloc/collections/mod.rs b/rust/alloc/collections/mod.rs new file mode 100644 index 00000000000000..628a5b155673c9 --- /dev/null +++ b/rust/alloc/collections/mod.rs @@ -0,0 +1,154 @@ +//! Collection types. + +#![stable(feature = "rust1", since = "1.0.0")] + +#[cfg(not(no_global_oom_handling))] +pub mod binary_heap; +#[cfg(not(no_global_oom_handling))] +mod btree; +#[cfg(not(no_global_oom_handling))] +pub mod linked_list; +#[cfg(not(no_global_oom_handling))] +pub mod vec_deque; + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +pub mod btree_map { + //! An ordered map based on a B-Tree. + #[stable(feature = "rust1", since = "1.0.0")] + pub use super::btree::map::*; +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +pub mod btree_set { + //! An ordered set based on a B-Tree. + #[stable(feature = "rust1", since = "1.0.0")] + pub use super::btree::set::*; +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +#[doc(no_inline)] +pub use binary_heap::BinaryHeap; + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +#[doc(no_inline)] +pub use btree_map::BTreeMap; + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +#[doc(no_inline)] +pub use btree_set::BTreeSet; + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +#[doc(no_inline)] +pub use linked_list::LinkedList; + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +#[doc(no_inline)] +pub use vec_deque::VecDeque; + +use crate::alloc::{Layout, LayoutError}; +use core::fmt::Display; + +/// The error type for `try_reserve` methods. +#[derive(Clone, PartialEq, Eq, Debug)] +#[stable(feature = "try_reserve", since = "1.57.0")] +pub struct TryReserveError { + kind: TryReserveErrorKind, +} + +impl TryReserveError { + /// Details about the allocation that caused the error + #[inline] + #[must_use] + #[unstable( + feature = "try_reserve_kind", + reason = "Uncertain how much info should be exposed", + issue = "48043" + )] + pub fn kind(&self) -> TryReserveErrorKind { + self.kind.clone() + } +} + +/// Details of the allocation that caused a `TryReserveError` +#[derive(Clone, PartialEq, Eq, Debug)] +#[unstable( + feature = "try_reserve_kind", + reason = "Uncertain how much info should be exposed", + issue = "48043" +)] +pub enum TryReserveErrorKind { + /// Error due to the computed capacity exceeding the collection's maximum + /// (usually `isize::MAX` bytes). + CapacityOverflow, + + /// The memory allocator returned an error + AllocError { + /// The layout of allocation request that failed + layout: Layout, + + #[doc(hidden)] + #[unstable( + feature = "container_error_extra", + issue = "none", + reason = "\ + Enable exposing the allocator’s custom error value \ + if an associated type is added in the future: \ + https://github.com/rust-lang/wg-allocators/issues/23" + )] + non_exhaustive: (), + }, +} + +#[unstable( + feature = "try_reserve_kind", + reason = "Uncertain how much info should be exposed", + issue = "48043" +)] +impl From for TryReserveError { + #[inline] + fn from(kind: TryReserveErrorKind) -> Self { + Self { kind } + } +} + +#[unstable(feature = "try_reserve_kind", reason = "new API", issue = "48043")] +impl From for TryReserveErrorKind { + /// Always evaluates to [`TryReserveErrorKind::CapacityOverflow`]. + #[inline] + fn from(_: LayoutError) -> Self { + TryReserveErrorKind::CapacityOverflow + } +} + +#[stable(feature = "try_reserve", since = "1.57.0")] +impl Display for TryReserveError { + fn fmt( + &self, + fmt: &mut core::fmt::Formatter<'_>, + ) -> core::result::Result<(), core::fmt::Error> { + fmt.write_str("memory allocation failed")?; + let reason = match self.kind { + TryReserveErrorKind::CapacityOverflow => { + " because the computed capacity exceeded the collection's maximum" + } + TryReserveErrorKind::AllocError { .. } => { + " because the memory allocator returned a error" + } + }; + fmt.write_str(reason) + } +} + +/// An intermediate trait for specialization of `Extend`. +#[doc(hidden)] +trait SpecExtend { + /// Extends `self` with the contents of the given iterator. + fn spec_extend(&mut self, iter: I); +} diff --git a/rust/alloc/fmt.rs b/rust/alloc/fmt.rs new file mode 100644 index 00000000000000..aeb7554f8e914e --- /dev/null +++ b/rust/alloc/fmt.rs @@ -0,0 +1,599 @@ +//! Utilities for formatting and printing `String`s. +//! +//! This module contains the runtime support for the [`format!`] syntax extension. +//! This macro is implemented in the compiler to emit calls to this module in +//! order to format arguments at runtime into strings. +//! +//! # Usage +//! +//! The [`format!`] macro is intended to be familiar to those coming from C's +//! `printf`/`fprintf` functions or Python's `str.format` function. +//! +//! Some examples of the [`format!`] extension are: +//! +//! ``` +//! format!("Hello"); // => "Hello" +//! format!("Hello, {}!", "world"); // => "Hello, world!" +//! format!("The number is {}", 1); // => "The number is 1" +//! format!("{:?}", (3, 4)); // => "(3, 4)" +//! format!("{value}", value=4); // => "4" +//! let people = "Rustaceans"; +//! format!("Hello {people}!"); // => "Hello Rustaceans!" +//! format!("{} {}", 1, 2); // => "1 2" +//! format!("{:04}", 42); // => "0042" with leading zeros +//! format!("{:#?}", (100, 200)); // => "( +//! // 100, +//! // 200, +//! // )" +//! ``` +//! +//! From these, you can see that the first argument is a format string. It is +//! required by the compiler for this to be a string literal; it cannot be a +//! variable passed in (in order to perform validity checking). The compiler +//! will then parse the format string and determine if the list of arguments +//! provided is suitable to pass to this format string. +//! +//! To convert a single value to a string, use the [`to_string`] method. This +//! will use the [`Display`] formatting trait. +//! +//! ## Positional parameters +//! +//! Each formatting argument is allowed to specify which value argument it's +//! referencing, and if omitted it is assumed to be "the next argument". For +//! example, the format string `{} {} {}` would take three parameters, and they +//! would be formatted in the same order as they're given. The format string +//! `{2} {1} {0}`, however, would format arguments in reverse order. +//! +//! Things can get a little tricky once you start intermingling the two types of +//! positional specifiers. The "next argument" specifier can be thought of as an +//! iterator over the argument. Each time a "next argument" specifier is seen, +//! the iterator advances. This leads to behavior like this: +//! +//! ``` +//! format!("{1} {} {0} {}", 1, 2); // => "2 1 1 2" +//! ``` +//! +//! The internal iterator over the argument has not been advanced by the time +//! the first `{}` is seen, so it prints the first argument. Then upon reaching +//! the second `{}`, the iterator has advanced forward to the second argument. +//! Essentially, parameters that explicitly name their argument do not affect +//! parameters that do not name an argument in terms of positional specifiers. +//! +//! A format string is required to use all of its arguments, otherwise it is a +//! compile-time error. You may refer to the same argument more than once in the +//! format string. +//! +//! ## Named parameters +//! +//! Rust itself does not have a Python-like equivalent of named parameters to a +//! function, but the [`format!`] macro is a syntax extension that allows it to +//! leverage named parameters. Named parameters are listed at the end of the +//! argument list and have the syntax: +//! +//! ```text +//! identifier '=' expression +//! ``` +//! +//! For example, the following [`format!`] expressions all use named arguments: +//! +//! ``` +//! format!("{argument}", argument = "test"); // => "test" +//! format!("{name} {}", 1, name = 2); // => "2 1" +//! format!("{a} {c} {b}", a="a", b='b', c=3); // => "a 3 b" +//! ``` +//! +//! If a named parameter does not appear in the argument list, `format!` will +//! reference a variable with that name in the current scope. +//! +//! ``` +//! let argument = 2 + 2; +//! format!("{argument}"); // => "4" +//! +//! fn make_string(a: u32, b: &str) -> String { +//! format!("{b} {a}") +//! } +//! make_string(927, "label"); // => "label 927" +//! ``` +//! +//! It is not valid to put positional parameters (those without names) after +//! arguments that have names. Like with positional parameters, it is not +//! valid to provide named parameters that are unused by the format string. +//! +//! # Formatting Parameters +//! +//! Each argument being formatted can be transformed by a number of formatting +//! parameters (corresponding to `format_spec` in [the syntax](#syntax)). These +//! parameters affect the string representation of what's being formatted. +//! +//! ## Width +//! +//! ``` +//! // All of these print "Hello x !" +//! println!("Hello {:5}!", "x"); +//! println!("Hello {:1$}!", "x", 5); +//! println!("Hello {1:0$}!", 5, "x"); +//! println!("Hello {:width$}!", "x", width = 5); +//! let width = 5; +//! println!("Hello {:width$}!", "x"); +//! ``` +//! +//! This is a parameter for the "minimum width" that the format should take up. +//! If the value's string does not fill up this many characters, then the +//! padding specified by fill/alignment will be used to take up the required +//! space (see below). +//! +//! The value for the width can also be provided as a [`usize`] in the list of +//! parameters by adding a postfix `$`, indicating that the second argument is +//! a [`usize`] specifying the width. +//! +//! Referring to an argument with the dollar syntax does not affect the "next +//! argument" counter, so it's usually a good idea to refer to arguments by +//! position, or use named arguments. +//! +//! ## Fill/Alignment +//! +//! ``` +//! assert_eq!(format!("Hello {:<5}!", "x"), "Hello x !"); +//! assert_eq!(format!("Hello {:-<5}!", "x"), "Hello x----!"); +//! assert_eq!(format!("Hello {:^5}!", "x"), "Hello x !"); +//! assert_eq!(format!("Hello {:>5}!", "x"), "Hello x!"); +//! ``` +//! +//! The optional fill character and alignment is provided normally in conjunction with the +//! [`width`](#width) parameter. It must be defined before `width`, right after the `:`. +//! This indicates that if the value being formatted is smaller than +//! `width` some extra characters will be printed around it. +//! Filling comes in the following variants for different alignments: +//! +//! * `[fill]<` - the argument is left-aligned in `width` columns +//! * `[fill]^` - the argument is center-aligned in `width` columns +//! * `[fill]>` - the argument is right-aligned in `width` columns +//! +//! The default [fill/alignment](#fillalignment) for non-numerics is a space and +//! left-aligned. The +//! default for numeric formatters is also a space character but with right-alignment. If +//! the `0` flag (see below) is specified for numerics, then the implicit fill character is +//! `0`. +//! +//! Note that alignment might not be implemented by some types. In particular, it +//! is not generally implemented for the `Debug` trait. A good way to ensure +//! padding is applied is to format your input, then pad this resulting string +//! to obtain your output: +//! +//! ``` +//! println!("Hello {:^15}!", format!("{:?}", Some("hi"))); // => "Hello Some("hi") !" +//! ``` +//! +//! ## Sign/`#`/`0` +//! +//! ``` +//! assert_eq!(format!("Hello {:+}!", 5), "Hello +5!"); +//! assert_eq!(format!("{:#x}!", 27), "0x1b!"); +//! assert_eq!(format!("Hello {:05}!", 5), "Hello 00005!"); +//! assert_eq!(format!("Hello {:05}!", -5), "Hello -0005!"); +//! assert_eq!(format!("{:#010x}!", 27), "0x0000001b!"); +//! ``` +//! +//! These are all flags altering the behavior of the formatter. +//! +//! * `+` - This is intended for numeric types and indicates that the sign +//! should always be printed. Positive signs are never printed by +//! default, and the negative sign is only printed by default for signed values. +//! This flag indicates that the correct sign (`+` or `-`) should always be printed. +//! * `-` - Currently not used +//! * `#` - This flag indicates that the "alternate" form of printing should +//! be used. The alternate forms are: +//! * `#?` - pretty-print the [`Debug`] formatting (adds linebreaks and indentation) +//! * `#x` - precedes the argument with a `0x` +//! * `#X` - precedes the argument with a `0x` +//! * `#b` - precedes the argument with a `0b` +//! * `#o` - precedes the argument with a `0o` +//! * `0` - This is used to indicate for integer formats that the padding to `width` should +//! both be done with a `0` character as well as be sign-aware. A format +//! like `{:08}` would yield `00000001` for the integer `1`, while the +//! same format would yield `-0000001` for the integer `-1`. Notice that +//! the negative version has one fewer zero than the positive version. +//! Note that padding zeros are always placed after the sign (if any) +//! and before the digits. When used together with the `#` flag, a similar +//! rule applies: padding zeros are inserted after the prefix but before +//! the digits. The prefix is included in the total width. +//! +//! ## Precision +//! +//! For non-numeric types, this can be considered a "maximum width". If the resulting string is +//! longer than this width, then it is truncated down to this many characters and that truncated +//! value is emitted with proper `fill`, `alignment` and `width` if those parameters are set. +//! +//! For integral types, this is ignored. +//! +//! For floating-point types, this indicates how many digits after the decimal point should be +//! printed. +//! +//! There are three possible ways to specify the desired `precision`: +//! +//! 1. An integer `.N`: +//! +//! the integer `N` itself is the precision. +//! +//! 2. An integer or name followed by dollar sign `.N$`: +//! +//! use format *argument* `N` (which must be a `usize`) as the precision. +//! +//! 3. An asterisk `.*`: +//! +//! `.*` means that this `{...}` is associated with *two* format inputs rather than one: the +//! first input holds the `usize` precision, and the second holds the value to print. Note that +//! in this case, if one uses the format string `{:.*}`, then the `` part refers +//! to the *value* to print, and the `precision` must come in the input preceding ``. +//! +//! For example, the following calls all print the same thing `Hello x is 0.01000`: +//! +//! ``` +//! // Hello {arg 0 ("x")} is {arg 1 (0.01) with precision specified inline (5)} +//! println!("Hello {0} is {1:.5}", "x", 0.01); +//! +//! // Hello {arg 1 ("x")} is {arg 2 (0.01) with precision specified in arg 0 (5)} +//! println!("Hello {1} is {2:.0$}", 5, "x", 0.01); +//! +//! // Hello {arg 0 ("x")} is {arg 2 (0.01) with precision specified in arg 1 (5)} +//! println!("Hello {0} is {2:.1$}", "x", 5, 0.01); +//! +//! // Hello {next arg ("x")} is {second of next two args (0.01) with precision +//! // specified in first of next two args (5)} +//! println!("Hello {} is {:.*}", "x", 5, 0.01); +//! +//! // Hello {next arg ("x")} is {arg 2 (0.01) with precision +//! // specified in its predecessor (5)} +//! println!("Hello {} is {2:.*}", "x", 5, 0.01); +//! +//! // Hello {next arg ("x")} is {arg "number" (0.01) with precision specified +//! // in arg "prec" (5)} +//! println!("Hello {} is {number:.prec$}", "x", prec = 5, number = 0.01); +//! ``` +//! +//! While these: +//! +//! ``` +//! println!("{}, `{name:.*}` has 3 fractional digits", "Hello", 3, name=1234.56); +//! println!("{}, `{name:.*}` has 3 characters", "Hello", 3, name="1234.56"); +//! println!("{}, `{name:>8.*}` has 3 right-aligned characters", "Hello", 3, name="1234.56"); +//! ``` +//! +//! print three significantly different things: +//! +//! ```text +//! Hello, `1234.560` has 3 fractional digits +//! Hello, `123` has 3 characters +//! Hello, ` 123` has 3 right-aligned characters +//! ``` +//! +//! ## Localization +//! +//! In some programming languages, the behavior of string formatting functions +//! depends on the operating system's locale setting. The format functions +//! provided by Rust's standard library do not have any concept of locale and +//! will produce the same results on all systems regardless of user +//! configuration. +//! +//! For example, the following code will always print `1.5` even if the system +//! locale uses a decimal separator other than a dot. +//! +//! ``` +//! println!("The value is {}", 1.5); +//! ``` +//! +//! # Escaping +//! +//! The literal characters `{` and `}` may be included in a string by preceding +//! them with the same character. For example, the `{` character is escaped with +//! `{{` and the `}` character is escaped with `}}`. +//! +//! ``` +//! assert_eq!(format!("Hello {{}}"), "Hello {}"); +//! assert_eq!(format!("{{ Hello"), "{ Hello"); +//! ``` +//! +//! # Syntax +//! +//! To summarize, here you can find the full grammar of format strings. +//! The syntax for the formatting language used is drawn from other languages, +//! so it should not be too alien. Arguments are formatted with Python-like +//! syntax, meaning that arguments are surrounded by `{}` instead of the C-like +//! `%`. The actual grammar for the formatting syntax is: +//! +//! ```text +//! format_string := text [ maybe_format text ] * +//! maybe_format := '{' '{' | '}' '}' | format +//! format := '{' [ argument ] [ ':' format_spec ] '}' +//! argument := integer | identifier +//! +//! format_spec := [[fill]align][sign]['#']['0'][width]['.' precision]type +//! fill := character +//! align := '<' | '^' | '>' +//! sign := '+' | '-' +//! width := count +//! precision := count | '*' +//! type := '' | '?' | 'x?' | 'X?' | identifier +//! count := parameter | integer +//! parameter := argument '$' +//! ``` +//! In the above grammar, `text` must not contain any `'{'` or `'}'` characters. +//! +//! # Formatting traits +//! +//! When requesting that an argument be formatted with a particular type, you +//! are actually requesting that an argument ascribes to a particular trait. +//! This allows multiple actual types to be formatted via `{:x}` (like [`i8`] as +//! well as [`isize`]). The current mapping of types to traits is: +//! +//! * *nothing* ⇒ [`Display`] +//! * `?` ⇒ [`Debug`] +//! * `x?` ⇒ [`Debug`] with lower-case hexadecimal integers +//! * `X?` ⇒ [`Debug`] with upper-case hexadecimal integers +//! * `o` ⇒ [`Octal`] +//! * `x` ⇒ [`LowerHex`] +//! * `X` ⇒ [`UpperHex`] +//! * `p` ⇒ [`Pointer`] +//! * `b` ⇒ [`Binary`] +//! * `e` ⇒ [`LowerExp`] +//! * `E` ⇒ [`UpperExp`] +//! +//! What this means is that any type of argument which implements the +//! [`fmt::Binary`][`Binary`] trait can then be formatted with `{:b}`. Implementations +//! are provided for these traits for a number of primitive types by the +//! standard library as well. If no format is specified (as in `{}` or `{:6}`), +//! then the format trait used is the [`Display`] trait. +//! +//! When implementing a format trait for your own type, you will have to +//! implement a method of the signature: +//! +//! ``` +//! # #![allow(dead_code)] +//! # use std::fmt; +//! # struct Foo; // our custom type +//! # impl fmt::Display for Foo { +//! fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +//! # write!(f, "testing, testing") +//! # } } +//! ``` +//! +//! Your type will be passed as `self` by-reference, and then the function +//! should emit output into the `f.buf` stream. It is up to each format trait +//! implementation to correctly adhere to the requested formatting parameters. +//! The values of these parameters will be listed in the fields of the +//! [`Formatter`] struct. In order to help with this, the [`Formatter`] struct also +//! provides some helper methods. +//! +//! Additionally, the return value of this function is [`fmt::Result`] which is a +//! type alias of [Result]<(), [std::fmt::Error]>. Formatting implementations +//! should ensure that they propagate errors from the [`Formatter`] (e.g., when +//! calling [`write!`]). However, they should never return errors spuriously. That +//! is, a formatting implementation must and may only return an error if the +//! passed-in [`Formatter`] returns an error. This is because, contrary to what +//! the function signature might suggest, string formatting is an infallible +//! operation. This function only returns a result because writing to the +//! underlying stream might fail and it must provide a way to propagate the fact +//! that an error has occurred back up the stack. +//! +//! An example of implementing the formatting traits would look +//! like: +//! +//! ``` +//! use std::fmt; +//! +//! #[derive(Debug)] +//! struct Vector2D { +//! x: isize, +//! y: isize, +//! } +//! +//! impl fmt::Display for Vector2D { +//! fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +//! // The `f` value implements the `Write` trait, which is what the +//! // write! macro is expecting. Note that this formatting ignores the +//! // various flags provided to format strings. +//! write!(f, "({}, {})", self.x, self.y) +//! } +//! } +//! +//! // Different traits allow different forms of output of a type. The meaning +//! // of this format is to print the magnitude of a vector. +//! impl fmt::Binary for Vector2D { +//! fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { +//! let magnitude = (self.x * self.x + self.y * self.y) as f64; +//! let magnitude = magnitude.sqrt(); +//! +//! // Respect the formatting flags by using the helper method +//! // `pad_integral` on the Formatter object. See the method +//! // documentation for details, and the function `pad` can be used +//! // to pad strings. +//! let decimals = f.precision().unwrap_or(3); +//! let string = format!("{:.*}", decimals, magnitude); +//! f.pad_integral(true, "", &string) +//! } +//! } +//! +//! fn main() { +//! let myvector = Vector2D { x: 3, y: 4 }; +//! +//! println!("{}", myvector); // => "(3, 4)" +//! println!("{:?}", myvector); // => "Vector2D {x: 3, y:4}" +//! println!("{:10.3b}", myvector); // => " 5.000" +//! } +//! ``` +//! +//! ### `fmt::Display` vs `fmt::Debug` +//! +//! These two formatting traits have distinct purposes: +//! +//! - [`fmt::Display`][`Display`] implementations assert that the type can be faithfully +//! represented as a UTF-8 string at all times. It is **not** expected that +//! all types implement the [`Display`] trait. +//! - [`fmt::Debug`][`Debug`] implementations should be implemented for **all** public types. +//! Output will typically represent the internal state as faithfully as possible. +//! The purpose of the [`Debug`] trait is to facilitate debugging Rust code. In +//! most cases, using `#[derive(Debug)]` is sufficient and recommended. +//! +//! Some examples of the output from both traits: +//! +//! ``` +//! assert_eq!(format!("{} {:?}", 3, 4), "3 4"); +//! assert_eq!(format!("{} {:?}", 'a', 'b'), "a 'b'"); +//! assert_eq!(format!("{} {:?}", "foo\n", "bar\n"), "foo\n \"bar\\n\""); +//! ``` +//! +//! # Related macros +//! +//! There are a number of related macros in the [`format!`] family. The ones that +//! are currently implemented are: +//! +//! ```ignore (only-for-syntax-highlight) +//! format! // described above +//! write! // first argument is a &mut io::Write, the destination +//! writeln! // same as write but appends a newline +//! print! // the format string is printed to the standard output +//! println! // same as print but appends a newline +//! eprint! // the format string is printed to the standard error +//! eprintln! // same as eprint but appends a newline +//! format_args! // described below. +//! ``` +//! +//! ### `write!` +//! +//! This and [`writeln!`] are two macros which are used to emit the format string +//! to a specified stream. This is used to prevent intermediate allocations of +//! format strings and instead directly write the output. Under the hood, this +//! function is actually invoking the [`write_fmt`] function defined on the +//! [`std::io::Write`] trait. Example usage is: +//! +//! ``` +//! # #![allow(unused_must_use)] +//! use std::io::Write; +//! let mut w = Vec::new(); +//! write!(&mut w, "Hello {}!", "world"); +//! ``` +//! +//! ### `print!` +//! +//! This and [`println!`] emit their output to stdout. Similarly to the [`write!`] +//! macro, the goal of these macros is to avoid intermediate allocations when +//! printing output. Example usage is: +//! +//! ``` +//! print!("Hello {}!", "world"); +//! println!("I have a newline {}", "character at the end"); +//! ``` +//! ### `eprint!` +//! +//! The [`eprint!`] and [`eprintln!`] macros are identical to +//! [`print!`] and [`println!`], respectively, except they emit their +//! output to stderr. +//! +//! ### `format_args!` +//! +//! This is a curious macro used to safely pass around +//! an opaque object describing the format string. This object +//! does not require any heap allocations to create, and it only +//! references information on the stack. Under the hood, all of +//! the related macros are implemented in terms of this. First +//! off, some example usage is: +//! +//! ``` +//! # #![allow(unused_must_use)] +//! use std::fmt; +//! use std::io::{self, Write}; +//! +//! let mut some_writer = io::stdout(); +//! write!(&mut some_writer, "{}", format_args!("print with a {}", "macro")); +//! +//! fn my_fmt_fn(args: fmt::Arguments) { +//! write!(&mut io::stdout(), "{}", args); +//! } +//! my_fmt_fn(format_args!(", or a {} too", "function")); +//! ``` +//! +//! The result of the [`format_args!`] macro is a value of type [`fmt::Arguments`]. +//! This structure can then be passed to the [`write`] and [`format`] functions +//! inside this module in order to process the format string. +//! The goal of this macro is to even further prevent intermediate allocations +//! when dealing with formatting strings. +//! +//! For example, a logging library could use the standard formatting syntax, but +//! it would internally pass around this structure until it has been determined +//! where output should go to. +//! +//! [`fmt::Result`]: Result "fmt::Result" +//! [Result]: core::result::Result "std::result::Result" +//! [std::fmt::Error]: Error "fmt::Error" +//! [`write`]: write() "fmt::write" +//! [`to_string`]: crate::string::ToString::to_string "ToString::to_string" +//! [`write_fmt`]: ../../std/io/trait.Write.html#method.write_fmt +//! [`std::io::Write`]: ../../std/io/trait.Write.html +//! [`print!`]: ../../std/macro.print.html "print!" +//! [`println!`]: ../../std/macro.println.html "println!" +//! [`eprint!`]: ../../std/macro.eprint.html "eprint!" +//! [`eprintln!`]: ../../std/macro.eprintln.html "eprintln!" +//! [`fmt::Arguments`]: Arguments "fmt::Arguments" +//! [`format`]: format() "fmt::format" + +#![stable(feature = "rust1", since = "1.0.0")] + +#[unstable(feature = "fmt_internals", issue = "none")] +pub use core::fmt::rt; +#[stable(feature = "fmt_flags_align", since = "1.28.0")] +pub use core::fmt::Alignment; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::fmt::Error; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::fmt::{write, ArgumentV1, Arguments}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::fmt::{Binary, Octal}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::fmt::{Debug, Display}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::fmt::{DebugList, DebugMap, DebugSet, DebugStruct, DebugTuple}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::fmt::{Formatter, Result, Write}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::fmt::{LowerExp, UpperExp}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::fmt::{LowerHex, Pointer, UpperHex}; + +#[cfg(not(no_global_oom_handling))] +use crate::string; + +/// The `format` function takes an [`Arguments`] struct and returns the resulting +/// formatted string. +/// +/// The [`Arguments`] instance can be created with the [`format_args!`] macro. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// use std::fmt; +/// +/// let s = fmt::format(format_args!("Hello, {}!", "world")); +/// assert_eq!(s, "Hello, world!"); +/// ``` +/// +/// Please note that using [`format!`] might be preferable. +/// Example: +/// +/// ``` +/// let s = format!("Hello, {}!", "world"); +/// assert_eq!(s, "Hello, world!"); +/// ``` +/// +/// [`format_args!`]: core::format_args +/// [`format!`]: crate::format +#[cfg(not(no_global_oom_handling))] +#[must_use] +#[stable(feature = "rust1", since = "1.0.0")] +pub fn format(args: Arguments<'_>) -> string::String { + let capacity = args.estimated_capacity(); + let mut output = string::String::with_capacity(capacity); + output.write_fmt(args).expect("a formatting trait implementation returned an error"); + output +} diff --git a/rust/alloc/lib.rs b/rust/alloc/lib.rs new file mode 100644 index 00000000000000..6da32df57efb76 --- /dev/null +++ b/rust/alloc/lib.rs @@ -0,0 +1,223 @@ +//! # The Rust core allocation and collections library +//! +//! This library provides smart pointers and collections for managing +//! heap-allocated values. +//! +//! This library, like libcore, normally doesn’t need to be used directly +//! since its contents are re-exported in the [`std` crate](../std/index.html). +//! Crates that use the `#![no_std]` attribute however will typically +//! not depend on `std`, so they’d use this crate instead. +//! +//! ## Boxed values +//! +//! The [`Box`] type is a smart pointer type. There can only be one owner of a +//! [`Box`], and the owner can decide to mutate the contents, which live on the +//! heap. +//! +//! This type can be sent among threads efficiently as the size of a `Box` value +//! is the same as that of a pointer. Tree-like data structures are often built +//! with boxes because each node often has only one owner, the parent. +//! +//! ## Reference counted pointers +//! +//! The [`Rc`] type is a non-threadsafe reference-counted pointer type intended +//! for sharing memory within a thread. An [`Rc`] pointer wraps a type, `T`, and +//! only allows access to `&T`, a shared reference. +//! +//! This type is useful when inherited mutability (such as using [`Box`]) is too +//! constraining for an application, and is often paired with the [`Cell`] or +//! [`RefCell`] types in order to allow mutation. +//! +//! ## Atomically reference counted pointers +//! +//! The [`Arc`] type is the threadsafe equivalent of the [`Rc`] type. It +//! provides all the same functionality of [`Rc`], except it requires that the +//! contained type `T` is shareable. Additionally, [`Arc`][`Arc`] is itself +//! sendable while [`Rc`][`Rc`] is not. +//! +//! This type allows for shared access to the contained data, and is often +//! paired with synchronization primitives such as mutexes to allow mutation of +//! shared resources. +//! +//! ## Collections +//! +//! Implementations of the most common general purpose data structures are +//! defined in this library. They are re-exported through the +//! [standard collections library](../std/collections/index.html). +//! +//! ## Heap interfaces +//! +//! The [`alloc`](alloc/index.html) module defines the low-level interface to the +//! default global allocator. It is not compatible with the libc allocator API. +//! +//! [`Arc`]: sync +//! [`Box`]: boxed +//! [`Cell`]: core::cell +//! [`Rc`]: rc +//! [`RefCell`]: core::cell + +// To run liballoc tests without x.py without ending up with two copies of liballoc, Miri needs to be +// able to "empty" this crate. See . +// rustc itself never sets the feature, so this line has no affect there. +#![cfg(any(not(feature = "miri-test-libstd"), test, doctest))] +#![allow(unused_attributes)] +#![stable(feature = "alloc", since = "1.36.0")] +#![doc( + html_playground_url = "https://play.rust-lang.org/", + issue_tracker_base_url = "https://github.com/rust-lang/rust/issues/", + test(no_crate_inject, attr(allow(unused_variables), deny(warnings))) +)] +#![doc(cfg_hide( + not(test), + not(any(test, bootstrap)), + any(not(feature = "miri-test-libstd"), test, doctest), + no_global_oom_handling, + not(no_global_oom_handling), + target_has_atomic = "ptr" +))] +#![no_std] +#![needs_allocator] +// +// Lints: +#![deny(unsafe_op_in_unsafe_fn)] +#![warn(deprecated_in_future)] +#![warn(missing_debug_implementations)] +#![warn(missing_docs)] +#![allow(explicit_outlives_requirements)] +// +// Library features: +#![feature(alloc_layout_extra)] +#![feature(allocator_api)] +#![feature(array_chunks)] +#![feature(array_methods)] +#![feature(array_windows)] +#![feature(async_iterator)] +#![feature(coerce_unsized)] +#![cfg_attr(not(no_global_oom_handling), feature(const_alloc_error))] +#![feature(const_box)] +#![cfg_attr(not(no_global_oom_handling), feature(const_btree_new))] +#![feature(const_cow_is_borrowed)] +#![feature(const_convert)] +#![feature(const_size_of_val)] +#![feature(const_align_of_val)] +#![feature(const_ptr_read)] +#![feature(const_maybe_uninit_write)] +#![feature(const_maybe_uninit_as_mut_ptr)] +#![feature(const_refs_to_cell)] +#![feature(core_intrinsics)] +#![feature(const_eval_select)] +#![feature(const_pin)] +#![feature(dispatch_from_dyn)] +#![feature(exact_size_is_empty)] +#![feature(extend_one)] +#![feature(fmt_internals)] +#![feature(fn_traits)] +#![feature(inplace_iteration)] +#![feature(iter_advance_by)] +#![feature(layout_for_ptr)] +#![feature(maybe_uninit_slice)] +#![cfg_attr(test, feature(new_uninit))] +#![feature(nonnull_slice_from_raw_parts)] +#![feature(pattern)] +#![feature(ptr_internals)] +#![feature(receiver_trait)] +#![feature(set_ptr_value)] +#![feature(slice_group_by)] +#![feature(slice_ptr_get)] +#![feature(slice_ptr_len)] +#![feature(slice_range)] +#![feature(str_internals)] +#![feature(trusted_len)] +#![feature(trusted_random_access)] +#![feature(try_trait_v2)] +#![feature(unicode_internals)] +#![feature(unsize)] +// +// Language features: +#![feature(allocator_internals)] +#![feature(allow_internal_unstable)] +#![feature(associated_type_bounds)] +#![feature(box_syntax)] +#![feature(cfg_sanitize)] +#![cfg_attr(bootstrap, feature(cfg_target_has_atomic))] +#![feature(const_deref)] +#![feature(const_fn_trait_bound)] +#![feature(const_mut_refs)] +#![feature(const_ptr_write)] +#![feature(const_precise_live_drops)] +#![feature(const_trait_impl)] +#![feature(const_try)] +#![feature(dropck_eyepatch)] +#![feature(exclusive_range_pattern)] +#![feature(fundamental)] +#![cfg_attr(not(test), feature(generator_trait))] +#![feature(lang_items)] +#![feature(min_specialization)] +#![feature(negative_impls)] +#![feature(never_type)] +#![feature(nll)] // Not necessary, but here to test the `nll` feature. +#![feature(rustc_allow_const_fn_unstable)] +#![feature(rustc_attrs)] +#![feature(staged_api)] +#![cfg_attr(test, feature(test))] +#![feature(unboxed_closures)] +#![feature(unsized_fn_params)] +#![feature(c_unwind)] +// +// Rustdoc features: +#![feature(doc_cfg)] +#![feature(doc_cfg_hide)] +// Technically, this is a bug in rustdoc: rustdoc sees the documentation on `#[lang = slice_alloc]` +// blocks is for `&[T]`, which also has documentation using this feature in `core`, and gets mad +// that the feature-gate isn't enabled. Ideally, it wouldn't check for the feature gate for docs +// from other crates, but since this can only appear for lang items, it doesn't seem worth fixing. +#![feature(intra_doc_pointers)] + +// Allow testing this library +#[cfg(test)] +#[macro_use] +extern crate std; +#[cfg(test)] +extern crate test; + +// Module with internal macros used by other modules (needs to be included before other modules). +#[macro_use] +mod macros; + +mod raw_vec; + +// Heaps provided for low-level allocation strategies + +pub mod alloc; + +// Primitive types using the heaps above + +// Need to conditionally define the mod from `boxed.rs` to avoid +// duplicating the lang-items when building in test cfg; but also need +// to allow code to have `use boxed::Box;` declarations. +#[cfg(not(test))] +pub mod boxed; +#[cfg(test)] +mod boxed { + pub use std::boxed::Box; +} +pub mod borrow; +pub mod collections; +pub mod fmt; +pub mod rc; +pub mod slice; +pub mod str; +pub mod string; +#[cfg(target_has_atomic = "ptr")] +pub mod sync; +#[cfg(all(not(no_global_oom_handling), target_has_atomic = "ptr"))] +pub mod task; +#[cfg(test)] +mod tests; +pub mod vec; + +#[doc(hidden)] +#[unstable(feature = "liballoc_internals", issue = "none", reason = "implementation detail")] +pub mod __export { + pub use core::format_args; +} diff --git a/rust/alloc/macros.rs b/rust/alloc/macros.rs new file mode 100644 index 00000000000000..d3e9e65c3fe57b --- /dev/null +++ b/rust/alloc/macros.rs @@ -0,0 +1,125 @@ +/// Creates a [`Vec`] containing the arguments. +/// +/// `vec!` allows `Vec`s to be defined with the same syntax as array expressions. +/// There are two forms of this macro: +/// +/// - Create a [`Vec`] containing a given list of elements: +/// +/// ``` +/// let v = vec![1, 2, 3]; +/// assert_eq!(v[0], 1); +/// assert_eq!(v[1], 2); +/// assert_eq!(v[2], 3); +/// ``` +/// +/// - Create a [`Vec`] from a given element and size: +/// +/// ``` +/// let v = vec![1; 3]; +/// assert_eq!(v, [1, 1, 1]); +/// ``` +/// +/// Note that unlike array expressions this syntax supports all elements +/// which implement [`Clone`] and the number of elements doesn't have to be +/// a constant. +/// +/// This will use `clone` to duplicate an expression, so one should be careful +/// using this with types having a nonstandard `Clone` implementation. For +/// example, `vec![Rc::new(1); 5]` will create a vector of five references +/// to the same boxed integer value, not five references pointing to independently +/// boxed integers. +/// +/// Also, note that `vec![expr; 0]` is allowed, and produces an empty vector. +/// This will still evaluate `expr`, however, and immediately drop the resulting value, so +/// be mindful of side effects. +/// +/// [`Vec`]: crate::vec::Vec +#[cfg(not(test))] +#[macro_export] +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_diagnostic_item = "vec_macro"] +#[allow_internal_unstable(box_syntax, liballoc_internals)] +macro_rules! vec { + () => ( + $crate::__rust_force_expr!($crate::vec::Vec::new()) + ); + ($elem:expr; $n:expr) => ( + $crate::__rust_force_expr!($crate::vec::from_elem($elem, $n)) + ); + ($($x:expr),+ $(,)?) => ( + $crate::__rust_force_expr!(<[_]>::into_vec(box [$($x),+])) + ); +} + +// HACK(japaric): with cfg(test) the inherent `[T]::into_vec` method, which is +// required for this macro definition, is not available. Instead use the +// `slice::into_vec` function which is only available with cfg(test) +// NB see the slice::hack module in slice.rs for more information +#[cfg(test)] +macro_rules! vec { + () => ( + $crate::vec::Vec::new() + ); + ($elem:expr; $n:expr) => ( + $crate::vec::from_elem($elem, $n) + ); + ($($x:expr),*) => ( + $crate::slice::into_vec(box [$($x),*]) + ); + ($($x:expr,)*) => (vec![$($x),*]) +} + +/// Creates a `String` using interpolation of runtime expressions. +/// +/// The first argument `format!` receives is a format string. This must be a string +/// literal. The power of the formatting string is in the `{}`s contained. +/// +/// Additional parameters passed to `format!` replace the `{}`s within the +/// formatting string in the order given unless named or positional parameters +/// are used; see [`std::fmt`] for more information. +/// +/// A common use for `format!` is concatenation and interpolation of strings. +/// The same convention is used with [`print!`] and [`write!`] macros, +/// depending on the intended destination of the string. +/// +/// To convert a single value to a string, use the [`to_string`] method. This +/// will use the [`Display`] formatting trait. +/// +/// [`std::fmt`]: ../std/fmt/index.html +/// [`print!`]: ../std/macro.print.html +/// [`write!`]: core::write +/// [`to_string`]: crate::string::ToString +/// [`Display`]: core::fmt::Display +/// +/// # Panics +/// +/// `format!` panics if a formatting trait implementation returns an error. +/// This indicates an incorrect implementation +/// since `fmt::Write for String` never returns an error itself. +/// +/// # Examples +/// +/// ``` +/// format!("test"); +/// format!("hello {}", "world!"); +/// format!("x = {}, y = {y}", 10, y = 30); +/// ``` +#[macro_export] +#[stable(feature = "rust1", since = "1.0.0")] +#[cfg_attr(not(test), rustc_diagnostic_item = "format_macro")] +macro_rules! format { + ($($arg:tt)*) => {{ + let res = $crate::fmt::format($crate::__export::format_args!($($arg)*)); + res + }} +} + +/// Force AST node to an expression to improve diagnostics in pattern position. +#[doc(hidden)] +#[macro_export] +#[unstable(feature = "liballoc_internals", issue = "none", reason = "implementation detail")] +macro_rules! __rust_force_expr { + ($e:expr) => { + $e + }; +} diff --git a/rust/alloc/raw_vec.rs b/rust/alloc/raw_vec.rs new file mode 100644 index 00000000000000..8fa0242ca9a9f0 --- /dev/null +++ b/rust/alloc/raw_vec.rs @@ -0,0 +1,519 @@ +#![unstable(feature = "raw_vec_internals", reason = "unstable const warnings", issue = "none")] + +use core::alloc::LayoutError; +use core::cmp; +use core::intrinsics; +use core::mem::{self, ManuallyDrop, MaybeUninit}; +use core::ops::Drop; +use core::ptr::{self, NonNull, Unique}; +use core::slice; + +#[cfg(not(no_global_oom_handling))] +use crate::alloc::handle_alloc_error; +use crate::alloc::{Allocator, Global, Layout}; +use crate::boxed::Box; +use crate::collections::TryReserveError; +use crate::collections::TryReserveErrorKind::*; + +#[cfg(test)] +mod tests; + +#[cfg(not(no_global_oom_handling))] +enum AllocInit { + /// The contents of the new memory are uninitialized. + Uninitialized, + /// The new memory is guaranteed to be zeroed. + Zeroed, +} + +/// A low-level utility for more ergonomically allocating, reallocating, and deallocating +/// a buffer of memory on the heap without having to worry about all the corner cases +/// involved. This type is excellent for building your own data structures like Vec and VecDeque. +/// In particular: +/// +/// * Produces `Unique::dangling()` on zero-sized types. +/// * Produces `Unique::dangling()` on zero-length allocations. +/// * Avoids freeing `Unique::dangling()`. +/// * Catches all overflows in capacity computations (promotes them to "capacity overflow" panics). +/// * Guards against 32-bit systems allocating more than isize::MAX bytes. +/// * Guards against overflowing your length. +/// * Calls `handle_alloc_error` for fallible allocations. +/// * Contains a `ptr::Unique` and thus endows the user with all related benefits. +/// * Uses the excess returned from the allocator to use the largest available capacity. +/// +/// This type does not in anyway inspect the memory that it manages. When dropped it *will* +/// free its memory, but it *won't* try to drop its contents. It is up to the user of `RawVec` +/// to handle the actual things *stored* inside of a `RawVec`. +/// +/// Note that the excess of a zero-sized types is always infinite, so `capacity()` always returns +/// `usize::MAX`. This means that you need to be careful when round-tripping this type with a +/// `Box<[T]>`, since `capacity()` won't yield the length. +#[allow(missing_debug_implementations)] +pub(crate) struct RawVec { + ptr: Unique, + cap: usize, + alloc: A, +} + +impl RawVec { + /// HACK(Centril): This exists because stable `const fn` can only call stable `const fn`, so + /// they cannot call `Self::new()`. + /// + /// If you change `RawVec::new` or dependencies, please take care to not introduce anything + /// that would truly const-call something unstable. + pub const NEW: Self = Self::new(); + + /// Creates the biggest possible `RawVec` (on the system heap) + /// without allocating. If `T` has positive size, then this makes a + /// `RawVec` with capacity `0`. If `T` is zero-sized, then it makes a + /// `RawVec` with capacity `usize::MAX`. Useful for implementing + /// delayed allocation. + #[must_use] + pub const fn new() -> Self { + Self::new_in(Global) + } + + /// Creates a `RawVec` (on the system heap) with exactly the + /// capacity and alignment requirements for a `[T; capacity]`. This is + /// equivalent to calling `RawVec::new` when `capacity` is `0` or `T` is + /// zero-sized. Note that if `T` is zero-sized this means you will + /// *not* get a `RawVec` with the requested capacity. + /// + /// # Panics + /// + /// Panics if the requested capacity exceeds `isize::MAX` bytes. + /// + /// # Aborts + /// + /// Aborts on OOM. + #[cfg(not(any(no_global_oom_handling, test)))] + #[must_use] + #[inline] + pub fn with_capacity(capacity: usize) -> Self { + Self::with_capacity_in(capacity, Global) + } + + /// Like `with_capacity`, but guarantees the buffer is zeroed. + #[cfg(not(any(no_global_oom_handling, test)))] + #[must_use] + #[inline] + pub fn with_capacity_zeroed(capacity: usize) -> Self { + Self::with_capacity_zeroed_in(capacity, Global) + } +} + +impl RawVec { + // Tiny Vecs are dumb. Skip to: + // - 8 if the element size is 1, because any heap allocators is likely + // to round up a request of less than 8 bytes to at least 8 bytes. + // - 4 if elements are moderate-sized (<= 1 KiB). + // - 1 otherwise, to avoid wasting too much space for very short Vecs. + pub(crate) const MIN_NON_ZERO_CAP: usize = if mem::size_of::() == 1 { + 8 + } else if mem::size_of::() <= 1024 { + 4 + } else { + 1 + }; + + /// Like `new`, but parameterized over the choice of allocator for + /// the returned `RawVec`. + #[rustc_allow_const_fn_unstable(const_fn)] + pub const fn new_in(alloc: A) -> Self { + // `cap: 0` means "unallocated". zero-sized types are ignored. + Self { ptr: Unique::dangling(), cap: 0, alloc } + } + + /// Like `with_capacity`, but parameterized over the choice of + /// allocator for the returned `RawVec`. + #[cfg(not(no_global_oom_handling))] + #[inline] + pub fn with_capacity_in(capacity: usize, alloc: A) -> Self { + Self::allocate_in(capacity, AllocInit::Uninitialized, alloc) + } + + /// Like `with_capacity_zeroed`, but parameterized over the choice + /// of allocator for the returned `RawVec`. + #[cfg(not(no_global_oom_handling))] + #[inline] + pub fn with_capacity_zeroed_in(capacity: usize, alloc: A) -> Self { + Self::allocate_in(capacity, AllocInit::Zeroed, alloc) + } + + /// Converts the entire buffer into `Box<[MaybeUninit]>` with the specified `len`. + /// + /// Note that this will correctly reconstitute any `cap` changes + /// that may have been performed. (See description of type for details.) + /// + /// # Safety + /// + /// * `len` must be greater than or equal to the most recently requested capacity, and + /// * `len` must be less than or equal to `self.capacity()`. + /// + /// Note, that the requested capacity and `self.capacity()` could differ, as + /// an allocator could overallocate and return a greater memory block than requested. + pub unsafe fn into_box(self, len: usize) -> Box<[MaybeUninit], A> { + // Sanity-check one half of the safety requirement (we cannot check the other half). + debug_assert!( + len <= self.capacity(), + "`len` must be smaller than or equal to `self.capacity()`" + ); + + let me = ManuallyDrop::new(self); + unsafe { + let slice = slice::from_raw_parts_mut(me.ptr() as *mut MaybeUninit, len); + Box::from_raw_in(slice, ptr::read(&me.alloc)) + } + } + + #[cfg(not(no_global_oom_handling))] + fn allocate_in(capacity: usize, init: AllocInit, alloc: A) -> Self { + if mem::size_of::() == 0 { + Self::new_in(alloc) + } else { + // We avoid `unwrap_or_else` here because it bloats the amount of + // LLVM IR generated. + let layout = match Layout::array::(capacity) { + Ok(layout) => layout, + Err(_) => capacity_overflow(), + }; + match alloc_guard(layout.size()) { + Ok(_) => {} + Err(_) => capacity_overflow(), + } + let result = match init { + AllocInit::Uninitialized => alloc.allocate(layout), + AllocInit::Zeroed => alloc.allocate_zeroed(layout), + }; + let ptr = match result { + Ok(ptr) => ptr, + Err(_) => handle_alloc_error(layout), + }; + + // Allocators currently return a `NonNull<[u8]>` whose length + // matches the size requested. If that ever changes, the capacity + // here should change to `ptr.len() / mem::size_of::()`. + Self { + ptr: unsafe { Unique::new_unchecked(ptr.cast().as_ptr()) }, + cap: capacity, + alloc, + } + } + } + + /// Reconstitutes a `RawVec` from a pointer, capacity, and allocator. + /// + /// # Safety + /// + /// The `ptr` must be allocated (via the given allocator `alloc`), and with the given + /// `capacity`. + /// The `capacity` cannot exceed `isize::MAX` for sized types. (only a concern on 32-bit + /// systems). ZST vectors may have a capacity up to `usize::MAX`. + /// If the `ptr` and `capacity` come from a `RawVec` created via `alloc`, then this is + /// guaranteed. + #[inline] + pub unsafe fn from_raw_parts_in(ptr: *mut T, capacity: usize, alloc: A) -> Self { + Self { ptr: unsafe { Unique::new_unchecked(ptr) }, cap: capacity, alloc } + } + + /// Gets a raw pointer to the start of the allocation. Note that this is + /// `Unique::dangling()` if `capacity == 0` or `T` is zero-sized. In the former case, you must + /// be careful. + #[inline] + pub fn ptr(&self) -> *mut T { + self.ptr.as_ptr() + } + + /// Gets the capacity of the allocation. + /// + /// This will always be `usize::MAX` if `T` is zero-sized. + #[inline(always)] + pub fn capacity(&self) -> usize { + if mem::size_of::() == 0 { usize::MAX } else { self.cap } + } + + /// Returns a shared reference to the allocator backing this `RawVec`. + pub fn allocator(&self) -> &A { + &self.alloc + } + + fn current_memory(&self) -> Option<(NonNull, Layout)> { + if mem::size_of::() == 0 || self.cap == 0 { + None + } else { + // We have an allocated chunk of memory, so we can bypass runtime + // checks to get our current layout. + unsafe { + let align = mem::align_of::(); + let size = mem::size_of::() * self.cap; + let layout = Layout::from_size_align_unchecked(size, align); + Some((self.ptr.cast().into(), layout)) + } + } + } + + /// Ensures that the buffer contains at least enough space to hold `len + + /// additional` elements. If it doesn't already have enough capacity, will + /// reallocate enough space plus comfortable slack space to get amortized + /// *O*(1) behavior. Will limit this behavior if it would needlessly cause + /// itself to panic. + /// + /// If `len` exceeds `self.capacity()`, this may fail to actually allocate + /// the requested space. This is not really unsafe, but the unsafe + /// code *you* write that relies on the behavior of this function may break. + /// + /// This is ideal for implementing a bulk-push operation like `extend`. + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` bytes. + /// + /// # Aborts + /// + /// Aborts on OOM. + #[cfg(not(no_global_oom_handling))] + #[inline] + pub fn reserve(&mut self, len: usize, additional: usize) { + // Callers expect this function to be very cheap when there is already sufficient capacity. + // Therefore, we move all the resizing and error-handling logic from grow_amortized and + // handle_reserve behind a call, while making sure that this function is likely to be + // inlined as just a comparison and a call if the comparison fails. + #[cold] + fn do_reserve_and_handle( + slf: &mut RawVec, + len: usize, + additional: usize, + ) { + handle_reserve(slf.grow_amortized(len, additional)); + } + + if self.needs_to_grow(len, additional) { + do_reserve_and_handle(self, len, additional); + } + } + + /// A specialized version of `reserve()` used only by the hot and + /// oft-instantiated `Vec::push()`, which does its own capacity check. + #[cfg(not(no_global_oom_handling))] + #[inline(never)] + pub fn reserve_for_push(&mut self, len: usize) { + handle_reserve(self.grow_amortized(len, 1)); + } + + /// The same as `reserve`, but returns on errors instead of panicking or aborting. + pub fn try_reserve(&mut self, len: usize, additional: usize) -> Result<(), TryReserveError> { + if self.needs_to_grow(len, additional) { + self.grow_amortized(len, additional) + } else { + Ok(()) + } + } + + /// Ensures that the buffer contains at least enough space to hold `len + + /// additional` elements. If it doesn't already, will reallocate the + /// minimum possible amount of memory necessary. Generally this will be + /// exactly the amount of memory necessary, but in principle the allocator + /// is free to give back more than we asked for. + /// + /// If `len` exceeds `self.capacity()`, this may fail to actually allocate + /// the requested space. This is not really unsafe, but the unsafe code + /// *you* write that relies on the behavior of this function may break. + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` bytes. + /// + /// # Aborts + /// + /// Aborts on OOM. + #[cfg(not(no_global_oom_handling))] + pub fn reserve_exact(&mut self, len: usize, additional: usize) { + handle_reserve(self.try_reserve_exact(len, additional)); + } + + /// The same as `reserve_exact`, but returns on errors instead of panicking or aborting. + pub fn try_reserve_exact( + &mut self, + len: usize, + additional: usize, + ) -> Result<(), TryReserveError> { + if self.needs_to_grow(len, additional) { self.grow_exact(len, additional) } else { Ok(()) } + } + + /// Shrinks the buffer down to the specified capacity. If the given amount + /// is 0, actually completely deallocates. + /// + /// # Panics + /// + /// Panics if the given amount is *larger* than the current capacity. + /// + /// # Aborts + /// + /// Aborts on OOM. + #[cfg(not(no_global_oom_handling))] + pub fn shrink_to_fit(&mut self, cap: usize) { + handle_reserve(self.shrink(cap)); + } +} + +impl RawVec { + /// Returns if the buffer needs to grow to fulfill the needed extra capacity. + /// Mainly used to make inlining reserve-calls possible without inlining `grow`. + fn needs_to_grow(&self, len: usize, additional: usize) -> bool { + additional > self.capacity().wrapping_sub(len) + } + + fn set_ptr_and_cap(&mut self, ptr: NonNull<[u8]>, cap: usize) { + // Allocators currently return a `NonNull<[u8]>` whose length matches + // the size requested. If that ever changes, the capacity here should + // change to `ptr.len() / mem::size_of::()`. + self.ptr = unsafe { Unique::new_unchecked(ptr.cast().as_ptr()) }; + self.cap = cap; + } + + // This method is usually instantiated many times. So we want it to be as + // small as possible, to improve compile times. But we also want as much of + // its contents to be statically computable as possible, to make the + // generated code run faster. Therefore, this method is carefully written + // so that all of the code that depends on `T` is within it, while as much + // of the code that doesn't depend on `T` as possible is in functions that + // are non-generic over `T`. + fn grow_amortized(&mut self, len: usize, additional: usize) -> Result<(), TryReserveError> { + // This is ensured by the calling contexts. + debug_assert!(additional > 0); + + if mem::size_of::() == 0 { + // Since we return a capacity of `usize::MAX` when `elem_size` is + // 0, getting to here necessarily means the `RawVec` is overfull. + return Err(CapacityOverflow.into()); + } + + // Nothing we can really do about these checks, sadly. + let required_cap = len.checked_add(additional).ok_or(CapacityOverflow)?; + + // This guarantees exponential growth. The doubling cannot overflow + // because `cap <= isize::MAX` and the type of `cap` is `usize`. + let cap = cmp::max(self.cap * 2, required_cap); + let cap = cmp::max(Self::MIN_NON_ZERO_CAP, cap); + + let new_layout = Layout::array::(cap); + + // `finish_grow` is non-generic over `T`. + let ptr = finish_grow(new_layout, self.current_memory(), &mut self.alloc)?; + self.set_ptr_and_cap(ptr, cap); + Ok(()) + } + + // The constraints on this method are much the same as those on + // `grow_amortized`, but this method is usually instantiated less often so + // it's less critical. + fn grow_exact(&mut self, len: usize, additional: usize) -> Result<(), TryReserveError> { + if mem::size_of::() == 0 { + // Since we return a capacity of `usize::MAX` when the type size is + // 0, getting to here necessarily means the `RawVec` is overfull. + return Err(CapacityOverflow.into()); + } + + let cap = len.checked_add(additional).ok_or(CapacityOverflow)?; + let new_layout = Layout::array::(cap); + + // `finish_grow` is non-generic over `T`. + let ptr = finish_grow(new_layout, self.current_memory(), &mut self.alloc)?; + self.set_ptr_and_cap(ptr, cap); + Ok(()) + } + + fn shrink(&mut self, cap: usize) -> Result<(), TryReserveError> { + assert!(cap <= self.capacity(), "Tried to shrink to a larger capacity"); + + let (ptr, layout) = if let Some(mem) = self.current_memory() { mem } else { return Ok(()) }; + let new_size = cap * mem::size_of::(); + + let ptr = unsafe { + let new_layout = Layout::from_size_align_unchecked(new_size, layout.align()); + self.alloc + .shrink(ptr, layout, new_layout) + .map_err(|_| AllocError { layout: new_layout, non_exhaustive: () })? + }; + self.set_ptr_and_cap(ptr, cap); + Ok(()) + } +} + +// This function is outside `RawVec` to minimize compile times. See the comment +// above `RawVec::grow_amortized` for details. (The `A` parameter isn't +// significant, because the number of different `A` types seen in practice is +// much smaller than the number of `T` types.) +#[inline(never)] +fn finish_grow( + new_layout: Result, + current_memory: Option<(NonNull, Layout)>, + alloc: &mut A, +) -> Result, TryReserveError> +where + A: Allocator, +{ + // Check for the error here to minimize the size of `RawVec::grow_*`. + let new_layout = new_layout.map_err(|_| CapacityOverflow)?; + + alloc_guard(new_layout.size())?; + + let memory = if let Some((ptr, old_layout)) = current_memory { + debug_assert_eq!(old_layout.align(), new_layout.align()); + unsafe { + // The allocator checks for alignment equality + intrinsics::assume(old_layout.align() == new_layout.align()); + alloc.grow(ptr, old_layout, new_layout) + } + } else { + alloc.allocate(new_layout) + }; + + memory.map_err(|_| AllocError { layout: new_layout, non_exhaustive: () }.into()) +} + +unsafe impl<#[may_dangle] T, A: Allocator> Drop for RawVec { + /// Frees the memory owned by the `RawVec` *without* trying to drop its contents. + fn drop(&mut self) { + if let Some((ptr, layout)) = self.current_memory() { + unsafe { self.alloc.deallocate(ptr, layout) } + } + } +} + +// Central function for reserve error handling. +#[cfg(not(no_global_oom_handling))] +#[inline] +fn handle_reserve(result: Result<(), TryReserveError>) { + match result.map_err(|e| e.kind()) { + Err(CapacityOverflow) => capacity_overflow(), + Err(AllocError { layout, .. }) => handle_alloc_error(layout), + Ok(()) => { /* yay */ } + } +} + +// We need to guarantee the following: +// * We don't ever allocate `> isize::MAX` byte-size objects. +// * We don't overflow `usize::MAX` and actually allocate too little. +// +// On 64-bit we just need to check for overflow since trying to allocate +// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add +// an extra guard for this in case we're running on a platform which can use +// all 4GB in user-space, e.g., PAE or x32. + +#[inline] +fn alloc_guard(alloc_size: usize) -> Result<(), TryReserveError> { + if usize::BITS < 64 && alloc_size > isize::MAX as usize { + Err(CapacityOverflow.into()) + } else { + Ok(()) + } +} + +// One central function responsible for reporting capacity overflows. This'll +// ensure that the code generation related to these panics is minimal as there's +// only one location which panics rather than a bunch throughout the module. +#[cfg(not(no_global_oom_handling))] +fn capacity_overflow() -> ! { + panic!("capacity overflow"); +} diff --git a/rust/alloc/slice.rs b/rust/alloc/slice.rs new file mode 100644 index 00000000000000..f0397d08f95a8f --- /dev/null +++ b/rust/alloc/slice.rs @@ -0,0 +1,1191 @@ +//! A dynamically-sized view into a contiguous sequence, `[T]`. +//! +//! *[See also the slice primitive type](slice).* +//! +//! Slices are a view into a block of memory represented as a pointer and a +//! length. +//! +//! ``` +//! // slicing a Vec +//! let vec = vec![1, 2, 3]; +//! let int_slice = &vec[..]; +//! // coercing an array to a slice +//! let str_slice: &[&str] = &["one", "two", "three"]; +//! ``` +//! +//! Slices are either mutable or shared. The shared slice type is `&[T]`, +//! while the mutable slice type is `&mut [T]`, where `T` represents the element +//! type. For example, you can mutate the block of memory that a mutable slice +//! points to: +//! +//! ``` +//! let x = &mut [1, 2, 3]; +//! x[1] = 7; +//! assert_eq!(x, &[1, 7, 3]); +//! ``` +//! +//! Here are some of the things this module contains: +//! +//! ## Structs +//! +//! There are several structs that are useful for slices, such as [`Iter`], which +//! represents iteration over a slice. +//! +//! ## Trait Implementations +//! +//! There are several implementations of common traits for slices. Some examples +//! include: +//! +//! * [`Clone`] +//! * [`Eq`], [`Ord`] - for slices whose element type are [`Eq`] or [`Ord`]. +//! * [`Hash`] - for slices whose element type is [`Hash`]. +//! +//! ## Iteration +//! +//! The slices implement `IntoIterator`. The iterator yields references to the +//! slice elements. +//! +//! ``` +//! let numbers = &[0, 1, 2]; +//! for n in numbers { +//! println!("{} is a number!", n); +//! } +//! ``` +//! +//! The mutable slice yields mutable references to the elements: +//! +//! ``` +//! let mut scores = [7, 8, 9]; +//! for score in &mut scores[..] { +//! *score += 1; +//! } +//! ``` +//! +//! This iterator yields mutable references to the slice's elements, so while +//! the element type of the slice is `i32`, the element type of the iterator is +//! `&mut i32`. +//! +//! * [`.iter`] and [`.iter_mut`] are the explicit methods to return the default +//! iterators. +//! * Further methods that return iterators are [`.split`], [`.splitn`], +//! [`.chunks`], [`.windows`] and more. +//! +//! [`Hash`]: core::hash::Hash +//! [`.iter`]: slice::iter +//! [`.iter_mut`]: slice::iter_mut +//! [`.split`]: slice::split +//! [`.splitn`]: slice::splitn +//! [`.chunks`]: slice::chunks +//! [`.windows`]: slice::windows +#![stable(feature = "rust1", since = "1.0.0")] +// Many of the usings in this module are only used in the test configuration. +// It's cleaner to just turn off the unused_imports warning than to fix them. +#![cfg_attr(test, allow(unused_imports, dead_code))] + +use core::borrow::{Borrow, BorrowMut}; +#[cfg(not(no_global_oom_handling))] +use core::cmp::Ordering::{self, Less}; +#[cfg(not(no_global_oom_handling))] +use core::mem; +#[cfg(not(no_global_oom_handling))] +use core::mem::size_of; +#[cfg(not(no_global_oom_handling))] +use core::ptr; + +use crate::alloc::Allocator; +#[cfg(not(no_global_oom_handling))] +use crate::alloc::Global; +#[cfg(not(no_global_oom_handling))] +use crate::borrow::ToOwned; +use crate::boxed::Box; +use crate::vec::Vec; + +#[unstable(feature = "slice_range", issue = "76393")] +pub use core::slice::range; +#[unstable(feature = "array_chunks", issue = "74985")] +pub use core::slice::ArrayChunks; +#[unstable(feature = "array_chunks", issue = "74985")] +pub use core::slice::ArrayChunksMut; +#[unstable(feature = "array_windows", issue = "75027")] +pub use core::slice::ArrayWindows; +#[stable(feature = "inherent_ascii_escape", since = "1.60.0")] +pub use core::slice::EscapeAscii; +#[stable(feature = "slice_get_slice", since = "1.28.0")] +pub use core::slice::SliceIndex; +#[stable(feature = "from_ref", since = "1.28.0")] +pub use core::slice::{from_mut, from_ref}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::slice::{from_raw_parts, from_raw_parts_mut}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::slice::{Chunks, Windows}; +#[stable(feature = "chunks_exact", since = "1.31.0")] +pub use core::slice::{ChunksExact, ChunksExactMut}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::slice::{ChunksMut, Split, SplitMut}; +#[unstable(feature = "slice_group_by", issue = "80552")] +pub use core::slice::{GroupBy, GroupByMut}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::slice::{Iter, IterMut}; +#[stable(feature = "rchunks", since = "1.31.0")] +pub use core::slice::{RChunks, RChunksExact, RChunksExactMut, RChunksMut}; +#[stable(feature = "slice_rsplit", since = "1.27.0")] +pub use core::slice::{RSplit, RSplitMut}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::slice::{RSplitN, RSplitNMut, SplitN, SplitNMut}; +#[stable(feature = "split_inclusive", since = "1.51.0")] +pub use core::slice::{SplitInclusive, SplitInclusiveMut}; + +//////////////////////////////////////////////////////////////////////////////// +// Basic slice extension methods +//////////////////////////////////////////////////////////////////////////////// + +// HACK(japaric) needed for the implementation of `vec!` macro during testing +// N.B., see the `hack` module in this file for more details. +#[cfg(test)] +pub use hack::into_vec; + +// HACK(japaric) needed for the implementation of `Vec::clone` during testing +// N.B., see the `hack` module in this file for more details. +#[cfg(test)] +pub use hack::to_vec; + +// HACK(japaric): With cfg(test) `impl [T]` is not available, these three +// functions are actually methods that are in `impl [T]` but not in +// `core::slice::SliceExt` - we need to supply these functions for the +// `test_permutations` test +mod hack { + use core::alloc::Allocator; + + use crate::boxed::Box; + use crate::vec::Vec; + + // We shouldn't add inline attribute to this since this is used in + // `vec!` macro mostly and causes perf regression. See #71204 for + // discussion and perf results. + pub fn into_vec(b: Box<[T], A>) -> Vec { + unsafe { + let len = b.len(); + let (b, alloc) = Box::into_raw_with_allocator(b); + Vec::from_raw_parts_in(b as *mut T, len, len, alloc) + } + } + + #[cfg(not(no_global_oom_handling))] + #[inline] + pub fn to_vec(s: &[T], alloc: A) -> Vec { + T::to_vec(s, alloc) + } + + #[cfg(not(no_global_oom_handling))] + pub trait ConvertVec { + fn to_vec(s: &[Self], alloc: A) -> Vec + where + Self: Sized; + } + + #[cfg(not(no_global_oom_handling))] + impl ConvertVec for T { + #[inline] + default fn to_vec(s: &[Self], alloc: A) -> Vec { + struct DropGuard<'a, T, A: Allocator> { + vec: &'a mut Vec, + num_init: usize, + } + impl<'a, T, A: Allocator> Drop for DropGuard<'a, T, A> { + #[inline] + fn drop(&mut self) { + // SAFETY: + // items were marked initialized in the loop below + unsafe { + self.vec.set_len(self.num_init); + } + } + } + let mut vec = Vec::with_capacity_in(s.len(), alloc); + let mut guard = DropGuard { vec: &mut vec, num_init: 0 }; + let slots = guard.vec.spare_capacity_mut(); + // .take(slots.len()) is necessary for LLVM to remove bounds checks + // and has better codegen than zip. + for (i, b) in s.iter().enumerate().take(slots.len()) { + guard.num_init = i; + slots[i].write(b.clone()); + } + core::mem::forget(guard); + // SAFETY: + // the vec was allocated and initialized above to at least this length. + unsafe { + vec.set_len(s.len()); + } + vec + } + } + + #[cfg(not(no_global_oom_handling))] + impl ConvertVec for T { + #[inline] + fn to_vec(s: &[Self], alloc: A) -> Vec { + let mut v = Vec::with_capacity_in(s.len(), alloc); + // SAFETY: + // allocated above with the capacity of `s`, and initialize to `s.len()` in + // ptr::copy_to_non_overlapping below. + unsafe { + s.as_ptr().copy_to_nonoverlapping(v.as_mut_ptr(), s.len()); + v.set_len(s.len()); + } + v + } + } +} + +#[lang = "slice_alloc"] +#[cfg(not(test))] +impl [T] { + /// Sorts the slice. + /// + /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case. + /// + /// When applicable, unstable sorting is preferred because it is generally faster than stable + /// sorting and it doesn't allocate auxiliary memory. + /// See [`sort_unstable`](slice::sort_unstable). + /// + /// # Current implementation + /// + /// The current algorithm is an adaptive, iterative merge sort inspired by + /// [timsort](https://en.wikipedia.org/wiki/Timsort). + /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of + /// two or more sorted sequences concatenated one after another. + /// + /// Also, it allocates temporary storage half the size of `self`, but for short slices a + /// non-allocating insertion sort is used instead. + /// + /// # Examples + /// + /// ``` + /// let mut v = [-5, 4, 1, -3, 2]; + /// + /// v.sort(); + /// assert!(v == [-5, -3, 1, 2, 4]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn sort(&mut self) + where + T: Ord, + { + merge_sort(self, |a, b| a.lt(b)); + } + + /// Sorts the slice with a comparator function. + /// + /// This sort is stable (i.e., does not reorder equal elements) and *O*(*n* \* log(*n*)) worst-case. + /// + /// The comparator function must define a total ordering for the elements in the slice. If + /// the ordering is not total, the order of the elements is unspecified. An order is a + /// total order if it is (for all `a`, `b` and `c`): + /// + /// * total and antisymmetric: exactly one of `a < b`, `a == b` or `a > b` is true, and + /// * transitive, `a < b` and `b < c` implies `a < c`. The same must hold for both `==` and `>`. + /// + /// For example, while [`f64`] doesn't implement [`Ord`] because `NaN != NaN`, we can use + /// `partial_cmp` as our sort function when we know the slice doesn't contain a `NaN`. + /// + /// ``` + /// let mut floats = [5f64, 4.0, 1.0, 3.0, 2.0]; + /// floats.sort_by(|a, b| a.partial_cmp(b).unwrap()); + /// assert_eq!(floats, [1.0, 2.0, 3.0, 4.0, 5.0]); + /// ``` + /// + /// When applicable, unstable sorting is preferred because it is generally faster than stable + /// sorting and it doesn't allocate auxiliary memory. + /// See [`sort_unstable_by`](slice::sort_unstable_by). + /// + /// # Current implementation + /// + /// The current algorithm is an adaptive, iterative merge sort inspired by + /// [timsort](https://en.wikipedia.org/wiki/Timsort). + /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of + /// two or more sorted sequences concatenated one after another. + /// + /// Also, it allocates temporary storage half the size of `self`, but for short slices a + /// non-allocating insertion sort is used instead. + /// + /// # Examples + /// + /// ``` + /// let mut v = [5, 4, 1, 3, 2]; + /// v.sort_by(|a, b| a.cmp(b)); + /// assert!(v == [1, 2, 3, 4, 5]); + /// + /// // reverse sorting + /// v.sort_by(|a, b| b.cmp(a)); + /// assert!(v == [5, 4, 3, 2, 1]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn sort_by(&mut self, mut compare: F) + where + F: FnMut(&T, &T) -> Ordering, + { + merge_sort(self, |a, b| compare(a, b) == Less); + } + + /// Sorts the slice with a key extraction function. + /// + /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* \* log(*n*)) + /// worst-case, where the key function is *O*(*m*). + /// + /// For expensive key functions (e.g. functions that are not simple property accesses or + /// basic operations), [`sort_by_cached_key`](slice::sort_by_cached_key) is likely to be + /// significantly faster, as it does not recompute element keys. + /// + /// When applicable, unstable sorting is preferred because it is generally faster than stable + /// sorting and it doesn't allocate auxiliary memory. + /// See [`sort_unstable_by_key`](slice::sort_unstable_by_key). + /// + /// # Current implementation + /// + /// The current algorithm is an adaptive, iterative merge sort inspired by + /// [timsort](https://en.wikipedia.org/wiki/Timsort). + /// It is designed to be very fast in cases where the slice is nearly sorted, or consists of + /// two or more sorted sequences concatenated one after another. + /// + /// Also, it allocates temporary storage half the size of `self`, but for short slices a + /// non-allocating insertion sort is used instead. + /// + /// # Examples + /// + /// ``` + /// let mut v = [-5i32, 4, 1, -3, 2]; + /// + /// v.sort_by_key(|k| k.abs()); + /// assert!(v == [1, 2, -3, 4, -5]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "slice_sort_by_key", since = "1.7.0")] + #[inline] + pub fn sort_by_key(&mut self, mut f: F) + where + F: FnMut(&T) -> K, + K: Ord, + { + merge_sort(self, |a, b| f(a).lt(&f(b))); + } + + /// Sorts the slice with a key extraction function. + /// + /// During sorting, the key function is called at most once per element, by using + /// temporary storage to remember the results of key evaluation. + /// The order of calls to the key function is unspecified and may change in future versions + /// of the standard library. + /// + /// This sort is stable (i.e., does not reorder equal elements) and *O*(*m* \* *n* + *n* \* log(*n*)) + /// worst-case, where the key function is *O*(*m*). + /// + /// For simple key functions (e.g., functions that are property accesses or + /// basic operations), [`sort_by_key`](slice::sort_by_key) is likely to be + /// faster. + /// + /// # Current implementation + /// + /// The current algorithm is based on [pattern-defeating quicksort][pdqsort] by Orson Peters, + /// which combines the fast average case of randomized quicksort with the fast worst case of + /// heapsort, while achieving linear time on slices with certain patterns. It uses some + /// randomization to avoid degenerate cases, but with a fixed seed to always provide + /// deterministic behavior. + /// + /// In the worst case, the algorithm allocates temporary storage in a `Vec<(K, usize)>` the + /// length of the slice. + /// + /// # Examples + /// + /// ``` + /// let mut v = [-5i32, 4, 32, -3, 2]; + /// + /// v.sort_by_cached_key(|k| k.to_string()); + /// assert!(v == [-3, -5, 2, 32, 4]); + /// ``` + /// + /// [pdqsort]: https://github.com/orlp/pdqsort + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "slice_sort_by_cached_key", since = "1.34.0")] + #[inline] + pub fn sort_by_cached_key(&mut self, f: F) + where + F: FnMut(&T) -> K, + K: Ord, + { + // Helper macro for indexing our vector by the smallest possible type, to reduce allocation. + macro_rules! sort_by_key { + ($t:ty, $slice:ident, $f:ident) => {{ + let mut indices: Vec<_> = + $slice.iter().map($f).enumerate().map(|(i, k)| (k, i as $t)).collect(); + // The elements of `indices` are unique, as they are indexed, so any sort will be + // stable with respect to the original slice. We use `sort_unstable` here because + // it requires less memory allocation. + indices.sort_unstable(); + for i in 0..$slice.len() { + let mut index = indices[i].1; + while (index as usize) < i { + index = indices[index as usize].1; + } + indices[i].1 = index; + $slice.swap(i, index as usize); + } + }}; + } + + let sz_u8 = mem::size_of::<(K, u8)>(); + let sz_u16 = mem::size_of::<(K, u16)>(); + let sz_u32 = mem::size_of::<(K, u32)>(); + let sz_usize = mem::size_of::<(K, usize)>(); + + let len = self.len(); + if len < 2 { + return; + } + if sz_u8 < sz_u16 && len <= (u8::MAX as usize) { + return sort_by_key!(u8, self, f); + } + if sz_u16 < sz_u32 && len <= (u16::MAX as usize) { + return sort_by_key!(u16, self, f); + } + if sz_u32 < sz_usize && len <= (u32::MAX as usize) { + return sort_by_key!(u32, self, f); + } + sort_by_key!(usize, self, f) + } + + /// Copies `self` into a new `Vec`. + /// + /// # Examples + /// + /// ``` + /// let s = [10, 40, 30]; + /// let x = s.to_vec(); + /// // Here, `s` and `x` can be modified independently. + /// ``` + #[cfg(not(no_global_oom_handling))] + #[rustc_conversion_suggestion] + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn to_vec(&self) -> Vec + where + T: Clone, + { + self.to_vec_in(Global) + } + + /// Copies `self` into a new `Vec` with an allocator. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// let s = [10, 40, 30]; + /// let x = s.to_vec_in(System); + /// // Here, `s` and `x` can be modified independently. + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[unstable(feature = "allocator_api", issue = "32838")] + pub fn to_vec_in(&self, alloc: A) -> Vec + where + T: Clone, + { + // N.B., see the `hack` module in this file for more details. + hack::to_vec(self, alloc) + } + + /// Converts `self` into a vector without clones or allocation. + /// + /// The resulting vector can be converted back into a box via + /// `Vec`'s `into_boxed_slice` method. + /// + /// # Examples + /// + /// ``` + /// let s: Box<[i32]> = Box::new([10, 40, 30]); + /// let x = s.into_vec(); + /// // `s` cannot be used anymore because it has been converted into `x`. + /// + /// assert_eq!(x, vec![10, 40, 30]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn into_vec(self: Box) -> Vec { + // N.B., see the `hack` module in this file for more details. + hack::into_vec(self) + } + + /// Creates a vector by repeating a slice `n` times. + /// + /// # Panics + /// + /// This function will panic if the capacity would overflow. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// assert_eq!([1, 2].repeat(3), vec![1, 2, 1, 2, 1, 2]); + /// ``` + /// + /// A panic upon overflow: + /// + /// ```should_panic + /// // this will panic at runtime + /// b"0123456789abcdef".repeat(usize::MAX); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "repeat_generic_slice", since = "1.40.0")] + pub fn repeat(&self, n: usize) -> Vec + where + T: Copy, + { + if n == 0 { + return Vec::new(); + } + + // If `n` is larger than zero, it can be split as + // `n = 2^expn + rem (2^expn > rem, expn >= 0, rem >= 0)`. + // `2^expn` is the number represented by the leftmost '1' bit of `n`, + // and `rem` is the remaining part of `n`. + + // Using `Vec` to access `set_len()`. + let capacity = self.len().checked_mul(n).expect("capacity overflow"); + let mut buf = Vec::with_capacity(capacity); + + // `2^expn` repetition is done by doubling `buf` `expn`-times. + buf.extend(self); + { + let mut m = n >> 1; + // If `m > 0`, there are remaining bits up to the leftmost '1'. + while m > 0 { + // `buf.extend(buf)`: + unsafe { + ptr::copy_nonoverlapping( + buf.as_ptr(), + (buf.as_mut_ptr() as *mut T).add(buf.len()), + buf.len(), + ); + // `buf` has capacity of `self.len() * n`. + let buf_len = buf.len(); + buf.set_len(buf_len * 2); + } + + m >>= 1; + } + } + + // `rem` (`= n - 2^expn`) repetition is done by copying + // first `rem` repetitions from `buf` itself. + let rem_len = capacity - buf.len(); // `self.len() * rem` + if rem_len > 0 { + // `buf.extend(buf[0 .. rem_len])`: + unsafe { + // This is non-overlapping since `2^expn > rem`. + ptr::copy_nonoverlapping( + buf.as_ptr(), + (buf.as_mut_ptr() as *mut T).add(buf.len()), + rem_len, + ); + // `buf.len() + rem_len` equals to `buf.capacity()` (`= self.len() * n`). + buf.set_len(capacity); + } + } + buf + } + + /// Flattens a slice of `T` into a single value `Self::Output`. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(["hello", "world"].concat(), "helloworld"); + /// assert_eq!([[1, 2], [3, 4]].concat(), [1, 2, 3, 4]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + pub fn concat(&self) -> >::Output + where + Self: Concat, + { + Concat::concat(self) + } + + /// Flattens a slice of `T` into a single value `Self::Output`, placing a + /// given separator between each. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(["hello", "world"].join(" "), "hello world"); + /// assert_eq!([[1, 2], [3, 4]].join(&0), [1, 2, 0, 3, 4]); + /// assert_eq!([[1, 2], [3, 4]].join(&[0, 0][..]), [1, 2, 0, 0, 3, 4]); + /// ``` + #[stable(feature = "rename_connect_to_join", since = "1.3.0")] + pub fn join(&self, sep: Separator) -> >::Output + where + Self: Join, + { + Join::join(self, sep) + } + + /// Flattens a slice of `T` into a single value `Self::Output`, placing a + /// given separator between each. + /// + /// # Examples + /// + /// ``` + /// # #![allow(deprecated)] + /// assert_eq!(["hello", "world"].connect(" "), "hello world"); + /// assert_eq!([[1, 2], [3, 4]].connect(&0), [1, 2, 0, 3, 4]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[rustc_deprecated(since = "1.3.0", reason = "renamed to join")] + pub fn connect(&self, sep: Separator) -> >::Output + where + Self: Join, + { + Join::join(self, sep) + } +} + +#[lang = "slice_u8_alloc"] +#[cfg(not(test))] +impl [u8] { + /// Returns a vector containing a copy of this slice where each byte + /// is mapped to its ASCII upper case equivalent. + /// + /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', + /// but non-ASCII letters are unchanged. + /// + /// To uppercase the value in-place, use [`make_ascii_uppercase`]. + /// + /// [`make_ascii_uppercase`]: slice::make_ascii_uppercase + #[cfg(not(no_global_oom_handling))] + #[must_use = "this returns the uppercase bytes as a new Vec, \ + without modifying the original"] + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[inline] + pub fn to_ascii_uppercase(&self) -> Vec { + let mut me = self.to_vec(); + me.make_ascii_uppercase(); + me + } + + /// Returns a vector containing a copy of this slice where each byte + /// is mapped to its ASCII lower case equivalent. + /// + /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', + /// but non-ASCII letters are unchanged. + /// + /// To lowercase the value in-place, use [`make_ascii_lowercase`]. + /// + /// [`make_ascii_lowercase`]: slice::make_ascii_lowercase + #[cfg(not(no_global_oom_handling))] + #[must_use = "this returns the lowercase bytes as a new Vec, \ + without modifying the original"] + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[inline] + pub fn to_ascii_lowercase(&self) -> Vec { + let mut me = self.to_vec(); + me.make_ascii_lowercase(); + me + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Extension traits for slices over specific kinds of data +//////////////////////////////////////////////////////////////////////////////// + +/// Helper trait for [`[T]::concat`](slice::concat). +/// +/// Note: the `Item` type parameter is not used in this trait, +/// but it allows impls to be more generic. +/// Without it, we get this error: +/// +/// ```error +/// error[E0207]: the type parameter `T` is not constrained by the impl trait, self type, or predica +/// --> src/liballoc/slice.rs:608:6 +/// | +/// 608 | impl> Concat for [V] { +/// | ^ unconstrained type parameter +/// ``` +/// +/// This is because there could exist `V` types with multiple `Borrow<[_]>` impls, +/// such that multiple `T` types would apply: +/// +/// ``` +/// # #[allow(dead_code)] +/// pub struct Foo(Vec, Vec); +/// +/// impl std::borrow::Borrow<[u32]> for Foo { +/// fn borrow(&self) -> &[u32] { &self.0 } +/// } +/// +/// impl std::borrow::Borrow<[String]> for Foo { +/// fn borrow(&self) -> &[String] { &self.1 } +/// } +/// ``` +#[unstable(feature = "slice_concat_trait", issue = "27747")] +pub trait Concat { + #[unstable(feature = "slice_concat_trait", issue = "27747")] + /// The resulting type after concatenation + type Output; + + /// Implementation of [`[T]::concat`](slice::concat) + #[unstable(feature = "slice_concat_trait", issue = "27747")] + fn concat(slice: &Self) -> Self::Output; +} + +/// Helper trait for [`[T]::join`](slice::join) +#[unstable(feature = "slice_concat_trait", issue = "27747")] +pub trait Join { + #[unstable(feature = "slice_concat_trait", issue = "27747")] + /// The resulting type after concatenation + type Output; + + /// Implementation of [`[T]::join`](slice::join) + #[unstable(feature = "slice_concat_trait", issue = "27747")] + fn join(slice: &Self, sep: Separator) -> Self::Output; +} + +#[cfg(not(no_global_oom_handling))] +#[unstable(feature = "slice_concat_ext", issue = "27747")] +impl> Concat for [V] { + type Output = Vec; + + fn concat(slice: &Self) -> Vec { + let size = slice.iter().map(|slice| slice.borrow().len()).sum(); + let mut result = Vec::with_capacity(size); + for v in slice { + result.extend_from_slice(v.borrow()) + } + result + } +} + +#[cfg(not(no_global_oom_handling))] +#[unstable(feature = "slice_concat_ext", issue = "27747")] +impl> Join<&T> for [V] { + type Output = Vec; + + fn join(slice: &Self, sep: &T) -> Vec { + let mut iter = slice.iter(); + let first = match iter.next() { + Some(first) => first, + None => return vec![], + }; + let size = slice.iter().map(|v| v.borrow().len()).sum::() + slice.len() - 1; + let mut result = Vec::with_capacity(size); + result.extend_from_slice(first.borrow()); + + for v in iter { + result.push(sep.clone()); + result.extend_from_slice(v.borrow()) + } + result + } +} + +#[cfg(not(no_global_oom_handling))] +#[unstable(feature = "slice_concat_ext", issue = "27747")] +impl> Join<&[T]> for [V] { + type Output = Vec; + + fn join(slice: &Self, sep: &[T]) -> Vec { + let mut iter = slice.iter(); + let first = match iter.next() { + Some(first) => first, + None => return vec![], + }; + let size = + slice.iter().map(|v| v.borrow().len()).sum::() + sep.len() * (slice.len() - 1); + let mut result = Vec::with_capacity(size); + result.extend_from_slice(first.borrow()); + + for v in iter { + result.extend_from_slice(sep); + result.extend_from_slice(v.borrow()) + } + result + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Standard trait implementations for slices +//////////////////////////////////////////////////////////////////////////////// + +#[stable(feature = "rust1", since = "1.0.0")] +impl Borrow<[T]> for Vec { + fn borrow(&self) -> &[T] { + &self[..] + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl BorrowMut<[T]> for Vec { + fn borrow_mut(&mut self) -> &mut [T] { + &mut self[..] + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl ToOwned for [T] { + type Owned = Vec; + #[cfg(not(test))] + fn to_owned(&self) -> Vec { + self.to_vec() + } + + #[cfg(test)] + fn to_owned(&self) -> Vec { + hack::to_vec(self, Global) + } + + fn clone_into(&self, target: &mut Vec) { + // drop anything in target that will not be overwritten + target.truncate(self.len()); + + // target.len <= self.len due to the truncate above, so the + // slices here are always in-bounds. + let (init, tail) = self.split_at(target.len()); + + // reuse the contained values' allocations/resources. + target.clone_from_slice(init); + target.extend_from_slice(tail); + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Sorting +//////////////////////////////////////////////////////////////////////////////// + +/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted. +/// +/// This is the integral subroutine of insertion sort. +#[cfg(not(no_global_oom_handling))] +fn insert_head(v: &mut [T], is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + if v.len() >= 2 && is_less(&v[1], &v[0]) { + unsafe { + // There are three ways to implement insertion here: + // + // 1. Swap adjacent elements until the first one gets to its final destination. + // However, this way we copy data around more than is necessary. If elements are big + // structures (costly to copy), this method will be slow. + // + // 2. Iterate until the right place for the first element is found. Then shift the + // elements succeeding it to make room for it and finally place it into the + // remaining hole. This is a good method. + // + // 3. Copy the first element into a temporary variable. Iterate until the right place + // for it is found. As we go along, copy every traversed element into the slot + // preceding it. Finally, copy data from the temporary variable into the remaining + // hole. This method is very good. Benchmarks demonstrated slightly better + // performance than with the 2nd method. + // + // All methods were benchmarked, and the 3rd showed best results. So we chose that one. + let tmp = mem::ManuallyDrop::new(ptr::read(&v[0])); + + // Intermediate state of the insertion process is always tracked by `hole`, which + // serves two purposes: + // 1. Protects integrity of `v` from panics in `is_less`. + // 2. Fills the remaining hole in `v` in the end. + // + // Panic safety: + // + // If `is_less` panics at any point during the process, `hole` will get dropped and + // fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it + // initially held exactly once. + let mut hole = InsertionHole { src: &*tmp, dest: &mut v[1] }; + ptr::copy_nonoverlapping(&v[1], &mut v[0], 1); + + for i in 2..v.len() { + if !is_less(&v[i], &*tmp) { + break; + } + ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1); + hole.dest = &mut v[i]; + } + // `hole` gets dropped and thus copies `tmp` into the remaining hole in `v`. + } + } + + // When dropped, copies from `src` into `dest`. + struct InsertionHole { + src: *const T, + dest: *mut T, + } + + impl Drop for InsertionHole { + fn drop(&mut self) { + unsafe { + ptr::copy_nonoverlapping(self.src, self.dest, 1); + } + } + } +} + +/// Merges non-decreasing runs `v[..mid]` and `v[mid..]` using `buf` as temporary storage, and +/// stores the result into `v[..]`. +/// +/// # Safety +/// +/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough +/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type. +#[cfg(not(no_global_oom_handling))] +unsafe fn merge(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F) +where + F: FnMut(&T, &T) -> bool, +{ + let len = v.len(); + let v = v.as_mut_ptr(); + let (v_mid, v_end) = unsafe { (v.add(mid), v.add(len)) }; + + // The merge process first copies the shorter run into `buf`. Then it traces the newly copied + // run and the longer run forwards (or backwards), comparing their next unconsumed elements and + // copying the lesser (or greater) one into `v`. + // + // As soon as the shorter run is fully consumed, the process is done. If the longer run gets + // consumed first, then we must copy whatever is left of the shorter run into the remaining + // hole in `v`. + // + // Intermediate state of the process is always tracked by `hole`, which serves two purposes: + // 1. Protects integrity of `v` from panics in `is_less`. + // 2. Fills the remaining hole in `v` if the longer run gets consumed first. + // + // Panic safety: + // + // If `is_less` panics at any point during the process, `hole` will get dropped and fill the + // hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every + // object it initially held exactly once. + let mut hole; + + if mid <= len - mid { + // The left run is shorter. + unsafe { + ptr::copy_nonoverlapping(v, buf, mid); + hole = MergeHole { start: buf, end: buf.add(mid), dest: v }; + } + + // Initially, these pointers point to the beginnings of their arrays. + let left = &mut hole.start; + let mut right = v_mid; + let out = &mut hole.dest; + + while *left < hole.end && right < v_end { + // Consume the lesser side. + // If equal, prefer the left run to maintain stability. + unsafe { + let to_copy = if is_less(&*right, &**left) { + get_and_increment(&mut right) + } else { + get_and_increment(left) + }; + ptr::copy_nonoverlapping(to_copy, get_and_increment(out), 1); + } + } + } else { + // The right run is shorter. + unsafe { + ptr::copy_nonoverlapping(v_mid, buf, len - mid); + hole = MergeHole { start: buf, end: buf.add(len - mid), dest: v_mid }; + } + + // Initially, these pointers point past the ends of their arrays. + let left = &mut hole.dest; + let right = &mut hole.end; + let mut out = v_end; + + while v < *left && buf < *right { + // Consume the greater side. + // If equal, prefer the right run to maintain stability. + unsafe { + let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) { + decrement_and_get(left) + } else { + decrement_and_get(right) + }; + ptr::copy_nonoverlapping(to_copy, decrement_and_get(&mut out), 1); + } + } + } + // Finally, `hole` gets dropped. If the shorter run was not fully consumed, whatever remains of + // it will now be copied into the hole in `v`. + + unsafe fn get_and_increment(ptr: &mut *mut T) -> *mut T { + let old = *ptr; + *ptr = unsafe { ptr.offset(1) }; + old + } + + unsafe fn decrement_and_get(ptr: &mut *mut T) -> *mut T { + *ptr = unsafe { ptr.offset(-1) }; + *ptr + } + + // When dropped, copies the range `start..end` into `dest..`. + struct MergeHole { + start: *mut T, + end: *mut T, + dest: *mut T, + } + + impl Drop for MergeHole { + fn drop(&mut self) { + // `T` is not a zero-sized type, so it's okay to divide by its size. + let len = (self.end as usize - self.start as usize) / mem::size_of::(); + unsafe { + ptr::copy_nonoverlapping(self.start, self.dest, len); + } + } + } +} + +/// This merge sort borrows some (but not all) ideas from TimSort, which is described in detail +/// [here](https://github.com/python/cpython/blob/main/Objects/listsort.txt). +/// +/// The algorithm identifies strictly descending and non-descending subsequences, which are called +/// natural runs. There is a stack of pending runs yet to be merged. Each newly found run is pushed +/// onto the stack, and then some pairs of adjacent runs are merged until these two invariants are +/// satisfied: +/// +/// 1. for every `i` in `1..runs.len()`: `runs[i - 1].len > runs[i].len` +/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len` +/// +/// The invariants ensure that the total running time is *O*(*n* \* log(*n*)) worst-case. +#[cfg(not(no_global_oom_handling))] +fn merge_sort(v: &mut [T], mut is_less: F) +where + F: FnMut(&T, &T) -> bool, +{ + // Slices of up to this length get sorted using insertion sort. + const MAX_INSERTION: usize = 20; + // Very short runs are extended using insertion sort to span at least this many elements. + const MIN_RUN: usize = 10; + + // Sorting has no meaningful behavior on zero-sized types. + if size_of::() == 0 { + return; + } + + let len = v.len(); + + // Short arrays get sorted in-place via insertion sort to avoid allocations. + if len <= MAX_INSERTION { + if len >= 2 { + for i in (0..len - 1).rev() { + insert_head(&mut v[i..], &mut is_less); + } + } + return; + } + + // Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it + // shallow copies of the contents of `v` without risking the dtors running on copies if + // `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run, + // which will always have length at most `len / 2`. + let mut buf = Vec::with_capacity(len / 2); + + // In order to identify natural runs in `v`, we traverse it backwards. That might seem like a + // strange decision, but consider the fact that merges more often go in the opposite direction + // (forwards). According to benchmarks, merging forwards is slightly faster than merging + // backwards. To conclude, identifying runs by traversing backwards improves performance. + let mut runs = vec![]; + let mut end = len; + while end > 0 { + // Find the next natural run, and reverse it if it's strictly descending. + let mut start = end - 1; + if start > 0 { + start -= 1; + unsafe { + if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) { + while start > 0 && is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) { + start -= 1; + } + v[start..end].reverse(); + } else { + while start > 0 && !is_less(v.get_unchecked(start), v.get_unchecked(start - 1)) + { + start -= 1; + } + } + } + } + + // Insert some more elements into the run if it's too short. Insertion sort is faster than + // merge sort on short sequences, so this significantly improves performance. + while start > 0 && end - start < MIN_RUN { + start -= 1; + insert_head(&mut v[start..end], &mut is_less); + } + + // Push this run onto the stack. + runs.push(Run { start, len: end - start }); + end = start; + + // Merge some pairs of adjacent runs to satisfy the invariants. + while let Some(r) = collapse(&runs) { + let left = runs[r + 1]; + let right = runs[r]; + unsafe { + merge( + &mut v[left.start..right.start + right.len], + left.len, + buf.as_mut_ptr(), + &mut is_less, + ); + } + runs[r] = Run { start: left.start, len: left.len + right.len }; + runs.remove(r + 1); + } + } + + // Finally, exactly one run must remain in the stack. + debug_assert!(runs.len() == 1 && runs[0].start == 0 && runs[0].len == len); + + // Examines the stack of runs and identifies the next pair of runs to merge. More specifically, + // if `Some(r)` is returned, that means `runs[r]` and `runs[r + 1]` must be merged next. If the + // algorithm should continue building a new run instead, `None` is returned. + // + // TimSort is infamous for its buggy implementations, as described here: + // http://envisage-project.eu/timsort-specification-and-verification/ + // + // The gist of the story is: we must enforce the invariants on the top four runs on the stack. + // Enforcing them on just top three is not sufficient to ensure that the invariants will still + // hold for *all* runs in the stack. + // + // This function correctly checks invariants for the top four runs. Additionally, if the top + // run starts at index 0, it will always demand a merge operation until the stack is fully + // collapsed, in order to complete the sort. + #[inline] + fn collapse(runs: &[Run]) -> Option { + let n = runs.len(); + if n >= 2 + && (runs[n - 1].start == 0 + || runs[n - 2].len <= runs[n - 1].len + || (n >= 3 && runs[n - 3].len <= runs[n - 2].len + runs[n - 1].len) + || (n >= 4 && runs[n - 4].len <= runs[n - 3].len + runs[n - 2].len)) + { + if n >= 3 && runs[n - 3].len < runs[n - 1].len { Some(n - 3) } else { Some(n - 2) } + } else { + None + } + } + + #[derive(Clone, Copy)] + struct Run { + start: usize, + len: usize, + } +} diff --git a/rust/alloc/str.rs b/rust/alloc/str.rs new file mode 100644 index 00000000000000..69495f31c32ca4 --- /dev/null +++ b/rust/alloc/str.rs @@ -0,0 +1,613 @@ +//! Unicode string slices. +//! +//! *[See also the `str` primitive type](str).* +//! +//! The `&str` type is one of the two main string types, the other being `String`. +//! Unlike its `String` counterpart, its contents are borrowed. +//! +//! # Basic Usage +//! +//! A basic string declaration of `&str` type: +//! +//! ``` +//! let hello_world = "Hello, World!"; +//! ``` +//! +//! Here we have declared a string literal, also known as a string slice. +//! String literals have a static lifetime, which means the string `hello_world` +//! is guaranteed to be valid for the duration of the entire program. +//! We can explicitly specify `hello_world`'s lifetime as well: +//! +//! ``` +//! let hello_world: &'static str = "Hello, world!"; +//! ``` + +#![stable(feature = "rust1", since = "1.0.0")] +// Many of the usings in this module are only used in the test configuration. +// It's cleaner to just turn off the unused_imports warning than to fix them. +#![allow(unused_imports)] + +use core::borrow::{Borrow, BorrowMut}; +use core::iter::FusedIterator; +use core::mem; +use core::ptr; +use core::str::pattern::{DoubleEndedSearcher, Pattern, ReverseSearcher, Searcher}; +use core::unicode::conversions; + +use crate::borrow::ToOwned; +use crate::boxed::Box; +use crate::slice::{Concat, Join, SliceIndex}; +use crate::string::String; +use crate::vec::Vec; + +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::pattern; +#[stable(feature = "encode_utf16", since = "1.8.0")] +pub use core::str::EncodeUtf16; +#[stable(feature = "split_ascii_whitespace", since = "1.34.0")] +pub use core::str::SplitAsciiWhitespace; +#[stable(feature = "split_inclusive", since = "1.51.0")] +pub use core::str::SplitInclusive; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::SplitWhitespace; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::{from_utf8, from_utf8_mut, Bytes, CharIndices, Chars}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::{from_utf8_unchecked, from_utf8_unchecked_mut, ParseBoolError}; +#[stable(feature = "str_escape", since = "1.34.0")] +pub use core::str::{EscapeDebug, EscapeDefault, EscapeUnicode}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::{FromStr, Utf8Error}; +#[allow(deprecated)] +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::{Lines, LinesAny}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::{MatchIndices, RMatchIndices}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::{Matches, RMatches}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::{RSplit, Split}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::{RSplitN, SplitN}; +#[stable(feature = "rust1", since = "1.0.0")] +pub use core::str::{RSplitTerminator, SplitTerminator}; + +/// Note: `str` in `Concat` is not meaningful here. +/// This type parameter of the trait only exists to enable another impl. +#[cfg(not(no_global_oom_handling))] +#[unstable(feature = "slice_concat_ext", issue = "27747")] +impl> Concat for [S] { + type Output = String; + + fn concat(slice: &Self) -> String { + Join::join(slice, "") + } +} + +#[cfg(not(no_global_oom_handling))] +#[unstable(feature = "slice_concat_ext", issue = "27747")] +impl> Join<&str> for [S] { + type Output = String; + + fn join(slice: &Self, sep: &str) -> String { + unsafe { String::from_utf8_unchecked(join_generic_copy(slice, sep.as_bytes())) } + } +} + +#[cfg(not(no_global_oom_handling))] +macro_rules! specialize_for_lengths { + ($separator:expr, $target:expr, $iter:expr; $($num:expr),*) => {{ + let mut target = $target; + let iter = $iter; + let sep_bytes = $separator; + match $separator.len() { + $( + // loops with hardcoded sizes run much faster + // specialize the cases with small separator lengths + $num => { + for s in iter { + copy_slice_and_advance!(target, sep_bytes); + let content_bytes = s.borrow().as_ref(); + copy_slice_and_advance!(target, content_bytes); + } + }, + )* + _ => { + // arbitrary non-zero size fallback + for s in iter { + copy_slice_and_advance!(target, sep_bytes); + let content_bytes = s.borrow().as_ref(); + copy_slice_and_advance!(target, content_bytes); + } + } + } + target + }} +} + +#[cfg(not(no_global_oom_handling))] +macro_rules! copy_slice_and_advance { + ($target:expr, $bytes:expr) => { + let len = $bytes.len(); + let (head, tail) = { $target }.split_at_mut(len); + head.copy_from_slice($bytes); + $target = tail; + }; +} + +// Optimized join implementation that works for both Vec (T: Copy) and String's inner vec +// Currently (2018-05-13) there is a bug with type inference and specialization (see issue #36262) +// For this reason SliceConcat is not specialized for T: Copy and SliceConcat is the +// only user of this function. It is left in place for the time when that is fixed. +// +// the bounds for String-join are S: Borrow and for Vec-join Borrow<[T]> +// [T] and str both impl AsRef<[T]> for some T +// => s.borrow().as_ref() and we always have slices +#[cfg(not(no_global_oom_handling))] +fn join_generic_copy(slice: &[S], sep: &[T]) -> Vec +where + T: Copy, + B: AsRef<[T]> + ?Sized, + S: Borrow, +{ + let sep_len = sep.len(); + let mut iter = slice.iter(); + + // the first slice is the only one without a separator preceding it + let first = match iter.next() { + Some(first) => first, + None => return vec![], + }; + + // compute the exact total length of the joined Vec + // if the `len` calculation overflows, we'll panic + // we would have run out of memory anyway and the rest of the function requires + // the entire Vec pre-allocated for safety + let reserved_len = sep_len + .checked_mul(iter.len()) + .and_then(|n| { + slice.iter().map(|s| s.borrow().as_ref().len()).try_fold(n, usize::checked_add) + }) + .expect("attempt to join into collection with len > usize::MAX"); + + // prepare an uninitialized buffer + let mut result = Vec::with_capacity(reserved_len); + debug_assert!(result.capacity() >= reserved_len); + + result.extend_from_slice(first.borrow().as_ref()); + + unsafe { + let pos = result.len(); + let target = result.spare_capacity_mut().get_unchecked_mut(..reserved_len - pos); + + // Convert the separator and slices to slices of MaybeUninit + // to simplify implementation in specialize_for_lengths + let sep_uninit = core::slice::from_raw_parts(sep.as_ptr().cast(), sep.len()); + let iter_uninit = iter.map(|it| { + let it = it.borrow().as_ref(); + core::slice::from_raw_parts(it.as_ptr().cast(), it.len()) + }); + + // copy separator and slices over without bounds checks + // generate loops with hardcoded offsets for small separators + // massive improvements possible (~ x2) + let remain = specialize_for_lengths!(sep_uninit, target, iter_uninit; 0, 1, 2, 3, 4); + + // A weird borrow implementation may return different + // slices for the length calculation and the actual copy. + // Make sure we don't expose uninitialized bytes to the caller. + let result_len = reserved_len - remain.len(); + result.set_len(result_len); + } + result +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Borrow for String { + #[inline] + fn borrow(&self) -> &str { + &self[..] + } +} + +#[stable(feature = "string_borrow_mut", since = "1.36.0")] +impl BorrowMut for String { + #[inline] + fn borrow_mut(&mut self) -> &mut str { + &mut self[..] + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl ToOwned for str { + type Owned = String; + #[inline] + fn to_owned(&self) -> String { + unsafe { String::from_utf8_unchecked(self.as_bytes().to_owned()) } + } + + fn clone_into(&self, target: &mut String) { + let mut b = mem::take(target).into_bytes(); + self.as_bytes().clone_into(&mut b); + *target = unsafe { String::from_utf8_unchecked(b) } + } +} + +/// Methods for string slices. +#[lang = "str_alloc"] +#[cfg(not(test))] +impl str { + /// Converts a `Box` into a `Box<[u8]>` without copying or allocating. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "this is a string"; + /// let boxed_str = s.to_owned().into_boxed_str(); + /// let boxed_bytes = boxed_str.into_boxed_bytes(); + /// assert_eq!(*boxed_bytes, *s.as_bytes()); + /// ``` + #[stable(feature = "str_box_extras", since = "1.20.0")] + #[must_use = "`self` will be dropped if the result is not used"] + #[inline] + pub fn into_boxed_bytes(self: Box) -> Box<[u8]> { + self.into() + } + + /// Replaces all matches of a pattern with another string. + /// + /// `replace` creates a new [`String`], and copies the data from this string slice into it. + /// While doing so, it attempts to find matches of a pattern. If it finds any, it + /// replaces them with the replacement string slice. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "this is old"; + /// + /// assert_eq!("this is new", s.replace("old", "new")); + /// ``` + /// + /// When the pattern doesn't match: + /// + /// ``` + /// let s = "this is old"; + /// assert_eq!(s, s.replace("cookie monster", "little lamb")); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[must_use = "this returns the replaced string as a new allocation, \ + without modifying the original"] + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn replace<'a, P: Pattern<'a>>(&'a self, from: P, to: &str) -> String { + let mut result = String::new(); + let mut last_end = 0; + for (start, part) in self.match_indices(from) { + result.push_str(unsafe { self.get_unchecked(last_end..start) }); + result.push_str(to); + last_end = start + part.len(); + } + result.push_str(unsafe { self.get_unchecked(last_end..self.len()) }); + result + } + + /// Replaces first N matches of a pattern with another string. + /// + /// `replacen` creates a new [`String`], and copies the data from this string slice into it. + /// While doing so, it attempts to find matches of a pattern. If it finds any, it + /// replaces them with the replacement string slice at most `count` times. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "foo foo 123 foo"; + /// assert_eq!("new new 123 foo", s.replacen("foo", "new", 2)); + /// assert_eq!("faa fao 123 foo", s.replacen('o', "a", 3)); + /// assert_eq!("foo foo new23 foo", s.replacen(char::is_numeric, "new", 1)); + /// ``` + /// + /// When the pattern doesn't match: + /// + /// ``` + /// let s = "this is old"; + /// assert_eq!(s, s.replacen("cookie monster", "little lamb", 10)); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[must_use = "this returns the replaced string as a new allocation, \ + without modifying the original"] + #[stable(feature = "str_replacen", since = "1.16.0")] + pub fn replacen<'a, P: Pattern<'a>>(&'a self, pat: P, to: &str, count: usize) -> String { + // Hope to reduce the times of re-allocation + let mut result = String::with_capacity(32); + let mut last_end = 0; + for (start, part) in self.match_indices(pat).take(count) { + result.push_str(unsafe { self.get_unchecked(last_end..start) }); + result.push_str(to); + last_end = start + part.len(); + } + result.push_str(unsafe { self.get_unchecked(last_end..self.len()) }); + result + } + + /// Returns the lowercase equivalent of this string slice, as a new [`String`]. + /// + /// 'Lowercase' is defined according to the terms of the Unicode Derived Core Property + /// `Lowercase`. + /// + /// Since some characters can expand into multiple characters when changing + /// the case, this function returns a [`String`] instead of modifying the + /// parameter in-place. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "HELLO"; + /// + /// assert_eq!("hello", s.to_lowercase()); + /// ``` + /// + /// A tricky example, with sigma: + /// + /// ``` + /// let sigma = "Σ"; + /// + /// assert_eq!("σ", sigma.to_lowercase()); + /// + /// // but at the end of a word, it's ς, not σ: + /// let odysseus = "ὈΔΥΣΣΕΎΣ"; + /// + /// assert_eq!("ὀδυσσεύς", odysseus.to_lowercase()); + /// ``` + /// + /// Languages without case are not changed: + /// + /// ``` + /// let new_year = "农历新年"; + /// + /// assert_eq!(new_year, new_year.to_lowercase()); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[must_use = "this returns the lowercase string as a new String, \ + without modifying the original"] + #[stable(feature = "unicode_case_mapping", since = "1.2.0")] + pub fn to_lowercase(&self) -> String { + let mut s = String::with_capacity(self.len()); + for (i, c) in self[..].char_indices() { + if c == 'Σ' { + // Σ maps to σ, except at the end of a word where it maps to ς. + // This is the only conditional (contextual) but language-independent mapping + // in `SpecialCasing.txt`, + // so hard-code it rather than have a generic "condition" mechanism. + // See https://github.com/rust-lang/rust/issues/26035 + map_uppercase_sigma(self, i, &mut s) + } else { + match conversions::to_lower(c) { + [a, '\0', _] => s.push(a), + [a, b, '\0'] => { + s.push(a); + s.push(b); + } + [a, b, c] => { + s.push(a); + s.push(b); + s.push(c); + } + } + } + } + return s; + + fn map_uppercase_sigma(from: &str, i: usize, to: &mut String) { + // See https://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992 + // for the definition of `Final_Sigma`. + debug_assert!('Σ'.len_utf8() == 2); + let is_word_final = case_ignoreable_then_cased(from[..i].chars().rev()) + && !case_ignoreable_then_cased(from[i + 2..].chars()); + to.push_str(if is_word_final { "ς" } else { "σ" }); + } + + fn case_ignoreable_then_cased>(iter: I) -> bool { + use core::unicode::{Case_Ignorable, Cased}; + match iter.skip_while(|&c| Case_Ignorable(c)).next() { + Some(c) => Cased(c), + None => false, + } + } + } + + /// Returns the uppercase equivalent of this string slice, as a new [`String`]. + /// + /// 'Uppercase' is defined according to the terms of the Unicode Derived Core Property + /// `Uppercase`. + /// + /// Since some characters can expand into multiple characters when changing + /// the case, this function returns a [`String`] instead of modifying the + /// parameter in-place. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = "hello"; + /// + /// assert_eq!("HELLO", s.to_uppercase()); + /// ``` + /// + /// Scripts without case are not changed: + /// + /// ``` + /// let new_year = "农历新年"; + /// + /// assert_eq!(new_year, new_year.to_uppercase()); + /// ``` + /// + /// One character can become multiple: + /// ``` + /// let s = "tschüß"; + /// + /// assert_eq!("TSCHÜSS", s.to_uppercase()); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[must_use = "this returns the uppercase string as a new String, \ + without modifying the original"] + #[stable(feature = "unicode_case_mapping", since = "1.2.0")] + pub fn to_uppercase(&self) -> String { + let mut s = String::with_capacity(self.len()); + for c in self[..].chars() { + match conversions::to_upper(c) { + [a, '\0', _] => s.push(a), + [a, b, '\0'] => { + s.push(a); + s.push(b); + } + [a, b, c] => { + s.push(a); + s.push(b); + s.push(c); + } + } + } + s + } + + /// Converts a [`Box`] into a [`String`] without copying or allocating. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let string = String::from("birthday gift"); + /// let boxed_str = string.clone().into_boxed_str(); + /// + /// assert_eq!(boxed_str.into_string(), string); + /// ``` + #[stable(feature = "box_str", since = "1.4.0")] + #[must_use = "`self` will be dropped if the result is not used"] + #[inline] + pub fn into_string(self: Box) -> String { + let slice = Box::<[u8]>::from(self); + unsafe { String::from_utf8_unchecked(slice.into_vec()) } + } + + /// Creates a new [`String`] by repeating a string `n` times. + /// + /// # Panics + /// + /// This function will panic if the capacity would overflow. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// assert_eq!("abc".repeat(4), String::from("abcabcabcabc")); + /// ``` + /// + /// A panic upon overflow: + /// + /// ```should_panic + /// // this will panic at runtime + /// let huge = "0123456789abcdef".repeat(usize::MAX); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[must_use] + #[stable(feature = "repeat_str", since = "1.16.0")] + pub fn repeat(&self, n: usize) -> String { + unsafe { String::from_utf8_unchecked(self.as_bytes().repeat(n)) } + } + + /// Returns a copy of this string where each character is mapped to its + /// ASCII upper case equivalent. + /// + /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', + /// but non-ASCII letters are unchanged. + /// + /// To uppercase the value in-place, use [`make_ascii_uppercase`]. + /// + /// To uppercase ASCII characters in addition to non-ASCII characters, use + /// [`to_uppercase`]. + /// + /// # Examples + /// + /// ``` + /// let s = "Grüße, Jürgen ❤"; + /// + /// assert_eq!("GRüßE, JüRGEN ❤", s.to_ascii_uppercase()); + /// ``` + /// + /// [`make_ascii_uppercase`]: str::make_ascii_uppercase + /// [`to_uppercase`]: #method.to_uppercase + #[cfg(not(no_global_oom_handling))] + #[must_use = "to uppercase the value in-place, use `make_ascii_uppercase()`"] + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[inline] + pub fn to_ascii_uppercase(&self) -> String { + let mut bytes = self.as_bytes().to_vec(); + bytes.make_ascii_uppercase(); + // make_ascii_uppercase() preserves the UTF-8 invariant. + unsafe { String::from_utf8_unchecked(bytes) } + } + + /// Returns a copy of this string where each character is mapped to its + /// ASCII lower case equivalent. + /// + /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', + /// but non-ASCII letters are unchanged. + /// + /// To lowercase the value in-place, use [`make_ascii_lowercase`]. + /// + /// To lowercase ASCII characters in addition to non-ASCII characters, use + /// [`to_lowercase`]. + /// + /// # Examples + /// + /// ``` + /// let s = "Grüße, Jürgen ❤"; + /// + /// assert_eq!("grüße, jürgen ❤", s.to_ascii_lowercase()); + /// ``` + /// + /// [`make_ascii_lowercase`]: str::make_ascii_lowercase + /// [`to_lowercase`]: #method.to_lowercase + #[cfg(not(no_global_oom_handling))] + #[must_use = "to lowercase the value in-place, use `make_ascii_lowercase()`"] + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] + #[inline] + pub fn to_ascii_lowercase(&self) -> String { + let mut bytes = self.as_bytes().to_vec(); + bytes.make_ascii_lowercase(); + // make_ascii_lowercase() preserves the UTF-8 invariant. + unsafe { String::from_utf8_unchecked(bytes) } + } +} + +/// Converts a boxed slice of bytes to a boxed string slice without checking +/// that the string contains valid UTF-8. +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// let smile_utf8 = Box::new([226, 152, 186]); +/// let smile = unsafe { std::str::from_boxed_utf8_unchecked(smile_utf8) }; +/// +/// assert_eq!("☺", &*smile); +/// ``` +#[stable(feature = "str_box_extras", since = "1.20.0")] +#[must_use] +#[inline] +pub unsafe fn from_boxed_utf8_unchecked(v: Box<[u8]>) -> Box { + unsafe { Box::from_raw(Box::into_raw(v) as *mut str) } +} diff --git a/rust/alloc/string.rs b/rust/alloc/string.rs new file mode 100644 index 00000000000000..716bb4983a651f --- /dev/null +++ b/rust/alloc/string.rs @@ -0,0 +1,2867 @@ +//! A UTF-8–encoded, growable string. +//! +//! This module contains the [`String`] type, the [`ToString`] trait for +//! converting to strings, and several error types that may result from +//! working with [`String`]s. +//! +//! # Examples +//! +//! There are multiple ways to create a new [`String`] from a string literal: +//! +//! ``` +//! let s = "Hello".to_string(); +//! +//! let s = String::from("world"); +//! let s: String = "also this".into(); +//! ``` +//! +//! You can create a new [`String`] from an existing one by concatenating with +//! `+`: +//! +//! ``` +//! let s = "Hello".to_string(); +//! +//! let message = s + " world!"; +//! ``` +//! +//! If you have a vector of valid UTF-8 bytes, you can make a [`String`] out of +//! it. You can do the reverse too. +//! +//! ``` +//! let sparkle_heart = vec![240, 159, 146, 150]; +//! +//! // We know these bytes are valid, so we'll use `unwrap()`. +//! let sparkle_heart = String::from_utf8(sparkle_heart).unwrap(); +//! +//! assert_eq!("💖", sparkle_heart); +//! +//! let bytes = sparkle_heart.into_bytes(); +//! +//! assert_eq!(bytes, [240, 159, 146, 150]); +//! ``` + +#![stable(feature = "rust1", since = "1.0.0")] + +#[cfg(not(no_global_oom_handling))] +use core::char::{decode_utf16, REPLACEMENT_CHARACTER}; +use core::fmt; +use core::hash; +#[cfg(not(no_global_oom_handling))] +use core::iter::FromIterator; +use core::iter::{from_fn, FusedIterator}; +#[cfg(not(no_global_oom_handling))] +use core::ops::Add; +#[cfg(not(no_global_oom_handling))] +use core::ops::AddAssign; +#[cfg(not(no_global_oom_handling))] +use core::ops::Bound::{Excluded, Included, Unbounded}; +use core::ops::{self, Index, IndexMut, Range, RangeBounds}; +use core::ptr; +use core::slice; +#[cfg(not(no_global_oom_handling))] +use core::str::lossy; +use core::str::pattern::Pattern; + +#[cfg(not(no_global_oom_handling))] +use crate::borrow::{Cow, ToOwned}; +use crate::boxed::Box; +use crate::collections::TryReserveError; +use crate::str::{self, Chars, Utf8Error}; +#[cfg(not(no_global_oom_handling))] +use crate::str::{from_boxed_utf8_unchecked, FromStr}; +use crate::vec::Vec; + +/// A UTF-8–encoded, growable string. +/// +/// The `String` type is the most common string type that has ownership over the +/// contents of the string. It has a close relationship with its borrowed +/// counterpart, the primitive [`str`]. +/// +/// # Examples +/// +/// You can create a `String` from [a literal string][`&str`] with [`String::from`]: +/// +/// [`String::from`]: From::from +/// +/// ``` +/// let hello = String::from("Hello, world!"); +/// ``` +/// +/// You can append a [`char`] to a `String` with the [`push`] method, and +/// append a [`&str`] with the [`push_str`] method: +/// +/// ``` +/// let mut hello = String::from("Hello, "); +/// +/// hello.push('w'); +/// hello.push_str("orld!"); +/// ``` +/// +/// [`push`]: String::push +/// [`push_str`]: String::push_str +/// +/// If you have a vector of UTF-8 bytes, you can create a `String` from it with +/// the [`from_utf8`] method: +/// +/// ``` +/// // some bytes, in a vector +/// let sparkle_heart = vec![240, 159, 146, 150]; +/// +/// // We know these bytes are valid, so we'll use `unwrap()`. +/// let sparkle_heart = String::from_utf8(sparkle_heart).unwrap(); +/// +/// assert_eq!("💖", sparkle_heart); +/// ``` +/// +/// [`from_utf8`]: String::from_utf8 +/// +/// # UTF-8 +/// +/// `String`s are always valid UTF-8. This has a few implications, the first of +/// which is that if you need a non-UTF-8 string, consider [`OsString`]. It is +/// similar, but without the UTF-8 constraint. The second implication is that +/// you cannot index into a `String`: +/// +/// ```compile_fail,E0277 +/// let s = "hello"; +/// +/// println!("The first letter of s is {}", s[0]); // ERROR!!! +/// ``` +/// +/// [`OsString`]: ../../std/ffi/struct.OsString.html "ffi::OsString" +/// +/// Indexing is intended to be a constant-time operation, but UTF-8 encoding +/// does not allow us to do this. Furthermore, it's not clear what sort of +/// thing the index should return: a byte, a codepoint, or a grapheme cluster. +/// The [`bytes`] and [`chars`] methods return iterators over the first +/// two, respectively. +/// +/// [`bytes`]: str::bytes +/// [`chars`]: str::chars +/// +/// # Deref +/// +/// `String` implements [Deref], and so inherits all of [`str`]'s +/// methods. In addition, this means that you can pass a `String` to a +/// function which takes a [`&str`] by using an ampersand (`&`): +/// +/// ``` +/// fn takes_str(s: &str) { } +/// +/// let s = String::from("Hello"); +/// +/// takes_str(&s); +/// ``` +/// +/// This will create a [`&str`] from the `String` and pass it in. This +/// conversion is very inexpensive, and so generally, functions will accept +/// [`&str`]s as arguments unless they need a `String` for some specific +/// reason. +/// +/// In certain cases Rust doesn't have enough information to make this +/// conversion, known as [`Deref`] coercion. In the following example a string +/// slice [`&'a str`][`&str`] implements the trait `TraitExample`, and the function +/// `example_func` takes anything that implements the trait. In this case Rust +/// would need to make two implicit conversions, which Rust doesn't have the +/// means to do. For that reason, the following example will not compile. +/// +/// ```compile_fail,E0277 +/// trait TraitExample {} +/// +/// impl<'a> TraitExample for &'a str {} +/// +/// fn example_func(example_arg: A) {} +/// +/// let example_string = String::from("example_string"); +/// example_func(&example_string); +/// ``` +/// +/// There are two options that would work instead. The first would be to +/// change the line `example_func(&example_string);` to +/// `example_func(example_string.as_str());`, using the method [`as_str()`] +/// to explicitly extract the string slice containing the string. The second +/// way changes `example_func(&example_string);` to +/// `example_func(&*example_string);`. In this case we are dereferencing a +/// `String` to a [`str`], then referencing the [`str`] back to +/// [`&str`]. The second way is more idiomatic, however both work to do the +/// conversion explicitly rather than relying on the implicit conversion. +/// +/// # Representation +/// +/// A `String` is made up of three components: a pointer to some bytes, a +/// length, and a capacity. The pointer points to an internal buffer `String` +/// uses to store its data. The length is the number of bytes currently stored +/// in the buffer, and the capacity is the size of the buffer in bytes. As such, +/// the length will always be less than or equal to the capacity. +/// +/// This buffer is always stored on the heap. +/// +/// You can look at these with the [`as_ptr`], [`len`], and [`capacity`] +/// methods: +/// +/// ``` +/// use std::mem; +/// +/// let story = String::from("Once upon a time..."); +/// +// FIXME Update this when vec_into_raw_parts is stabilized +/// // Prevent automatically dropping the String's data +/// let mut story = mem::ManuallyDrop::new(story); +/// +/// let ptr = story.as_mut_ptr(); +/// let len = story.len(); +/// let capacity = story.capacity(); +/// +/// // story has nineteen bytes +/// assert_eq!(19, len); +/// +/// // We can re-build a String out of ptr, len, and capacity. This is all +/// // unsafe because we are responsible for making sure the components are +/// // valid: +/// let s = unsafe { String::from_raw_parts(ptr, len, capacity) } ; +/// +/// assert_eq!(String::from("Once upon a time..."), s); +/// ``` +/// +/// [`as_ptr`]: str::as_ptr +/// [`len`]: String::len +/// [`capacity`]: String::capacity +/// +/// If a `String` has enough capacity, adding elements to it will not +/// re-allocate. For example, consider this program: +/// +/// ``` +/// let mut s = String::new(); +/// +/// println!("{}", s.capacity()); +/// +/// for _ in 0..5 { +/// s.push_str("hello"); +/// println!("{}", s.capacity()); +/// } +/// ``` +/// +/// This will output the following: +/// +/// ```text +/// 0 +/// 5 +/// 10 +/// 20 +/// 20 +/// 40 +/// ``` +/// +/// At first, we have no memory allocated at all, but as we append to the +/// string, it increases its capacity appropriately. If we instead use the +/// [`with_capacity`] method to allocate the correct capacity initially: +/// +/// ``` +/// let mut s = String::with_capacity(25); +/// +/// println!("{}", s.capacity()); +/// +/// for _ in 0..5 { +/// s.push_str("hello"); +/// println!("{}", s.capacity()); +/// } +/// ``` +/// +/// [`with_capacity`]: String::with_capacity +/// +/// We end up with a different output: +/// +/// ```text +/// 25 +/// 25 +/// 25 +/// 25 +/// 25 +/// 25 +/// ``` +/// +/// Here, there's no need to allocate more memory inside the loop. +/// +/// [str]: prim@str "str" +/// [`str`]: prim@str "str" +/// [`&str`]: prim@str "&str" +/// [Deref]: core::ops::Deref "ops::Deref" +/// [`Deref`]: core::ops::Deref "ops::Deref" +/// [`as_str()`]: String::as_str +#[derive(PartialOrd, Eq, Ord)] +#[cfg_attr(not(test), rustc_diagnostic_item = "String")] +#[stable(feature = "rust1", since = "1.0.0")] +pub struct String { + vec: Vec, +} + +/// A possible error value when converting a `String` from a UTF-8 byte vector. +/// +/// This type is the error type for the [`from_utf8`] method on [`String`]. It +/// is designed in such a way to carefully avoid reallocations: the +/// [`into_bytes`] method will give back the byte vector that was used in the +/// conversion attempt. +/// +/// [`from_utf8`]: String::from_utf8 +/// [`into_bytes`]: FromUtf8Error::into_bytes +/// +/// The [`Utf8Error`] type provided by [`std::str`] represents an error that may +/// occur when converting a slice of [`u8`]s to a [`&str`]. In this sense, it's +/// an analogue to `FromUtf8Error`, and you can get one from a `FromUtf8Error` +/// through the [`utf8_error`] method. +/// +/// [`Utf8Error`]: str::Utf8Error "std::str::Utf8Error" +/// [`std::str`]: core::str "std::str" +/// [`&str`]: prim@str "&str" +/// [`utf8_error`]: FromUtf8Error::utf8_error +/// +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// // some invalid bytes, in a vector +/// let bytes = vec![0, 159]; +/// +/// let value = String::from_utf8(bytes); +/// +/// assert!(value.is_err()); +/// assert_eq!(vec![0, 159], value.unwrap_err().into_bytes()); +/// ``` +#[stable(feature = "rust1", since = "1.0.0")] +#[cfg_attr(not(no_global_oom_handling), derive(Clone))] +#[derive(Debug, PartialEq, Eq)] +pub struct FromUtf8Error { + bytes: Vec, + error: Utf8Error, +} + +/// A possible error value when converting a `String` from a UTF-16 byte slice. +/// +/// This type is the error type for the [`from_utf16`] method on [`String`]. +/// +/// [`from_utf16`]: String::from_utf16 +/// # Examples +/// +/// Basic usage: +/// +/// ``` +/// // 𝄞muic +/// let v = &[0xD834, 0xDD1E, 0x006d, 0x0075, +/// 0xD800, 0x0069, 0x0063]; +/// +/// assert!(String::from_utf16(v).is_err()); +/// ``` +#[stable(feature = "rust1", since = "1.0.0")] +#[derive(Debug)] +pub struct FromUtf16Error(()); + +impl String { + /// Creates a new empty `String`. + /// + /// Given that the `String` is empty, this will not allocate any initial + /// buffer. While that means that this initial operation is very + /// inexpensive, it may cause excessive allocation later when you add + /// data. If you have an idea of how much data the `String` will hold, + /// consider the [`with_capacity`] method to prevent excessive + /// re-allocation. + /// + /// [`with_capacity`]: String::with_capacity + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = String::new(); + /// ``` + #[inline] + #[rustc_const_stable(feature = "const_string_new", since = "1.39.0")] + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use] + pub const fn new() -> String { + String { vec: Vec::new() } + } + + /// Creates a new empty `String` with a particular capacity. + /// + /// `String`s have an internal buffer to hold their data. The capacity is + /// the length of that buffer, and can be queried with the [`capacity`] + /// method. This method creates an empty `String`, but one with an initial + /// buffer that can hold `capacity` bytes. This is useful when you may be + /// appending a bunch of data to the `String`, reducing the number of + /// reallocations it needs to do. + /// + /// [`capacity`]: String::capacity + /// + /// If the given capacity is `0`, no allocation will occur, and this method + /// is identical to the [`new`] method. + /// + /// [`new`]: String::new + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::with_capacity(10); + /// + /// // The String contains no chars, even though it has capacity for more + /// assert_eq!(s.len(), 0); + /// + /// // These are all done without reallocating... + /// let cap = s.capacity(); + /// for _ in 0..10 { + /// s.push('a'); + /// } + /// + /// assert_eq!(s.capacity(), cap); + /// + /// // ...but this may make the string reallocate + /// s.push('a'); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use] + pub fn with_capacity(capacity: usize) -> String { + String { vec: Vec::with_capacity(capacity) } + } + + // HACK(japaric): with cfg(test) the inherent `[T]::to_vec` method, which is + // required for this method definition, is not available. Since we don't + // require this method for testing purposes, I'll just stub it + // NB see the slice::hack module in slice.rs for more information + #[inline] + #[cfg(test)] + pub fn from_str(_: &str) -> String { + panic!("not available with cfg(test)"); + } + + /// Converts a vector of bytes to a `String`. + /// + /// A string ([`String`]) is made of bytes ([`u8`]), and a vector of bytes + /// ([`Vec`]) is made of bytes, so this function converts between the + /// two. Not all byte slices are valid `String`s, however: `String` + /// requires that it is valid UTF-8. `from_utf8()` checks to ensure that + /// the bytes are valid UTF-8, and then does the conversion. + /// + /// If you are sure that the byte slice is valid UTF-8, and you don't want + /// to incur the overhead of the validity check, there is an unsafe version + /// of this function, [`from_utf8_unchecked`], which has the same behavior + /// but skips the check. + /// + /// This method will take care to not copy the vector, for efficiency's + /// sake. + /// + /// If you need a [`&str`] instead of a `String`, consider + /// [`str::from_utf8`]. + /// + /// The inverse of this method is [`into_bytes`]. + /// + /// # Errors + /// + /// Returns [`Err`] if the slice is not UTF-8 with a description as to why the + /// provided bytes are not UTF-8. The vector you moved in is also included. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// // some bytes, in a vector + /// let sparkle_heart = vec![240, 159, 146, 150]; + /// + /// // We know these bytes are valid, so we'll use `unwrap()`. + /// let sparkle_heart = String::from_utf8(sparkle_heart).unwrap(); + /// + /// assert_eq!("💖", sparkle_heart); + /// ``` + /// + /// Incorrect bytes: + /// + /// ``` + /// // some invalid bytes, in a vector + /// let sparkle_heart = vec![0, 159, 146, 150]; + /// + /// assert!(String::from_utf8(sparkle_heart).is_err()); + /// ``` + /// + /// See the docs for [`FromUtf8Error`] for more details on what you can do + /// with this error. + /// + /// [`from_utf8_unchecked`]: String::from_utf8_unchecked + /// [`Vec`]: crate::vec::Vec "Vec" + /// [`&str`]: prim@str "&str" + /// [`into_bytes`]: String::into_bytes + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn from_utf8(vec: Vec) -> Result { + match str::from_utf8(&vec) { + Ok(..) => Ok(String { vec }), + Err(e) => Err(FromUtf8Error { bytes: vec, error: e }), + } + } + + /// Converts a slice of bytes to a string, including invalid characters. + /// + /// Strings are made of bytes ([`u8`]), and a slice of bytes + /// ([`&[u8]`][byteslice]) is made of bytes, so this function converts + /// between the two. Not all byte slices are valid strings, however: strings + /// are required to be valid UTF-8. During this conversion, + /// `from_utf8_lossy()` will replace any invalid UTF-8 sequences with + /// [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD], which looks like this: � + /// + /// [byteslice]: prim@slice + /// [U+FFFD]: core::char::REPLACEMENT_CHARACTER + /// + /// If you are sure that the byte slice is valid UTF-8, and you don't want + /// to incur the overhead of the conversion, there is an unsafe version + /// of this function, [`from_utf8_unchecked`], which has the same behavior + /// but skips the checks. + /// + /// [`from_utf8_unchecked`]: String::from_utf8_unchecked + /// + /// This function returns a [`Cow<'a, str>`]. If our byte slice is invalid + /// UTF-8, then we need to insert the replacement characters, which will + /// change the size of the string, and hence, require a `String`. But if + /// it's already valid UTF-8, we don't need a new allocation. This return + /// type allows us to handle both cases. + /// + /// [`Cow<'a, str>`]: crate::borrow::Cow "borrow::Cow" + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// // some bytes, in a vector + /// let sparkle_heart = vec![240, 159, 146, 150]; + /// + /// let sparkle_heart = String::from_utf8_lossy(&sparkle_heart); + /// + /// assert_eq!("💖", sparkle_heart); + /// ``` + /// + /// Incorrect bytes: + /// + /// ``` + /// // some invalid bytes + /// let input = b"Hello \xF0\x90\x80World"; + /// let output = String::from_utf8_lossy(input); + /// + /// assert_eq!("Hello �World", output); + /// ``` + #[must_use] + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn from_utf8_lossy(v: &[u8]) -> Cow<'_, str> { + let mut iter = lossy::Utf8Lossy::from_bytes(v).chunks(); + + let first_valid = if let Some(chunk) = iter.next() { + let lossy::Utf8LossyChunk { valid, broken } = chunk; + if broken.is_empty() { + debug_assert_eq!(valid.len(), v.len()); + return Cow::Borrowed(valid); + } + valid + } else { + return Cow::Borrowed(""); + }; + + const REPLACEMENT: &str = "\u{FFFD}"; + + let mut res = String::with_capacity(v.len()); + res.push_str(first_valid); + res.push_str(REPLACEMENT); + + for lossy::Utf8LossyChunk { valid, broken } in iter { + res.push_str(valid); + if !broken.is_empty() { + res.push_str(REPLACEMENT); + } + } + + Cow::Owned(res) + } + + /// Decode a UTF-16–encoded vector `v` into a `String`, returning [`Err`] + /// if `v` contains any invalid data. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// // 𝄞music + /// let v = &[0xD834, 0xDD1E, 0x006d, 0x0075, + /// 0x0073, 0x0069, 0x0063]; + /// assert_eq!(String::from("𝄞music"), + /// String::from_utf16(v).unwrap()); + /// + /// // 𝄞muic + /// let v = &[0xD834, 0xDD1E, 0x006d, 0x0075, + /// 0xD800, 0x0069, 0x0063]; + /// assert!(String::from_utf16(v).is_err()); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn from_utf16(v: &[u16]) -> Result { + // This isn't done via collect::>() for performance reasons. + // FIXME: the function can be simplified again when #48994 is closed. + let mut ret = String::with_capacity(v.len()); + for c in decode_utf16(v.iter().cloned()) { + if let Ok(c) = c { + ret.push(c); + } else { + return Err(FromUtf16Error(())); + } + } + Ok(ret) + } + + /// Decode a UTF-16–encoded slice `v` into a `String`, replacing + /// invalid data with [the replacement character (`U+FFFD`)][U+FFFD]. + /// + /// Unlike [`from_utf8_lossy`] which returns a [`Cow<'a, str>`], + /// `from_utf16_lossy` returns a `String` since the UTF-16 to UTF-8 + /// conversion requires a memory allocation. + /// + /// [`from_utf8_lossy`]: String::from_utf8_lossy + /// [`Cow<'a, str>`]: crate::borrow::Cow "borrow::Cow" + /// [U+FFFD]: core::char::REPLACEMENT_CHARACTER + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// // 𝄞music + /// let v = &[0xD834, 0xDD1E, 0x006d, 0x0075, + /// 0x0073, 0xDD1E, 0x0069, 0x0063, + /// 0xD834]; + /// + /// assert_eq!(String::from("𝄞mus\u{FFFD}ic\u{FFFD}"), + /// String::from_utf16_lossy(v)); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[must_use] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn from_utf16_lossy(v: &[u16]) -> String { + decode_utf16(v.iter().cloned()).map(|r| r.unwrap_or(REPLACEMENT_CHARACTER)).collect() + } + + /// Decomposes a `String` into its raw components. + /// + /// Returns the raw pointer to the underlying data, the length of + /// the string (in bytes), and the allocated capacity of the data + /// (in bytes). These are the same arguments in the same order as + /// the arguments to [`from_raw_parts`]. + /// + /// After calling this function, the caller is responsible for the + /// memory previously managed by the `String`. The only way to do + /// this is to convert the raw pointer, length, and capacity back + /// into a `String` with the [`from_raw_parts`] function, allowing + /// the destructor to perform the cleanup. + /// + /// [`from_raw_parts`]: String::from_raw_parts + /// + /// # Examples + /// + /// ``` + /// #![feature(vec_into_raw_parts)] + /// let s = String::from("hello"); + /// + /// let (ptr, len, cap) = s.into_raw_parts(); + /// + /// let rebuilt = unsafe { String::from_raw_parts(ptr, len, cap) }; + /// assert_eq!(rebuilt, "hello"); + /// ``` + #[must_use = "`self` will be dropped if the result is not used"] + #[unstable(feature = "vec_into_raw_parts", reason = "new API", issue = "65816")] + pub fn into_raw_parts(self) -> (*mut u8, usize, usize) { + self.vec.into_raw_parts() + } + + /// Creates a new `String` from a length, capacity, and pointer. + /// + /// # Safety + /// + /// This is highly unsafe, due to the number of invariants that aren't + /// checked: + /// + /// * The memory at `buf` needs to have been previously allocated by the + /// same allocator the standard library uses, with a required alignment of exactly 1. + /// * `length` needs to be less than or equal to `capacity`. + /// * `capacity` needs to be the correct value. + /// * The first `length` bytes at `buf` need to be valid UTF-8. + /// + /// Violating these may cause problems like corrupting the allocator's + /// internal data structures. + /// + /// The ownership of `buf` is effectively transferred to the + /// `String` which may then deallocate, reallocate or change the + /// contents of memory pointed to by the pointer at will. Ensure + /// that nothing else uses the pointer after calling this + /// function. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// use std::mem; + /// + /// unsafe { + /// let s = String::from("hello"); + /// + // FIXME Update this when vec_into_raw_parts is stabilized + /// // Prevent automatically dropping the String's data + /// let mut s = mem::ManuallyDrop::new(s); + /// + /// let ptr = s.as_mut_ptr(); + /// let len = s.len(); + /// let capacity = s.capacity(); + /// + /// let s = String::from_raw_parts(ptr, len, capacity); + /// + /// assert_eq!(String::from("hello"), s); + /// } + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub unsafe fn from_raw_parts(buf: *mut u8, length: usize, capacity: usize) -> String { + unsafe { String { vec: Vec::from_raw_parts(buf, length, capacity) } } + } + + /// Converts a vector of bytes to a `String` without checking that the + /// string contains valid UTF-8. + /// + /// See the safe version, [`from_utf8`], for more details. + /// + /// [`from_utf8`]: String::from_utf8 + /// + /// # Safety + /// + /// This function is unsafe because it does not check that the bytes passed + /// to it are valid UTF-8. If this constraint is violated, it may cause + /// memory unsafety issues with future users of the `String`, as the rest of + /// the standard library assumes that `String`s are valid UTF-8. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// // some bytes, in a vector + /// let sparkle_heart = vec![240, 159, 146, 150]; + /// + /// let sparkle_heart = unsafe { + /// String::from_utf8_unchecked(sparkle_heart) + /// }; + /// + /// assert_eq!("💖", sparkle_heart); + /// ``` + #[inline] + #[must_use] + #[stable(feature = "rust1", since = "1.0.0")] + pub unsafe fn from_utf8_unchecked(bytes: Vec) -> String { + String { vec: bytes } + } + + /// Converts a `String` into a byte vector. + /// + /// This consumes the `String`, so we do not need to copy its contents. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = String::from("hello"); + /// let bytes = s.into_bytes(); + /// + /// assert_eq!(&[104, 101, 108, 108, 111][..], &bytes[..]); + /// ``` + #[inline] + #[must_use = "`self` will be dropped if the result is not used"] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn into_bytes(self) -> Vec { + self.vec + } + + /// Extracts a string slice containing the entire `String`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = String::from("foo"); + /// + /// assert_eq!("foo", s.as_str()); + /// ``` + #[inline] + #[must_use] + #[stable(feature = "string_as_str", since = "1.7.0")] + pub fn as_str(&self) -> &str { + self + } + + /// Converts a `String` into a mutable string slice. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("foobar"); + /// let s_mut_str = s.as_mut_str(); + /// + /// s_mut_str.make_ascii_uppercase(); + /// + /// assert_eq!("FOOBAR", s_mut_str); + /// ``` + #[inline] + #[must_use] + #[stable(feature = "string_as_str", since = "1.7.0")] + pub fn as_mut_str(&mut self) -> &mut str { + self + } + + /// Appends a given string slice onto the end of this `String`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("foo"); + /// + /// s.push_str("bar"); + /// + /// assert_eq!("foobar", s); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn push_str(&mut self, string: &str) { + self.vec.extend_from_slice(string.as_bytes()) + } + + /// Copies elements from `src` range to the end of the string. + /// + /// ## Panics + /// + /// Panics if the starting point or end point do not lie on a [`char`] + /// boundary, or if they're out of bounds. + /// + /// ## Examples + /// + /// ``` + /// #![feature(string_extend_from_within)] + /// let mut string = String::from("abcde"); + /// + /// string.extend_from_within(2..); + /// assert_eq!(string, "abcdecde"); + /// + /// string.extend_from_within(..2); + /// assert_eq!(string, "abcdecdeab"); + /// + /// string.extend_from_within(4..8); + /// assert_eq!(string, "abcdecdeabecde"); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "string_extend_from_within", issue = "none")] + pub fn extend_from_within(&mut self, src: R) + where + R: RangeBounds, + { + let src @ Range { start, end } = slice::range(src, ..self.len()); + + assert!(self.is_char_boundary(start)); + assert!(self.is_char_boundary(end)); + + self.vec.extend_from_within(src); + } + + /// Returns this `String`'s capacity, in bytes. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = String::with_capacity(10); + /// + /// assert!(s.capacity() >= 10); + /// ``` + #[inline] + #[must_use] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn capacity(&self) -> usize { + self.vec.capacity() + } + + /// Ensures that this `String`'s capacity is at least `additional` bytes + /// larger than its length. + /// + /// The capacity may be increased by more than `additional` bytes if it + /// chooses, to prevent frequent reallocations. + /// + /// If you do not want this "at least" behavior, see the [`reserve_exact`] + /// method. + /// + /// # Panics + /// + /// Panics if the new capacity overflows [`usize`]. + /// + /// [`reserve_exact`]: String::reserve_exact + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::new(); + /// + /// s.reserve(10); + /// + /// assert!(s.capacity() >= 10); + /// ``` + /// + /// This might not actually increase the capacity: + /// + /// ``` + /// let mut s = String::with_capacity(10); + /// s.push('a'); + /// s.push('b'); + /// + /// // s now has a length of 2 and a capacity of 10 + /// assert_eq!(2, s.len()); + /// assert_eq!(10, s.capacity()); + /// + /// // Since we already have an extra 8 capacity, calling this... + /// s.reserve(8); + /// + /// // ... doesn't actually increase. + /// assert_eq!(10, s.capacity()); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn reserve(&mut self, additional: usize) { + self.vec.reserve(additional) + } + + /// Ensures that this `String`'s capacity is `additional` bytes + /// larger than its length. + /// + /// Consider using the [`reserve`] method unless you absolutely know + /// better than the allocator. + /// + /// [`reserve`]: String::reserve + /// + /// # Panics + /// + /// Panics if the new capacity overflows `usize`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::new(); + /// + /// s.reserve_exact(10); + /// + /// assert!(s.capacity() >= 10); + /// ``` + /// + /// This might not actually increase the capacity: + /// + /// ``` + /// let mut s = String::with_capacity(10); + /// s.push('a'); + /// s.push('b'); + /// + /// // s now has a length of 2 and a capacity of 10 + /// assert_eq!(2, s.len()); + /// assert_eq!(10, s.capacity()); + /// + /// // Since we already have an extra 8 capacity, calling this... + /// s.reserve_exact(8); + /// + /// // ... doesn't actually increase. + /// assert_eq!(10, s.capacity()); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn reserve_exact(&mut self, additional: usize) { + self.vec.reserve_exact(additional) + } + + /// Tries to reserve capacity for at least `additional` more elements to be inserted + /// in the given `String`. The collection may reserve more space to avoid + /// frequent reallocations. After calling `reserve`, capacity will be + /// greater than or equal to `self.len() + additional`. Does nothing if + /// capacity is already sufficient. + /// + /// # Errors + /// + /// If the capacity overflows, or the allocator reports a failure, then an error + /// is returned. + /// + /// # Examples + /// + /// ``` + /// use std::collections::TryReserveError; + /// + /// fn process_data(data: &str) -> Result { + /// let mut output = String::new(); + /// + /// // Pre-reserve the memory, exiting if we can't + /// output.try_reserve(data.len())?; + /// + /// // Now we know this can't OOM in the middle of our complex work + /// output.push_str(data); + /// + /// Ok(output) + /// } + /// # process_data("rust").expect("why is the test harness OOMing on 4 bytes?"); + /// ``` + #[stable(feature = "try_reserve", since = "1.57.0")] + pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.vec.try_reserve(additional) + } + + /// Tries to reserve the minimum capacity for exactly `additional` more elements to + /// be inserted in the given `String`. After calling `reserve_exact`, + /// capacity will be greater than or equal to `self.len() + additional`. + /// Does nothing if the capacity is already sufficient. + /// + /// Note that the allocator may give the collection more space than it + /// requests. Therefore, capacity can not be relied upon to be precisely + /// minimal. Prefer [`try_reserve`] if future insertions are expected. + /// + /// [`try_reserve`]: String::try_reserve + /// + /// # Errors + /// + /// If the capacity overflows, or the allocator reports a failure, then an error + /// is returned. + /// + /// # Examples + /// + /// ``` + /// use std::collections::TryReserveError; + /// + /// fn process_data(data: &str) -> Result { + /// let mut output = String::new(); + /// + /// // Pre-reserve the memory, exiting if we can't + /// output.try_reserve_exact(data.len())?; + /// + /// // Now we know this can't OOM in the middle of our complex work + /// output.push_str(data); + /// + /// Ok(output) + /// } + /// # process_data("rust").expect("why is the test harness OOMing on 4 bytes?"); + /// ``` + #[stable(feature = "try_reserve", since = "1.57.0")] + pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.vec.try_reserve_exact(additional) + } + + /// Shrinks the capacity of this `String` to match its length. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("foo"); + /// + /// s.reserve(100); + /// assert!(s.capacity() >= 100); + /// + /// s.shrink_to_fit(); + /// assert_eq!(3, s.capacity()); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn shrink_to_fit(&mut self) { + self.vec.shrink_to_fit() + } + + /// Shrinks the capacity of this `String` with a lower bound. + /// + /// The capacity will remain at least as large as both the length + /// and the supplied value. + /// + /// If the current capacity is less than the lower limit, this is a no-op. + /// + /// # Examples + /// + /// ``` + /// let mut s = String::from("foo"); + /// + /// s.reserve(100); + /// assert!(s.capacity() >= 100); + /// + /// s.shrink_to(10); + /// assert!(s.capacity() >= 10); + /// s.shrink_to(0); + /// assert!(s.capacity() >= 3); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "shrink_to", since = "1.56.0")] + pub fn shrink_to(&mut self, min_capacity: usize) { + self.vec.shrink_to(min_capacity) + } + + /// Appends the given [`char`] to the end of this `String`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("abc"); + /// + /// s.push('1'); + /// s.push('2'); + /// s.push('3'); + /// + /// assert_eq!("abc123", s); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn push(&mut self, ch: char) { + match ch.len_utf8() { + 1 => self.vec.push(ch as u8), + _ => self.vec.extend_from_slice(ch.encode_utf8(&mut [0; 4]).as_bytes()), + } + } + + /// Returns a byte slice of this `String`'s contents. + /// + /// The inverse of this method is [`from_utf8`]. + /// + /// [`from_utf8`]: String::from_utf8 + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = String::from("hello"); + /// + /// assert_eq!(&[104, 101, 108, 108, 111], s.as_bytes()); + /// ``` + #[inline] + #[must_use] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn as_bytes(&self) -> &[u8] { + &self.vec + } + + /// Shortens this `String` to the specified length. + /// + /// If `new_len` is greater than the string's current length, this has no + /// effect. + /// + /// Note that this method has no effect on the allocated capacity + /// of the string + /// + /// # Panics + /// + /// Panics if `new_len` does not lie on a [`char`] boundary. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("hello"); + /// + /// s.truncate(2); + /// + /// assert_eq!("he", s); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn truncate(&mut self, new_len: usize) { + if new_len <= self.len() { + assert!(self.is_char_boundary(new_len)); + self.vec.truncate(new_len) + } + } + + /// Removes the last character from the string buffer and returns it. + /// + /// Returns [`None`] if this `String` is empty. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("foo"); + /// + /// assert_eq!(s.pop(), Some('o')); + /// assert_eq!(s.pop(), Some('o')); + /// assert_eq!(s.pop(), Some('f')); + /// + /// assert_eq!(s.pop(), None); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn pop(&mut self) -> Option { + let ch = self.chars().rev().next()?; + let newlen = self.len() - ch.len_utf8(); + unsafe { + self.vec.set_len(newlen); + } + Some(ch) + } + + /// Removes a [`char`] from this `String` at a byte position and returns it. + /// + /// This is an *O*(*n*) operation, as it requires copying every element in the + /// buffer. + /// + /// # Panics + /// + /// Panics if `idx` is larger than or equal to the `String`'s length, + /// or if it does not lie on a [`char`] boundary. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("foo"); + /// + /// assert_eq!(s.remove(0), 'f'); + /// assert_eq!(s.remove(1), 'o'); + /// assert_eq!(s.remove(0), 'o'); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn remove(&mut self, idx: usize) -> char { + let ch = match self[idx..].chars().next() { + Some(ch) => ch, + None => panic!("cannot remove a char from the end of a string"), + }; + + let next = idx + ch.len_utf8(); + let len = self.len(); + unsafe { + ptr::copy(self.vec.as_ptr().add(next), self.vec.as_mut_ptr().add(idx), len - next); + self.vec.set_len(len - (next - idx)); + } + ch + } + + /// Remove all matches of pattern `pat` in the `String`. + /// + /// # Examples + /// + /// ``` + /// #![feature(string_remove_matches)] + /// let mut s = String::from("Trees are not green, the sky is not blue."); + /// s.remove_matches("not "); + /// assert_eq!("Trees are green, the sky is blue.", s); + /// ``` + /// + /// Matches will be detected and removed iteratively, so in cases where + /// patterns overlap, only the first pattern will be removed: + /// + /// ``` + /// #![feature(string_remove_matches)] + /// let mut s = String::from("banana"); + /// s.remove_matches("ana"); + /// assert_eq!("bna", s); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[unstable(feature = "string_remove_matches", reason = "new API", issue = "72826")] + pub fn remove_matches<'a, P>(&'a mut self, pat: P) + where + P: for<'x> Pattern<'x>, + { + use core::str::pattern::Searcher; + + let rejections = { + let mut searcher = pat.into_searcher(self); + // Per Searcher::next: + // + // A Match result needs to contain the whole matched pattern, + // however Reject results may be split up into arbitrary many + // adjacent fragments. Both ranges may have zero length. + // + // In practice the implementation of Searcher::next_match tends to + // be more efficient, so we use it here and do some work to invert + // matches into rejections since that's what we want to copy below. + let mut front = 0; + let rejections: Vec<_> = from_fn(|| { + let (start, end) = searcher.next_match()?; + let prev_front = front; + front = end; + Some((prev_front, start)) + }) + .collect(); + rejections.into_iter().chain(core::iter::once((front, self.len()))) + }; + + let mut len = 0; + let ptr = self.vec.as_mut_ptr(); + + for (start, end) in rejections { + let count = end - start; + if start != len { + // SAFETY: per Searcher::next: + // + // The stream of Match and Reject values up to a Done will + // contain index ranges that are adjacent, non-overlapping, + // covering the whole haystack, and laying on utf8 + // boundaries. + unsafe { + ptr::copy(ptr.add(start), ptr.add(len), count); + } + } + len += count; + } + + unsafe { + self.vec.set_len(len); + } + } + + /// Retains only the characters specified by the predicate. + /// + /// In other words, remove all characters `c` such that `f(c)` returns `false`. + /// This method operates in place, visiting each character exactly once in the + /// original order, and preserves the order of the retained characters. + /// + /// # Examples + /// + /// ``` + /// let mut s = String::from("f_o_ob_ar"); + /// + /// s.retain(|c| c != '_'); + /// + /// assert_eq!(s, "foobar"); + /// ``` + /// + /// Because the elements are visited exactly once in the original order, + /// external state may be used to decide which elements to keep. + /// + /// ``` + /// let mut s = String::from("abcde"); + /// let keep = [false, true, true, false, true]; + /// let mut iter = keep.iter(); + /// s.retain(|_| *iter.next().unwrap()); + /// assert_eq!(s, "bce"); + /// ``` + #[inline] + #[stable(feature = "string_retain", since = "1.26.0")] + pub fn retain(&mut self, mut f: F) + where + F: FnMut(char) -> bool, + { + struct SetLenOnDrop<'a> { + s: &'a mut String, + idx: usize, + del_bytes: usize, + } + + impl<'a> Drop for SetLenOnDrop<'a> { + fn drop(&mut self) { + let new_len = self.idx - self.del_bytes; + debug_assert!(new_len <= self.s.len()); + unsafe { self.s.vec.set_len(new_len) }; + } + } + + let len = self.len(); + let mut guard = SetLenOnDrop { s: self, idx: 0, del_bytes: 0 }; + + while guard.idx < len { + let ch = unsafe { guard.s.get_unchecked(guard.idx..len).chars().next().unwrap() }; + let ch_len = ch.len_utf8(); + + if !f(ch) { + guard.del_bytes += ch_len; + } else if guard.del_bytes > 0 { + unsafe { + ptr::copy( + guard.s.vec.as_ptr().add(guard.idx), + guard.s.vec.as_mut_ptr().add(guard.idx - guard.del_bytes), + ch_len, + ); + } + } + + // Point idx to the next char + guard.idx += ch_len; + } + + drop(guard); + } + + /// Inserts a character into this `String` at a byte position. + /// + /// This is an *O*(*n*) operation as it requires copying every element in the + /// buffer. + /// + /// # Panics + /// + /// Panics if `idx` is larger than the `String`'s length, or if it does not + /// lie on a [`char`] boundary. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::with_capacity(3); + /// + /// s.insert(0, 'f'); + /// s.insert(1, 'o'); + /// s.insert(2, 'o'); + /// + /// assert_eq!("foo", s); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn insert(&mut self, idx: usize, ch: char) { + assert!(self.is_char_boundary(idx)); + let mut bits = [0; 4]; + let bits = ch.encode_utf8(&mut bits).as_bytes(); + + unsafe { + self.insert_bytes(idx, bits); + } + } + + #[cfg(not(no_global_oom_handling))] + unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) { + let len = self.len(); + let amt = bytes.len(); + self.vec.reserve(amt); + + unsafe { + ptr::copy(self.vec.as_ptr().add(idx), self.vec.as_mut_ptr().add(idx + amt), len - idx); + ptr::copy_nonoverlapping(bytes.as_ptr(), self.vec.as_mut_ptr().add(idx), amt); + self.vec.set_len(len + amt); + } + } + + /// Inserts a string slice into this `String` at a byte position. + /// + /// This is an *O*(*n*) operation as it requires copying every element in the + /// buffer. + /// + /// # Panics + /// + /// Panics if `idx` is larger than the `String`'s length, or if it does not + /// lie on a [`char`] boundary. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("bar"); + /// + /// s.insert_str(0, "foo"); + /// + /// assert_eq!("foobar", s); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "insert_str", since = "1.16.0")] + pub fn insert_str(&mut self, idx: usize, string: &str) { + assert!(self.is_char_boundary(idx)); + + unsafe { + self.insert_bytes(idx, string.as_bytes()); + } + } + + /// Returns a mutable reference to the contents of this `String`. + /// + /// # Safety + /// + /// This function is unsafe because the returned `&mut Vec` allows writing + /// bytes which are not valid UTF-8. If this constraint is violated, using + /// the original `String` after dropping the `&mut Vec` may violate memory + /// safety, as the rest of the standard library assumes that `String`s are + /// valid UTF-8. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("hello"); + /// + /// unsafe { + /// let vec = s.as_mut_vec(); + /// assert_eq!(&[104, 101, 108, 108, 111][..], &vec[..]); + /// + /// vec.reverse(); + /// } + /// assert_eq!(s, "olleh"); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub unsafe fn as_mut_vec(&mut self) -> &mut Vec { + &mut self.vec + } + + /// Returns the length of this `String`, in bytes, not [`char`]s or + /// graphemes. In other words, it might not be what a human considers the + /// length of the string. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let a = String::from("foo"); + /// assert_eq!(a.len(), 3); + /// + /// let fancy_f = String::from("ƒoo"); + /// assert_eq!(fancy_f.len(), 4); + /// assert_eq!(fancy_f.chars().count(), 3); + /// ``` + #[inline] + #[must_use] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn len(&self) -> usize { + self.vec.len() + } + + /// Returns `true` if this `String` has a length of zero, and `false` otherwise. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut v = String::new(); + /// assert!(v.is_empty()); + /// + /// v.push('a'); + /// assert!(!v.is_empty()); + /// ``` + #[inline] + #[must_use] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Splits the string into two at the given byte index. + /// + /// Returns a newly allocated `String`. `self` contains bytes `[0, at)`, and + /// the returned `String` contains bytes `[at, len)`. `at` must be on the + /// boundary of a UTF-8 code point. + /// + /// Note that the capacity of `self` does not change. + /// + /// # Panics + /// + /// Panics if `at` is not on a `UTF-8` code point boundary, or if it is beyond the last + /// code point of the string. + /// + /// # Examples + /// + /// ``` + /// # fn main() { + /// let mut hello = String::from("Hello, World!"); + /// let world = hello.split_off(7); + /// assert_eq!(hello, "Hello, "); + /// assert_eq!(world, "World!"); + /// # } + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "string_split_off", since = "1.16.0")] + #[must_use = "use `.truncate()` if you don't need the other half"] + pub fn split_off(&mut self, at: usize) -> String { + assert!(self.is_char_boundary(at)); + let other = self.vec.split_off(at); + unsafe { String::from_utf8_unchecked(other) } + } + + /// Truncates this `String`, removing all contents. + /// + /// While this means the `String` will have a length of zero, it does not + /// touch its capacity. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("foo"); + /// + /// s.clear(); + /// + /// assert!(s.is_empty()); + /// assert_eq!(0, s.len()); + /// assert_eq!(3, s.capacity()); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn clear(&mut self) { + self.vec.clear() + } + + /// Removes the specified range from the string in bulk, returning all + /// removed characters as an iterator. + /// + /// The returned iterator keeps a mutable borrow on the string to optimize + /// its implementation. + /// + /// # Panics + /// + /// Panics if the starting point or end point do not lie on a [`char`] + /// boundary, or if they're out of bounds. + /// + /// # Leaking + /// + /// If the returned iterator goes out of scope without being dropped (due to + /// [`core::mem::forget`], for example), the string may still contain a copy + /// of any drained characters, or may have lost characters arbitrarily, + /// including characters outside the range. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("α is alpha, β is beta"); + /// let beta_offset = s.find('β').unwrap_or(s.len()); + /// + /// // Remove the range up until the β from the string + /// let t: String = s.drain(..beta_offset).collect(); + /// assert_eq!(t, "α is alpha, "); + /// assert_eq!(s, "β is beta"); + /// + /// // A full range clears the string, like `clear()` does + /// s.drain(..); + /// assert_eq!(s, ""); + /// ``` + #[stable(feature = "drain", since = "1.6.0")] + pub fn drain(&mut self, range: R) -> Drain<'_> + where + R: RangeBounds, + { + // Memory safety + // + // The String version of Drain does not have the memory safety issues + // of the vector version. The data is just plain bytes. + // Because the range removal happens in Drop, if the Drain iterator is leaked, + // the removal will not happen. + let Range { start, end } = slice::range(range, ..self.len()); + assert!(self.is_char_boundary(start)); + assert!(self.is_char_boundary(end)); + + // Take out two simultaneous borrows. The &mut String won't be accessed + // until iteration is over, in Drop. + let self_ptr = self as *mut _; + // SAFETY: `slice::range` and `is_char_boundary` do the appropriate bounds checks. + let chars_iter = unsafe { self.get_unchecked(start..end) }.chars(); + + Drain { start, end, iter: chars_iter, string: self_ptr } + } + + /// Removes the specified range in the string, + /// and replaces it with the given string. + /// The given string doesn't need to be the same length as the range. + /// + /// # Panics + /// + /// Panics if the starting point or end point do not lie on a [`char`] + /// boundary, or if they're out of bounds. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let mut s = String::from("α is alpha, β is beta"); + /// let beta_offset = s.find('β').unwrap_or(s.len()); + /// + /// // Replace the range up until the β from the string + /// s.replace_range(..beta_offset, "Α is capital alpha; "); + /// assert_eq!(s, "Α is capital alpha; β is beta"); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "splice", since = "1.27.0")] + pub fn replace_range(&mut self, range: R, replace_with: &str) + where + R: RangeBounds, + { + // Memory safety + // + // Replace_range does not have the memory safety issues of a vector Splice. + // of the vector version. The data is just plain bytes. + + // WARNING: Inlining this variable would be unsound (#81138) + let start = range.start_bound(); + match start { + Included(&n) => assert!(self.is_char_boundary(n)), + Excluded(&n) => assert!(self.is_char_boundary(n + 1)), + Unbounded => {} + }; + // WARNING: Inlining this variable would be unsound (#81138) + let end = range.end_bound(); + match end { + Included(&n) => assert!(self.is_char_boundary(n + 1)), + Excluded(&n) => assert!(self.is_char_boundary(n)), + Unbounded => {} + }; + + // Using `range` again would be unsound (#81138) + // We assume the bounds reported by `range` remain the same, but + // an adversarial implementation could change between calls + unsafe { self.as_mut_vec() }.splice((start, end), replace_with.bytes()); + } + + /// Converts this `String` into a [Box]<[str]>. + /// + /// This will drop any excess capacity. + /// + /// [str]: prim@str "str" + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s = String::from("hello"); + /// + /// let b = s.into_boxed_str(); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "box_str", since = "1.4.0")] + #[must_use = "`self` will be dropped if the result is not used"] + #[inline] + pub fn into_boxed_str(self) -> Box { + let slice = self.vec.into_boxed_slice(); + unsafe { from_boxed_utf8_unchecked(slice) } + } +} + +impl FromUtf8Error { + /// Returns a slice of [`u8`]s bytes that were attempted to convert to a `String`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// // some invalid bytes, in a vector + /// let bytes = vec![0, 159]; + /// + /// let value = String::from_utf8(bytes); + /// + /// assert_eq!(&[0, 159], value.unwrap_err().as_bytes()); + /// ``` + #[must_use] + #[stable(feature = "from_utf8_error_as_bytes", since = "1.26.0")] + pub fn as_bytes(&self) -> &[u8] { + &self.bytes[..] + } + + /// Returns the bytes that were attempted to convert to a `String`. + /// + /// This method is carefully constructed to avoid allocation. It will + /// consume the error, moving out the bytes, so that a copy of the bytes + /// does not need to be made. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// // some invalid bytes, in a vector + /// let bytes = vec![0, 159]; + /// + /// let value = String::from_utf8(bytes); + /// + /// assert_eq!(vec![0, 159], value.unwrap_err().into_bytes()); + /// ``` + #[must_use = "`self` will be dropped if the result is not used"] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn into_bytes(self) -> Vec { + self.bytes + } + + /// Fetch a `Utf8Error` to get more details about the conversion failure. + /// + /// The [`Utf8Error`] type provided by [`std::str`] represents an error that may + /// occur when converting a slice of [`u8`]s to a [`&str`]. In this sense, it's + /// an analogue to `FromUtf8Error`. See its documentation for more details + /// on using it. + /// + /// [`std::str`]: core::str "std::str" + /// [`&str`]: prim@str "&str" + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// // some invalid bytes, in a vector + /// let bytes = vec![0, 159]; + /// + /// let error = String::from_utf8(bytes).unwrap_err().utf8_error(); + /// + /// // the first byte is invalid here + /// assert_eq!(1, error.valid_up_to()); + /// ``` + #[must_use] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn utf8_error(&self) -> Utf8Error { + self.error + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Display for FromUtf8Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.error, f) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Display for FromUtf16Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt("invalid utf-16: lone surrogate found", f) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Clone for String { + fn clone(&self) -> Self { + String { vec: self.vec.clone() } + } + + fn clone_from(&mut self, source: &Self) { + self.vec.clone_from(&source.vec); + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl FromIterator for String { + fn from_iter>(iter: I) -> String { + let mut buf = String::new(); + buf.extend(iter); + buf + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "string_from_iter_by_ref", since = "1.17.0")] +impl<'a> FromIterator<&'a char> for String { + fn from_iter>(iter: I) -> String { + let mut buf = String::new(); + buf.extend(iter); + buf + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> FromIterator<&'a str> for String { + fn from_iter>(iter: I) -> String { + let mut buf = String::new(); + buf.extend(iter); + buf + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "extend_string", since = "1.4.0")] +impl FromIterator for String { + fn from_iter>(iter: I) -> String { + let mut iterator = iter.into_iter(); + + // Because we're iterating over `String`s, we can avoid at least + // one allocation by getting the first string from the iterator + // and appending to it all the subsequent strings. + match iterator.next() { + None => String::new(), + Some(mut buf) => { + buf.extend(iterator); + buf + } + } + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_str2", since = "1.45.0")] +impl FromIterator> for String { + fn from_iter>>(iter: I) -> String { + let mut buf = String::new(); + buf.extend(iter); + buf + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "herd_cows", since = "1.19.0")] +impl<'a> FromIterator> for String { + fn from_iter>>(iter: I) -> String { + let mut iterator = iter.into_iter(); + + // Because we're iterating over CoWs, we can (potentially) avoid at least + // one allocation by getting the first item and appending to it all the + // subsequent items. + match iterator.next() { + None => String::new(), + Some(cow) => { + let mut buf = cow.into_owned(); + buf.extend(iterator); + buf + } + } + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Extend for String { + fn extend>(&mut self, iter: I) { + let iterator = iter.into_iter(); + let (lower_bound, _) = iterator.size_hint(); + self.reserve(lower_bound); + iterator.for_each(move |c| self.push(c)); + } + + #[inline] + fn extend_one(&mut self, c: char) { + self.push(c); + } + + #[inline] + fn extend_reserve(&mut self, additional: usize) { + self.reserve(additional); + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "extend_ref", since = "1.2.0")] +impl<'a> Extend<&'a char> for String { + fn extend>(&mut self, iter: I) { + self.extend(iter.into_iter().cloned()); + } + + #[inline] + fn extend_one(&mut self, &c: &'a char) { + self.push(c); + } + + #[inline] + fn extend_reserve(&mut self, additional: usize) { + self.reserve(additional); + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> Extend<&'a str> for String { + fn extend>(&mut self, iter: I) { + iter.into_iter().for_each(move |s| self.push_str(s)); + } + + #[inline] + fn extend_one(&mut self, s: &'a str) { + self.push_str(s); + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_str2", since = "1.45.0")] +impl Extend> for String { + fn extend>>(&mut self, iter: I) { + iter.into_iter().for_each(move |s| self.push_str(&s)); + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "extend_string", since = "1.4.0")] +impl Extend for String { + fn extend>(&mut self, iter: I) { + iter.into_iter().for_each(move |s| self.push_str(&s)); + } + + #[inline] + fn extend_one(&mut self, s: String) { + self.push_str(&s); + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "herd_cows", since = "1.19.0")] +impl<'a> Extend> for String { + fn extend>>(&mut self, iter: I) { + iter.into_iter().for_each(move |s| self.push_str(&s)); + } + + #[inline] + fn extend_one(&mut self, s: Cow<'a, str>) { + self.push_str(&s); + } +} + +/// A convenience impl that delegates to the impl for `&str`. +/// +/// # Examples +/// +/// ``` +/// assert_eq!(String::from("Hello world").find("world"), Some(6)); +/// ``` +#[unstable( + feature = "pattern", + reason = "API not fully fleshed out and ready to be stabilized", + issue = "27721" +)] +impl<'a, 'b> Pattern<'a> for &'b String { + type Searcher = <&'b str as Pattern<'a>>::Searcher; + + fn into_searcher(self, haystack: &'a str) -> <&'b str as Pattern<'a>>::Searcher { + self[..].into_searcher(haystack) + } + + #[inline] + fn is_contained_in(self, haystack: &'a str) -> bool { + self[..].is_contained_in(haystack) + } + + #[inline] + fn is_prefix_of(self, haystack: &'a str) -> bool { + self[..].is_prefix_of(haystack) + } + + #[inline] + fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> { + self[..].strip_prefix_of(haystack) + } + + #[inline] + fn is_suffix_of(self, haystack: &'a str) -> bool { + self[..].is_suffix_of(haystack) + } + + #[inline] + fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> { + self[..].strip_suffix_of(haystack) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialEq for String { + #[inline] + fn eq(&self, other: &String) -> bool { + PartialEq::eq(&self[..], &other[..]) + } + #[inline] + fn ne(&self, other: &String) -> bool { + PartialEq::ne(&self[..], &other[..]) + } +} + +macro_rules! impl_eq { + ($lhs:ty, $rhs: ty) => { + #[stable(feature = "rust1", since = "1.0.0")] + #[allow(unused_lifetimes)] + impl<'a, 'b> PartialEq<$rhs> for $lhs { + #[inline] + fn eq(&self, other: &$rhs) -> bool { + PartialEq::eq(&self[..], &other[..]) + } + #[inline] + fn ne(&self, other: &$rhs) -> bool { + PartialEq::ne(&self[..], &other[..]) + } + } + + #[stable(feature = "rust1", since = "1.0.0")] + #[allow(unused_lifetimes)] + impl<'a, 'b> PartialEq<$lhs> for $rhs { + #[inline] + fn eq(&self, other: &$lhs) -> bool { + PartialEq::eq(&self[..], &other[..]) + } + #[inline] + fn ne(&self, other: &$lhs) -> bool { + PartialEq::ne(&self[..], &other[..]) + } + } + }; +} + +impl_eq! { String, str } +impl_eq! { String, &'a str } +#[cfg(not(no_global_oom_handling))] +impl_eq! { Cow<'a, str>, str } +#[cfg(not(no_global_oom_handling))] +impl_eq! { Cow<'a, str>, &'b str } +#[cfg(not(no_global_oom_handling))] +impl_eq! { Cow<'a, str>, String } + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_unstable(feature = "const_default_impls", issue = "87864")] +impl const Default for String { + /// Creates an empty `String`. + #[inline] + fn default() -> String { + String::new() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Display for String { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&**self, f) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Debug for String { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&**self, f) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl hash::Hash for String { + #[inline] + fn hash(&self, hasher: &mut H) { + (**self).hash(hasher) + } +} + +/// Implements the `+` operator for concatenating two strings. +/// +/// This consumes the `String` on the left-hand side and re-uses its buffer (growing it if +/// necessary). This is done to avoid allocating a new `String` and copying the entire contents on +/// every operation, which would lead to *O*(*n*^2) running time when building an *n*-byte string by +/// repeated concatenation. +/// +/// The string on the right-hand side is only borrowed; its contents are copied into the returned +/// `String`. +/// +/// # Examples +/// +/// Concatenating two `String`s takes the first by value and borrows the second: +/// +/// ``` +/// let a = String::from("hello"); +/// let b = String::from(" world"); +/// let c = a + &b; +/// // `a` is moved and can no longer be used here. +/// ``` +/// +/// If you want to keep using the first `String`, you can clone it and append to the clone instead: +/// +/// ``` +/// let a = String::from("hello"); +/// let b = String::from(" world"); +/// let c = a.clone() + &b; +/// // `a` is still valid here. +/// ``` +/// +/// Concatenating `&str` slices can be done by converting the first to a `String`: +/// +/// ``` +/// let a = "hello"; +/// let b = " world"; +/// let c = a.to_string() + b; +/// ``` +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Add<&str> for String { + type Output = String; + + #[inline] + fn add(mut self, other: &str) -> String { + self.push_str(other); + self + } +} + +/// Implements the `+=` operator for appending to a `String`. +/// +/// This has the same behavior as the [`push_str`][String::push_str] method. +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "stringaddassign", since = "1.12.0")] +impl AddAssign<&str> for String { + #[inline] + fn add_assign(&mut self, other: &str) { + self.push_str(other); + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::Index> for String { + type Output = str; + + #[inline] + fn index(&self, index: ops::Range) -> &str { + &self[..][index] + } +} +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::Index> for String { + type Output = str; + + #[inline] + fn index(&self, index: ops::RangeTo) -> &str { + &self[..][index] + } +} +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::Index> for String { + type Output = str; + + #[inline] + fn index(&self, index: ops::RangeFrom) -> &str { + &self[..][index] + } +} +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::Index for String { + type Output = str; + + #[inline] + fn index(&self, _index: ops::RangeFull) -> &str { + unsafe { str::from_utf8_unchecked(&self.vec) } + } +} +#[stable(feature = "inclusive_range", since = "1.26.0")] +impl ops::Index> for String { + type Output = str; + + #[inline] + fn index(&self, index: ops::RangeInclusive) -> &str { + Index::index(&**self, index) + } +} +#[stable(feature = "inclusive_range", since = "1.26.0")] +impl ops::Index> for String { + type Output = str; + + #[inline] + fn index(&self, index: ops::RangeToInclusive) -> &str { + Index::index(&**self, index) + } +} + +#[stable(feature = "derefmut_for_string", since = "1.3.0")] +impl ops::IndexMut> for String { + #[inline] + fn index_mut(&mut self, index: ops::Range) -> &mut str { + &mut self[..][index] + } +} +#[stable(feature = "derefmut_for_string", since = "1.3.0")] +impl ops::IndexMut> for String { + #[inline] + fn index_mut(&mut self, index: ops::RangeTo) -> &mut str { + &mut self[..][index] + } +} +#[stable(feature = "derefmut_for_string", since = "1.3.0")] +impl ops::IndexMut> for String { + #[inline] + fn index_mut(&mut self, index: ops::RangeFrom) -> &mut str { + &mut self[..][index] + } +} +#[stable(feature = "derefmut_for_string", since = "1.3.0")] +impl ops::IndexMut for String { + #[inline] + fn index_mut(&mut self, _index: ops::RangeFull) -> &mut str { + unsafe { str::from_utf8_unchecked_mut(&mut *self.vec) } + } +} +#[stable(feature = "inclusive_range", since = "1.26.0")] +impl ops::IndexMut> for String { + #[inline] + fn index_mut(&mut self, index: ops::RangeInclusive) -> &mut str { + IndexMut::index_mut(&mut **self, index) + } +} +#[stable(feature = "inclusive_range", since = "1.26.0")] +impl ops::IndexMut> for String { + #[inline] + fn index_mut(&mut self, index: ops::RangeToInclusive) -> &mut str { + IndexMut::index_mut(&mut **self, index) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::Deref for String { + type Target = str; + + #[inline] + fn deref(&self) -> &str { + unsafe { str::from_utf8_unchecked(&self.vec) } + } +} + +#[stable(feature = "derefmut_for_string", since = "1.3.0")] +impl ops::DerefMut for String { + #[inline] + fn deref_mut(&mut self) -> &mut str { + unsafe { str::from_utf8_unchecked_mut(&mut *self.vec) } + } +} + +/// A type alias for [`Infallible`]. +/// +/// This alias exists for backwards compatibility, and may be eventually deprecated. +/// +/// [`Infallible`]: core::convert::Infallible "convert::Infallible" +#[stable(feature = "str_parse_error", since = "1.5.0")] +pub type ParseError = core::convert::Infallible; + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl FromStr for String { + type Err = core::convert::Infallible; + #[inline] + fn from_str(s: &str) -> Result { + Ok(String::from(s)) + } +} + +/// A trait for converting a value to a `String`. +/// +/// This trait is automatically implemented for any type which implements the +/// [`Display`] trait. As such, `ToString` shouldn't be implemented directly: +/// [`Display`] should be implemented instead, and you get the `ToString` +/// implementation for free. +/// +/// [`Display`]: fmt::Display +#[cfg_attr(not(test), rustc_diagnostic_item = "ToString")] +#[stable(feature = "rust1", since = "1.0.0")] +pub trait ToString { + /// Converts the given value to a `String`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let i = 5; + /// let five = String::from("5"); + /// + /// assert_eq!(five, i.to_string()); + /// ``` + #[rustc_conversion_suggestion] + #[stable(feature = "rust1", since = "1.0.0")] + fn to_string(&self) -> String; +} + +/// # Panics +/// +/// In this implementation, the `to_string` method panics +/// if the `Display` implementation returns an error. +/// This indicates an incorrect `Display` implementation +/// since `fmt::Write for String` never returns an error itself. +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl ToString for T { + // A common guideline is to not inline generic functions. However, + // removing `#[inline]` from this method causes non-negligible regressions. + // See , the last attempt + // to try to remove it. + #[inline] + default fn to_string(&self) -> String { + let mut buf = String::new(); + let mut formatter = core::fmt::Formatter::new(&mut buf); + // Bypass format_args!() to avoid write_str with zero-length strs + fmt::Display::fmt(self, &mut formatter) + .expect("a Display implementation returned an error unexpectedly"); + buf + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "char_to_string_specialization", since = "1.46.0")] +impl ToString for char { + #[inline] + fn to_string(&self) -> String { + String::from(self.encode_utf8(&mut [0; 4])) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "u8_to_string_specialization", since = "1.54.0")] +impl ToString for u8 { + #[inline] + fn to_string(&self) -> String { + let mut buf = String::with_capacity(3); + let mut n = *self; + if n >= 10 { + if n >= 100 { + buf.push((b'0' + n / 100) as char); + n %= 100; + } + buf.push((b'0' + n / 10) as char); + n %= 10; + } + buf.push((b'0' + n) as char); + buf + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "i8_to_string_specialization", since = "1.54.0")] +impl ToString for i8 { + #[inline] + fn to_string(&self) -> String { + let mut buf = String::with_capacity(4); + if self.is_negative() { + buf.push('-'); + } + let mut n = self.unsigned_abs(); + if n >= 10 { + if n >= 100 { + buf.push('1'); + n -= 100; + } + buf.push((b'0' + n / 10) as char); + n %= 10; + } + buf.push((b'0' + n) as char); + buf + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "str_to_string_specialization", since = "1.9.0")] +impl ToString for str { + #[inline] + fn to_string(&self) -> String { + String::from(self) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_str_to_string_specialization", since = "1.17.0")] +impl ToString for Cow<'_, str> { + #[inline] + fn to_string(&self) -> String { + self[..].to_owned() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "string_to_string_specialization", since = "1.17.0")] +impl ToString for String { + #[inline] + fn to_string(&self) -> String { + self.to_owned() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef for String { + #[inline] + fn as_ref(&self) -> &str { + self + } +} + +#[stable(feature = "string_as_mut", since = "1.43.0")] +impl AsMut for String { + #[inline] + fn as_mut(&mut self) -> &mut str { + self + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef<[u8]> for String { + #[inline] + fn as_ref(&self) -> &[u8] { + self.as_bytes() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl From<&str> for String { + /// Converts a `&str` into a [`String`]. + /// + /// The result is allocated on the heap. + #[inline] + fn from(s: &str) -> String { + s.to_owned() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "from_mut_str_for_string", since = "1.44.0")] +impl From<&mut str> for String { + /// Converts a `&mut str` into a [`String`]. + /// + /// The result is allocated on the heap. + #[inline] + fn from(s: &mut str) -> String { + s.to_owned() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "from_ref_string", since = "1.35.0")] +impl From<&String> for String { + /// Converts a `&String` into a [`String`]. + /// + /// This clones `s` and returns the clone. + #[inline] + fn from(s: &String) -> String { + s.clone() + } +} + +// note: test pulls in libstd, which causes errors here +#[cfg(not(test))] +#[stable(feature = "string_from_box", since = "1.18.0")] +impl From> for String { + /// Converts the given boxed `str` slice to a [`String`]. + /// It is notable that the `str` slice is owned. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s1: String = String::from("hello world"); + /// let s2: Box = s1.into_boxed_str(); + /// let s3: String = String::from(s2); + /// + /// assert_eq!("hello world", s3) + /// ``` + fn from(s: Box) -> String { + s.into_string() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "box_from_str", since = "1.20.0")] +impl From for Box { + /// Converts the given [`String`] to a boxed `str` slice that is owned. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s1: String = String::from("hello world"); + /// let s2: Box = Box::from(s1); + /// let s3: String = String::from(s2); + /// + /// assert_eq!("hello world", s3) + /// ``` + fn from(s: String) -> Box { + s.into_boxed_str() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "string_from_cow_str", since = "1.14.0")] +impl<'a> From> for String { + /// Converts a clone-on-write string to an owned + /// instance of [`String`]. + /// + /// This extracts the owned string, + /// clones the string if it is not already owned. + /// + /// # Example + /// + /// ``` + /// # use std::borrow::Cow; + /// // If the string is not owned... + /// let cow: Cow = Cow::Borrowed("eggplant"); + /// // It will allocate on the heap and copy the string. + /// let owned: String = String::from(cow); + /// assert_eq!(&owned[..], "eggplant"); + /// ``` + fn from(s: Cow<'a, str>) -> String { + s.into_owned() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> From<&'a str> for Cow<'a, str> { + /// Converts a string slice into a [`Borrowed`] variant. + /// No heap allocation is performed, and the string + /// is not copied. + /// + /// # Example + /// + /// ``` + /// # use std::borrow::Cow; + /// assert_eq!(Cow::from("eggplant"), Cow::Borrowed("eggplant")); + /// ``` + /// + /// [`Borrowed`]: crate::borrow::Cow::Borrowed "borrow::Cow::Borrowed" + #[inline] + fn from(s: &'a str) -> Cow<'a, str> { + Cow::Borrowed(s) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a> From for Cow<'a, str> { + /// Converts a [`String`] into an [`Owned`] variant. + /// No heap allocation is performed, and the string + /// is not copied. + /// + /// # Example + /// + /// ``` + /// # use std::borrow::Cow; + /// let s = "eggplant".to_string(); + /// let s2 = "eggplant".to_string(); + /// assert_eq!(Cow::from(s), Cow::<'static, str>::Owned(s2)); + /// ``` + /// + /// [`Owned`]: crate::borrow::Cow::Owned "borrow::Cow::Owned" + #[inline] + fn from(s: String) -> Cow<'a, str> { + Cow::Owned(s) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_from_string_ref", since = "1.28.0")] +impl<'a> From<&'a String> for Cow<'a, str> { + /// Converts a [`String`] reference into a [`Borrowed`] variant. + /// No heap allocation is performed, and the string + /// is not copied. + /// + /// # Example + /// + /// ``` + /// # use std::borrow::Cow; + /// let s = "eggplant".to_string(); + /// assert_eq!(Cow::from(&s), Cow::Borrowed("eggplant")); + /// ``` + /// + /// [`Borrowed`]: crate::borrow::Cow::Borrowed "borrow::Cow::Borrowed" + #[inline] + fn from(s: &'a String) -> Cow<'a, str> { + Cow::Borrowed(s.as_str()) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_str_from_iter", since = "1.12.0")] +impl<'a> FromIterator for Cow<'a, str> { + fn from_iter>(it: I) -> Cow<'a, str> { + Cow::Owned(FromIterator::from_iter(it)) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_str_from_iter", since = "1.12.0")] +impl<'a, 'b> FromIterator<&'b str> for Cow<'a, str> { + fn from_iter>(it: I) -> Cow<'a, str> { + Cow::Owned(FromIterator::from_iter(it)) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "cow_str_from_iter", since = "1.12.0")] +impl<'a> FromIterator for Cow<'a, str> { + fn from_iter>(it: I) -> Cow<'a, str> { + Cow::Owned(FromIterator::from_iter(it)) + } +} + +#[stable(feature = "from_string_for_vec_u8", since = "1.14.0")] +impl From for Vec { + /// Converts the given [`String`] to a vector [`Vec`] that holds values of type [`u8`]. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s1 = String::from("hello world"); + /// let v1 = Vec::from(s1); + /// + /// for b in v1 { + /// println!("{}", b); + /// } + /// ``` + fn from(string: String) -> Vec { + string.into_bytes() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Write for String { + #[inline] + fn write_str(&mut self, s: &str) -> fmt::Result { + self.push_str(s); + Ok(()) + } + + #[inline] + fn write_char(&mut self, c: char) -> fmt::Result { + self.push(c); + Ok(()) + } +} + +/// A draining iterator for `String`. +/// +/// This struct is created by the [`drain`] method on [`String`]. See its +/// documentation for more. +/// +/// [`drain`]: String::drain +#[stable(feature = "drain", since = "1.6.0")] +pub struct Drain<'a> { + /// Will be used as &'a mut String in the destructor + string: *mut String, + /// Start of part to remove + start: usize, + /// End of part to remove + end: usize, + /// Current remaining range to remove + iter: Chars<'a>, +} + +#[stable(feature = "collection_debug", since = "1.17.0")] +impl fmt::Debug for Drain<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Drain").field(&self.as_str()).finish() + } +} + +#[stable(feature = "drain", since = "1.6.0")] +unsafe impl Sync for Drain<'_> {} +#[stable(feature = "drain", since = "1.6.0")] +unsafe impl Send for Drain<'_> {} + +#[stable(feature = "drain", since = "1.6.0")] +impl Drop for Drain<'_> { + fn drop(&mut self) { + unsafe { + // Use Vec::drain. "Reaffirm" the bounds checks to avoid + // panic code being inserted again. + let self_vec = (*self.string).as_mut_vec(); + if self.start <= self.end && self.end <= self_vec.len() { + self_vec.drain(self.start..self.end); + } + } + } +} + +impl<'a> Drain<'a> { + /// Returns the remaining (sub)string of this iterator as a slice. + /// + /// # Examples + /// + /// ``` + /// let mut s = String::from("abc"); + /// let mut drain = s.drain(..); + /// assert_eq!(drain.as_str(), "abc"); + /// let _ = drain.next().unwrap(); + /// assert_eq!(drain.as_str(), "bc"); + /// ``` + #[must_use] + #[stable(feature = "string_drain_as_str", since = "1.55.0")] + pub fn as_str(&self) -> &str { + self.iter.as_str() + } +} + +#[stable(feature = "string_drain_as_str", since = "1.55.0")] +impl<'a> AsRef for Drain<'a> { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +#[stable(feature = "string_drain_as_str", since = "1.55.0")] +impl<'a> AsRef<[u8]> for Drain<'a> { + fn as_ref(&self) -> &[u8] { + self.as_str().as_bytes() + } +} + +#[stable(feature = "drain", since = "1.6.0")] +impl Iterator for Drain<'_> { + type Item = char; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next() + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } + + #[inline] + fn last(mut self) -> Option { + self.next_back() + } +} + +#[stable(feature = "drain", since = "1.6.0")] +impl DoubleEndedIterator for Drain<'_> { + #[inline] + fn next_back(&mut self) -> Option { + self.iter.next_back() + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for Drain<'_> {} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "from_char_for_string", since = "1.46.0")] +impl From for String { + /// Allocates an owned [`String`] from a single character. + /// + /// # Example + /// ```rust + /// let c: char = 'a'; + /// let s: String = String::from(c); + /// assert_eq!("a", &s[..]); + /// ``` + #[inline] + fn from(c: char) -> Self { + c.to_string() + } +} diff --git a/rust/alloc/vec/drain.rs b/rust/alloc/vec/drain.rs new file mode 100644 index 00000000000000..1bff19d05c10d3 --- /dev/null +++ b/rust/alloc/vec/drain.rs @@ -0,0 +1,184 @@ +use crate::alloc::{Allocator, Global}; +use core::fmt; +use core::iter::{FusedIterator, TrustedLen}; +use core::mem; +use core::ptr::{self, NonNull}; +use core::slice::{self}; + +use super::Vec; + +/// A draining iterator for `Vec`. +/// +/// This `struct` is created by [`Vec::drain`]. +/// See its documentation for more. +/// +/// # Example +/// +/// ``` +/// let mut v = vec![0, 1, 2]; +/// let iter: std::vec::Drain<_> = v.drain(..); +/// ``` +#[stable(feature = "drain", since = "1.6.0")] +pub struct Drain< + 'a, + T: 'a, + #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator + 'a = Global, +> { + /// Index of tail to preserve + pub(super) tail_start: usize, + /// Length of tail + pub(super) tail_len: usize, + /// Current remaining range to remove + pub(super) iter: slice::Iter<'a, T>, + pub(super) vec: NonNull>, +} + +#[stable(feature = "collection_debug", since = "1.17.0")] +impl fmt::Debug for Drain<'_, T, A> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("Drain").field(&self.iter.as_slice()).finish() + } +} + +impl<'a, T, A: Allocator> Drain<'a, T, A> { + /// Returns the remaining items of this iterator as a slice. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec!['a', 'b', 'c']; + /// let mut drain = vec.drain(..); + /// assert_eq!(drain.as_slice(), &['a', 'b', 'c']); + /// let _ = drain.next().unwrap(); + /// assert_eq!(drain.as_slice(), &['b', 'c']); + /// ``` + #[must_use] + #[stable(feature = "vec_drain_as_slice", since = "1.46.0")] + pub fn as_slice(&self) -> &[T] { + self.iter.as_slice() + } + + /// Returns a reference to the underlying allocator. + #[unstable(feature = "allocator_api", issue = "32838")] + #[must_use] + #[inline] + pub fn allocator(&self) -> &A { + unsafe { self.vec.as_ref().allocator() } + } +} + +#[stable(feature = "vec_drain_as_slice", since = "1.46.0")] +impl<'a, T, A: Allocator> AsRef<[T]> for Drain<'a, T, A> { + fn as_ref(&self) -> &[T] { + self.as_slice() + } +} + +#[stable(feature = "drain", since = "1.6.0")] +unsafe impl Sync for Drain<'_, T, A> {} +#[stable(feature = "drain", since = "1.6.0")] +unsafe impl Send for Drain<'_, T, A> {} + +#[stable(feature = "drain", since = "1.6.0")] +impl Iterator for Drain<'_, T, A> { + type Item = T; + + #[inline] + fn next(&mut self) -> Option { + self.iter.next().map(|elt| unsafe { ptr::read(elt as *const _) }) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +#[stable(feature = "drain", since = "1.6.0")] +impl DoubleEndedIterator for Drain<'_, T, A> { + #[inline] + fn next_back(&mut self) -> Option { + self.iter.next_back().map(|elt| unsafe { ptr::read(elt as *const _) }) + } +} + +#[stable(feature = "drain", since = "1.6.0")] +impl Drop for Drain<'_, T, A> { + fn drop(&mut self) { + /// Moves back the un-`Drain`ed elements to restore the original `Vec`. + struct DropGuard<'r, 'a, T, A: Allocator>(&'r mut Drain<'a, T, A>); + + impl<'r, 'a, T, A: Allocator> Drop for DropGuard<'r, 'a, T, A> { + fn drop(&mut self) { + if self.0.tail_len > 0 { + unsafe { + let source_vec = self.0.vec.as_mut(); + // memmove back untouched tail, update to new length + let start = source_vec.len(); + let tail = self.0.tail_start; + if tail != start { + let src = source_vec.as_ptr().add(tail); + let dst = source_vec.as_mut_ptr().add(start); + ptr::copy(src, dst, self.0.tail_len); + } + source_vec.set_len(start + self.0.tail_len); + } + } + } + } + + let iter = mem::replace(&mut self.iter, (&mut []).iter()); + let drop_len = iter.len(); + + let mut vec = self.vec; + + if mem::size_of::() == 0 { + // ZSTs have no identity, so we don't need to move them around, we only need to drop the correct amount. + // this can be achieved by manipulating the Vec length instead of moving values out from `iter`. + unsafe { + let vec = vec.as_mut(); + let old_len = vec.len(); + vec.set_len(old_len + drop_len + self.tail_len); + vec.truncate(old_len + self.tail_len); + } + + return; + } + + // ensure elements are moved back into their appropriate places, even when drop_in_place panics + let _guard = DropGuard(self); + + if drop_len == 0 { + return; + } + + // as_slice() must only be called when iter.len() is > 0 because + // vec::Splice modifies vec::Drain fields and may grow the vec which would invalidate + // the iterator's internal pointers. Creating a reference to deallocated memory + // is invalid even when it is zero-length + let drop_ptr = iter.as_slice().as_ptr(); + + unsafe { + // drop_ptr comes from a slice::Iter which only gives us a &[T] but for drop_in_place + // a pointer with mutable provenance is necessary. Therefore we must reconstruct + // it from the original vec but also avoid creating a &mut to the front since that could + // invalidate raw pointers to it which some unsafe code might rely on. + let vec_ptr = vec.as_mut().as_mut_ptr(); + let drop_offset = drop_ptr.offset_from(vec_ptr) as usize; + let to_drop = ptr::slice_from_raw_parts_mut(vec_ptr.add(drop_offset), drop_len); + ptr::drop_in_place(to_drop); + } + } +} + +#[stable(feature = "drain", since = "1.6.0")] +impl ExactSizeIterator for Drain<'_, T, A> { + fn is_empty(&self) -> bool { + self.iter.is_empty() + } +} + +#[unstable(feature = "trusted_len", issue = "37572")] +unsafe impl TrustedLen for Drain<'_, T, A> {} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for Drain<'_, T, A> {} diff --git a/rust/alloc/vec/drain_filter.rs b/rust/alloc/vec/drain_filter.rs new file mode 100644 index 00000000000000..3c37c92ae44b0c --- /dev/null +++ b/rust/alloc/vec/drain_filter.rs @@ -0,0 +1,143 @@ +use crate::alloc::{Allocator, Global}; +use core::ptr::{self}; +use core::slice::{self}; + +use super::Vec; + +/// An iterator which uses a closure to determine if an element should be removed. +/// +/// This struct is created by [`Vec::drain_filter`]. +/// See its documentation for more. +/// +/// # Example +/// +/// ``` +/// #![feature(drain_filter)] +/// +/// let mut v = vec![0, 1, 2]; +/// let iter: std::vec::DrainFilter<_, _> = v.drain_filter(|x| *x % 2 == 0); +/// ``` +#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")] +#[derive(Debug)] +pub struct DrainFilter< + 'a, + T, + F, + #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator = Global, +> where + F: FnMut(&mut T) -> bool, +{ + pub(super) vec: &'a mut Vec, + /// The index of the item that will be inspected by the next call to `next`. + pub(super) idx: usize, + /// The number of items that have been drained (removed) thus far. + pub(super) del: usize, + /// The original length of `vec` prior to draining. + pub(super) old_len: usize, + /// The filter test predicate. + pub(super) pred: F, + /// A flag that indicates a panic has occurred in the filter test predicate. + /// This is used as a hint in the drop implementation to prevent consumption + /// of the remainder of the `DrainFilter`. Any unprocessed items will be + /// backshifted in the `vec`, but no further items will be dropped or + /// tested by the filter predicate. + pub(super) panic_flag: bool, +} + +impl DrainFilter<'_, T, F, A> +where + F: FnMut(&mut T) -> bool, +{ + /// Returns a reference to the underlying allocator. + #[unstable(feature = "allocator_api", issue = "32838")] + #[inline] + pub fn allocator(&self) -> &A { + self.vec.allocator() + } +} + +#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")] +impl Iterator for DrainFilter<'_, T, F, A> +where + F: FnMut(&mut T) -> bool, +{ + type Item = T; + + fn next(&mut self) -> Option { + unsafe { + while self.idx < self.old_len { + let i = self.idx; + let v = slice::from_raw_parts_mut(self.vec.as_mut_ptr(), self.old_len); + self.panic_flag = true; + let drained = (self.pred)(&mut v[i]); + self.panic_flag = false; + // Update the index *after* the predicate is called. If the index + // is updated prior and the predicate panics, the element at this + // index would be leaked. + self.idx += 1; + if drained { + self.del += 1; + return Some(ptr::read(&v[i])); + } else if self.del > 0 { + let del = self.del; + let src: *const T = &v[i]; + let dst: *mut T = &mut v[i - del]; + ptr::copy_nonoverlapping(src, dst, 1); + } + } + None + } + } + + fn size_hint(&self) -> (usize, Option) { + (0, Some(self.old_len - self.idx)) + } +} + +#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")] +impl Drop for DrainFilter<'_, T, F, A> +where + F: FnMut(&mut T) -> bool, +{ + fn drop(&mut self) { + struct BackshiftOnDrop<'a, 'b, T, F, A: Allocator> + where + F: FnMut(&mut T) -> bool, + { + drain: &'b mut DrainFilter<'a, T, F, A>, + } + + impl<'a, 'b, T, F, A: Allocator> Drop for BackshiftOnDrop<'a, 'b, T, F, A> + where + F: FnMut(&mut T) -> bool, + { + fn drop(&mut self) { + unsafe { + if self.drain.idx < self.drain.old_len && self.drain.del > 0 { + // This is a pretty messed up state, and there isn't really an + // obviously right thing to do. We don't want to keep trying + // to execute `pred`, so we just backshift all the unprocessed + // elements and tell the vec that they still exist. The backshift + // is required to prevent a double-drop of the last successfully + // drained item prior to a panic in the predicate. + let ptr = self.drain.vec.as_mut_ptr(); + let src = ptr.add(self.drain.idx); + let dst = src.sub(self.drain.del); + let tail_len = self.drain.old_len - self.drain.idx; + src.copy_to(dst, tail_len); + } + self.drain.vec.set_len(self.drain.old_len - self.drain.del); + } + } + } + + let backshift = BackshiftOnDrop { drain: self }; + + // Attempt to consume any remaining elements if the filter predicate + // has not yet panicked. We'll backshift any remaining elements + // whether we've already panicked or if the consumption here panics. + if !backshift.drain.panic_flag { + backshift.drain.for_each(drop); + } + } +} diff --git a/rust/alloc/vec/into_iter.rs b/rust/alloc/vec/into_iter.rs new file mode 100644 index 00000000000000..f985fb78465b9a --- /dev/null +++ b/rust/alloc/vec/into_iter.rs @@ -0,0 +1,354 @@ +use crate::alloc::{Allocator, Global}; +use crate::raw_vec::RawVec; +use core::fmt; +use core::intrinsics::arith_offset; +use core::iter::{ + FusedIterator, InPlaceIterable, SourceIter, TrustedLen, TrustedRandomAccessNoCoerce, +}; +use core::marker::PhantomData; +use core::mem::{self}; +use core::ptr::{self, NonNull}; +use core::slice::{self}; + +/// An iterator that moves out of a vector. +/// +/// This `struct` is created by the `into_iter` method on [`Vec`](super::Vec) +/// (provided by the [`IntoIterator`] trait). +/// +/// # Example +/// +/// ``` +/// let v = vec![0, 1, 2]; +/// let iter: std::vec::IntoIter<_> = v.into_iter(); +/// ``` +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_insignificant_dtor] +pub struct IntoIter< + T, + #[unstable(feature = "allocator_api", issue = "32838")] A: Allocator = Global, +> { + pub(super) buf: NonNull, + pub(super) phantom: PhantomData, + pub(super) cap: usize, + pub(super) alloc: A, + pub(super) ptr: *const T, + pub(super) end: *const T, +} + +#[stable(feature = "vec_intoiter_debug", since = "1.13.0")] +impl fmt::Debug for IntoIter { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_tuple("IntoIter").field(&self.as_slice()).finish() + } +} + +impl IntoIter { + /// Returns the remaining items of this iterator as a slice. + /// + /// # Examples + /// + /// ``` + /// let vec = vec!['a', 'b', 'c']; + /// let mut into_iter = vec.into_iter(); + /// assert_eq!(into_iter.as_slice(), &['a', 'b', 'c']); + /// let _ = into_iter.next().unwrap(); + /// assert_eq!(into_iter.as_slice(), &['b', 'c']); + /// ``` + #[stable(feature = "vec_into_iter_as_slice", since = "1.15.0")] + pub fn as_slice(&self) -> &[T] { + unsafe { slice::from_raw_parts(self.ptr, self.len()) } + } + + /// Returns the remaining items of this iterator as a mutable slice. + /// + /// # Examples + /// + /// ``` + /// let vec = vec!['a', 'b', 'c']; + /// let mut into_iter = vec.into_iter(); + /// assert_eq!(into_iter.as_slice(), &['a', 'b', 'c']); + /// into_iter.as_mut_slice()[2] = 'z'; + /// assert_eq!(into_iter.next().unwrap(), 'a'); + /// assert_eq!(into_iter.next().unwrap(), 'b'); + /// assert_eq!(into_iter.next().unwrap(), 'z'); + /// ``` + #[stable(feature = "vec_into_iter_as_slice", since = "1.15.0")] + pub fn as_mut_slice(&mut self) -> &mut [T] { + unsafe { &mut *self.as_raw_mut_slice() } + } + + /// Returns a reference to the underlying allocator. + #[unstable(feature = "allocator_api", issue = "32838")] + #[inline] + pub fn allocator(&self) -> &A { + &self.alloc + } + + fn as_raw_mut_slice(&mut self) -> *mut [T] { + ptr::slice_from_raw_parts_mut(self.ptr as *mut T, self.len()) + } + + /// Drops remaining elements and relinquishes the backing allocation. + /// + /// This is roughly equivalent to the following, but more efficient + /// + /// ``` + /// # let mut into_iter = Vec::::with_capacity(10).into_iter(); + /// (&mut into_iter).for_each(core::mem::drop); + /// unsafe { core::ptr::write(&mut into_iter, Vec::new().into_iter()); } + /// ``` + #[cfg(not(no_global_oom_handling))] + pub(super) fn forget_allocation_drop_remaining(&mut self) { + let remaining = self.as_raw_mut_slice(); + + // overwrite the individual fields instead of creating a new + // struct and then overwriting &mut self. + // this creates less assembly + self.cap = 0; + self.buf = unsafe { NonNull::new_unchecked(RawVec::NEW.ptr()) }; + self.ptr = self.buf.as_ptr(); + self.end = self.buf.as_ptr(); + + unsafe { + ptr::drop_in_place(remaining); + } + } +} + +#[stable(feature = "vec_intoiter_as_ref", since = "1.46.0")] +impl AsRef<[T]> for IntoIter { + fn as_ref(&self) -> &[T] { + self.as_slice() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +unsafe impl Send for IntoIter {} +#[stable(feature = "rust1", since = "1.0.0")] +unsafe impl Sync for IntoIter {} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Iterator for IntoIter { + type Item = T; + + #[inline] + fn next(&mut self) -> Option { + if self.ptr as *const _ == self.end { + None + } else if mem::size_of::() == 0 { + // purposefully don't use 'ptr.offset' because for + // vectors with 0-size elements this would return the + // same pointer. + self.ptr = unsafe { arith_offset(self.ptr as *const i8, 1) as *mut T }; + + // Make up a value of this ZST. + Some(unsafe { mem::zeroed() }) + } else { + let old = self.ptr; + self.ptr = unsafe { self.ptr.offset(1) }; + + Some(unsafe { ptr::read(old) }) + } + } + + #[inline] + fn size_hint(&self) -> (usize, Option) { + let exact = if mem::size_of::() == 0 { + (self.end as usize).wrapping_sub(self.ptr as usize) + } else { + unsafe { self.end.offset_from(self.ptr) as usize } + }; + (exact, Some(exact)) + } + + #[inline] + fn advance_by(&mut self, n: usize) -> Result<(), usize> { + let step_size = self.len().min(n); + let to_drop = ptr::slice_from_raw_parts_mut(self.ptr as *mut T, step_size); + if mem::size_of::() == 0 { + // SAFETY: due to unchecked casts of unsigned amounts to signed offsets the wraparound + // effectively results in unsigned pointers representing positions 0..usize::MAX, + // which is valid for ZSTs. + self.ptr = unsafe { arith_offset(self.ptr as *const i8, step_size as isize) as *mut T } + } else { + // SAFETY: the min() above ensures that step_size is in bounds + self.ptr = unsafe { self.ptr.add(step_size) }; + } + // SAFETY: the min() above ensures that step_size is in bounds + unsafe { + ptr::drop_in_place(to_drop); + } + if step_size < n { + return Err(step_size); + } + Ok(()) + } + + #[inline] + fn count(self) -> usize { + self.len() + } + + #[doc(hidden)] + unsafe fn __iterator_get_unchecked(&mut self, i: usize) -> Self::Item + where + Self: TrustedRandomAccessNoCoerce, + { + // SAFETY: the caller must guarantee that `i` is in bounds of the + // `Vec`, so `i` cannot overflow an `isize`, and the `self.ptr.add(i)` + // is guaranteed to pointer to an element of the `Vec` and + // thus guaranteed to be valid to dereference. + // + // Also note the implementation of `Self: TrustedRandomAccess` requires + // that `T: Copy` so reading elements from the buffer doesn't invalidate + // them for `Drop`. + unsafe { + if mem::size_of::() == 0 { mem::zeroed() } else { ptr::read(self.ptr.add(i)) } + } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl DoubleEndedIterator for IntoIter { + #[inline] + fn next_back(&mut self) -> Option { + if self.end == self.ptr { + None + } else if mem::size_of::() == 0 { + // See above for why 'ptr.offset' isn't used + self.end = unsafe { arith_offset(self.end as *const i8, -1) as *mut T }; + + // Make up a value of this ZST. + Some(unsafe { mem::zeroed() }) + } else { + self.end = unsafe { self.end.offset(-1) }; + + Some(unsafe { ptr::read(self.end) }) + } + } + + #[inline] + fn advance_back_by(&mut self, n: usize) -> Result<(), usize> { + let step_size = self.len().min(n); + if mem::size_of::() == 0 { + // SAFETY: same as for advance_by() + self.end = unsafe { + arith_offset(self.end as *const i8, step_size.wrapping_neg() as isize) as *mut T + } + } else { + // SAFETY: same as for advance_by() + self.end = unsafe { self.end.offset(step_size.wrapping_neg() as isize) }; + } + let to_drop = ptr::slice_from_raw_parts_mut(self.end as *mut T, step_size); + // SAFETY: same as for advance_by() + unsafe { + ptr::drop_in_place(to_drop); + } + if step_size < n { + return Err(step_size); + } + Ok(()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ExactSizeIterator for IntoIter { + fn is_empty(&self) -> bool { + self.ptr == self.end + } +} + +#[stable(feature = "fused", since = "1.26.0")] +impl FusedIterator for IntoIter {} + +#[unstable(feature = "trusted_len", issue = "37572")] +unsafe impl TrustedLen for IntoIter {} + +#[doc(hidden)] +#[unstable(issue = "none", feature = "std_internals")] +#[rustc_unsafe_specialization_marker] +pub trait NonDrop {} + +// T: Copy as approximation for !Drop since get_unchecked does not advance self.ptr +// and thus we can't implement drop-handling +#[unstable(issue = "none", feature = "std_internals")] +impl NonDrop for T {} + +#[doc(hidden)] +#[unstable(issue = "none", feature = "std_internals")] +// TrustedRandomAccess (without NoCoerce) must not be implemented because +// subtypes/supertypes of `T` might not be `NonDrop` +unsafe impl TrustedRandomAccessNoCoerce for IntoIter +where + T: NonDrop, +{ + const MAY_HAVE_SIDE_EFFECT: bool = false; +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "vec_into_iter_clone", since = "1.8.0")] +impl Clone for IntoIter { + #[cfg(not(test))] + fn clone(&self) -> Self { + self.as_slice().to_vec_in(self.alloc.clone()).into_iter() + } + #[cfg(test)] + fn clone(&self) -> Self { + crate::slice::to_vec(self.as_slice(), self.alloc.clone()).into_iter() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +unsafe impl<#[may_dangle] T, A: Allocator> Drop for IntoIter { + fn drop(&mut self) { + struct DropGuard<'a, T, A: Allocator>(&'a mut IntoIter); + + impl Drop for DropGuard<'_, T, A> { + fn drop(&mut self) { + unsafe { + // `IntoIter::alloc` is not used anymore after this + let alloc = ptr::read(&self.0.alloc); + // RawVec handles deallocation + let _ = RawVec::from_raw_parts_in(self.0.buf.as_ptr(), self.0.cap, alloc); + } + } + } + + let guard = DropGuard(self); + // destroy the remaining elements + unsafe { + ptr::drop_in_place(guard.0.as_raw_mut_slice()); + } + // now `guard` will be dropped and do the rest + } +} + +#[unstable(issue = "none", feature = "inplace_iteration")] +#[doc(hidden)] +unsafe impl InPlaceIterable for IntoIter {} + +#[unstable(issue = "none", feature = "inplace_iteration")] +#[doc(hidden)] +unsafe impl SourceIter for IntoIter { + type Source = Self; + + #[inline] + unsafe fn as_inner(&mut self) -> &mut Self::Source { + self + } +} + +// internal helper trait for in-place iteration specialization. +#[rustc_specialization_trait] +pub(crate) trait AsIntoIter { + type Item; + fn as_into_iter(&mut self) -> &mut IntoIter; +} + +impl AsIntoIter for IntoIter { + type Item = T; + + fn as_into_iter(&mut self) -> &mut IntoIter { + self + } +} diff --git a/rust/alloc/vec/is_zero.rs b/rust/alloc/vec/is_zero.rs new file mode 100644 index 00000000000000..0efc4893c3c428 --- /dev/null +++ b/rust/alloc/vec/is_zero.rs @@ -0,0 +1,104 @@ +use crate::boxed::Box; + +#[rustc_specialization_trait] +pub(super) unsafe trait IsZero { + /// Whether this value is zero + fn is_zero(&self) -> bool; +} + +macro_rules! impl_is_zero { + ($t:ty, $is_zero:expr) => { + unsafe impl IsZero for $t { + #[inline] + fn is_zero(&self) -> bool { + $is_zero(*self) + } + } + }; +} + +impl_is_zero!(i16, |x| x == 0); +impl_is_zero!(i32, |x| x == 0); +impl_is_zero!(i64, |x| x == 0); +impl_is_zero!(i128, |x| x == 0); +impl_is_zero!(isize, |x| x == 0); + +impl_is_zero!(u16, |x| x == 0); +impl_is_zero!(u32, |x| x == 0); +impl_is_zero!(u64, |x| x == 0); +impl_is_zero!(u128, |x| x == 0); +impl_is_zero!(usize, |x| x == 0); + +impl_is_zero!(bool, |x| x == false); +impl_is_zero!(char, |x| x == '\0'); + +impl_is_zero!(f32, |x: f32| x.to_bits() == 0); +impl_is_zero!(f64, |x: f64| x.to_bits() == 0); + +unsafe impl IsZero for *const T { + #[inline] + fn is_zero(&self) -> bool { + (*self).is_null() + } +} + +unsafe impl IsZero for *mut T { + #[inline] + fn is_zero(&self) -> bool { + (*self).is_null() + } +} + +// `Option<&T>` and `Option>` are guaranteed to represent `None` as null. +// For fat pointers, the bytes that would be the pointer metadata in the `Some` +// variant are padding in the `None` variant, so ignoring them and +// zero-initializing instead is ok. +// `Option<&mut T>` never implements `Clone`, so there's no need for an impl of +// `SpecFromElem`. + +unsafe impl IsZero for Option<&T> { + #[inline] + fn is_zero(&self) -> bool { + self.is_none() + } +} + +unsafe impl IsZero for Option> { + #[inline] + fn is_zero(&self) -> bool { + self.is_none() + } +} + +// `Option` and similar have a representation guarantee that +// they're the same size as the corresponding `u32` type, as well as a guarantee +// that transmuting between `NonZeroU32` and `Option` works. +// While the documentation officially makes it UB to transmute from `None`, +// we're the standard library so we can make extra inferences, and we know that +// the only niche available to represent `None` is the one that's all zeros. + +macro_rules! impl_is_zero_option_of_nonzero { + ($($t:ident,)+) => {$( + unsafe impl IsZero for Option { + #[inline] + fn is_zero(&self) -> bool { + self.is_none() + } + } + )+}; +} + +impl_is_zero_option_of_nonzero!( + NonZeroU8, + NonZeroU16, + NonZeroU32, + NonZeroU64, + NonZeroU128, + NonZeroI8, + NonZeroI16, + NonZeroI32, + NonZeroI64, + NonZeroI128, + NonZeroUsize, + NonZeroIsize, +); diff --git a/rust/alloc/vec/mod.rs b/rust/alloc/vec/mod.rs new file mode 100644 index 00000000000000..c29aa0fec5b87f --- /dev/null +++ b/rust/alloc/vec/mod.rs @@ -0,0 +1,3055 @@ +//! A contiguous growable array type with heap-allocated contents, written +//! `Vec`. +//! +//! Vectors have *O*(1) indexing, amortized *O*(1) push (to the end) and +//! *O*(1) pop (from the end). +//! +//! Vectors ensure they never allocate more than `isize::MAX` bytes. +//! +//! # Examples +//! +//! You can explicitly create a [`Vec`] with [`Vec::new`]: +//! +//! ``` +//! let v: Vec = Vec::new(); +//! ``` +//! +//! ...or by using the [`vec!`] macro: +//! +//! ``` +//! let v: Vec = vec![]; +//! +//! let v = vec![1, 2, 3, 4, 5]; +//! +//! let v = vec![0; 10]; // ten zeroes +//! ``` +//! +//! You can [`push`] values onto the end of a vector (which will grow the vector +//! as needed): +//! +//! ``` +//! let mut v = vec![1, 2]; +//! +//! v.push(3); +//! ``` +//! +//! Popping values works in much the same way: +//! +//! ``` +//! let mut v = vec![1, 2]; +//! +//! let two = v.pop(); +//! ``` +//! +//! Vectors also support indexing (through the [`Index`] and [`IndexMut`] traits): +//! +//! ``` +//! let mut v = vec![1, 2, 3]; +//! let three = v[2]; +//! v[1] = v[1] + 5; +//! ``` +//! +//! [`push`]: Vec::push + +#![stable(feature = "rust1", since = "1.0.0")] + +#[cfg(not(no_global_oom_handling))] +use core::cmp; +use core::cmp::Ordering; +use core::convert::TryFrom; +use core::fmt; +use core::hash::{Hash, Hasher}; +use core::intrinsics::{arith_offset, assume}; +use core::iter; +#[cfg(not(no_global_oom_handling))] +use core::iter::FromIterator; +use core::marker::PhantomData; +use core::mem::{self, ManuallyDrop, MaybeUninit}; +use core::ops::{self, Index, IndexMut, Range, RangeBounds}; +use core::ptr::{self, NonNull}; +use core::slice::{self, SliceIndex}; + +use crate::alloc::{Allocator, Global}; +use crate::borrow::{Cow, ToOwned}; +use crate::boxed::Box; +use crate::collections::TryReserveError; +use crate::raw_vec::RawVec; + +#[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")] +pub use self::drain_filter::DrainFilter; + +mod drain_filter; + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "vec_splice", since = "1.21.0")] +pub use self::splice::Splice; + +#[cfg(not(no_global_oom_handling))] +mod splice; + +#[stable(feature = "drain", since = "1.6.0")] +pub use self::drain::Drain; + +mod drain; + +#[cfg(not(no_global_oom_handling))] +mod cow; + +#[cfg(not(no_global_oom_handling))] +pub(crate) use self::into_iter::AsIntoIter; +#[stable(feature = "rust1", since = "1.0.0")] +pub use self::into_iter::IntoIter; + +mod into_iter; + +#[cfg(not(no_global_oom_handling))] +use self::is_zero::IsZero; + +mod is_zero; + +#[cfg(not(no_global_oom_handling))] +mod source_iter_marker; + +mod partial_eq; + +#[cfg(not(no_global_oom_handling))] +use self::spec_from_elem::SpecFromElem; + +#[cfg(not(no_global_oom_handling))] +mod spec_from_elem; + +#[cfg(not(no_global_oom_handling))] +use self::set_len_on_drop::SetLenOnDrop; + +#[cfg(not(no_global_oom_handling))] +mod set_len_on_drop; + +#[cfg(not(no_global_oom_handling))] +use self::in_place_drop::InPlaceDrop; + +#[cfg(not(no_global_oom_handling))] +mod in_place_drop; + +#[cfg(not(no_global_oom_handling))] +use self::spec_from_iter_nested::SpecFromIterNested; + +#[cfg(not(no_global_oom_handling))] +mod spec_from_iter_nested; + +#[cfg(not(no_global_oom_handling))] +use self::spec_from_iter::SpecFromIter; + +#[cfg(not(no_global_oom_handling))] +mod spec_from_iter; + +#[cfg(not(no_global_oom_handling))] +use self::spec_extend::SpecExtend; + +#[cfg(not(no_global_oom_handling))] +mod spec_extend; + +/// A contiguous growable array type, written as `Vec`, short for 'vector'. +/// +/// # Examples +/// +/// ``` +/// let mut vec = Vec::new(); +/// vec.push(1); +/// vec.push(2); +/// +/// assert_eq!(vec.len(), 2); +/// assert_eq!(vec[0], 1); +/// +/// assert_eq!(vec.pop(), Some(2)); +/// assert_eq!(vec.len(), 1); +/// +/// vec[0] = 7; +/// assert_eq!(vec[0], 7); +/// +/// vec.extend([1, 2, 3].iter().copied()); +/// +/// for x in &vec { +/// println!("{}", x); +/// } +/// assert_eq!(vec, [7, 1, 2, 3]); +/// ``` +/// +/// The [`vec!`] macro is provided for convenient initialization: +/// +/// ``` +/// let mut vec1 = vec![1, 2, 3]; +/// vec1.push(4); +/// let vec2 = Vec::from([1, 2, 3, 4]); +/// assert_eq!(vec1, vec2); +/// ``` +/// +/// It can also initialize each element of a `Vec` with a given value. +/// This may be more efficient than performing allocation and initialization +/// in separate steps, especially when initializing a vector of zeros: +/// +/// ``` +/// let vec = vec![0; 5]; +/// assert_eq!(vec, [0, 0, 0, 0, 0]); +/// +/// // The following is equivalent, but potentially slower: +/// let mut vec = Vec::with_capacity(5); +/// vec.resize(5, 0); +/// assert_eq!(vec, [0, 0, 0, 0, 0]); +/// ``` +/// +/// For more information, see +/// [Capacity and Reallocation](#capacity-and-reallocation). +/// +/// Use a `Vec` as an efficient stack: +/// +/// ``` +/// let mut stack = Vec::new(); +/// +/// stack.push(1); +/// stack.push(2); +/// stack.push(3); +/// +/// while let Some(top) = stack.pop() { +/// // Prints 3, 2, 1 +/// println!("{}", top); +/// } +/// ``` +/// +/// # Indexing +/// +/// The `Vec` type allows to access values by index, because it implements the +/// [`Index`] trait. An example will be more explicit: +/// +/// ``` +/// let v = vec![0, 2, 4, 6]; +/// println!("{}", v[1]); // it will display '2' +/// ``` +/// +/// However be careful: if you try to access an index which isn't in the `Vec`, +/// your software will panic! You cannot do this: +/// +/// ```should_panic +/// let v = vec![0, 2, 4, 6]; +/// println!("{}", v[6]); // it will panic! +/// ``` +/// +/// Use [`get`] and [`get_mut`] if you want to check whether the index is in +/// the `Vec`. +/// +/// # Slicing +/// +/// A `Vec` can be mutable. On the other hand, slices are read-only objects. +/// To get a [slice][prim@slice], use [`&`]. Example: +/// +/// ``` +/// fn read_slice(slice: &[usize]) { +/// // ... +/// } +/// +/// let v = vec![0, 1]; +/// read_slice(&v); +/// +/// // ... and that's all! +/// // you can also do it like this: +/// let u: &[usize] = &v; +/// // or like this: +/// let u: &[_] = &v; +/// ``` +/// +/// In Rust, it's more common to pass slices as arguments rather than vectors +/// when you just want to provide read access. The same goes for [`String`] and +/// [`&str`]. +/// +/// # Capacity and reallocation +/// +/// The capacity of a vector is the amount of space allocated for any future +/// elements that will be added onto the vector. This is not to be confused with +/// the *length* of a vector, which specifies the number of actual elements +/// within the vector. If a vector's length exceeds its capacity, its capacity +/// will automatically be increased, but its elements will have to be +/// reallocated. +/// +/// For example, a vector with capacity 10 and length 0 would be an empty vector +/// with space for 10 more elements. Pushing 10 or fewer elements onto the +/// vector will not change its capacity or cause reallocation to occur. However, +/// if the vector's length is increased to 11, it will have to reallocate, which +/// can be slow. For this reason, it is recommended to use [`Vec::with_capacity`] +/// whenever possible to specify how big the vector is expected to get. +/// +/// # Guarantees +/// +/// Due to its incredibly fundamental nature, `Vec` makes a lot of guarantees +/// about its design. This ensures that it's as low-overhead as possible in +/// the general case, and can be correctly manipulated in primitive ways +/// by unsafe code. Note that these guarantees refer to an unqualified `Vec`. +/// If additional type parameters are added (e.g., to support custom allocators), +/// overriding their defaults may change the behavior. +/// +/// Most fundamentally, `Vec` is and always will be a (pointer, capacity, length) +/// triplet. No more, no less. The order of these fields is completely +/// unspecified, and you should use the appropriate methods to modify these. +/// The pointer will never be null, so this type is null-pointer-optimized. +/// +/// However, the pointer might not actually point to allocated memory. In particular, +/// if you construct a `Vec` with capacity 0 via [`Vec::new`], [`vec![]`][`vec!`], +/// [`Vec::with_capacity(0)`][`Vec::with_capacity`], or by calling [`shrink_to_fit`] +/// on an empty Vec, it will not allocate memory. Similarly, if you store zero-sized +/// types inside a `Vec`, it will not allocate space for them. *Note that in this case +/// the `Vec` might not report a [`capacity`] of 0*. `Vec` will allocate if and only +/// if [mem::size_of::\]\() * [capacity]\() > 0. In general, `Vec`'s allocation +/// details are very subtle --- if you intend to allocate memory using a `Vec` +/// and use it for something else (either to pass to unsafe code, or to build your +/// own memory-backed collection), be sure to deallocate this memory by using +/// `from_raw_parts` to recover the `Vec` and then dropping it. +/// +/// If a `Vec` *has* allocated memory, then the memory it points to is on the heap +/// (as defined by the allocator Rust is configured to use by default), and its +/// pointer points to [`len`] initialized, contiguous elements in order (what +/// you would see if you coerced it to a slice), followed by [capacity] - [len] +/// logically uninitialized, contiguous elements. +/// +/// A vector containing the elements `'a'` and `'b'` with capacity 4 can be +/// visualized as below. The top part is the `Vec` struct, it contains a +/// pointer to the head of the allocation in the heap, length and capacity. +/// The bottom part is the allocation on the heap, a contiguous memory block. +/// +/// ```text +/// ptr len capacity +/// +--------+--------+--------+ +/// | 0x0123 | 2 | 4 | +/// +--------+--------+--------+ +/// | +/// v +/// Heap +--------+--------+--------+--------+ +/// | 'a' | 'b' | uninit | uninit | +/// +--------+--------+--------+--------+ +/// ``` +/// +/// - **uninit** represents memory that is not initialized, see [`MaybeUninit`]. +/// - Note: the ABI is not stable and `Vec` makes no guarantees about its memory +/// layout (including the order of fields). +/// +/// `Vec` will never perform a "small optimization" where elements are actually +/// stored on the stack for two reasons: +/// +/// * It would make it more difficult for unsafe code to correctly manipulate +/// a `Vec`. The contents of a `Vec` wouldn't have a stable address if it were +/// only moved, and it would be more difficult to determine if a `Vec` had +/// actually allocated memory. +/// +/// * It would penalize the general case, incurring an additional branch +/// on every access. +/// +/// `Vec` will never automatically shrink itself, even if completely empty. This +/// ensures no unnecessary allocations or deallocations occur. Emptying a `Vec` +/// and then filling it back up to the same [`len`] should incur no calls to +/// the allocator. If you wish to free up unused memory, use +/// [`shrink_to_fit`] or [`shrink_to`]. +/// +/// [`push`] and [`insert`] will never (re)allocate if the reported capacity is +/// sufficient. [`push`] and [`insert`] *will* (re)allocate if +/// [len] == [capacity]. That is, the reported capacity is completely +/// accurate, and can be relied on. It can even be used to manually free the memory +/// allocated by a `Vec` if desired. Bulk insertion methods *may* reallocate, even +/// when not necessary. +/// +/// `Vec` does not guarantee any particular growth strategy when reallocating +/// when full, nor when [`reserve`] is called. The current strategy is basic +/// and it may prove desirable to use a non-constant growth factor. Whatever +/// strategy is used will of course guarantee *O*(1) amortized [`push`]. +/// +/// `vec![x; n]`, `vec![a, b, c, d]`, and +/// [`Vec::with_capacity(n)`][`Vec::with_capacity`], will all produce a `Vec` +/// with exactly the requested capacity. If [len] == [capacity], +/// (as is the case for the [`vec!`] macro), then a `Vec` can be converted to +/// and from a [`Box<[T]>`][owned slice] without reallocating or moving the elements. +/// +/// `Vec` will not specifically overwrite any data that is removed from it, +/// but also won't specifically preserve it. Its uninitialized memory is +/// scratch space that it may use however it wants. It will generally just do +/// whatever is most efficient or otherwise easy to implement. Do not rely on +/// removed data to be erased for security purposes. Even if you drop a `Vec`, its +/// buffer may simply be reused by another allocation. Even if you zero a `Vec`'s memory +/// first, that might not actually happen because the optimizer does not consider +/// this a side-effect that must be preserved. There is one case which we will +/// not break, however: using `unsafe` code to write to the excess capacity, +/// and then increasing the length to match, is always valid. +/// +/// Currently, `Vec` does not guarantee the order in which elements are dropped. +/// The order has changed in the past and may change again. +/// +/// [`get`]: ../../std/vec/struct.Vec.html#method.get +/// [`get_mut`]: ../../std/vec/struct.Vec.html#method.get_mut +/// [`String`]: crate::string::String +/// [`&str`]: type@str +/// [`shrink_to_fit`]: Vec::shrink_to_fit +/// [`shrink_to`]: Vec::shrink_to +/// [capacity]: Vec::capacity +/// [`capacity`]: Vec::capacity +/// [mem::size_of::\]: core::mem::size_of +/// [len]: Vec::len +/// [`len`]: Vec::len +/// [`push`]: Vec::push +/// [`insert`]: Vec::insert +/// [`reserve`]: Vec::reserve +/// [`MaybeUninit`]: core::mem::MaybeUninit +/// [owned slice]: Box +#[stable(feature = "rust1", since = "1.0.0")] +#[cfg_attr(not(test), rustc_diagnostic_item = "Vec")] +#[rustc_insignificant_dtor] +pub struct Vec { + buf: RawVec, + len: usize, +} + +//////////////////////////////////////////////////////////////////////////////// +// Inherent methods +//////////////////////////////////////////////////////////////////////////////// + +impl Vec { + /// Constructs a new, empty `Vec`. + /// + /// The vector will not allocate until elements are pushed onto it. + /// + /// # Examples + /// + /// ``` + /// # #![allow(unused_mut)] + /// let mut vec: Vec = Vec::new(); + /// ``` + #[inline] + #[rustc_const_stable(feature = "const_vec_new", since = "1.39.0")] + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use] + pub const fn new() -> Self { + Vec { buf: RawVec::NEW, len: 0 } + } + + /// Constructs a new, empty `Vec` with the specified capacity. + /// + /// The vector will be able to hold exactly `capacity` elements without + /// reallocating. If `capacity` is 0, the vector will not allocate. + /// + /// It is important to note that although the returned vector has the + /// *capacity* specified, the vector will have a zero *length*. For an + /// explanation of the difference between length and capacity, see + /// *[Capacity and reallocation]*. + /// + /// [Capacity and reallocation]: #capacity-and-reallocation + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` bytes. + /// + /// # Examples + /// + /// ``` + /// let mut vec = Vec::with_capacity(10); + /// + /// // The vector contains no items, even though it has capacity for more + /// assert_eq!(vec.len(), 0); + /// assert_eq!(vec.capacity(), 10); + /// + /// // These are all done without reallocating... + /// for i in 0..10 { + /// vec.push(i); + /// } + /// assert_eq!(vec.len(), 10); + /// assert_eq!(vec.capacity(), 10); + /// + /// // ...but this may make the vector reallocate + /// vec.push(11); + /// assert_eq!(vec.len(), 11); + /// assert!(vec.capacity() >= 11); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + #[must_use] + pub fn with_capacity(capacity: usize) -> Self { + Self::with_capacity_in(capacity, Global) + } + + /// Creates a `Vec` directly from the raw components of another vector. + /// + /// # Safety + /// + /// This is highly unsafe, due to the number of invariants that aren't + /// checked: + /// + /// * `ptr` needs to have been previously allocated via [`String`]/`Vec` + /// (at least, it's highly likely to be incorrect if it wasn't). + /// * `T` needs to have the same size and alignment as what `ptr` was allocated with. + /// (`T` having a less strict alignment is not sufficient, the alignment really + /// needs to be equal to satisfy the [`dealloc`] requirement that memory must be + /// allocated and deallocated with the same layout.) + /// * `length` needs to be less than or equal to `capacity`. + /// * `capacity` needs to be the capacity that the pointer was allocated with. + /// + /// Violating these may cause problems like corrupting the allocator's + /// internal data structures. For example it is **not** safe + /// to build a `Vec` from a pointer to a C `char` array with length `size_t`. + /// It's also not safe to build one from a `Vec` and its length, because + /// the allocator cares about the alignment, and these two types have different + /// alignments. The buffer was allocated with alignment 2 (for `u16`), but after + /// turning it into a `Vec` it'll be deallocated with alignment 1. + /// + /// The ownership of `ptr` is effectively transferred to the + /// `Vec` which may then deallocate, reallocate or change the + /// contents of memory pointed to by the pointer at will. Ensure + /// that nothing else uses the pointer after calling this + /// function. + /// + /// [`String`]: crate::string::String + /// [`dealloc`]: crate::alloc::GlobalAlloc::dealloc + /// + /// # Examples + /// + /// ``` + /// use std::ptr; + /// use std::mem; + /// + /// let v = vec![1, 2, 3]; + /// + // FIXME Update this when vec_into_raw_parts is stabilized + /// // Prevent running `v`'s destructor so we are in complete control + /// // of the allocation. + /// let mut v = mem::ManuallyDrop::new(v); + /// + /// // Pull out the various important pieces of information about `v` + /// let p = v.as_mut_ptr(); + /// let len = v.len(); + /// let cap = v.capacity(); + /// + /// unsafe { + /// // Overwrite memory with 4, 5, 6 + /// for i in 0..len as isize { + /// ptr::write(p.offset(i), 4 + i); + /// } + /// + /// // Put everything back together into a Vec + /// let rebuilt = Vec::from_raw_parts(p, len, cap); + /// assert_eq!(rebuilt, [4, 5, 6]); + /// } + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub unsafe fn from_raw_parts(ptr: *mut T, length: usize, capacity: usize) -> Self { + unsafe { Self::from_raw_parts_in(ptr, length, capacity, Global) } + } +} + +impl Vec { + /// Constructs a new, empty `Vec`. + /// + /// The vector will not allocate until elements are pushed onto it. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// # #[allow(unused_mut)] + /// let mut vec: Vec = Vec::new_in(System); + /// ``` + #[inline] + #[unstable(feature = "allocator_api", issue = "32838")] + pub const fn new_in(alloc: A) -> Self { + Vec { buf: RawVec::new_in(alloc), len: 0 } + } + + /// Constructs a new, empty `Vec` with the specified capacity with the provided + /// allocator. + /// + /// The vector will be able to hold exactly `capacity` elements without + /// reallocating. If `capacity` is 0, the vector will not allocate. + /// + /// It is important to note that although the returned vector has the + /// *capacity* specified, the vector will have a zero *length*. For an + /// explanation of the difference between length and capacity, see + /// *[Capacity and reallocation]*. + /// + /// [Capacity and reallocation]: #capacity-and-reallocation + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` bytes. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// let mut vec = Vec::with_capacity_in(10, System); + /// + /// // The vector contains no items, even though it has capacity for more + /// assert_eq!(vec.len(), 0); + /// assert_eq!(vec.capacity(), 10); + /// + /// // These are all done without reallocating... + /// for i in 0..10 { + /// vec.push(i); + /// } + /// assert_eq!(vec.len(), 10); + /// assert_eq!(vec.capacity(), 10); + /// + /// // ...but this may make the vector reallocate + /// vec.push(11); + /// assert_eq!(vec.len(), 11); + /// assert!(vec.capacity() >= 11); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[unstable(feature = "allocator_api", issue = "32838")] + pub fn with_capacity_in(capacity: usize, alloc: A) -> Self { + Vec { buf: RawVec::with_capacity_in(capacity, alloc), len: 0 } + } + + /// Creates a `Vec` directly from the raw components of another vector. + /// + /// # Safety + /// + /// This is highly unsafe, due to the number of invariants that aren't + /// checked: + /// + /// * `ptr` needs to have been previously allocated via [`String`]/`Vec` + /// (at least, it's highly likely to be incorrect if it wasn't). + /// * `T` needs to have the same size and alignment as what `ptr` was allocated with. + /// (`T` having a less strict alignment is not sufficient, the alignment really + /// needs to be equal to satisfy the [`dealloc`] requirement that memory must be + /// allocated and deallocated with the same layout.) + /// * `length` needs to be less than or equal to `capacity`. + /// * `capacity` needs to be the capacity that the pointer was allocated with. + /// + /// Violating these may cause problems like corrupting the allocator's + /// internal data structures. For example it is **not** safe + /// to build a `Vec` from a pointer to a C `char` array with length `size_t`. + /// It's also not safe to build one from a `Vec` and its length, because + /// the allocator cares about the alignment, and these two types have different + /// alignments. The buffer was allocated with alignment 2 (for `u16`), but after + /// turning it into a `Vec` it'll be deallocated with alignment 1. + /// + /// The ownership of `ptr` is effectively transferred to the + /// `Vec` which may then deallocate, reallocate or change the + /// contents of memory pointed to by the pointer at will. Ensure + /// that nothing else uses the pointer after calling this + /// function. + /// + /// [`String`]: crate::string::String + /// [`dealloc`]: crate::alloc::GlobalAlloc::dealloc + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// use std::ptr; + /// use std::mem; + /// + /// let mut v = Vec::with_capacity_in(3, System); + /// v.push(1); + /// v.push(2); + /// v.push(3); + /// + // FIXME Update this when vec_into_raw_parts is stabilized + /// // Prevent running `v`'s destructor so we are in complete control + /// // of the allocation. + /// let mut v = mem::ManuallyDrop::new(v); + /// + /// // Pull out the various important pieces of information about `v` + /// let p = v.as_mut_ptr(); + /// let len = v.len(); + /// let cap = v.capacity(); + /// let alloc = v.allocator(); + /// + /// unsafe { + /// // Overwrite memory with 4, 5, 6 + /// for i in 0..len as isize { + /// ptr::write(p.offset(i), 4 + i); + /// } + /// + /// // Put everything back together into a Vec + /// let rebuilt = Vec::from_raw_parts_in(p, len, cap, alloc.clone()); + /// assert_eq!(rebuilt, [4, 5, 6]); + /// } + /// ``` + #[inline] + #[unstable(feature = "allocator_api", issue = "32838")] + pub unsafe fn from_raw_parts_in(ptr: *mut T, length: usize, capacity: usize, alloc: A) -> Self { + unsafe { Vec { buf: RawVec::from_raw_parts_in(ptr, capacity, alloc), len: length } } + } + + /// Decomposes a `Vec` into its raw components. + /// + /// Returns the raw pointer to the underlying data, the length of + /// the vector (in elements), and the allocated capacity of the + /// data (in elements). These are the same arguments in the same + /// order as the arguments to [`from_raw_parts`]. + /// + /// After calling this function, the caller is responsible for the + /// memory previously managed by the `Vec`. The only way to do + /// this is to convert the raw pointer, length, and capacity back + /// into a `Vec` with the [`from_raw_parts`] function, allowing + /// the destructor to perform the cleanup. + /// + /// [`from_raw_parts`]: Vec::from_raw_parts + /// + /// # Examples + /// + /// ``` + /// #![feature(vec_into_raw_parts)] + /// let v: Vec = vec![-1, 0, 1]; + /// + /// let (ptr, len, cap) = v.into_raw_parts(); + /// + /// let rebuilt = unsafe { + /// // We can now make changes to the components, such as + /// // transmuting the raw pointer to a compatible type. + /// let ptr = ptr as *mut u32; + /// + /// Vec::from_raw_parts(ptr, len, cap) + /// }; + /// assert_eq!(rebuilt, [4294967295, 0, 1]); + /// ``` + #[unstable(feature = "vec_into_raw_parts", reason = "new API", issue = "65816")] + pub fn into_raw_parts(self) -> (*mut T, usize, usize) { + let mut me = ManuallyDrop::new(self); + (me.as_mut_ptr(), me.len(), me.capacity()) + } + + /// Decomposes a `Vec` into its raw components. + /// + /// Returns the raw pointer to the underlying data, the length of the vector (in elements), + /// the allocated capacity of the data (in elements), and the allocator. These are the same + /// arguments in the same order as the arguments to [`from_raw_parts_in`]. + /// + /// After calling this function, the caller is responsible for the + /// memory previously managed by the `Vec`. The only way to do + /// this is to convert the raw pointer, length, and capacity back + /// into a `Vec` with the [`from_raw_parts_in`] function, allowing + /// the destructor to perform the cleanup. + /// + /// [`from_raw_parts_in`]: Vec::from_raw_parts_in + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api, vec_into_raw_parts)] + /// + /// use std::alloc::System; + /// + /// let mut v: Vec = Vec::new_in(System); + /// v.push(-1); + /// v.push(0); + /// v.push(1); + /// + /// let (ptr, len, cap, alloc) = v.into_raw_parts_with_alloc(); + /// + /// let rebuilt = unsafe { + /// // We can now make changes to the components, such as + /// // transmuting the raw pointer to a compatible type. + /// let ptr = ptr as *mut u32; + /// + /// Vec::from_raw_parts_in(ptr, len, cap, alloc) + /// }; + /// assert_eq!(rebuilt, [4294967295, 0, 1]); + /// ``` + #[unstable(feature = "allocator_api", issue = "32838")] + // #[unstable(feature = "vec_into_raw_parts", reason = "new API", issue = "65816")] + pub fn into_raw_parts_with_alloc(self) -> (*mut T, usize, usize, A) { + let mut me = ManuallyDrop::new(self); + let len = me.len(); + let capacity = me.capacity(); + let ptr = me.as_mut_ptr(); + let alloc = unsafe { ptr::read(me.allocator()) }; + (ptr, len, capacity, alloc) + } + + /// Returns the number of elements the vector can hold without + /// reallocating. + /// + /// # Examples + /// + /// ``` + /// let vec: Vec = Vec::with_capacity(10); + /// assert_eq!(vec.capacity(), 10); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn capacity(&self) -> usize { + self.buf.capacity() + } + + /// Reserves capacity for at least `additional` more elements to be inserted + /// in the given `Vec`. The collection may reserve more space to avoid + /// frequent reallocations. After calling `reserve`, capacity will be + /// greater than or equal to `self.len() + additional`. Does nothing if + /// capacity is already sufficient. + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` bytes. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1]; + /// vec.reserve(10); + /// assert!(vec.capacity() >= 11); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn reserve(&mut self, additional: usize) { + self.buf.reserve(self.len, additional); + } + + /// Reserves the minimum capacity for exactly `additional` more elements to + /// be inserted in the given `Vec`. After calling `reserve_exact`, + /// capacity will be greater than or equal to `self.len() + additional`. + /// Does nothing if the capacity is already sufficient. + /// + /// Note that the allocator may give the collection more space than it + /// requests. Therefore, capacity can not be relied upon to be precisely + /// minimal. Prefer [`reserve`] if future insertions are expected. + /// + /// [`reserve`]: Vec::reserve + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` bytes. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1]; + /// vec.reserve_exact(10); + /// assert!(vec.capacity() >= 11); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn reserve_exact(&mut self, additional: usize) { + self.buf.reserve_exact(self.len, additional); + } + + /// Tries to reserve capacity for at least `additional` more elements to be inserted + /// in the given `Vec`. The collection may reserve more space to avoid + /// frequent reallocations. After calling `try_reserve`, capacity will be + /// greater than or equal to `self.len() + additional`. Does nothing if + /// capacity is already sufficient. + /// + /// # Errors + /// + /// If the capacity overflows, or the allocator reports a failure, then an error + /// is returned. + /// + /// # Examples + /// + /// ``` + /// use std::collections::TryReserveError; + /// + /// fn process_data(data: &[u32]) -> Result, TryReserveError> { + /// let mut output = Vec::new(); + /// + /// // Pre-reserve the memory, exiting if we can't + /// output.try_reserve(data.len())?; + /// + /// // Now we know this can't OOM in the middle of our complex work + /// output.extend(data.iter().map(|&val| { + /// val * 2 + 5 // very complicated + /// })); + /// + /// Ok(output) + /// } + /// # process_data(&[1, 2, 3]).expect("why is the test harness OOMing on 12 bytes?"); + /// ``` + #[stable(feature = "try_reserve", since = "1.57.0")] + pub fn try_reserve(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.buf.try_reserve(self.len, additional) + } + + /// Tries to reserve the minimum capacity for exactly `additional` + /// elements to be inserted in the given `Vec`. After calling + /// `try_reserve_exact`, capacity will be greater than or equal to + /// `self.len() + additional` if it returns `Ok(())`. + /// Does nothing if the capacity is already sufficient. + /// + /// Note that the allocator may give the collection more space than it + /// requests. Therefore, capacity can not be relied upon to be precisely + /// minimal. Prefer [`try_reserve`] if future insertions are expected. + /// + /// [`try_reserve`]: Vec::try_reserve + /// + /// # Errors + /// + /// If the capacity overflows, or the allocator reports a failure, then an error + /// is returned. + /// + /// # Examples + /// + /// ``` + /// use std::collections::TryReserveError; + /// + /// fn process_data(data: &[u32]) -> Result, TryReserveError> { + /// let mut output = Vec::new(); + /// + /// // Pre-reserve the memory, exiting if we can't + /// output.try_reserve_exact(data.len())?; + /// + /// // Now we know this can't OOM in the middle of our complex work + /// output.extend(data.iter().map(|&val| { + /// val * 2 + 5 // very complicated + /// })); + /// + /// Ok(output) + /// } + /// # process_data(&[1, 2, 3]).expect("why is the test harness OOMing on 12 bytes?"); + /// ``` + #[stable(feature = "try_reserve", since = "1.57.0")] + pub fn try_reserve_exact(&mut self, additional: usize) -> Result<(), TryReserveError> { + self.buf.try_reserve_exact(self.len, additional) + } + + /// Shrinks the capacity of the vector as much as possible. + /// + /// It will drop down as close as possible to the length but the allocator + /// may still inform the vector that there is space for a few more elements. + /// + /// # Examples + /// + /// ``` + /// let mut vec = Vec::with_capacity(10); + /// vec.extend([1, 2, 3]); + /// assert_eq!(vec.capacity(), 10); + /// vec.shrink_to_fit(); + /// assert!(vec.capacity() >= 3); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn shrink_to_fit(&mut self) { + // The capacity is never less than the length, and there's nothing to do when + // they are equal, so we can avoid the panic case in `RawVec::shrink_to_fit` + // by only calling it with a greater capacity. + if self.capacity() > self.len { + self.buf.shrink_to_fit(self.len); + } + } + + /// Shrinks the capacity of the vector with a lower bound. + /// + /// The capacity will remain at least as large as both the length + /// and the supplied value. + /// + /// If the current capacity is less than the lower limit, this is a no-op. + /// + /// # Examples + /// + /// ``` + /// let mut vec = Vec::with_capacity(10); + /// vec.extend([1, 2, 3]); + /// assert_eq!(vec.capacity(), 10); + /// vec.shrink_to(4); + /// assert!(vec.capacity() >= 4); + /// vec.shrink_to(0); + /// assert!(vec.capacity() >= 3); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "shrink_to", since = "1.56.0")] + pub fn shrink_to(&mut self, min_capacity: usize) { + if self.capacity() > min_capacity { + self.buf.shrink_to_fit(cmp::max(self.len, min_capacity)); + } + } + + /// Converts the vector into [`Box<[T]>`][owned slice]. + /// + /// Note that this will drop any excess capacity. + /// + /// [owned slice]: Box + /// + /// # Examples + /// + /// ``` + /// let v = vec![1, 2, 3]; + /// + /// let slice = v.into_boxed_slice(); + /// ``` + /// + /// Any excess capacity is removed: + /// + /// ``` + /// let mut vec = Vec::with_capacity(10); + /// vec.extend([1, 2, 3]); + /// + /// assert_eq!(vec.capacity(), 10); + /// let slice = vec.into_boxed_slice(); + /// assert_eq!(slice.into_vec().capacity(), 3); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn into_boxed_slice(mut self) -> Box<[T], A> { + unsafe { + self.shrink_to_fit(); + let me = ManuallyDrop::new(self); + let buf = ptr::read(&me.buf); + let len = me.len(); + buf.into_box(len).assume_init() + } + } + + /// Shortens the vector, keeping the first `len` elements and dropping + /// the rest. + /// + /// If `len` is greater than the vector's current length, this has no + /// effect. + /// + /// The [`drain`] method can emulate `truncate`, but causes the excess + /// elements to be returned instead of dropped. + /// + /// Note that this method has no effect on the allocated capacity + /// of the vector. + /// + /// # Examples + /// + /// Truncating a five element vector to two elements: + /// + /// ``` + /// let mut vec = vec![1, 2, 3, 4, 5]; + /// vec.truncate(2); + /// assert_eq!(vec, [1, 2]); + /// ``` + /// + /// No truncation occurs when `len` is greater than the vector's current + /// length: + /// + /// ``` + /// let mut vec = vec![1, 2, 3]; + /// vec.truncate(8); + /// assert_eq!(vec, [1, 2, 3]); + /// ``` + /// + /// Truncating when `len == 0` is equivalent to calling the [`clear`] + /// method. + /// + /// ``` + /// let mut vec = vec![1, 2, 3]; + /// vec.truncate(0); + /// assert_eq!(vec, []); + /// ``` + /// + /// [`clear`]: Vec::clear + /// [`drain`]: Vec::drain + #[stable(feature = "rust1", since = "1.0.0")] + pub fn truncate(&mut self, len: usize) { + // This is safe because: + // + // * the slice passed to `drop_in_place` is valid; the `len > self.len` + // case avoids creating an invalid slice, and + // * the `len` of the vector is shrunk before calling `drop_in_place`, + // such that no value will be dropped twice in case `drop_in_place` + // were to panic once (if it panics twice, the program aborts). + unsafe { + // Note: It's intentional that this is `>` and not `>=`. + // Changing it to `>=` has negative performance + // implications in some cases. See #78884 for more. + if len > self.len { + return; + } + let remaining_len = self.len - len; + let s = ptr::slice_from_raw_parts_mut(self.as_mut_ptr().add(len), remaining_len); + self.len = len; + ptr::drop_in_place(s); + } + } + + /// Extracts a slice containing the entire vector. + /// + /// Equivalent to `&s[..]`. + /// + /// # Examples + /// + /// ``` + /// use std::io::{self, Write}; + /// let buffer = vec![1, 2, 3, 5, 8]; + /// io::sink().write(buffer.as_slice()).unwrap(); + /// ``` + #[inline] + #[stable(feature = "vec_as_slice", since = "1.7.0")] + pub fn as_slice(&self) -> &[T] { + self + } + + /// Extracts a mutable slice of the entire vector. + /// + /// Equivalent to `&mut s[..]`. + /// + /// # Examples + /// + /// ``` + /// use std::io::{self, Read}; + /// let mut buffer = vec![0; 3]; + /// io::repeat(0b101).read_exact(buffer.as_mut_slice()).unwrap(); + /// ``` + #[inline] + #[stable(feature = "vec_as_slice", since = "1.7.0")] + pub fn as_mut_slice(&mut self) -> &mut [T] { + self + } + + /// Returns a raw pointer to the vector's buffer. + /// + /// The caller must ensure that the vector outlives the pointer this + /// function returns, or else it will end up pointing to garbage. + /// Modifying the vector may cause its buffer to be reallocated, + /// which would also make any pointers to it invalid. + /// + /// The caller must also ensure that the memory the pointer (non-transitively) points to + /// is never written to (except inside an `UnsafeCell`) using this pointer or any pointer + /// derived from it. If you need to mutate the contents of the slice, use [`as_mut_ptr`]. + /// + /// # Examples + /// + /// ``` + /// let x = vec![1, 2, 4]; + /// let x_ptr = x.as_ptr(); + /// + /// unsafe { + /// for i in 0..x.len() { + /// assert_eq!(*x_ptr.add(i), 1 << i); + /// } + /// } + /// ``` + /// + /// [`as_mut_ptr`]: Vec::as_mut_ptr + #[stable(feature = "vec_as_ptr", since = "1.37.0")] + #[inline] + pub fn as_ptr(&self) -> *const T { + // We shadow the slice method of the same name to avoid going through + // `deref`, which creates an intermediate reference. + let ptr = self.buf.ptr(); + unsafe { + assume(!ptr.is_null()); + } + ptr + } + + /// Returns an unsafe mutable pointer to the vector's buffer. + /// + /// The caller must ensure that the vector outlives the pointer this + /// function returns, or else it will end up pointing to garbage. + /// Modifying the vector may cause its buffer to be reallocated, + /// which would also make any pointers to it invalid. + /// + /// # Examples + /// + /// ``` + /// // Allocate vector big enough for 4 elements. + /// let size = 4; + /// let mut x: Vec = Vec::with_capacity(size); + /// let x_ptr = x.as_mut_ptr(); + /// + /// // Initialize elements via raw pointer writes, then set length. + /// unsafe { + /// for i in 0..size { + /// *x_ptr.add(i) = i as i32; + /// } + /// x.set_len(size); + /// } + /// assert_eq!(&*x, &[0, 1, 2, 3]); + /// ``` + #[stable(feature = "vec_as_ptr", since = "1.37.0")] + #[inline] + pub fn as_mut_ptr(&mut self) -> *mut T { + // We shadow the slice method of the same name to avoid going through + // `deref_mut`, which creates an intermediate reference. + let ptr = self.buf.ptr(); + unsafe { + assume(!ptr.is_null()); + } + ptr + } + + /// Returns a reference to the underlying allocator. + #[unstable(feature = "allocator_api", issue = "32838")] + #[inline] + pub fn allocator(&self) -> &A { + self.buf.allocator() + } + + /// Forces the length of the vector to `new_len`. + /// + /// This is a low-level operation that maintains none of the normal + /// invariants of the type. Normally changing the length of a vector + /// is done using one of the safe operations instead, such as + /// [`truncate`], [`resize`], [`extend`], or [`clear`]. + /// + /// [`truncate`]: Vec::truncate + /// [`resize`]: Vec::resize + /// [`extend`]: Extend::extend + /// [`clear`]: Vec::clear + /// + /// # Safety + /// + /// - `new_len` must be less than or equal to [`capacity()`]. + /// - The elements at `old_len..new_len` must be initialized. + /// + /// [`capacity()`]: Vec::capacity + /// + /// # Examples + /// + /// This method can be useful for situations in which the vector + /// is serving as a buffer for other code, particularly over FFI: + /// + /// ```no_run + /// # #![allow(dead_code)] + /// # // This is just a minimal skeleton for the doc example; + /// # // don't use this as a starting point for a real library. + /// # pub struct StreamWrapper { strm: *mut std::ffi::c_void } + /// # const Z_OK: i32 = 0; + /// # extern "C" { + /// # fn deflateGetDictionary( + /// # strm: *mut std::ffi::c_void, + /// # dictionary: *mut u8, + /// # dictLength: *mut usize, + /// # ) -> i32; + /// # } + /// # impl StreamWrapper { + /// pub fn get_dictionary(&self) -> Option> { + /// // Per the FFI method's docs, "32768 bytes is always enough". + /// let mut dict = Vec::with_capacity(32_768); + /// let mut dict_length = 0; + /// // SAFETY: When `deflateGetDictionary` returns `Z_OK`, it holds that: + /// // 1. `dict_length` elements were initialized. + /// // 2. `dict_length` <= the capacity (32_768) + /// // which makes `set_len` safe to call. + /// unsafe { + /// // Make the FFI call... + /// let r = deflateGetDictionary(self.strm, dict.as_mut_ptr(), &mut dict_length); + /// if r == Z_OK { + /// // ...and update the length to what was initialized. + /// dict.set_len(dict_length); + /// Some(dict) + /// } else { + /// None + /// } + /// } + /// } + /// # } + /// ``` + /// + /// While the following example is sound, there is a memory leak since + /// the inner vectors were not freed prior to the `set_len` call: + /// + /// ``` + /// let mut vec = vec![vec![1, 0, 0], + /// vec![0, 1, 0], + /// vec![0, 0, 1]]; + /// // SAFETY: + /// // 1. `old_len..0` is empty so no elements need to be initialized. + /// // 2. `0 <= capacity` always holds whatever `capacity` is. + /// unsafe { + /// vec.set_len(0); + /// } + /// ``` + /// + /// Normally, here, one would use [`clear`] instead to correctly drop + /// the contents and thus not leak memory. + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub unsafe fn set_len(&mut self, new_len: usize) { + debug_assert!(new_len <= self.capacity()); + + self.len = new_len; + } + + /// Removes an element from the vector and returns it. + /// + /// The removed element is replaced by the last element of the vector. + /// + /// This does not preserve ordering, but is *O*(1). + /// If you need to preserve the element order, use [`remove`] instead. + /// + /// [`remove`]: Vec::remove + /// + /// # Panics + /// + /// Panics if `index` is out of bounds. + /// + /// # Examples + /// + /// ``` + /// let mut v = vec!["foo", "bar", "baz", "qux"]; + /// + /// assert_eq!(v.swap_remove(1), "bar"); + /// assert_eq!(v, ["foo", "qux", "baz"]); + /// + /// assert_eq!(v.swap_remove(0), "foo"); + /// assert_eq!(v, ["baz", "qux"]); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn swap_remove(&mut self, index: usize) -> T { + #[cold] + #[inline(never)] + fn assert_failed(index: usize, len: usize) -> ! { + panic!("swap_remove index (is {}) should be < len (is {})", index, len); + } + + let len = self.len(); + if index >= len { + assert_failed(index, len); + } + unsafe { + // We replace self[index] with the last element. Note that if the + // bounds check above succeeds there must be a last element (which + // can be self[index] itself). + let value = ptr::read(self.as_ptr().add(index)); + let base_ptr = self.as_mut_ptr(); + ptr::copy(base_ptr.add(len - 1), base_ptr.add(index), 1); + self.set_len(len - 1); + value + } + } + + /// Inserts an element at position `index` within the vector, shifting all + /// elements after it to the right. + /// + /// # Panics + /// + /// Panics if `index > len`. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1, 2, 3]; + /// vec.insert(1, 4); + /// assert_eq!(vec, [1, 4, 2, 3]); + /// vec.insert(4, 5); + /// assert_eq!(vec, [1, 4, 2, 3, 5]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn insert(&mut self, index: usize, element: T) { + #[cold] + #[inline(never)] + fn assert_failed(index: usize, len: usize) -> ! { + panic!("insertion index (is {}) should be <= len (is {})", index, len); + } + + let len = self.len(); + if index > len { + assert_failed(index, len); + } + + // space for the new element + if len == self.buf.capacity() { + self.reserve(1); + } + + unsafe { + // infallible + // The spot to put the new value + { + let p = self.as_mut_ptr().add(index); + // Shift everything over to make space. (Duplicating the + // `index`th element into two consecutive places.) + ptr::copy(p, p.offset(1), len - index); + // Write it in, overwriting the first copy of the `index`th + // element. + ptr::write(p, element); + } + self.set_len(len + 1); + } + } + + /// Removes and returns the element at position `index` within the vector, + /// shifting all elements after it to the left. + /// + /// Note: Because this shifts over the remaining elements, it has a + /// worst-case performance of *O*(*n*). If you don't need the order of elements + /// to be preserved, use [`swap_remove`] instead. If you'd like to remove + /// elements from the beginning of the `Vec`, consider using + /// [`VecDeque::pop_front`] instead. + /// + /// [`swap_remove`]: Vec::swap_remove + /// [`VecDeque::pop_front`]: crate::collections::VecDeque::pop_front + /// + /// # Panics + /// + /// Panics if `index` is out of bounds. + /// + /// # Examples + /// + /// ``` + /// let mut v = vec![1, 2, 3]; + /// assert_eq!(v.remove(1), 2); + /// assert_eq!(v, [1, 3]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[track_caller] + pub fn remove(&mut self, index: usize) -> T { + #[cold] + #[inline(never)] + #[track_caller] + fn assert_failed(index: usize, len: usize) -> ! { + panic!("removal index (is {}) should be < len (is {})", index, len); + } + + let len = self.len(); + if index >= len { + assert_failed(index, len); + } + unsafe { + // infallible + let ret; + { + // the place we are taking from. + let ptr = self.as_mut_ptr().add(index); + // copy it out, unsafely having a copy of the value on + // the stack and in the vector at the same time. + ret = ptr::read(ptr); + + // Shift everything down to fill in that spot. + ptr::copy(ptr.offset(1), ptr, len - index - 1); + } + self.set_len(len - 1); + ret + } + } + + /// Retains only the elements specified by the predicate. + /// + /// In other words, remove all elements `e` such that `f(&e)` returns `false`. + /// This method operates in place, visiting each element exactly once in the + /// original order, and preserves the order of the retained elements. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1, 2, 3, 4]; + /// vec.retain(|&x| x % 2 == 0); + /// assert_eq!(vec, [2, 4]); + /// ``` + /// + /// Because the elements are visited exactly once in the original order, + /// external state may be used to decide which elements to keep. + /// + /// ``` + /// let mut vec = vec![1, 2, 3, 4, 5]; + /// let keep = [false, true, true, false, true]; + /// let mut iter = keep.iter(); + /// vec.retain(|_| *iter.next().unwrap()); + /// assert_eq!(vec, [2, 3, 5]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + pub fn retain(&mut self, mut f: F) + where + F: FnMut(&T) -> bool, + { + self.retain_mut(|elem| f(elem)); + } + + /// Retains only the elements specified by the predicate, passing a mutable reference to it. + /// + /// In other words, remove all elements `e` such that `f(&mut e)` returns `false`. + /// This method operates in place, visiting each element exactly once in the + /// original order, and preserves the order of the retained elements. + /// + /// # Examples + /// + /// ``` + /// #![feature(vec_retain_mut)] + /// + /// let mut vec = vec![1, 2, 3, 4]; + /// vec.retain_mut(|x| if *x > 3 { + /// false + /// } else { + /// *x += 1; + /// true + /// }); + /// assert_eq!(vec, [2, 3, 4]); + /// ``` + #[unstable(feature = "vec_retain_mut", issue = "90829")] + pub fn retain_mut(&mut self, mut f: F) + where + F: FnMut(&mut T) -> bool, + { + let original_len = self.len(); + // Avoid double drop if the drop guard is not executed, + // since we may make some holes during the process. + unsafe { self.set_len(0) }; + + // Vec: [Kept, Kept, Hole, Hole, Hole, Hole, Unchecked, Unchecked] + // |<- processed len ->| ^- next to check + // |<- deleted cnt ->| + // |<- original_len ->| + // Kept: Elements which predicate returns true on. + // Hole: Moved or dropped element slot. + // Unchecked: Unchecked valid elements. + // + // This drop guard will be invoked when predicate or `drop` of element panicked. + // It shifts unchecked elements to cover holes and `set_len` to the correct length. + // In cases when predicate and `drop` never panick, it will be optimized out. + struct BackshiftOnDrop<'a, T, A: Allocator> { + v: &'a mut Vec, + processed_len: usize, + deleted_cnt: usize, + original_len: usize, + } + + impl Drop for BackshiftOnDrop<'_, T, A> { + fn drop(&mut self) { + if self.deleted_cnt > 0 { + // SAFETY: Trailing unchecked items must be valid since we never touch them. + unsafe { + ptr::copy( + self.v.as_ptr().add(self.processed_len), + self.v.as_mut_ptr().add(self.processed_len - self.deleted_cnt), + self.original_len - self.processed_len, + ); + } + } + // SAFETY: After filling holes, all items are in contiguous memory. + unsafe { + self.v.set_len(self.original_len - self.deleted_cnt); + } + } + } + + let mut g = BackshiftOnDrop { v: self, processed_len: 0, deleted_cnt: 0, original_len }; + + fn process_loop( + original_len: usize, + f: &mut F, + g: &mut BackshiftOnDrop<'_, T, A>, + ) where + F: FnMut(&mut T) -> bool, + { + while g.processed_len != original_len { + // SAFETY: Unchecked element must be valid. + let cur = unsafe { &mut *g.v.as_mut_ptr().add(g.processed_len) }; + if !f(cur) { + // Advance early to avoid double drop if `drop_in_place` panicked. + g.processed_len += 1; + g.deleted_cnt += 1; + // SAFETY: We never touch this element again after dropped. + unsafe { ptr::drop_in_place(cur) }; + // We already advanced the counter. + if DELETED { + continue; + } else { + break; + } + } + if DELETED { + // SAFETY: `deleted_cnt` > 0, so the hole slot must not overlap with current element. + // We use copy for move, and never touch this element again. + unsafe { + let hole_slot = g.v.as_mut_ptr().add(g.processed_len - g.deleted_cnt); + ptr::copy_nonoverlapping(cur, hole_slot, 1); + } + } + g.processed_len += 1; + } + } + + // Stage 1: Nothing was deleted. + process_loop::(original_len, &mut f, &mut g); + + // Stage 2: Some elements were deleted. + process_loop::(original_len, &mut f, &mut g); + + // All item are processed. This can be optimized to `set_len` by LLVM. + drop(g); + } + + /// Removes all but the first of consecutive elements in the vector that resolve to the same + /// key. + /// + /// If the vector is sorted, this removes all duplicates. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![10, 20, 21, 30, 20]; + /// + /// vec.dedup_by_key(|i| *i / 10); + /// + /// assert_eq!(vec, [10, 20, 30, 20]); + /// ``` + #[stable(feature = "dedup_by", since = "1.16.0")] + #[inline] + pub fn dedup_by_key(&mut self, mut key: F) + where + F: FnMut(&mut T) -> K, + K: PartialEq, + { + self.dedup_by(|a, b| key(a) == key(b)) + } + + /// Removes all but the first of consecutive elements in the vector satisfying a given equality + /// relation. + /// + /// The `same_bucket` function is passed references to two elements from the vector and + /// must determine if the elements compare equal. The elements are passed in opposite order + /// from their order in the slice, so if `same_bucket(a, b)` returns `true`, `a` is removed. + /// + /// If the vector is sorted, this removes all duplicates. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec!["foo", "bar", "Bar", "baz", "bar"]; + /// + /// vec.dedup_by(|a, b| a.eq_ignore_ascii_case(b)); + /// + /// assert_eq!(vec, ["foo", "bar", "baz", "bar"]); + /// ``` + #[stable(feature = "dedup_by", since = "1.16.0")] + pub fn dedup_by(&mut self, mut same_bucket: F) + where + F: FnMut(&mut T, &mut T) -> bool, + { + let len = self.len(); + if len <= 1 { + return; + } + + /* INVARIANT: vec.len() > read >= write > write-1 >= 0 */ + struct FillGapOnDrop<'a, T, A: core::alloc::Allocator> { + /* Offset of the element we want to check if it is duplicate */ + read: usize, + + /* Offset of the place where we want to place the non-duplicate + * when we find it. */ + write: usize, + + /* The Vec that would need correction if `same_bucket` panicked */ + vec: &'a mut Vec, + } + + impl<'a, T, A: core::alloc::Allocator> Drop for FillGapOnDrop<'a, T, A> { + fn drop(&mut self) { + /* This code gets executed when `same_bucket` panics */ + + /* SAFETY: invariant guarantees that `read - write` + * and `len - read` never overflow and that the copy is always + * in-bounds. */ + unsafe { + let ptr = self.vec.as_mut_ptr(); + let len = self.vec.len(); + + /* How many items were left when `same_bucket` panicked. + * Basically vec[read..].len() */ + let items_left = len.wrapping_sub(self.read); + + /* Pointer to first item in vec[write..write+items_left] slice */ + let dropped_ptr = ptr.add(self.write); + /* Pointer to first item in vec[read..] slice */ + let valid_ptr = ptr.add(self.read); + + /* Copy `vec[read..]` to `vec[write..write+items_left]`. + * The slices can overlap, so `copy_nonoverlapping` cannot be used */ + ptr::copy(valid_ptr, dropped_ptr, items_left); + + /* How many items have been already dropped + * Basically vec[read..write].len() */ + let dropped = self.read.wrapping_sub(self.write); + + self.vec.set_len(len - dropped); + } + } + } + + let mut gap = FillGapOnDrop { read: 1, write: 1, vec: self }; + let ptr = gap.vec.as_mut_ptr(); + + /* Drop items while going through Vec, it should be more efficient than + * doing slice partition_dedup + truncate */ + + /* SAFETY: Because of the invariant, read_ptr, prev_ptr and write_ptr + * are always in-bounds and read_ptr never aliases prev_ptr */ + unsafe { + while gap.read < len { + let read_ptr = ptr.add(gap.read); + let prev_ptr = ptr.add(gap.write.wrapping_sub(1)); + + if same_bucket(&mut *read_ptr, &mut *prev_ptr) { + // Increase `gap.read` now since the drop may panic. + gap.read += 1; + /* We have found duplicate, drop it in-place */ + ptr::drop_in_place(read_ptr); + } else { + let write_ptr = ptr.add(gap.write); + + /* Because `read_ptr` can be equal to `write_ptr`, we either + * have to use `copy` or conditional `copy_nonoverlapping`. + * Looks like the first option is faster. */ + ptr::copy(read_ptr, write_ptr, 1); + + /* We have filled that place, so go further */ + gap.write += 1; + gap.read += 1; + } + } + + /* Technically we could let `gap` clean up with its Drop, but + * when `same_bucket` is guaranteed to not panic, this bloats a little + * the codegen, so we just do it manually */ + gap.vec.set_len(gap.write); + mem::forget(gap); + } + } + + /// Appends an element to the back of a collection. + /// + /// # Panics + /// + /// Panics if the new capacity exceeds `isize::MAX` bytes. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1, 2]; + /// vec.push(3); + /// assert_eq!(vec, [1, 2, 3]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn push(&mut self, value: T) { + // This will panic or abort if we would allocate > isize::MAX bytes + // or if the length increment would overflow for zero-sized types. + if self.len == self.buf.capacity() { + self.buf.reserve_for_push(self.len); + } + unsafe { + let end = self.as_mut_ptr().add(self.len); + ptr::write(end, value); + self.len += 1; + } + } + + /// Removes the last element from a vector and returns it, or [`None`] if it + /// is empty. + /// + /// If you'd like to pop the first element, consider using + /// [`VecDeque::pop_front`] instead. + /// + /// [`VecDeque::pop_front`]: crate::collections::VecDeque::pop_front + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1, 2, 3]; + /// assert_eq!(vec.pop(), Some(3)); + /// assert_eq!(vec, [1, 2]); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn pop(&mut self) -> Option { + if self.len == 0 { + None + } else { + unsafe { + self.len -= 1; + Some(ptr::read(self.as_ptr().add(self.len()))) + } + } + } + + /// Moves all the elements of `other` into `Self`, leaving `other` empty. + /// + /// # Panics + /// + /// Panics if the number of elements in the vector overflows a `usize`. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1, 2, 3]; + /// let mut vec2 = vec![4, 5, 6]; + /// vec.append(&mut vec2); + /// assert_eq!(vec, [1, 2, 3, 4, 5, 6]); + /// assert_eq!(vec2, []); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "append", since = "1.4.0")] + pub fn append(&mut self, other: &mut Self) { + unsafe { + self.append_elements(other.as_slice() as _); + other.set_len(0); + } + } + + /// Appends elements to `Self` from other buffer. + #[cfg(not(no_global_oom_handling))] + #[inline] + unsafe fn append_elements(&mut self, other: *const [T]) { + let count = unsafe { (*other).len() }; + self.reserve(count); + let len = self.len(); + unsafe { ptr::copy_nonoverlapping(other as *const T, self.as_mut_ptr().add(len), count) }; + self.len += count; + } + + /// Removes the specified range from the vector in bulk, returning all + /// removed elements as an iterator. If the iterator is dropped before + /// being fully consumed, it drops the remaining removed elements. + /// + /// The returned iterator keeps a mutable borrow on the vector to optimize + /// its implementation. + /// + /// # Panics + /// + /// Panics if the starting point is greater than the end point or if + /// the end point is greater than the length of the vector. + /// + /// # Leaking + /// + /// If the returned iterator goes out of scope without being dropped (due to + /// [`mem::forget`], for example), the vector may have lost and leaked + /// elements arbitrarily, including elements outside the range. + /// + /// # Examples + /// + /// ``` + /// let mut v = vec![1, 2, 3]; + /// let u: Vec<_> = v.drain(1..).collect(); + /// assert_eq!(v, &[1]); + /// assert_eq!(u, &[2, 3]); + /// + /// // A full range clears the vector, like `clear()` does + /// v.drain(..); + /// assert_eq!(v, &[]); + /// ``` + #[stable(feature = "drain", since = "1.6.0")] + pub fn drain(&mut self, range: R) -> Drain<'_, T, A> + where + R: RangeBounds, + { + // Memory safety + // + // When the Drain is first created, it shortens the length of + // the source vector to make sure no uninitialized or moved-from elements + // are accessible at all if the Drain's destructor never gets to run. + // + // Drain will ptr::read out the values to remove. + // When finished, remaining tail of the vec is copied back to cover + // the hole, and the vector length is restored to the new length. + // + let len = self.len(); + let Range { start, end } = slice::range(range, ..len); + + unsafe { + // set self.vec length's to start, to be safe in case Drain is leaked + self.set_len(start); + // Use the borrow in the IterMut to indicate borrowing behavior of the + // whole Drain iterator (like &mut T). + let range_slice = slice::from_raw_parts_mut(self.as_mut_ptr().add(start), end - start); + Drain { + tail_start: end, + tail_len: len - end, + iter: range_slice.iter(), + vec: NonNull::from(self), + } + } + } + + /// Clears the vector, removing all values. + /// + /// Note that this method has no effect on the allocated capacity + /// of the vector. + /// + /// # Examples + /// + /// ``` + /// let mut v = vec![1, 2, 3]; + /// + /// v.clear(); + /// + /// assert!(v.is_empty()); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn clear(&mut self) { + self.truncate(0) + } + + /// Returns the number of elements in the vector, also referred to + /// as its 'length'. + /// + /// # Examples + /// + /// ``` + /// let a = vec![1, 2, 3]; + /// assert_eq!(a.len(), 3); + /// ``` + #[inline] + #[stable(feature = "rust1", since = "1.0.0")] + pub fn len(&self) -> usize { + self.len + } + + /// Returns `true` if the vector contains no elements. + /// + /// # Examples + /// + /// ``` + /// let mut v = Vec::new(); + /// assert!(v.is_empty()); + /// + /// v.push(1); + /// assert!(!v.is_empty()); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Splits the collection into two at the given index. + /// + /// Returns a newly allocated vector containing the elements in the range + /// `[at, len)`. After the call, the original vector will be left containing + /// the elements `[0, at)` with its previous capacity unchanged. + /// + /// # Panics + /// + /// Panics if `at > len`. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1, 2, 3]; + /// let vec2 = vec.split_off(1); + /// assert_eq!(vec, [1]); + /// assert_eq!(vec2, [2, 3]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[must_use = "use `.truncate()` if you don't need the other half"] + #[stable(feature = "split_off", since = "1.4.0")] + pub fn split_off(&mut self, at: usize) -> Self + where + A: Clone, + { + #[cold] + #[inline(never)] + fn assert_failed(at: usize, len: usize) -> ! { + panic!("`at` split index (is {}) should be <= len (is {})", at, len); + } + + if at > self.len() { + assert_failed(at, self.len()); + } + + if at == 0 { + // the new vector can take over the original buffer and avoid the copy + return mem::replace( + self, + Vec::with_capacity_in(self.capacity(), self.allocator().clone()), + ); + } + + let other_len = self.len - at; + let mut other = Vec::with_capacity_in(other_len, self.allocator().clone()); + + // Unsafely `set_len` and copy items to `other`. + unsafe { + self.set_len(at); + other.set_len(other_len); + + ptr::copy_nonoverlapping(self.as_ptr().add(at), other.as_mut_ptr(), other.len()); + } + other + } + + /// Resizes the `Vec` in-place so that `len` is equal to `new_len`. + /// + /// If `new_len` is greater than `len`, the `Vec` is extended by the + /// difference, with each additional slot filled with the result of + /// calling the closure `f`. The return values from `f` will end up + /// in the `Vec` in the order they have been generated. + /// + /// If `new_len` is less than `len`, the `Vec` is simply truncated. + /// + /// This method uses a closure to create new values on every push. If + /// you'd rather [`Clone`] a given value, use [`Vec::resize`]. If you + /// want to use the [`Default`] trait to generate values, you can + /// pass [`Default::default`] as the second argument. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1, 2, 3]; + /// vec.resize_with(5, Default::default); + /// assert_eq!(vec, [1, 2, 3, 0, 0]); + /// + /// let mut vec = vec![]; + /// let mut p = 1; + /// vec.resize_with(4, || { p *= 2; p }); + /// assert_eq!(vec, [2, 4, 8, 16]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "vec_resize_with", since = "1.33.0")] + pub fn resize_with(&mut self, new_len: usize, f: F) + where + F: FnMut() -> T, + { + let len = self.len(); + if new_len > len { + self.extend_with(new_len - len, ExtendFunc(f)); + } else { + self.truncate(new_len); + } + } + + /// Consumes and leaks the `Vec`, returning a mutable reference to the contents, + /// `&'a mut [T]`. Note that the type `T` must outlive the chosen lifetime + /// `'a`. If the type has only static references, or none at all, then this + /// may be chosen to be `'static`. + /// + /// As of Rust 1.57, this method does not reallocate or shrink the `Vec`, + /// so the leaked allocation may include unused capacity that is not part + /// of the returned slice. + /// + /// This function is mainly useful for data that lives for the remainder of + /// the program's life. Dropping the returned reference will cause a memory + /// leak. + /// + /// # Examples + /// + /// Simple usage: + /// + /// ``` + /// let x = vec![1, 2, 3]; + /// let static_ref: &'static mut [usize] = x.leak(); + /// static_ref[0] += 1; + /// assert_eq!(static_ref, &[2, 2, 3]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "vec_leak", since = "1.47.0")] + #[inline] + pub fn leak<'a>(self) -> &'a mut [T] + where + A: 'a, + { + let mut me = ManuallyDrop::new(self); + unsafe { slice::from_raw_parts_mut(me.as_mut_ptr(), me.len) } + } + + /// Returns the remaining spare capacity of the vector as a slice of + /// `MaybeUninit`. + /// + /// The returned slice can be used to fill the vector with data (e.g. by + /// reading from a file) before marking the data as initialized using the + /// [`set_len`] method. + /// + /// [`set_len`]: Vec::set_len + /// + /// # Examples + /// + /// ``` + /// // Allocate vector big enough for 10 elements. + /// let mut v = Vec::with_capacity(10); + /// + /// // Fill in the first 3 elements. + /// let uninit = v.spare_capacity_mut(); + /// uninit[0].write(0); + /// uninit[1].write(1); + /// uninit[2].write(2); + /// + /// // Mark the first 3 elements of the vector as being initialized. + /// unsafe { + /// v.set_len(3); + /// } + /// + /// assert_eq!(&v, &[0, 1, 2]); + /// ``` + #[stable(feature = "vec_spare_capacity", since = "1.60.0")] + #[inline] + pub fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit] { + // Note: + // This method is not implemented in terms of `split_at_spare_mut`, + // to prevent invalidation of pointers to the buffer. + unsafe { + slice::from_raw_parts_mut( + self.as_mut_ptr().add(self.len) as *mut MaybeUninit, + self.buf.capacity() - self.len, + ) + } + } + + /// Returns vector content as a slice of `T`, along with the remaining spare + /// capacity of the vector as a slice of `MaybeUninit`. + /// + /// The returned spare capacity slice can be used to fill the vector with data + /// (e.g. by reading from a file) before marking the data as initialized using + /// the [`set_len`] method. + /// + /// [`set_len`]: Vec::set_len + /// + /// Note that this is a low-level API, which should be used with care for + /// optimization purposes. If you need to append data to a `Vec` + /// you can use [`push`], [`extend`], [`extend_from_slice`], + /// [`extend_from_within`], [`insert`], [`append`], [`resize`] or + /// [`resize_with`], depending on your exact needs. + /// + /// [`push`]: Vec::push + /// [`extend`]: Vec::extend + /// [`extend_from_slice`]: Vec::extend_from_slice + /// [`extend_from_within`]: Vec::extend_from_within + /// [`insert`]: Vec::insert + /// [`append`]: Vec::append + /// [`resize`]: Vec::resize + /// [`resize_with`]: Vec::resize_with + /// + /// # Examples + /// + /// ``` + /// #![feature(vec_split_at_spare)] + /// + /// let mut v = vec![1, 1, 2]; + /// + /// // Reserve additional space big enough for 10 elements. + /// v.reserve(10); + /// + /// let (init, uninit) = v.split_at_spare_mut(); + /// let sum = init.iter().copied().sum::(); + /// + /// // Fill in the next 4 elements. + /// uninit[0].write(sum); + /// uninit[1].write(sum * 2); + /// uninit[2].write(sum * 3); + /// uninit[3].write(sum * 4); + /// + /// // Mark the 4 elements of the vector as being initialized. + /// unsafe { + /// let len = v.len(); + /// v.set_len(len + 4); + /// } + /// + /// assert_eq!(&v, &[1, 1, 2, 4, 8, 12, 16]); + /// ``` + #[unstable(feature = "vec_split_at_spare", issue = "81944")] + #[inline] + pub fn split_at_spare_mut(&mut self) -> (&mut [T], &mut [MaybeUninit]) { + // SAFETY: + // - len is ignored and so never changed + let (init, spare, _) = unsafe { self.split_at_spare_mut_with_len() }; + (init, spare) + } + + /// Safety: changing returned .2 (&mut usize) is considered the same as calling `.set_len(_)`. + /// + /// This method provides unique access to all vec parts at once in `extend_from_within`. + unsafe fn split_at_spare_mut_with_len( + &mut self, + ) -> (&mut [T], &mut [MaybeUninit], &mut usize) { + let ptr = self.as_mut_ptr(); + // SAFETY: + // - `ptr` is guaranteed to be valid for `self.len` elements + // - but the allocation extends out to `self.buf.capacity()` elements, possibly + // uninitialized + let spare_ptr = unsafe { ptr.add(self.len) }; + let spare_ptr = spare_ptr.cast::>(); + let spare_len = self.buf.capacity() - self.len; + + // SAFETY: + // - `ptr` is guaranteed to be valid for `self.len` elements + // - `spare_ptr` is pointing one element past the buffer, so it doesn't overlap with `initialized` + unsafe { + let initialized = slice::from_raw_parts_mut(ptr, self.len); + let spare = slice::from_raw_parts_mut(spare_ptr, spare_len); + + (initialized, spare, &mut self.len) + } + } +} + +impl Vec { + /// Resizes the `Vec` in-place so that `len` is equal to `new_len`. + /// + /// If `new_len` is greater than `len`, the `Vec` is extended by the + /// difference, with each additional slot filled with `value`. + /// If `new_len` is less than `len`, the `Vec` is simply truncated. + /// + /// This method requires `T` to implement [`Clone`], + /// in order to be able to clone the passed value. + /// If you need more flexibility (or want to rely on [`Default`] instead of + /// [`Clone`]), use [`Vec::resize_with`]. + /// If you only need to resize to a smaller size, use [`Vec::truncate`]. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec!["hello"]; + /// vec.resize(3, "world"); + /// assert_eq!(vec, ["hello", "world", "world"]); + /// + /// let mut vec = vec![1, 2, 3, 4]; + /// vec.resize(2, 0); + /// assert_eq!(vec, [1, 2]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "vec_resize", since = "1.5.0")] + pub fn resize(&mut self, new_len: usize, value: T) { + let len = self.len(); + + if new_len > len { + self.extend_with(new_len - len, ExtendElement(value)) + } else { + self.truncate(new_len); + } + } + + /// Clones and appends all elements in a slice to the `Vec`. + /// + /// Iterates over the slice `other`, clones each element, and then appends + /// it to this `Vec`. The `other` slice is traversed in-order. + /// + /// Note that this function is same as [`extend`] except that it is + /// specialized to work with slices instead. If and when Rust gets + /// specialization this function will likely be deprecated (but still + /// available). + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1]; + /// vec.extend_from_slice(&[2, 3, 4]); + /// assert_eq!(vec, [1, 2, 3, 4]); + /// ``` + /// + /// [`extend`]: Vec::extend + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "vec_extend_from_slice", since = "1.6.0")] + pub fn extend_from_slice(&mut self, other: &[T]) { + self.spec_extend(other.iter()) + } + + /// Copies elements from `src` range to the end of the vector. + /// + /// # Panics + /// + /// Panics if the starting point is greater than the end point or if + /// the end point is greater than the length of the vector. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![0, 1, 2, 3, 4]; + /// + /// vec.extend_from_within(2..); + /// assert_eq!(vec, [0, 1, 2, 3, 4, 2, 3, 4]); + /// + /// vec.extend_from_within(..2); + /// assert_eq!(vec, [0, 1, 2, 3, 4, 2, 3, 4, 0, 1]); + /// + /// vec.extend_from_within(4..8); + /// assert_eq!(vec, [0, 1, 2, 3, 4, 2, 3, 4, 0, 1, 4, 2, 3, 4]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[stable(feature = "vec_extend_from_within", since = "1.53.0")] + pub fn extend_from_within(&mut self, src: R) + where + R: RangeBounds, + { + let range = slice::range(src, ..self.len()); + self.reserve(range.len()); + + // SAFETY: + // - `slice::range` guarantees that the given range is valid for indexing self + unsafe { + self.spec_extend_from_within(range); + } + } +} + +// This code generalizes `extend_with_{element,default}`. +trait ExtendWith { + fn next(&mut self) -> T; + fn last(self) -> T; +} + +struct ExtendElement(T); +impl ExtendWith for ExtendElement { + fn next(&mut self) -> T { + self.0.clone() + } + fn last(self) -> T { + self.0 + } +} + +struct ExtendFunc(F); +impl T> ExtendWith for ExtendFunc { + fn next(&mut self) -> T { + (self.0)() + } + fn last(mut self) -> T { + (self.0)() + } +} + +impl Vec { + #[cfg(not(no_global_oom_handling))] + /// Extend the vector by `n` values, using the given generator. + fn extend_with>(&mut self, n: usize, mut value: E) { + self.reserve(n); + + unsafe { + let mut ptr = self.as_mut_ptr().add(self.len()); + // Use SetLenOnDrop to work around bug where compiler + // might not realize the store through `ptr` through self.set_len() + // don't alias. + let mut local_len = SetLenOnDrop::new(&mut self.len); + + // Write all elements except the last one + for _ in 1..n { + ptr::write(ptr, value.next()); + ptr = ptr.offset(1); + // Increment the length in every step in case next() panics + local_len.increment_len(1); + } + + if n > 0 { + // We can write the last element directly without cloning needlessly + ptr::write(ptr, value.last()); + local_len.increment_len(1); + } + + // len set by scope guard + } + } +} + +impl Vec { + /// Removes consecutive repeated elements in the vector according to the + /// [`PartialEq`] trait implementation. + /// + /// If the vector is sorted, this removes all duplicates. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1, 2, 2, 3, 2]; + /// + /// vec.dedup(); + /// + /// assert_eq!(vec, [1, 2, 3, 2]); + /// ``` + #[stable(feature = "rust1", since = "1.0.0")] + #[inline] + pub fn dedup(&mut self) { + self.dedup_by(|a, b| a == b) + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Internal methods and functions +//////////////////////////////////////////////////////////////////////////////// + +#[doc(hidden)] +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +pub fn from_elem(elem: T, n: usize) -> Vec { + ::from_elem(elem, n, Global) +} + +#[doc(hidden)] +#[cfg(not(no_global_oom_handling))] +#[unstable(feature = "allocator_api", issue = "32838")] +pub fn from_elem_in(elem: T, n: usize, alloc: A) -> Vec { + ::from_elem(elem, n, alloc) +} + +trait ExtendFromWithinSpec { + /// # Safety + /// + /// - `src` needs to be valid index + /// - `self.capacity() - self.len()` must be `>= src.len()` + unsafe fn spec_extend_from_within(&mut self, src: Range); +} + +impl ExtendFromWithinSpec for Vec { + default unsafe fn spec_extend_from_within(&mut self, src: Range) { + // SAFETY: + // - len is increased only after initializing elements + let (this, spare, len) = unsafe { self.split_at_spare_mut_with_len() }; + + // SAFETY: + // - caller guaratees that src is a valid index + let to_clone = unsafe { this.get_unchecked(src) }; + + iter::zip(to_clone, spare) + .map(|(src, dst)| dst.write(src.clone())) + // Note: + // - Element was just initialized with `MaybeUninit::write`, so it's ok to increase len + // - len is increased after each element to prevent leaks (see issue #82533) + .for_each(|_| *len += 1); + } +} + +impl ExtendFromWithinSpec for Vec { + unsafe fn spec_extend_from_within(&mut self, src: Range) { + let count = src.len(); + { + let (init, spare) = self.split_at_spare_mut(); + + // SAFETY: + // - caller guaratees that `src` is a valid index + let source = unsafe { init.get_unchecked(src) }; + + // SAFETY: + // - Both pointers are created from unique slice references (`&mut [_]`) + // so they are valid and do not overlap. + // - Elements are :Copy so it's OK to to copy them, without doing + // anything with the original values + // - `count` is equal to the len of `source`, so source is valid for + // `count` reads + // - `.reserve(count)` guarantees that `spare.len() >= count` so spare + // is valid for `count` writes + unsafe { ptr::copy_nonoverlapping(source.as_ptr(), spare.as_mut_ptr() as _, count) }; + } + + // SAFETY: + // - The elements were just initialized by `copy_nonoverlapping` + self.len += count; + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Common trait implementations for Vec +//////////////////////////////////////////////////////////////////////////////// + +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::Deref for Vec { + type Target = [T]; + + fn deref(&self) -> &[T] { + unsafe { slice::from_raw_parts(self.as_ptr(), self.len) } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl ops::DerefMut for Vec { + fn deref_mut(&mut self) -> &mut [T] { + unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) } + } +} + +#[cfg(not(no_global_oom_handling))] +trait SpecCloneFrom { + fn clone_from(this: &mut Self, other: &Self); +} + +#[cfg(not(no_global_oom_handling))] +impl SpecCloneFrom for Vec { + default fn clone_from(this: &mut Self, other: &Self) { + // drop anything that will not be overwritten + this.truncate(other.len()); + + // self.len <= other.len due to the truncate above, so the + // slices here are always in-bounds. + let (init, tail) = other.split_at(this.len()); + + // reuse the contained values' allocations/resources. + this.clone_from_slice(init); + this.extend_from_slice(tail); + } +} + +#[cfg(not(no_global_oom_handling))] +impl SpecCloneFrom for Vec { + fn clone_from(this: &mut Self, other: &Self) { + this.clear(); + this.extend_from_slice(other); + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Clone for Vec { + #[cfg(not(test))] + fn clone(&self) -> Self { + let alloc = self.allocator().clone(); + <[T]>::to_vec_in(&**self, alloc) + } + + // HACK(japaric): with cfg(test) the inherent `[T]::to_vec` method, which is + // required for this method definition, is not available. Instead use the + // `slice::to_vec` function which is only available with cfg(test) + // NB see the slice::hack module in slice.rs for more information + #[cfg(test)] + fn clone(&self) -> Self { + let alloc = self.allocator().clone(); + crate::slice::to_vec(&**self, alloc) + } + + fn clone_from(&mut self, other: &Self) { + SpecCloneFrom::clone_from(self, other) + } +} + +/// The hash of a vector is the same as that of the corresponding slice, +/// as required by the `core::borrow::Borrow` implementation. +/// +/// ``` +/// #![feature(build_hasher_simple_hash_one)] +/// use std::hash::BuildHasher; +/// +/// let b = std::collections::hash_map::RandomState::new(); +/// let v: Vec = vec![0xa8, 0x3c, 0x09]; +/// let s: &[u8] = &[0xa8, 0x3c, 0x09]; +/// assert_eq!(b.hash_one(v), b.hash_one(s)); +/// ``` +#[stable(feature = "rust1", since = "1.0.0")] +impl Hash for Vec { + #[inline] + fn hash(&self, state: &mut H) { + Hash::hash(&**self, state) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_on_unimplemented( + message = "vector indices are of type `usize` or ranges of `usize`", + label = "vector indices are of type `usize` or ranges of `usize`" +)] +impl, A: Allocator> Index for Vec { + type Output = I::Output; + + #[inline] + fn index(&self, index: I) -> &Self::Output { + Index::index(&**self, index) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_on_unimplemented( + message = "vector indices are of type `usize` or ranges of `usize`", + label = "vector indices are of type `usize` or ranges of `usize`" +)] +impl, A: Allocator> IndexMut for Vec { + #[inline] + fn index_mut(&mut self, index: I) -> &mut Self::Output { + IndexMut::index_mut(&mut **self, index) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl FromIterator for Vec { + #[inline] + fn from_iter>(iter: I) -> Vec { + >::from_iter(iter.into_iter()) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl IntoIterator for Vec { + type Item = T; + type IntoIter = IntoIter; + + /// Creates a consuming iterator, that is, one that moves each value out of + /// the vector (from start to end). The vector cannot be used after calling + /// this. + /// + /// # Examples + /// + /// ``` + /// let v = vec!["a".to_string(), "b".to_string()]; + /// for s in v.into_iter() { + /// // s has type String, not &String + /// println!("{}", s); + /// } + /// ``` + #[inline] + fn into_iter(self) -> IntoIter { + unsafe { + let mut me = ManuallyDrop::new(self); + let alloc = ptr::read(me.allocator()); + let begin = me.as_mut_ptr(); + let end = if mem::size_of::() == 0 { + arith_offset(begin as *const i8, me.len() as isize) as *const T + } else { + begin.add(me.len()) as *const T + }; + let cap = me.buf.capacity(); + IntoIter { + buf: NonNull::new_unchecked(begin), + phantom: PhantomData, + cap, + alloc, + ptr: begin, + end, + } + } + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a, T, A: Allocator> IntoIterator for &'a Vec { + type Item = &'a T; + type IntoIter = slice::Iter<'a, T>; + + fn into_iter(self) -> slice::Iter<'a, T> { + self.iter() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl<'a, T, A: Allocator> IntoIterator for &'a mut Vec { + type Item = &'a mut T; + type IntoIter = slice::IterMut<'a, T>; + + fn into_iter(self) -> slice::IterMut<'a, T> { + self.iter_mut() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl Extend for Vec { + #[inline] + fn extend>(&mut self, iter: I) { + >::spec_extend(self, iter.into_iter()) + } + + #[inline] + fn extend_one(&mut self, item: T) { + self.push(item); + } + + #[inline] + fn extend_reserve(&mut self, additional: usize) { + self.reserve(additional); + } +} + +impl Vec { + // leaf method to which various SpecFrom/SpecExtend implementations delegate when + // they have no further optimizations to apply + #[cfg(not(no_global_oom_handling))] + fn extend_desugared>(&mut self, mut iterator: I) { + // This is the case for a general iterator. + // + // This function should be the moral equivalent of: + // + // for item in iterator { + // self.push(item); + // } + while let Some(element) = iterator.next() { + let len = self.len(); + if len == self.capacity() { + let (lower, _) = iterator.size_hint(); + self.reserve(lower.saturating_add(1)); + } + unsafe { + ptr::write(self.as_mut_ptr().add(len), element); + // Since next() executes user code which can panic we have to bump the length + // after each step. + // NB can't overflow since we would have had to alloc the address space + self.set_len(len + 1); + } + } + } + + /// Creates a splicing iterator that replaces the specified range in the vector + /// with the given `replace_with` iterator and yields the removed items. + /// `replace_with` does not need to be the same length as `range`. + /// + /// `range` is removed even if the iterator is not consumed until the end. + /// + /// It is unspecified how many elements are removed from the vector + /// if the `Splice` value is leaked. + /// + /// The input iterator `replace_with` is only consumed when the `Splice` value is dropped. + /// + /// This is optimal if: + /// + /// * The tail (elements in the vector after `range`) is empty, + /// * or `replace_with` yields fewer or equal elements than `range`’s length + /// * or the lower bound of its `size_hint()` is exact. + /// + /// Otherwise, a temporary vector is allocated and the tail is moved twice. + /// + /// # Panics + /// + /// Panics if the starting point is greater than the end point or if + /// the end point is greater than the length of the vector. + /// + /// # Examples + /// + /// ``` + /// let mut v = vec![1, 2, 3, 4]; + /// let new = [7, 8, 9]; + /// let u: Vec<_> = v.splice(1..3, new).collect(); + /// assert_eq!(v, &[1, 7, 8, 9, 4]); + /// assert_eq!(u, &[2, 3]); + /// ``` + #[cfg(not(no_global_oom_handling))] + #[inline] + #[stable(feature = "vec_splice", since = "1.21.0")] + pub fn splice(&mut self, range: R, replace_with: I) -> Splice<'_, I::IntoIter, A> + where + R: RangeBounds, + I: IntoIterator, + { + Splice { drain: self.drain(range), replace_with: replace_with.into_iter() } + } + + /// Creates an iterator which uses a closure to determine if an element should be removed. + /// + /// If the closure returns true, then the element is removed and yielded. + /// If the closure returns false, the element will remain in the vector and will not be yielded + /// by the iterator. + /// + /// Using this method is equivalent to the following code: + /// + /// ``` + /// # let some_predicate = |x: &mut i32| { *x == 2 || *x == 3 || *x == 6 }; + /// # let mut vec = vec![1, 2, 3, 4, 5, 6]; + /// let mut i = 0; + /// while i < vec.len() { + /// if some_predicate(&mut vec[i]) { + /// let val = vec.remove(i); + /// // your code here + /// } else { + /// i += 1; + /// } + /// } + /// + /// # assert_eq!(vec, vec![1, 4, 5]); + /// ``` + /// + /// But `drain_filter` is easier to use. `drain_filter` is also more efficient, + /// because it can backshift the elements of the array in bulk. + /// + /// Note that `drain_filter` also lets you mutate every element in the filter closure, + /// regardless of whether you choose to keep or remove it. + /// + /// # Examples + /// + /// Splitting an array into evens and odds, reusing the original allocation: + /// + /// ``` + /// #![feature(drain_filter)] + /// let mut numbers = vec![1, 2, 3, 4, 5, 6, 8, 9, 11, 13, 14, 15]; + /// + /// let evens = numbers.drain_filter(|x| *x % 2 == 0).collect::>(); + /// let odds = numbers; + /// + /// assert_eq!(evens, vec![2, 4, 6, 8, 14]); + /// assert_eq!(odds, vec![1, 3, 5, 9, 11, 13, 15]); + /// ``` + #[unstable(feature = "drain_filter", reason = "recently added", issue = "43244")] + pub fn drain_filter(&mut self, filter: F) -> DrainFilter<'_, T, F, A> + where + F: FnMut(&mut T) -> bool, + { + let old_len = self.len(); + + // Guard against us getting leaked (leak amplification) + unsafe { + self.set_len(0); + } + + DrainFilter { vec: self, idx: 0, del: 0, old_len, pred: filter, panic_flag: false } + } +} + +/// Extend implementation that copies elements out of references before pushing them onto the Vec. +/// +/// This implementation is specialized for slice iterators, where it uses [`copy_from_slice`] to +/// append the entire slice at once. +/// +/// [`copy_from_slice`]: slice::copy_from_slice +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "extend_ref", since = "1.2.0")] +impl<'a, T: Copy + 'a, A: Allocator + 'a> Extend<&'a T> for Vec { + fn extend>(&mut self, iter: I) { + self.spec_extend(iter.into_iter()) + } + + #[inline] + fn extend_one(&mut self, &item: &'a T) { + self.push(item); + } + + #[inline] + fn extend_reserve(&mut self, additional: usize) { + self.reserve(additional); + } +} + +/// Implements comparison of vectors, [lexicographically](core::cmp::Ord#lexicographical-comparison). +#[stable(feature = "rust1", since = "1.0.0")] +impl PartialOrd for Vec { + #[inline] + fn partial_cmp(&self, other: &Self) -> Option { + PartialOrd::partial_cmp(&**self, &**other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl Eq for Vec {} + +/// Implements ordering of vectors, [lexicographically](core::cmp::Ord#lexicographical-comparison). +#[stable(feature = "rust1", since = "1.0.0")] +impl Ord for Vec { + #[inline] + fn cmp(&self, other: &Self) -> Ordering { + Ord::cmp(&**self, &**other) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +unsafe impl<#[may_dangle] T, A: Allocator> Drop for Vec { + fn drop(&mut self) { + unsafe { + // use drop for [T] + // use a raw slice to refer to the elements of the vector as weakest necessary type; + // could avoid questions of validity in certain cases + ptr::drop_in_place(ptr::slice_from_raw_parts_mut(self.as_mut_ptr(), self.len)) + } + // RawVec handles deallocation + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +#[rustc_const_unstable(feature = "const_default_impls", issue = "87864")] +impl const Default for Vec { + /// Creates an empty `Vec`. + fn default() -> Vec { + Vec::new() + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl fmt::Debug for Vec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Debug::fmt(&**self, f) + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef> for Vec { + fn as_ref(&self) -> &Vec { + self + } +} + +#[stable(feature = "vec_as_mut", since = "1.5.0")] +impl AsMut> for Vec { + fn as_mut(&mut self) -> &mut Vec { + self + } +} + +#[stable(feature = "rust1", since = "1.0.0")] +impl AsRef<[T]> for Vec { + fn as_ref(&self) -> &[T] { + self + } +} + +#[stable(feature = "vec_as_mut", since = "1.5.0")] +impl AsMut<[T]> for Vec { + fn as_mut(&mut self) -> &mut [T] { + self + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl From<&[T]> for Vec { + /// Allocate a `Vec` and fill it by cloning `s`'s items. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(Vec::from(&[1, 2, 3][..]), vec![1, 2, 3]); + /// ``` + #[cfg(not(test))] + fn from(s: &[T]) -> Vec { + s.to_vec() + } + #[cfg(test)] + fn from(s: &[T]) -> Vec { + crate::slice::to_vec(s, Global) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "vec_from_mut", since = "1.19.0")] +impl From<&mut [T]> for Vec { + /// Allocate a `Vec` and fill it by cloning `s`'s items. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(Vec::from(&mut [1, 2, 3][..]), vec![1, 2, 3]); + /// ``` + #[cfg(not(test))] + fn from(s: &mut [T]) -> Vec { + s.to_vec() + } + #[cfg(test)] + fn from(s: &mut [T]) -> Vec { + crate::slice::to_vec(s, Global) + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "vec_from_array", since = "1.44.0")] +impl From<[T; N]> for Vec { + /// Allocate a `Vec` and move `s`'s items into it. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(Vec::from([1, 2, 3]), vec![1, 2, 3]); + /// ``` + #[cfg(not(test))] + fn from(s: [T; N]) -> Vec { + <[T]>::into_vec(box s) + } + + #[cfg(test)] + fn from(s: [T; N]) -> Vec { + crate::slice::into_vec(box s) + } +} + +#[stable(feature = "vec_from_cow_slice", since = "1.14.0")] +impl<'a, T> From> for Vec +where + [T]: ToOwned>, +{ + /// Convert a clone-on-write slice into a vector. + /// + /// If `s` already owns a `Vec`, it will be returned directly. + /// If `s` is borrowing a slice, a new `Vec` will be allocated and + /// filled by cloning `s`'s items into it. + /// + /// # Examples + /// + /// ``` + /// # use std::borrow::Cow; + /// let o: Cow<[i32]> = Cow::Owned(vec![1, 2, 3]); + /// let b: Cow<[i32]> = Cow::Borrowed(&[1, 2, 3]); + /// assert_eq!(Vec::from(o), Vec::from(b)); + /// ``` + fn from(s: Cow<'a, [T]>) -> Vec { + s.into_owned() + } +} + +// note: test pulls in libstd, which causes errors here +#[cfg(not(test))] +#[stable(feature = "vec_from_box", since = "1.18.0")] +impl From> for Vec { + /// Convert a boxed slice into a vector by transferring ownership of + /// the existing heap allocation. + /// + /// # Examples + /// + /// ``` + /// let b: Box<[i32]> = vec![1, 2, 3].into_boxed_slice(); + /// assert_eq!(Vec::from(b), vec![1, 2, 3]); + /// ``` + fn from(s: Box<[T], A>) -> Self { + s.into_vec() + } +} + +// note: test pulls in libstd, which causes errors here +#[cfg(not(no_global_oom_handling))] +#[cfg(not(test))] +#[stable(feature = "box_from_vec", since = "1.20.0")] +impl From> for Box<[T], A> { + /// Convert a vector into a boxed slice. + /// + /// If `v` has excess capacity, its items will be moved into a + /// newly-allocated buffer with exactly the right capacity. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(Box::from(vec![1, 2, 3]), vec![1, 2, 3].into_boxed_slice()); + /// ``` + fn from(v: Vec) -> Self { + v.into_boxed_slice() + } +} + +#[cfg(not(no_global_oom_handling))] +#[stable(feature = "rust1", since = "1.0.0")] +impl From<&str> for Vec { + /// Allocate a `Vec` and fill it with a UTF-8 string. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(Vec::from("123"), vec![b'1', b'2', b'3']); + /// ``` + fn from(s: &str) -> Vec { + From::from(s.as_bytes()) + } +} + +#[stable(feature = "array_try_from_vec", since = "1.48.0")] +impl TryFrom> for [T; N] { + type Error = Vec; + + /// Gets the entire contents of the `Vec` as an array, + /// if its size exactly matches that of the requested array. + /// + /// # Examples + /// + /// ``` + /// assert_eq!(vec![1, 2, 3].try_into(), Ok([1, 2, 3])); + /// assert_eq!(>::new().try_into(), Ok([])); + /// ``` + /// + /// If the length doesn't match, the input comes back in `Err`: + /// ``` + /// let r: Result<[i32; 4], _> = (0..10).collect::>().try_into(); + /// assert_eq!(r, Err(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9])); + /// ``` + /// + /// If you're fine with just getting a prefix of the `Vec`, + /// you can call [`.truncate(N)`](Vec::truncate) first. + /// ``` + /// let mut v = String::from("hello world").into_bytes(); + /// v.sort(); + /// v.truncate(2); + /// let [a, b]: [_; 2] = v.try_into().unwrap(); + /// assert_eq!(a, b' '); + /// assert_eq!(b, b'd'); + /// ``` + fn try_from(mut vec: Vec) -> Result<[T; N], Vec> { + if vec.len() != N { + return Err(vec); + } + + // SAFETY: `.set_len(0)` is always sound. + unsafe { vec.set_len(0) }; + + // SAFETY: A `Vec`'s pointer is always aligned properly, and + // the alignment the array needs is the same as the items. + // We checked earlier that we have sufficient items. + // The items will not double-drop as the `set_len` + // tells the `Vec` not to also drop them. + let array = unsafe { ptr::read(vec.as_ptr() as *const [T; N]) }; + Ok(array) + } +} diff --git a/rust/alloc/vec/partial_eq.rs b/rust/alloc/vec/partial_eq.rs new file mode 100644 index 00000000000000..50e1409610507e --- /dev/null +++ b/rust/alloc/vec/partial_eq.rs @@ -0,0 +1,47 @@ +use crate::alloc::Allocator; +#[cfg(not(no_global_oom_handling))] +use crate::borrow::Cow; + +use super::Vec; + +macro_rules! __impl_slice_eq1 { + ([$($vars:tt)*] $lhs:ty, $rhs:ty $(where $ty:ty: $bound:ident)?, #[$stability:meta]) => { + #[$stability] + impl PartialEq<$rhs> for $lhs + where + T: PartialEq, + $($ty: $bound)? + { + #[inline] + fn eq(&self, other: &$rhs) -> bool { self[..] == other[..] } + #[inline] + fn ne(&self, other: &$rhs) -> bool { self[..] != other[..] } + } + } +} + +__impl_slice_eq1! { [A: Allocator] Vec, Vec, #[stable(feature = "rust1", since = "1.0.0")] } +__impl_slice_eq1! { [A: Allocator] Vec, &[U], #[stable(feature = "rust1", since = "1.0.0")] } +__impl_slice_eq1! { [A: Allocator] Vec, &mut [U], #[stable(feature = "rust1", since = "1.0.0")] } +__impl_slice_eq1! { [A: Allocator] &[T], Vec, #[stable(feature = "partialeq_vec_for_ref_slice", since = "1.46.0")] } +__impl_slice_eq1! { [A: Allocator] &mut [T], Vec, #[stable(feature = "partialeq_vec_for_ref_slice", since = "1.46.0")] } +__impl_slice_eq1! { [A: Allocator] Vec, [U], #[stable(feature = "partialeq_vec_for_slice", since = "1.48.0")] } +__impl_slice_eq1! { [A: Allocator] [T], Vec, #[stable(feature = "partialeq_vec_for_slice", since = "1.48.0")] } +#[cfg(not(no_global_oom_handling))] +__impl_slice_eq1! { [A: Allocator] Cow<'_, [T]>, Vec where T: Clone, #[stable(feature = "rust1", since = "1.0.0")] } +#[cfg(not(no_global_oom_handling))] +__impl_slice_eq1! { [] Cow<'_, [T]>, &[U] where T: Clone, #[stable(feature = "rust1", since = "1.0.0")] } +#[cfg(not(no_global_oom_handling))] +__impl_slice_eq1! { [] Cow<'_, [T]>, &mut [U] where T: Clone, #[stable(feature = "rust1", since = "1.0.0")] } +__impl_slice_eq1! { [A: Allocator, const N: usize] Vec, [U; N], #[stable(feature = "rust1", since = "1.0.0")] } +__impl_slice_eq1! { [A: Allocator, const N: usize] Vec, &[U; N], #[stable(feature = "rust1", since = "1.0.0")] } + +// NOTE: some less important impls are omitted to reduce code bloat +// FIXME(Centril): Reconsider this? +//__impl_slice_eq1! { [const N: usize] Vec, &mut [B; N], } +//__impl_slice_eq1! { [const N: usize] [A; N], Vec, } +//__impl_slice_eq1! { [const N: usize] &[A; N], Vec, } +//__impl_slice_eq1! { [const N: usize] &mut [A; N], Vec, } +//__impl_slice_eq1! { [const N: usize] Cow<'a, [A]>, [B; N], } +//__impl_slice_eq1! { [const N: usize] Cow<'a, [A]>, &[B; N], } +//__impl_slice_eq1! { [const N: usize] Cow<'a, [A]>, &mut [B; N], } diff --git a/rust/alloc/vec/set_len_on_drop.rs b/rust/alloc/vec/set_len_on_drop.rs new file mode 100644 index 00000000000000..8b66bc81212969 --- /dev/null +++ b/rust/alloc/vec/set_len_on_drop.rs @@ -0,0 +1,28 @@ +// Set the length of the vec when the `SetLenOnDrop` value goes out of scope. +// +// The idea is: The length field in SetLenOnDrop is a local variable +// that the optimizer will see does not alias with any stores through the Vec's data +// pointer. This is a workaround for alias analysis issue #32155 +pub(super) struct SetLenOnDrop<'a> { + len: &'a mut usize, + local_len: usize, +} + +impl<'a> SetLenOnDrop<'a> { + #[inline] + pub(super) fn new(len: &'a mut usize) -> Self { + SetLenOnDrop { local_len: *len, len } + } + + #[inline] + pub(super) fn increment_len(&mut self, increment: usize) { + self.local_len += increment; + } +} + +impl Drop for SetLenOnDrop<'_> { + #[inline] + fn drop(&mut self) { + *self.len = self.local_len; + } +} diff --git a/rust/alloc/vec/spec_extend.rs b/rust/alloc/vec/spec_extend.rs new file mode 100644 index 00000000000000..c3b4534096de5f --- /dev/null +++ b/rust/alloc/vec/spec_extend.rs @@ -0,0 +1,87 @@ +use crate::alloc::Allocator; +use core::iter::TrustedLen; +use core::ptr::{self}; +use core::slice::{self}; + +use super::{IntoIter, SetLenOnDrop, Vec}; + +// Specialization trait used for Vec::extend +pub(super) trait SpecExtend { + fn spec_extend(&mut self, iter: I); +} + +impl SpecExtend for Vec +where + I: Iterator, +{ + default fn spec_extend(&mut self, iter: I) { + self.extend_desugared(iter) + } +} + +impl SpecExtend for Vec +where + I: TrustedLen, +{ + default fn spec_extend(&mut self, iterator: I) { + // This is the case for a TrustedLen iterator. + let (low, high) = iterator.size_hint(); + if let Some(additional) = high { + debug_assert_eq!( + low, + additional, + "TrustedLen iterator's size hint is not exact: {:?}", + (low, high) + ); + self.reserve(additional); + unsafe { + let mut ptr = self.as_mut_ptr().add(self.len()); + let mut local_len = SetLenOnDrop::new(&mut self.len); + iterator.for_each(move |element| { + ptr::write(ptr, element); + ptr = ptr.offset(1); + // Since the loop executes user code which can panic we have to bump the pointer + // after each step. + // NB can't overflow since we would have had to alloc the address space + local_len.increment_len(1); + }); + } + } else { + // Per TrustedLen contract a `None` upper bound means that the iterator length + // truly exceeds usize::MAX, which would eventually lead to a capacity overflow anyway. + // Since the other branch already panics eagerly (via `reserve()`) we do the same here. + // This avoids additional codegen for a fallback code path which would eventually + // panic anyway. + panic!("capacity overflow"); + } + } +} + +impl SpecExtend> for Vec { + fn spec_extend(&mut self, mut iterator: IntoIter) { + unsafe { + self.append_elements(iterator.as_slice() as _); + } + iterator.ptr = iterator.end; + } +} + +impl<'a, T: 'a, I, A: Allocator + 'a> SpecExtend<&'a T, I> for Vec +where + I: Iterator, + T: Clone, +{ + default fn spec_extend(&mut self, iterator: I) { + self.spec_extend(iterator.cloned()) + } +} + +impl<'a, T: 'a, A: Allocator + 'a> SpecExtend<&'a T, slice::Iter<'a, T>> for Vec +where + T: Copy, +{ + fn spec_extend(&mut self, iterator: slice::Iter<'a, T>) { + let slice = iterator.as_slice(); + unsafe { self.append_elements(slice) }; + } +} From 10fca43737603ab46add31681038b9012f8824fe Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 17:02:21 +0200 Subject: [PATCH 0022/1250] rust: adapt `alloc` crate to the kernel This customizes the subset of the Rust standard library `alloc` that was just imported as-is, mainly by: - Adding SPDX license identifiers. - Skipping `rc` and `sync` modules via new `cfg`s. - Skipping the `vec!` macro. - Addding fallible (`try_*`) versions of existing infallible methods (i.e. returning a `Result` instead of panicking). Since the standard library requires stable/unstable attributes, these additions are annotated with: #[stable(feature = "kernel", since = "1.0.0")] Using "kernel" as the feature allows to have the additions clearly marked. The "1.0.0" version is just a placeholder. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Matthew Bakhtiari Signed-off-by: Matthew Bakhtiari Signed-off-by: Miguel Ojeda --- rust/alloc/README.md | 33 ++++ rust/alloc/alloc.rs | 2 + rust/alloc/borrow.rs | 2 + rust/alloc/boxed.rs | 2 + rust/alloc/collections/mod.rs | 2 + rust/alloc/fmt.rs | 2 + rust/alloc/lib.rs | 5 +- rust/alloc/macros.rs | 4 +- rust/alloc/raw_vec.rs | 50 ++++- rust/alloc/slice.rs | 93 ++++++++- rust/alloc/str.rs | 19 ++ rust/alloc/string.rs | 6 +- rust/alloc/vec/drain.rs | 2 + rust/alloc/vec/drain_filter.rs | 2 + rust/alloc/vec/into_iter.rs | 2 + rust/alloc/vec/is_zero.rs | 2 + rust/alloc/vec/mod.rs | 313 +++++++++++++++++++++++++++++- rust/alloc/vec/partial_eq.rs | 2 + rust/alloc/vec/set_len_on_drop.rs | 2 + rust/alloc/vec/spec_extend.rs | 87 +++++++++ 20 files changed, 623 insertions(+), 9 deletions(-) create mode 100644 rust/alloc/README.md diff --git a/rust/alloc/README.md b/rust/alloc/README.md new file mode 100644 index 00000000000000..9cf0d074c82a6a --- /dev/null +++ b/rust/alloc/README.md @@ -0,0 +1,33 @@ +# `alloc` + +These source files come from the Rust standard library, hosted in +the https://github.com/rust-lang/rust repository, licensed under +"Apache-2.0 OR MIT" and adapted for kernel use. For copyright details, +see https://github.com/rust-lang/rust/blob/master/COPYRIGHT. + +Please note that these files should be kept as close as possible to +upstream. In general, only additions should be performed (e.g. new +methods). Eventually, changes should make it into upstream so that, +at some point, this fork can be dropped from the kernel tree. + + +## Rationale + +On one hand, kernel folks wanted to keep `alloc` in-tree to have more +freedom in both workflow and actual features if actually needed +(e.g. receiver types if we ended up using them), which is reasonable. + +On the other hand, Rust folks wanted to keep `alloc` as close as +upstream as possible and avoid as much divergence as possible, which +is also reasonable. + +We agreed on a middle-ground: we would keep a subset of `alloc` +in-tree that would be as small and as close as possible to upstream. +Then, upstream can start adding the functions that we add to `alloc` +etc., until we reach a point where the kernel already knows exactly +what it needs in `alloc` and all the new methods are merged into +upstream, so that we can drop `alloc` from the kernel tree and go back +to using the upstream one. + +By doing this, the kernel can go a bit faster now, and Rust can +slowly incorporate and discuss the changes as needed. diff --git a/rust/alloc/alloc.rs b/rust/alloc/alloc.rs index 9d4f9af91a5e19..cea3b747673fd6 100644 --- a/rust/alloc/alloc.rs +++ b/rust/alloc/alloc.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! Memory allocation APIs #![stable(feature = "alloc_module", since = "1.28.0")] diff --git a/rust/alloc/borrow.rs b/rust/alloc/borrow.rs index 63234ee91f0910..8e1d8a76464136 100644 --- a/rust/alloc/borrow.rs +++ b/rust/alloc/borrow.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! A module for working with borrowed data. #![stable(feature = "rust1", since = "1.0.0")] diff --git a/rust/alloc/boxed.rs b/rust/alloc/boxed.rs index f753189c683088..921fcef75e4b9b 100644 --- a/rust/alloc/boxed.rs +++ b/rust/alloc/boxed.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! A pointer type for heap allocation. //! //! [`Box`], casually referred to as a 'box', provides the simplest form of diff --git a/rust/alloc/collections/mod.rs b/rust/alloc/collections/mod.rs index 628a5b155673c9..1eec265b28f807 100644 --- a/rust/alloc/collections/mod.rs +++ b/rust/alloc/collections/mod.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! Collection types. #![stable(feature = "rust1", since = "1.0.0")] diff --git a/rust/alloc/fmt.rs b/rust/alloc/fmt.rs index aeb7554f8e914e..be75e6637442e3 100644 --- a/rust/alloc/fmt.rs +++ b/rust/alloc/fmt.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! Utilities for formatting and printing `String`s. //! //! This module contains the runtime support for the [`format!`] syntax extension. diff --git a/rust/alloc/lib.rs b/rust/alloc/lib.rs index 6da32df57efb76..085dc005170a05 100644 --- a/rust/alloc/lib.rs +++ b/rust/alloc/lib.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! # The Rust core allocation and collections library //! //! This library provides smart pointers and collections for managing @@ -204,11 +206,12 @@ mod boxed { pub mod borrow; pub mod collections; pub mod fmt; +#[cfg(not(no_rc))] pub mod rc; pub mod slice; pub mod str; pub mod string; -#[cfg(target_has_atomic = "ptr")] +#[cfg(all(not(no_sync), target_has_atomic = "ptr"))] pub mod sync; #[cfg(all(not(no_global_oom_handling), target_has_atomic = "ptr"))] pub mod task; diff --git a/rust/alloc/macros.rs b/rust/alloc/macros.rs index d3e9e65c3fe57b..47ebcd5277d16c 100644 --- a/rust/alloc/macros.rs +++ b/rust/alloc/macros.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + /// Creates a [`Vec`] containing the arguments. /// /// `vec!` allows `Vec`s to be defined with the same syntax as array expressions. @@ -34,7 +36,7 @@ /// be mindful of side effects. /// /// [`Vec`]: crate::vec::Vec -#[cfg(not(test))] +#[cfg(all(not(no_global_oom_handling), not(test)))] #[macro_export] #[stable(feature = "rust1", since = "1.0.0")] #[rustc_diagnostic_item = "vec_macro"] diff --git a/rust/alloc/raw_vec.rs b/rust/alloc/raw_vec.rs index 8fa0242ca9a9f0..018c4657f58047 100644 --- a/rust/alloc/raw_vec.rs +++ b/rust/alloc/raw_vec.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + #![unstable(feature = "raw_vec_internals", reason = "unstable const warnings", issue = "none")] use core::alloc::LayoutError; @@ -18,10 +20,10 @@ use crate::collections::TryReserveErrorKind::*; #[cfg(test)] mod tests; -#[cfg(not(no_global_oom_handling))] enum AllocInit { /// The contents of the new memory are uninitialized. Uninitialized, + #[allow(dead_code)] /// The new memory is guaranteed to be zeroed. Zeroed, } @@ -132,6 +134,13 @@ impl RawVec { Self::allocate_in(capacity, AllocInit::Uninitialized, alloc) } + /// Like `try_with_capacity`, but parameterized over the choice of + /// allocator for the returned `RawVec`. + #[inline] + pub fn try_with_capacity_in(capacity: usize, alloc: A) -> Result { + Self::try_allocate_in(capacity, AllocInit::Uninitialized, alloc) + } + /// Like `with_capacity_zeroed`, but parameterized over the choice /// of allocator for the returned `RawVec`. #[cfg(not(no_global_oom_handling))] @@ -201,6 +210,29 @@ impl RawVec { } } + fn try_allocate_in(capacity: usize, init: AllocInit, alloc: A) -> Result { + if mem::size_of::() == 0 { + return Ok(Self::new_in(alloc)); + } + + let layout = Layout::array::(capacity).map_err(|_| CapacityOverflow)?; + alloc_guard(layout.size())?; + let result = match init { + AllocInit::Uninitialized => alloc.allocate(layout), + AllocInit::Zeroed => alloc.allocate_zeroed(layout), + }; + let ptr = result.map_err(|_| AllocError { layout, non_exhaustive: () })?; + + // Allocators currently return a `NonNull<[u8]>` whose length + // matches the size requested. If that ever changes, the capacity + // here should change to `ptr.len() / mem::size_of::()`. + Ok(Self { + ptr: unsafe { Unique::new_unchecked(ptr.cast().as_ptr()) }, + cap: capacity, + alloc, + }) + } + /// Reconstitutes a `RawVec` from a pointer, capacity, and allocator. /// /// # Safety @@ -309,6 +341,12 @@ impl RawVec { } } + /// The same as `reserve_for_push`, but returns on errors instead of panicking or aborting. + #[inline(never)] + pub fn try_reserve_for_push(&mut self, len: usize) -> Result<(), TryReserveError> { + self.grow_amortized(len, 1) + } + /// Ensures that the buffer contains at least enough space to hold `len + /// additional` elements. If it doesn't already, will reallocate the /// minimum possible amount of memory necessary. Generally this will be @@ -354,6 +392,16 @@ impl RawVec { pub fn shrink_to_fit(&mut self, cap: usize) { handle_reserve(self.shrink(cap)); } + + /// Tries to shrink the buffer down to the specified capacity. If the given amount + /// is 0, actually completely deallocates. + /// + /// # Panics + /// + /// Panics if the given amount is *larger* than the current capacity. + pub fn try_shrink_to_fit(&mut self, cap: usize) -> Result<(), TryReserveError> { + self.shrink(cap) + } } impl RawVec { diff --git a/rust/alloc/slice.rs b/rust/alloc/slice.rs index f0397d08f95a8f..8cb5170f639d61 100644 --- a/rust/alloc/slice.rs +++ b/rust/alloc/slice.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! A dynamically-sized view into a contiguous sequence, `[T]`. //! //! *[See also the slice primitive type](slice).* @@ -93,11 +95,11 @@ use core::mem::size_of; use core::ptr; use crate::alloc::Allocator; -#[cfg(not(no_global_oom_handling))] use crate::alloc::Global; #[cfg(not(no_global_oom_handling))] use crate::borrow::ToOwned; use crate::boxed::Box; +use crate::collections::TryReserveError; use crate::vec::Vec; #[unstable(feature = "slice_range", issue = "76393")] @@ -157,6 +159,7 @@ mod hack { use core::alloc::Allocator; use crate::boxed::Box; + use crate::collections::TryReserveError; use crate::vec::Vec; // We shouldn't add inline attribute to this since this is used in @@ -176,6 +179,11 @@ mod hack { T::to_vec(s, alloc) } + #[inline] + pub fn try_to_vec(s: &[T], alloc: A) -> Result, TryReserveError> { + T::try_to_vec(s, alloc) + } + #[cfg(not(no_global_oom_handling))] pub trait ConvertVec { fn to_vec(s: &[Self], alloc: A) -> Vec @@ -183,6 +191,12 @@ mod hack { Self: Sized; } + pub trait TryConvertVec { + fn try_to_vec(s: &[Self], alloc: A) -> Result, TryReserveError> + where + Self: Sized; + } + #[cfg(not(no_global_oom_handling))] impl ConvertVec for T { #[inline] @@ -235,6 +249,42 @@ mod hack { v } } + + impl TryConvertVec for T { + #[inline] + default fn try_to_vec(s: &[Self], alloc: A) -> Result, TryReserveError> { + struct DropGuard<'a, T, A: Allocator> { + vec: &'a mut Vec, + num_init: usize, + } + impl<'a, T, A: Allocator> Drop for DropGuard<'a, T, A> { + #[inline] + fn drop(&mut self) { + // SAFETY: + // items were marked initialized in the loop below + unsafe { + self.vec.set_len(self.num_init); + } + } + } + let mut vec = Vec::try_with_capacity_in(s.len(), alloc)?; + let mut guard = DropGuard { vec: &mut vec, num_init: 0 }; + let slots = guard.vec.spare_capacity_mut(); + // .take(slots.len()) is necessary for LLVM to remove bounds checks + // and has better codegen than zip. + for (i, b) in s.iter().enumerate().take(slots.len()) { + guard.num_init = i; + slots[i].write(b.clone()); + } + core::mem::forget(guard); + // SAFETY: + // the vec was allocated and initialized above to at least this length. + unsafe { + vec.set_len(s.len()); + } + Ok(vec) + } + } } #[lang = "slice_alloc"] @@ -477,6 +527,24 @@ impl [T] { self.to_vec_in(Global) } + /// Tries to copy `self` into a new `Vec`. + /// + /// # Examples + /// + /// ``` + /// let s = [10, 40, 30]; + /// let x = s.try_to_vec().unwrap(); + /// // Here, `s` and `x` can be modified independently. + /// ``` + #[inline] + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_to_vec(&self) -> Result, TryReserveError> + where + T: Clone, + { + self.try_to_vec_in(Global) + } + /// Copies `self` into a new `Vec` with an allocator. /// /// # Examples @@ -501,6 +569,29 @@ impl [T] { hack::to_vec(self, alloc) } + /// Tries to copy `self` into a new `Vec` with an allocator. + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// let s = [10, 40, 30]; + /// let x = s.try_to_vec_in(System).unwrap(); + /// // Here, `s` and `x` can be modified independently. + /// ``` + #[inline] + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_to_vec_in(&self, alloc: A) -> Result, TryReserveError> + where + T: Clone, + { + // N.B., see the `hack` module in this file for more details. + hack::try_to_vec(self, alloc) + } + /// Converts `self` into a vector without clones or allocation. /// /// The resulting vector can be converted back into a box via diff --git a/rust/alloc/str.rs b/rust/alloc/str.rs index 69495f31c32ca4..a1370c2d674c34 100644 --- a/rust/alloc/str.rs +++ b/rust/alloc/str.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! Unicode string slices. //! //! *[See also the `str` primitive type](str).* @@ -36,6 +38,7 @@ use core::unicode::conversions; use crate::borrow::ToOwned; use crate::boxed::Box; +use crate::collections::TryReserveError; use crate::slice::{Concat, Join, SliceIndex}; use crate::string::String; use crate::vec::Vec; @@ -590,6 +593,22 @@ impl str { // make_ascii_lowercase() preserves the UTF-8 invariant. unsafe { String::from_utf8_unchecked(bytes) } } + + /// Tries to create a `String`. + /// + /// # Examples + /// + /// Basic usage: + /// + /// ``` + /// let s: &str = "a"; + /// let ss: String = s.try_to_owned().unwrap(); + /// ``` + #[inline] + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_to_owned(&self) -> Result { + unsafe { Ok(String::from_utf8_unchecked(self.as_bytes().try_to_vec()?)) } + } } /// Converts a boxed slice of bytes to a boxed string slice without checking diff --git a/rust/alloc/string.rs b/rust/alloc/string.rs index 716bb4983a651f..5b9780cfc11f75 100644 --- a/rust/alloc/string.rs +++ b/rust/alloc/string.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! A UTF-8–encoded, growable string. //! //! This module contains the [`String`] type, the [`ToString`] trait for @@ -47,8 +49,8 @@ use core::char::{decode_utf16, REPLACEMENT_CHARACTER}; use core::fmt; use core::hash; #[cfg(not(no_global_oom_handling))] -use core::iter::FromIterator; -use core::iter::{from_fn, FusedIterator}; +use core::iter::{from_fn, FromIterator}; +use core::iter::FusedIterator; #[cfg(not(no_global_oom_handling))] use core::ops::Add; #[cfg(not(no_global_oom_handling))] diff --git a/rust/alloc/vec/drain.rs b/rust/alloc/vec/drain.rs index 1bff19d05c10d3..8116ba6dfd9ec7 100644 --- a/rust/alloc/vec/drain.rs +++ b/rust/alloc/vec/drain.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + use crate::alloc::{Allocator, Global}; use core::fmt; use core::iter::{FusedIterator, TrustedLen}; diff --git a/rust/alloc/vec/drain_filter.rs b/rust/alloc/vec/drain_filter.rs index 3c37c92ae44b0c..b04fce041622f3 100644 --- a/rust/alloc/vec/drain_filter.rs +++ b/rust/alloc/vec/drain_filter.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + use crate::alloc::{Allocator, Global}; use core::ptr::{self}; use core::slice::{self}; diff --git a/rust/alloc/vec/into_iter.rs b/rust/alloc/vec/into_iter.rs index f985fb78465b9a..11d01ac868ca9c 100644 --- a/rust/alloc/vec/into_iter.rs +++ b/rust/alloc/vec/into_iter.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + use crate::alloc::{Allocator, Global}; use crate::raw_vec::RawVec; use core::fmt; diff --git a/rust/alloc/vec/is_zero.rs b/rust/alloc/vec/is_zero.rs index 0efc4893c3c428..40e1e667c9fb3c 100644 --- a/rust/alloc/vec/is_zero.rs +++ b/rust/alloc/vec/is_zero.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + use crate::boxed::Box; #[rustc_specialization_trait] diff --git a/rust/alloc/vec/mod.rs b/rust/alloc/vec/mod.rs index c29aa0fec5b87f..a2b1dfca35614b 100644 --- a/rust/alloc/vec/mod.rs +++ b/rust/alloc/vec/mod.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + //! A contiguous growable array type with heap-allocated contents, written //! `Vec`. //! @@ -118,10 +120,8 @@ use self::spec_from_elem::SpecFromElem; #[cfg(not(no_global_oom_handling))] mod spec_from_elem; -#[cfg(not(no_global_oom_handling))] use self::set_len_on_drop::SetLenOnDrop; -#[cfg(not(no_global_oom_handling))] mod set_len_on_drop; #[cfg(not(no_global_oom_handling))] @@ -145,7 +145,8 @@ mod spec_from_iter; #[cfg(not(no_global_oom_handling))] use self::spec_extend::SpecExtend; -#[cfg(not(no_global_oom_handling))] +use self::spec_extend::TrySpecExtend; + mod spec_extend; /// A contiguous growable array type, written as `Vec`, short for 'vector'. @@ -470,6 +471,49 @@ impl Vec { Self::with_capacity_in(capacity, Global) } + /// Tries to construct a new, empty `Vec` with the specified capacity. + /// + /// The vector will be able to hold exactly `capacity` elements without + /// reallocating. If `capacity` is 0, the vector will not allocate. + /// + /// It is important to note that although the returned vector has the + /// *capacity* specified, the vector will have a zero *length*. For an + /// explanation of the difference between length and capacity, see + /// *[Capacity and reallocation]*. + /// + /// [Capacity and reallocation]: #capacity-and-reallocation + /// + /// # Examples + /// + /// ``` + /// let mut vec = Vec::try_with_capacity(10).unwrap(); + /// + /// // The vector contains no items, even though it has capacity for more + /// assert_eq!(vec.len(), 0); + /// assert_eq!(vec.capacity(), 10); + /// + /// // These are all done without reallocating... + /// for i in 0..10 { + /// vec.push(i); + /// } + /// assert_eq!(vec.len(), 10); + /// assert_eq!(vec.capacity(), 10); + /// + /// // ...but this may make the vector reallocate + /// vec.push(11); + /// assert_eq!(vec.len(), 11); + /// assert!(vec.capacity() >= 11); + /// + /// let mut result = Vec::try_with_capacity(usize::MAX); + /// assert!(result.is_err()); + /// ``` + #[inline] + #[doc(alias = "malloc")] + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_with_capacity(capacity: usize) -> Result { + Self::try_with_capacity_in(capacity, Global) + } + /// Creates a `Vec` directly from the raw components of another vector. /// /// # Safety @@ -609,6 +653,53 @@ impl Vec { Vec { buf: RawVec::with_capacity_in(capacity, alloc), len: 0 } } + /// Tries to construct a new, empty `Vec` with the specified capacity + /// with the provided allocator. + /// + /// The vector will be able to hold exactly `capacity` elements without + /// reallocating. If `capacity` is 0, the vector will not allocate. + /// + /// It is important to note that although the returned vector has the + /// *capacity* specified, the vector will have a zero *length*. For an + /// explanation of the difference between length and capacity, see + /// *[Capacity and reallocation]*. + /// + /// [Capacity and reallocation]: #capacity-and-reallocation + /// + /// # Examples + /// + /// ``` + /// #![feature(allocator_api)] + /// + /// use std::alloc::System; + /// + /// let mut vec = Vec::try_with_capacity_in(10, System).unwrap(); + /// + /// // The vector contains no items, even though it has capacity for more + /// assert_eq!(vec.len(), 0); + /// assert_eq!(vec.capacity(), 10); + /// + /// // These are all done without reallocating... + /// for i in 0..10 { + /// vec.push(i); + /// } + /// assert_eq!(vec.len(), 10); + /// assert_eq!(vec.capacity(), 10); + /// + /// // ...but this may make the vector reallocate + /// vec.push(11); + /// assert_eq!(vec.len(), 11); + /// assert!(vec.capacity() >= 11); + /// + /// let mut result = Vec::try_with_capacity_in(usize::MAX, System); + /// assert!(result.is_err()); + /// ``` + #[inline] + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_with_capacity_in(capacity: usize, alloc: A) -> Result { + Ok(Vec { buf: RawVec::try_with_capacity_in(capacity, alloc)?, len: 0 }) + } + /// Creates a `Vec` directly from the raw components of another vector. /// /// # Safety @@ -940,6 +1031,33 @@ impl Vec { } } + /// Tries to shrink the capacity of the vector as much as possible. + /// + /// It will drop down as close as possible to the length but the allocator + /// may still inform the vector that there is space for a few more elements. + /// + /// # Examples + /// + /// ``` + /// let mut vec = Vec::with_capacity(10); + /// vec.extend([1, 2, 3]); + /// assert_eq!(vec.capacity(), 10); + /// vec.try_shrink_to_fit().unwrap(); + /// assert!(vec.capacity() >= 3); + /// ``` + #[doc(alias = "realloc")] + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_shrink_to_fit(&mut self) -> Result<(), TryReserveError> { + // The capacity is never less than the length, and there's nothing to do when + // they are equal, so we can avoid the panic case in `RawVec::try_shrink_to_fit` + // by only calling it with a greater capacity. + if self.capacity() <= self.len { + return Ok(()); + } + + self.buf.try_shrink_to_fit(self.len) + } + /// Shrinks the capacity of the vector with a lower bound. /// /// The capacity will remain at least as large as both the length @@ -1002,6 +1120,41 @@ impl Vec { } } + /// Tries to convert the vector into [`Box<[T]>`][owned slice]. + /// + /// Note that this will drop any excess capacity. + /// + /// [owned slice]: Box + /// + /// # Examples + /// + /// ``` + /// let v = vec![1, 2, 3]; + /// + /// let slice = v.try_into_boxed_slice().unwrap(); + /// ``` + /// + /// Any excess capacity is removed: + /// + /// ``` + /// let mut vec = Vec::with_capacity(10); + /// vec.extend([1, 2, 3]); + /// + /// assert_eq!(vec.capacity(), 10); + /// let slice = vec.try_into_boxed_slice().unwrap(); + /// assert_eq!(slice.into_vec().capacity(), 3); + /// ``` + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_into_boxed_slice(mut self) -> Result, TryReserveError> { + unsafe { + self.try_shrink_to_fit()?; + let me = ManuallyDrop::new(self); + let buf = ptr::read(&me.buf); + let len = me.len(); + Ok(buf.into_box(len).assume_init()) + } + } + /// Shortens the vector, keeping the first `len` elements and dropping /// the rest. /// @@ -1735,6 +1888,29 @@ impl Vec { } } + /// Tries to append an element to the back of a collection. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1, 2]; + /// vec.try_push(3).unwrap(); + /// assert_eq!(vec, [1, 2, 3]); + /// ``` + #[inline] + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_push(&mut self, value: T) -> Result<(), TryReserveError> { + if self.len == self.buf.capacity() { + self.buf.try_reserve_for_push(self.len)?; + } + unsafe { + let end = self.as_mut_ptr().add(self.len); + ptr::write(end, value); + self.len += 1; + } + Ok(()) + } + /// Removes the last element from a vector and returns it, or [`None`] if it /// is empty. /// @@ -1799,6 +1975,17 @@ impl Vec { self.len += count; } + /// Tries to append elements to `Self` from other buffer. + #[inline] + unsafe fn try_append_elements(&mut self, other: *const [T]) -> Result<(), TryReserveError> { + let count = unsafe { (*other).len() }; + self.try_reserve(count)?; + let len = self.len(); + unsafe { ptr::copy_nonoverlapping(other as *const T, self.as_mut_ptr().add(len), count) }; + self.len += count; + Ok(()) + } + /// Removes the specified range from the vector in bulk, returning all /// removed elements as an iterator. If the iterator is dropped before /// being fully consumed, it drops the remaining removed elements. @@ -2209,6 +2396,44 @@ impl Vec { } } + /// Tries to resize the `Vec` in-place so that `len` is equal to `new_len`. + /// + /// If `new_len` is greater than `len`, the `Vec` is extended by the + /// difference, with each additional slot filled with `value`. + /// If `new_len` is less than `len`, the `Vec` is simply truncated. + /// + /// This method requires `T` to implement [`Clone`], + /// in order to be able to clone the passed value. + /// If you need more flexibility (or want to rely on [`Default`] instead of + /// [`Clone`]), use [`Vec::resize_with`]. + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec!["hello"]; + /// vec.try_resize(3, "world").unwrap(); + /// assert_eq!(vec, ["hello", "world", "world"]); + /// + /// let mut vec = vec![1, 2, 3, 4]; + /// vec.try_resize(2, 0).unwrap(); + /// assert_eq!(vec, [1, 2]); + /// + /// let mut vec = vec![42]; + /// let result = vec.try_resize(usize::MAX, 0); + /// assert!(result.is_err()); + /// ``` + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_resize(&mut self, new_len: usize, value: T) -> Result<(), TryReserveError> { + let len = self.len(); + + if new_len > len { + self.try_extend_with(new_len - len, ExtendElement(value)) + } else { + self.truncate(new_len); + Ok(()) + } + } + /// Clones and appends all elements in a slice to the `Vec`. /// /// Iterates over the slice `other`, clones each element, and then appends @@ -2234,6 +2459,30 @@ impl Vec { self.spec_extend(other.iter()) } + /// Tries to clone and append all elements in a slice to the `Vec`. + /// + /// Iterates over the slice `other`, clones each element, and then appends + /// it to this `Vec`. The `other` slice is traversed in-order. + /// + /// Note that this function is same as [`extend`] except that it is + /// specialized to work with slices instead. If and when Rust gets + /// specialization this function will likely be deprecated (but still + /// available). + /// + /// # Examples + /// + /// ``` + /// let mut vec = vec![1]; + /// vec.try_extend_from_slice(&[2, 3, 4]).unwrap(); + /// assert_eq!(vec, [1, 2, 3, 4]); + /// ``` + /// + /// [`extend`]: Vec::extend + #[stable(feature = "kernel", since = "1.0.0")] + pub fn try_extend_from_slice(&mut self, other: &[T]) -> Result<(), TryReserveError> { + self.try_spec_extend(other.iter()) + } + /// Copies elements from `src` range to the end of the vector. /// /// # Panics @@ -2328,6 +2577,36 @@ impl Vec { // len set by scope guard } } + + /// Try to extend the vector by `n` values, using the given generator. + fn try_extend_with>(&mut self, n: usize, mut value: E) -> Result<(), TryReserveError> { + self.try_reserve(n)?; + + unsafe { + let mut ptr = self.as_mut_ptr().add(self.len()); + // Use SetLenOnDrop to work around bug where compiler + // might not realize the store through `ptr` through self.set_len() + // don't alias. + let mut local_len = SetLenOnDrop::new(&mut self.len); + + // Write all elements except the last one + for _ in 1..n { + ptr::write(ptr, value.next()); + ptr = ptr.offset(1); + // Increment the length in every step in case next() panics + local_len.increment_len(1); + } + + if n > 0 { + // We can write the last element directly without cloning needlessly + ptr::write(ptr, value.last()); + local_len.increment_len(1); + } + + // len set by scope guard + Ok(()) + } + } } impl Vec { @@ -2662,6 +2941,34 @@ impl Vec { } } + // leaf method to which various SpecFrom/SpecExtend implementations delegate when + // they have no further optimizations to apply + fn try_extend_desugared>(&mut self, mut iterator: I) -> Result<(), TryReserveError> { + // This is the case for a general iterator. + // + // This function should be the moral equivalent of: + // + // for item in iterator { + // self.push(item); + // } + while let Some(element) = iterator.next() { + let len = self.len(); + if len == self.capacity() { + let (lower, _) = iterator.size_hint(); + self.try_reserve(lower.saturating_add(1))?; + } + unsafe { + ptr::write(self.as_mut_ptr().add(len), element); + // Since next() executes user code which can panic we have to bump the length + // after each step. + // NB can't overflow since we would have had to alloc the address space + self.set_len(len + 1); + } + } + + Ok(()) + } + /// Creates a splicing iterator that replaces the specified range in the vector /// with the given `replace_with` iterator and yields the removed items. /// `replace_with` does not need to be the same length as `range`. diff --git a/rust/alloc/vec/partial_eq.rs b/rust/alloc/vec/partial_eq.rs index 50e1409610507e..273e99bed4888e 100644 --- a/rust/alloc/vec/partial_eq.rs +++ b/rust/alloc/vec/partial_eq.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + use crate::alloc::Allocator; #[cfg(not(no_global_oom_handling))] use crate::borrow::Cow; diff --git a/rust/alloc/vec/set_len_on_drop.rs b/rust/alloc/vec/set_len_on_drop.rs index 8b66bc81212969..448bf5076a0bf3 100644 --- a/rust/alloc/vec/set_len_on_drop.rs +++ b/rust/alloc/vec/set_len_on_drop.rs @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + // Set the length of the vec when the `SetLenOnDrop` value goes out of scope. // // The idea is: The length field in SetLenOnDrop is a local variable diff --git a/rust/alloc/vec/spec_extend.rs b/rust/alloc/vec/spec_extend.rs index c3b4534096de5f..729f4bb1709f4e 100644 --- a/rust/alloc/vec/spec_extend.rs +++ b/rust/alloc/vec/spec_extend.rs @@ -1,4 +1,7 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + use crate::alloc::Allocator; +use crate::collections::{TryReserveError, TryReserveErrorKind}; use core::iter::TrustedLen; use core::ptr::{self}; use core::slice::{self}; @@ -6,10 +9,17 @@ use core::slice::{self}; use super::{IntoIter, SetLenOnDrop, Vec}; // Specialization trait used for Vec::extend +#[cfg(not(no_global_oom_handling))] pub(super) trait SpecExtend { fn spec_extend(&mut self, iter: I); } +// Specialization trait used for Vec::try_extend +pub(super) trait TrySpecExtend { + fn try_spec_extend(&mut self, iter: I) -> Result<(), TryReserveError>; +} + +#[cfg(not(no_global_oom_handling))] impl SpecExtend for Vec where I: Iterator, @@ -19,6 +29,16 @@ where } } +impl TrySpecExtend for Vec +where + I: Iterator, +{ + default fn try_spec_extend(&mut self, iter: I) -> Result<(), TryReserveError> { + self.try_extend_desugared(iter) + } +} + +#[cfg(not(no_global_oom_handling))] impl SpecExtend for Vec where I: TrustedLen, @@ -57,6 +77,41 @@ where } } +impl TrySpecExtend for Vec +where + I: TrustedLen, +{ + default fn try_spec_extend(&mut self, iterator: I) -> Result<(), TryReserveError> { + // This is the case for a TrustedLen iterator. + let (low, high) = iterator.size_hint(); + if let Some(additional) = high { + debug_assert_eq!( + low, + additional, + "TrustedLen iterator's size hint is not exact: {:?}", + (low, high) + ); + self.try_reserve(additional)?; + unsafe { + let mut ptr = self.as_mut_ptr().add(self.len()); + let mut local_len = SetLenOnDrop::new(&mut self.len); + iterator.for_each(move |element| { + ptr::write(ptr, element); + ptr = ptr.offset(1); + // Since the loop executes user code which can panic we have to bump the pointer + // after each step. + // NB can't overflow since we would have had to alloc the address space + local_len.increment_len(1); + }); + } + Ok(()) + } else { + Err(TryReserveErrorKind::CapacityOverflow.into()) + } + } +} + +#[cfg(not(no_global_oom_handling))] impl SpecExtend> for Vec { fn spec_extend(&mut self, mut iterator: IntoIter) { unsafe { @@ -66,6 +121,17 @@ impl SpecExtend> for Vec { } } +impl TrySpecExtend> for Vec { + fn try_spec_extend(&mut self, mut iterator: IntoIter) -> Result<(), TryReserveError> { + unsafe { + self.try_append_elements(iterator.as_slice() as _)?; + } + iterator.ptr = iterator.end; + Ok(()) + } +} + +#[cfg(not(no_global_oom_handling))] impl<'a, T: 'a, I, A: Allocator + 'a> SpecExtend<&'a T, I> for Vec where I: Iterator, @@ -76,6 +142,17 @@ where } } +impl<'a, T: 'a, I, A: Allocator + 'a> TrySpecExtend<&'a T, I> for Vec +where + I: Iterator, + T: Clone, +{ + default fn try_spec_extend(&mut self, iterator: I) -> Result<(), TryReserveError> { + self.try_spec_extend(iterator.cloned()) + } +} + +#[cfg(not(no_global_oom_handling))] impl<'a, T: 'a, A: Allocator + 'a> SpecExtend<&'a T, slice::Iter<'a, T>> for Vec where T: Copy, @@ -85,3 +162,13 @@ where unsafe { self.append_elements(slice) }; } } + +impl<'a, T: 'a, A: Allocator + 'a> TrySpecExtend<&'a T, slice::Iter<'a, T>> for Vec +where + T: Copy, +{ + fn try_spec_extend(&mut self, iterator: slice::Iter<'a, T>) -> Result<(), TryReserveError> { + let slice = iterator.as_slice(); + unsafe { self.try_append_elements(slice) } + } +} From b25a3db0ec915f3c316a3ca9636fa6d139aba4a9 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Sat, 3 Jul 2021 17:30:16 +0200 Subject: [PATCH 0023/1250] rust: add `build_error` crate The `build_error` crate provides the `build_error` function which is then used to provide the `build_error!` and the `build_assert!` macros. `build_assert!` is intended to be used when `static_assert!` cannot be used, e.g. when the condition refers to generic parameters or parameters of an inline function. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Signed-off-by: Gary Guo Co-developed-by: Miguel Ojeda Signed-off-by: Miguel Ojeda --- rust/build_error.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 rust/build_error.rs diff --git a/rust/build_error.rs b/rust/build_error.rs new file mode 100644 index 00000000000000..34e589149d3ef3 --- /dev/null +++ b/rust/build_error.rs @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Build-time error. +//! +//! This crate provides a function `build_error`, which will panic in +//! compile-time if executed in const context, and will cause a build error +//! if not executed at compile time and the optimizer does not optimise away the +//! call. +//! +//! It is used by `build_assert!` in the kernel crate, allowing checking of +//! conditions that could be checked statically, but could not be enforced in +//! Rust yet (e.g. perform some checks in const functions, but those +//! functions could still be called in the runtime). + +#![no_std] + +/// Panics if executed in const context, or triggers a build error if not. +#[inline(never)] +#[cold] +#[no_mangle] +#[track_caller] +pub const fn build_error(msg: &'static str) -> ! { + panic!("{}", msg); +} + +#[cfg(CONFIG_RUST_BUILD_ASSERT_WARN)] +#[link_section = ".gnu.warning.build_error"] +#[used] +static BUILD_ERROR_WARNING: [u8; 45] = *b"call to build_error present after compilation"; From 0393a048dbaa08566449f444901e7edff9b61e16 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 16:56:17 +0200 Subject: [PATCH 0024/1250] rust: add `macros` crate This crate contains all the procedural macros ("proc macros") shared by all the kernel. Procedural macros allow to create syntax extensions. They run at compile-time and can consume as well as produce Rust syntax. For instance, the `module!` macro that is used by Rust modules is implemented here. It allows to easily declare the equivalent information to the `MODULE_*` macros in C modules, e.g.: module! { type: RustMinimal, name: b"rust_minimal", author: b"Rust for Linux Contributors", description: b"Rust minimal sample", license: b"GPL", } Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Finn Behrens Signed-off-by: Finn Behrens Co-developed-by: Adam Bratschi-Kaye Signed-off-by: Adam Bratschi-Kaye Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Sumera Priyadarsini Signed-off-by: Sumera Priyadarsini Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Matthew Bakhtiari Signed-off-by: Matthew Bakhtiari Signed-off-by: Miguel Ojeda --- rust/macros/helpers.rs | 79 ++++++ rust/macros/lib.rs | 94 ++++++ rust/macros/module.rs | 631 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 804 insertions(+) create mode 100644 rust/macros/helpers.rs create mode 100644 rust/macros/lib.rs create mode 100644 rust/macros/module.rs diff --git a/rust/macros/helpers.rs b/rust/macros/helpers.rs new file mode 100644 index 00000000000000..ad210563e5a6c1 --- /dev/null +++ b/rust/macros/helpers.rs @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 + +use proc_macro::{token_stream, Group, TokenTree}; + +pub(crate) fn try_ident(it: &mut token_stream::IntoIter) -> Option { + if let Some(TokenTree::Ident(ident)) = it.next() { + Some(ident.to_string()) + } else { + None + } +} + +pub(crate) fn try_literal(it: &mut token_stream::IntoIter) -> Option { + if let Some(TokenTree::Literal(literal)) = it.next() { + Some(literal.to_string()) + } else { + None + } +} + +pub(crate) fn try_byte_string(it: &mut token_stream::IntoIter) -> Option { + try_literal(it).and_then(|byte_string| { + if byte_string.starts_with("b\"") && byte_string.ends_with('\"') { + Some(byte_string[2..byte_string.len() - 1].to_string()) + } else { + None + } + }) +} + +pub(crate) fn expect_ident(it: &mut token_stream::IntoIter) -> String { + try_ident(it).expect("Expected Ident") +} + +pub(crate) fn expect_punct(it: &mut token_stream::IntoIter) -> char { + if let TokenTree::Punct(punct) = it.next().expect("Reached end of token stream for Punct") { + punct.as_char() + } else { + panic!("Expected Punct"); + } +} + +pub(crate) fn expect_literal(it: &mut token_stream::IntoIter) -> String { + try_literal(it).expect("Expected Literal") +} + +pub(crate) fn expect_group(it: &mut token_stream::IntoIter) -> Group { + if let TokenTree::Group(group) = it.next().expect("Reached end of token stream for Group") { + group + } else { + panic!("Expected Group"); + } +} + +pub(crate) fn expect_byte_string(it: &mut token_stream::IntoIter) -> String { + try_byte_string(it).expect("Expected byte string") +} + +pub(crate) fn expect_end(it: &mut token_stream::IntoIter) { + if it.next().is_some() { + panic!("Expected end"); + } +} + +pub(crate) fn get_literal(it: &mut token_stream::IntoIter, expected_name: &str) -> String { + assert_eq!(expect_ident(it), expected_name); + assert_eq!(expect_punct(it), ':'); + let literal = expect_literal(it); + assert_eq!(expect_punct(it), ','); + literal +} + +pub(crate) fn get_byte_string(it: &mut token_stream::IntoIter, expected_name: &str) -> String { + assert_eq!(expect_ident(it), expected_name); + assert_eq!(expect_punct(it), ':'); + let byte_string = expect_byte_string(it); + assert_eq!(expect_punct(it), ','); + byte_string +} diff --git a/rust/macros/lib.rs b/rust/macros/lib.rs new file mode 100644 index 00000000000000..d4ac221387a125 --- /dev/null +++ b/rust/macros/lib.rs @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Crate for all kernel procedural macros. + +mod helpers; +mod module; + +use proc_macro::TokenStream; + +/// Declares a kernel module. +/// +/// The `type` argument should be a type which implements the [`Module`] +/// trait. Also accepts various forms of kernel metadata. +/// +/// C header: [`include/linux/moduleparam.h`](../../../include/linux/moduleparam.h) +/// +/// [`Module`]: ../kernel/trait.Module.html +/// +/// # Examples +/// +/// ```ignore +/// use kernel::prelude::*; +/// +/// module!{ +/// type: MyModule, +/// name: b"my_kernel_module", +/// author: b"Rust for Linux Contributors", +/// description: b"My very own kernel module!", +/// license: b"GPL", +/// params: { +/// my_i32: i32 { +/// default: 42, +/// permissions: 0o000, +/// description: b"Example of i32", +/// }, +/// writeable_i32: i32 { +/// default: 42, +/// permissions: 0o644, +/// description: b"Example of i32", +/// }, +/// }, +/// } +/// +/// struct MyModule; +/// +/// impl kernel::Module for MyModule { +/// fn init() -> Result { +/// // If the parameter is writeable, then the kparam lock must be +/// // taken to read the parameter: +/// { +/// let lock = THIS_MODULE.kernel_param_lock(); +/// pr_info!("i32 param is: {}\n", writeable_i32.read(&lock)); +/// } +/// // If the parameter is read only, it can be read without locking +/// // the kernel parameters: +/// pr_info!("i32 param is: {}\n", my_i32.read()); +/// Ok(Self) +/// } +/// } +/// ``` +/// +/// # Supported argument types +/// - `type`: type which implements the [`Module`] trait (required). +/// - `name`: byte array of the name of the kernel module (required). +/// - `author`: byte array of the author of the kernel module. +/// - `description`: byte array of the description of the kernel module. +/// - `license`: byte array of the license of the kernel module (required). +/// - `alias`: byte array of alias name of the kernel module. +/// - `alias_rtnl_link`: byte array of the `rtnl_link_alias` of the kernel module (mutually exclusive with `alias`). +/// - `params`: parameters for the kernel module, as described below. +/// +/// # Supported parameter types +/// +/// - `bool`: Corresponds to C `bool` param type. +/// - `i8`: No equivalent C param type. +/// - `u8`: Corresponds to C `char` param type. +/// - `i16`: Corresponds to C `short` param type. +/// - `u16`: Corresponds to C `ushort` param type. +/// - `i32`: Corresponds to C `int` param type. +/// - `u32`: Corresponds to C `uint` param type. +/// - `i64`: No equivalent C param type. +/// - `u64`: Corresponds to C `ullong` param type. +/// - `isize`: No equivalent C param type. +/// - `usize`: No equivalent C param type. +/// - `str`: Corresponds to C `charp` param type. Reading returns a byte slice. +/// - `ArrayParam`: Corresponds to C parameters created using `module_param_array`. An array +/// of `T`'s of length at **most** `N`. +/// +/// `invbool` is unsupported: it was only ever used in a few modules. +/// Consider using a `bool` and inverting the logic instead. +#[proc_macro] +pub fn module(ts: TokenStream) -> TokenStream { + module::module(ts) +} diff --git a/rust/macros/module.rs b/rust/macros/module.rs new file mode 100644 index 00000000000000..323934b24e7f9b --- /dev/null +++ b/rust/macros/module.rs @@ -0,0 +1,631 @@ +// SPDX-License-Identifier: GPL-2.0 + +use proc_macro::{token_stream, Delimiter, Group, Literal, TokenStream, TokenTree}; + +use crate::helpers::*; + +#[derive(Clone, PartialEq)] +enum ParamType { + Ident(String), + Array { vals: String, max_length: usize }, +} + +fn expect_array_fields(it: &mut token_stream::IntoIter) -> ParamType { + assert_eq!(expect_punct(it), '<'); + let vals = expect_ident(it); + assert_eq!(expect_punct(it), ','); + let max_length_str = expect_literal(it); + let max_length = max_length_str + .parse::() + .expect("Expected usize length"); + assert_eq!(expect_punct(it), '>'); + ParamType::Array { vals, max_length } +} + +fn expect_type(it: &mut token_stream::IntoIter) -> ParamType { + if let TokenTree::Ident(ident) = it + .next() + .expect("Reached end of token stream for param type") + { + match ident.to_string().as_ref() { + "ArrayParam" => expect_array_fields(it), + _ => ParamType::Ident(ident.to_string()), + } + } else { + panic!("Expected Param Type") + } +} + +struct ModInfoBuilder<'a> { + module: &'a str, + counter: usize, + buffer: String, +} + +impl<'a> ModInfoBuilder<'a> { + fn new(module: &'a str) -> Self { + ModInfoBuilder { + module, + counter: 0, + buffer: String::new(), + } + } + + fn emit_base(&mut self, field: &str, content: &str, builtin: bool) { + use std::fmt::Write; + + let string = if builtin { + // Built-in modules prefix their modinfo strings by `module.`. + format!( + "{module}.{field}={content}\0", + module = self.module, + field = field, + content = content + ) + } else { + // Loadable modules' modinfo strings go as-is. + format!("{field}={content}\0", field = field, content = content) + }; + + write!( + &mut self.buffer, + " + {cfg} + #[doc(hidden)] + #[link_section = \".modinfo\"] + #[used] + pub static __{module}_{counter}: [u8; {length}] = *{string}; + ", + cfg = if builtin { + "#[cfg(not(MODULE))]" + } else { + "#[cfg(MODULE)]" + }, + module = self.module, + counter = self.counter, + length = string.len(), + string = Literal::byte_string(string.as_bytes()), + ) + .unwrap(); + + self.counter += 1; + } + + fn emit_only_builtin(&mut self, field: &str, content: &str) { + self.emit_base(field, content, true) + } + + fn emit_only_loadable(&mut self, field: &str, content: &str) { + self.emit_base(field, content, false) + } + + fn emit(&mut self, field: &str, content: &str) { + self.emit_only_builtin(field, content); + self.emit_only_loadable(field, content); + } + + fn emit_param(&mut self, field: &str, param: &str, content: &str) { + let content = format!("{param}:{content}", param = param, content = content); + self.emit(field, &content); + } +} + +fn permissions_are_readonly(perms: &str) -> bool { + let (radix, digits) = if let Some(n) = perms.strip_prefix("0x") { + (16, n) + } else if let Some(n) = perms.strip_prefix("0o") { + (8, n) + } else if let Some(n) = perms.strip_prefix("0b") { + (2, n) + } else { + (10, perms) + }; + match u32::from_str_radix(digits, radix) { + Ok(perms) => perms & 0o222 == 0, + Err(_) => false, + } +} + +fn param_ops_path(param_type: &str) -> &'static str { + match param_type { + "bool" => "kernel::module_param::PARAM_OPS_BOOL", + "i8" => "kernel::module_param::PARAM_OPS_I8", + "u8" => "kernel::module_param::PARAM_OPS_U8", + "i16" => "kernel::module_param::PARAM_OPS_I16", + "u16" => "kernel::module_param::PARAM_OPS_U16", + "i32" => "kernel::module_param::PARAM_OPS_I32", + "u32" => "kernel::module_param::PARAM_OPS_U32", + "i64" => "kernel::module_param::PARAM_OPS_I64", + "u64" => "kernel::module_param::PARAM_OPS_U64", + "isize" => "kernel::module_param::PARAM_OPS_ISIZE", + "usize" => "kernel::module_param::PARAM_OPS_USIZE", + "str" => "kernel::module_param::PARAM_OPS_STR", + t => panic!("Unrecognized type {}", t), + } +} + +fn try_simple_param_val( + param_type: &str, +) -> Box Option> { + match param_type { + "bool" => Box::new(try_ident), + "str" => Box::new(|param_it| { + try_byte_string(param_it) + .map(|s| format!("kernel::module_param::StringParam::Ref(b\"{}\")", s)) + }), + _ => Box::new(try_literal), + } +} + +fn get_default(param_type: &ParamType, param_it: &mut token_stream::IntoIter) -> String { + let try_param_val = match param_type { + ParamType::Ident(ref param_type) + | ParamType::Array { + vals: ref param_type, + max_length: _, + } => try_simple_param_val(param_type), + }; + assert_eq!(expect_ident(param_it), "default"); + assert_eq!(expect_punct(param_it), ':'); + let default = match param_type { + ParamType::Ident(_) => try_param_val(param_it).expect("Expected default param value"), + ParamType::Array { + vals: _, + max_length: _, + } => { + let group = expect_group(param_it); + assert_eq!(group.delimiter(), Delimiter::Bracket); + let mut default_vals = Vec::new(); + let mut it = group.stream().into_iter(); + + while let Some(default_val) = try_param_val(&mut it) { + default_vals.push(default_val); + match it.next() { + Some(TokenTree::Punct(punct)) => assert_eq!(punct.as_char(), ','), + None => break, + _ => panic!("Expected ',' or end of array default values"), + } + } + + let mut default_array = "kernel::module_param::ArrayParam::create(&[".to_string(); + default_array.push_str( + &default_vals + .iter() + .map(|val| val.to_string()) + .collect::>() + .join(","), + ); + default_array.push_str("])"); + default_array + } + }; + assert_eq!(expect_punct(param_it), ','); + default +} + +fn generated_array_ops_name(vals: &str, max_length: usize) -> String { + format!( + "__generated_array_ops_{vals}_{max_length}", + vals = vals, + max_length = max_length + ) +} + +#[derive(Debug, Default)] +struct ModuleInfo { + type_: String, + license: String, + name: String, + author: Option, + description: Option, + alias: Option, + params: Option, +} + +impl ModuleInfo { + fn parse(it: &mut token_stream::IntoIter) -> Self { + let mut info = ModuleInfo::default(); + + const EXPECTED_KEYS: &[&str] = &[ + "type", + "name", + "author", + "description", + "license", + "alias", + "alias_rtnl_link", + "params", + ]; + const REQUIRED_KEYS: &[&str] = &["type", "name", "license"]; + let mut seen_keys = Vec::new(); + + loop { + let key = match it.next() { + Some(TokenTree::Ident(ident)) => ident.to_string(), + Some(_) => panic!("Expected Ident or end"), + None => break, + }; + + if seen_keys.contains(&key) { + panic!( + "Duplicated key \"{}\". Keys can only be specified once.", + key + ); + } + + assert_eq!(expect_punct(it), ':'); + + match key.as_str() { + "type" => info.type_ = expect_ident(it), + "name" => info.name = expect_byte_string(it), + "author" => info.author = Some(expect_byte_string(it)), + "description" => info.description = Some(expect_byte_string(it)), + "license" => info.license = expect_byte_string(it), + "alias" => info.alias = Some(expect_byte_string(it)), + "alias_rtnl_link" => { + info.alias = Some(format!("rtnl-link-{}", expect_byte_string(it))) + } + "params" => info.params = Some(expect_group(it)), + _ => panic!( + "Unknown key \"{}\". Valid keys are: {:?}.", + key, EXPECTED_KEYS + ), + } + + assert_eq!(expect_punct(it), ','); + + seen_keys.push(key); + } + + expect_end(it); + + for key in REQUIRED_KEYS { + if !seen_keys.iter().any(|e| e == key) { + panic!("Missing required key \"{}\".", key); + } + } + + let mut ordered_keys: Vec<&str> = Vec::new(); + for key in EXPECTED_KEYS { + if seen_keys.iter().any(|e| e == key) { + ordered_keys.push(key); + } + } + + if seen_keys != ordered_keys { + panic!( + "Keys are not ordered as expected. Order them like: {:?}.", + ordered_keys + ); + } + + info + } +} + +pub(crate) fn module(ts: TokenStream) -> TokenStream { + let mut it = ts.into_iter(); + + let info = ModuleInfo::parse(&mut it); + + let mut modinfo = ModInfoBuilder::new(info.name.as_ref()); + if let Some(author) = info.author { + modinfo.emit("author", &author); + } + if let Some(description) = info.description { + modinfo.emit("description", &description); + } + modinfo.emit("license", &info.license); + if let Some(alias) = info.alias { + modinfo.emit("alias", &alias); + } + + // Built-in modules also export the `file` modinfo string. + let file = + std::env::var("RUST_MODFILE").expect("Unable to fetch RUST_MODFILE environmental variable"); + modinfo.emit_only_builtin("file", &file); + + let mut array_types_to_generate = Vec::new(); + if let Some(params) = info.params { + assert_eq!(params.delimiter(), Delimiter::Brace); + + let mut it = params.stream().into_iter(); + + loop { + let param_name = match it.next() { + Some(TokenTree::Ident(ident)) => ident.to_string(), + Some(_) => panic!("Expected Ident or end"), + None => break, + }; + + assert_eq!(expect_punct(&mut it), ':'); + let param_type = expect_type(&mut it); + let group = expect_group(&mut it); + assert_eq!(expect_punct(&mut it), ','); + + assert_eq!(group.delimiter(), Delimiter::Brace); + + let mut param_it = group.stream().into_iter(); + let param_default = get_default(¶m_type, &mut param_it); + let param_permissions = get_literal(&mut param_it, "permissions"); + let param_description = get_byte_string(&mut param_it, "description"); + expect_end(&mut param_it); + + // TODO: More primitive types. + // TODO: Other kinds: unsafes, etc. + let (param_kernel_type, ops): (String, _) = match param_type { + ParamType::Ident(ref param_type) => ( + param_type.to_string(), + param_ops_path(param_type).to_string(), + ), + ParamType::Array { + ref vals, + max_length, + } => { + array_types_to_generate.push((vals.clone(), max_length)); + ( + format!("__rust_array_param_{}_{}", vals, max_length), + generated_array_ops_name(vals, max_length), + ) + } + }; + + modinfo.emit_param("parmtype", ¶m_name, ¶m_kernel_type); + modinfo.emit_param("parm", ¶m_name, ¶m_description); + let param_type_internal = match param_type { + ParamType::Ident(ref param_type) => match param_type.as_ref() { + "str" => "kernel::module_param::StringParam".to_string(), + other => other.to_string(), + }, + ParamType::Array { + ref vals, + max_length, + } => format!( + "kernel::module_param::ArrayParam<{vals}, {max_length}>", + vals = vals, + max_length = max_length + ), + }; + let read_func = if permissions_are_readonly(¶m_permissions) { + format!( + " + fn read(&self) -> &<{param_type_internal} as kernel::module_param::ModuleParam>::Value {{ + // SAFETY: Parameters do not need to be locked because they are read only or sysfs is not enabled. + unsafe {{ <{param_type_internal} as kernel::module_param::ModuleParam>::value(&__{name}_{param_name}_value) }} + }} + ", + name = info.name, + param_name = param_name, + param_type_internal = param_type_internal, + ) + } else { + format!( + " + fn read<'lck>(&self, lock: &'lck kernel::KParamGuard) -> &'lck <{param_type_internal} as kernel::module_param::ModuleParam>::Value {{ + // SAFETY: Parameters are locked by `KParamGuard`. + unsafe {{ <{param_type_internal} as kernel::module_param::ModuleParam>::value(&__{name}_{param_name}_value) }} + }} + ", + name = info.name, + param_name = param_name, + param_type_internal = param_type_internal, + ) + }; + let kparam = format!( + " + kernel::bindings::kernel_param__bindgen_ty_1 {{ + arg: unsafe {{ &__{name}_{param_name}_value }} as *const _ as *mut kernel::c_types::c_void, + }}, + ", + name = info.name, + param_name = param_name, + ); + modinfo.buffer.push_str( + &format!( + " + static mut __{name}_{param_name}_value: {param_type_internal} = {param_default}; + + struct __{name}_{param_name}; + + impl __{name}_{param_name} {{ {read_func} }} + + const {param_name}: __{name}_{param_name} = __{name}_{param_name}; + + // Note: the C macro that generates the static structs for the `__param` section + // asks for them to be `aligned(sizeof(void *))`. However, that was put in place + // in 2003 in commit 38d5b085d2 (\"[PATCH] Fix over-alignment problem on x86-64\") + // to undo GCC over-alignment of static structs of >32 bytes. It seems that is + // not the case anymore, so we simplify to a transparent representation here + // in the expectation that it is not needed anymore. + // TODO: Revisit this to confirm the above comment and remove it if it happened. + #[repr(transparent)] + struct __{name}_{param_name}_RacyKernelParam(kernel::bindings::kernel_param); + + unsafe impl Sync for __{name}_{param_name}_RacyKernelParam {{ + }} + + #[cfg(not(MODULE))] + const __{name}_{param_name}_name: *const kernel::c_types::c_char = b\"{name}.{param_name}\\0\" as *const _ as *const kernel::c_types::c_char; + + #[cfg(MODULE)] + const __{name}_{param_name}_name: *const kernel::c_types::c_char = b\"{param_name}\\0\" as *const _ as *const kernel::c_types::c_char; + + #[link_section = \"__param\"] + #[used] + static __{name}_{param_name}_struct: __{name}_{param_name}_RacyKernelParam = __{name}_{param_name}_RacyKernelParam(kernel::bindings::kernel_param {{ + name: __{name}_{param_name}_name, + // SAFETY: `__this_module` is constructed by the kernel at load time and will not be freed until the module is unloaded. + #[cfg(MODULE)] + mod_: unsafe {{ &kernel::bindings::__this_module as *const _ as *mut _ }}, + #[cfg(not(MODULE))] + mod_: core::ptr::null_mut(), + ops: unsafe {{ &{ops} }} as *const kernel::bindings::kernel_param_ops, + perm: {permissions}, + level: -1, + flags: 0, + __bindgen_anon_1: {kparam} + }}); + ", + name = info.name, + param_type_internal = param_type_internal, + read_func = read_func, + param_default = param_default, + param_name = param_name, + ops = ops, + permissions = param_permissions, + kparam = kparam, + ) + ); + } + } + + let mut generated_array_types = String::new(); + + for (vals, max_length) in array_types_to_generate { + let ops_name = generated_array_ops_name(&vals, max_length); + generated_array_types.push_str(&format!( + " + kernel::make_param_ops!( + {ops_name}, + kernel::module_param::ArrayParam<{vals}, {{ {max_length} }}> + ); + ", + ops_name = ops_name, + vals = vals, + max_length = max_length, + )); + } + + format!( + " + /// The module name. + /// + /// Used by the printing macros, e.g. [`info!`]. + const __LOG_PREFIX: &[u8] = b\"{name}\\0\"; + + /// The \"Rust loadable module\" mark, for `scripts/is_rust_module.sh`. + // + // This may be best done another way later on, e.g. as a new modinfo + // key or a new section. For the moment, keep it simple. + #[cfg(MODULE)] + #[doc(hidden)] + #[used] + static __IS_RUST_MODULE: () = (); + + static mut __MOD: Option<{type_}> = None; + + // SAFETY: `__this_module` is constructed by the kernel at load time and will not be freed until the module is unloaded. + #[cfg(MODULE)] + static THIS_MODULE: kernel::ThisModule = unsafe {{ kernel::ThisModule::from_ptr(&kernel::bindings::__this_module as *const _ as *mut _) }}; + #[cfg(not(MODULE))] + static THIS_MODULE: kernel::ThisModule = unsafe {{ kernel::ThisModule::from_ptr(core::ptr::null_mut()) }}; + + // Loadable modules need to export the `{{init,cleanup}}_module` identifiers. + #[cfg(MODULE)] + #[doc(hidden)] + #[no_mangle] + pub extern \"C\" fn init_module() -> kernel::c_types::c_int {{ + __init() + }} + + #[cfg(MODULE)] + #[doc(hidden)] + #[no_mangle] + pub extern \"C\" fn cleanup_module() {{ + __exit() + }} + + // Built-in modules are initialized through an initcall pointer + // and the identifiers need to be unique. + #[cfg(not(MODULE))] + #[cfg(not(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS))] + #[doc(hidden)] + #[link_section = \"{initcall_section}\"] + #[used] + pub static __{name}_initcall: extern \"C\" fn() -> kernel::c_types::c_int = __{name}_init; + + #[cfg(not(MODULE))] + #[cfg(CONFIG_HAVE_ARCH_PREL32_RELOCATIONS)] + core::arch::global_asm!( + r#\".section \"{initcall_section}\", \"a\" + __{name}_initcall: + .long __{name}_init - . + .previous + \"# + ); + + #[cfg(not(MODULE))] + #[doc(hidden)] + #[no_mangle] + pub extern \"C\" fn __{name}_init() -> kernel::c_types::c_int {{ + __init() + }} + + #[cfg(not(MODULE))] + #[doc(hidden)] + #[no_mangle] + pub extern \"C\" fn __{name}_exit() {{ + __exit() + }} + + fn __init() -> kernel::c_types::c_int {{ + match <{type_} as kernel::Module>::init(kernel::c_str!(\"{name}\"), &THIS_MODULE) {{ + Ok(m) => {{ + unsafe {{ + __MOD = Some(m); + }} + return 0; + }} + Err(e) => {{ + return e.to_kernel_errno(); + }} + }} + }} + + fn __exit() {{ + unsafe {{ + // Invokes `drop()` on `__MOD`, which should be used for cleanup. + __MOD = None; + }} + }} + + {modinfo} + + {generated_array_types} + ", + type_ = info.type_, + name = info.name, + modinfo = modinfo.buffer, + generated_array_types = generated_array_types, + initcall_section = ".initcall6.init" + ).parse().expect("Error parsing formatted string into token stream.") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_permissions_are_readonly() { + assert!(permissions_are_readonly("0b000000000")); + assert!(permissions_are_readonly("0o000")); + assert!(permissions_are_readonly("000")); + assert!(permissions_are_readonly("0x000")); + + assert!(!permissions_are_readonly("0b111111111")); + assert!(!permissions_are_readonly("0o777")); + assert!(!permissions_are_readonly("511")); + assert!(!permissions_are_readonly("0x1ff")); + + assert!(permissions_are_readonly("0o014")); + assert!(permissions_are_readonly("0o015")); + + assert!(!permissions_are_readonly("0o214")); + assert!(!permissions_are_readonly("0o024")); + assert!(!permissions_are_readonly("0o012")); + + assert!(!permissions_are_readonly("0o315")); + assert!(!permissions_are_readonly("0o065")); + assert!(!permissions_are_readonly("0o017")); + } +} From 8dff7ef93192716cd421a78a2ee7bea40409a142 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Fri, 11 Feb 2022 20:23:34 +0100 Subject: [PATCH 0025/1250] rust: add `kernel` crate's `sync` module This module contains the kernel APIs related to synchronisation that have been ported or wrapped for usage by Rust code in the kernel and is shared by all of them. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Geoffrey Thomas Signed-off-by: Geoffrey Thomas Co-developed-by: Sven Van Asbroeck Signed-off-by: Sven Van Asbroeck Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Daniel Xu Signed-off-by: Daniel Xu Co-developed-by: Hsiang-Cheng Yang Signed-off-by: Hsiang-Cheng Yang Signed-off-by: Wedson Almeida Filho Co-developed-by: Miguel Ojeda Signed-off-by: Miguel Ojeda --- include/linux/spinlock.h | 25 +- rust/kernel/sync.rs | 161 +++++++++++ rust/kernel/sync/arc.rs | 503 ++++++++++++++++++++++++++++++++++ rust/kernel/sync/condvar.rs | 138 ++++++++++ rust/kernel/sync/guard.rs | 169 ++++++++++++ rust/kernel/sync/locked_by.rs | 111 ++++++++ rust/kernel/sync/mutex.rs | 153 +++++++++++ rust/kernel/sync/nowait.rs | 188 +++++++++++++ rust/kernel/sync/revocable.rs | 250 +++++++++++++++++ rust/kernel/sync/rwsem.rs | 197 +++++++++++++ rust/kernel/sync/seqlock.rs | 202 ++++++++++++++ rust/kernel/sync/smutex.rs | 295 ++++++++++++++++++++ rust/kernel/sync/spinlock.rs | 360 ++++++++++++++++++++++++ 13 files changed, 2745 insertions(+), 7 deletions(-) create mode 100644 rust/kernel/sync.rs create mode 100644 rust/kernel/sync/arc.rs create mode 100644 rust/kernel/sync/condvar.rs create mode 100644 rust/kernel/sync/guard.rs create mode 100644 rust/kernel/sync/locked_by.rs create mode 100644 rust/kernel/sync/mutex.rs create mode 100644 rust/kernel/sync/nowait.rs create mode 100644 rust/kernel/sync/revocable.rs create mode 100644 rust/kernel/sync/rwsem.rs create mode 100644 rust/kernel/sync/seqlock.rs create mode 100644 rust/kernel/sync/smutex.rs create mode 100644 rust/kernel/sync/spinlock.rs diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 5c0c5174155d05..cdcbf9d9c70c70 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -99,11 +99,17 @@ extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, struct lock_class_key *key, short inner); +static inline void _raw_spin_lock_init(raw_spinlock_t *lock, const char *name, + struct lock_class_key *key) +{ + __raw_spin_lock_init(lock, name, key, LD_WAIT_SPIN); +} + # define raw_spin_lock_init(lock) \ do { \ static struct lock_class_key __key; \ \ - __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN); \ + _raw_spin_lock_init((lock), #lock, &__key); \ } while (0) #else @@ -326,12 +332,17 @@ static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock) #ifdef CONFIG_DEBUG_SPINLOCK -# define spin_lock_init(lock) \ -do { \ - static struct lock_class_key __key; \ - \ - __raw_spin_lock_init(spinlock_check(lock), \ - #lock, &__key, LD_WAIT_CONFIG); \ +static inline void __spin_lock_init(spinlock_t *lock, const char *name, + struct lock_class_key *key) +{ + __raw_spin_lock_init(spinlock_check(lock), name, key, LD_WAIT_CONFIG); +} + +# define spin_lock_init(lock) \ +do { \ + static struct lock_class_key __key; \ + \ + __spin_lock_init(lock, #lock, &__key); \ } while (0) #else diff --git a/rust/kernel/sync.rs b/rust/kernel/sync.rs new file mode 100644 index 00000000000000..66536fe2ba18fe --- /dev/null +++ b/rust/kernel/sync.rs @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Synchronisation primitives. +//! +//! This module contains the kernel APIs related to synchronisation that have been ported or +//! wrapped for usage by Rust code in the kernel and is shared by all of them. +//! +//! # Example +//! +//! ``` +//! # use kernel::mutex_init; +//! # use kernel::sync::Mutex; +//! # use alloc::boxed::Box; +//! # use core::pin::Pin; +//! // SAFETY: `init` is called below. +//! let mut data = Pin::from(Box::try_new(unsafe { Mutex::new(10) }).unwrap()); +//! mutex_init!(data.as_mut(), "test::data"); +//! +//! assert_eq!(*data.lock(), 10); +//! *data.lock() = 20; +//! assert_eq!(*data.lock(), 20); +//! ``` + +use crate::{bindings, str::CStr}; +use core::pin::Pin; + +mod arc; +mod condvar; +mod guard; +mod locked_by; +mod mutex; +mod nowait; +mod revocable; +mod rwsem; +mod seqlock; +pub mod smutex; +mod spinlock; + +pub use arc::{Ref, RefBorrow, UniqueRef}; +pub use condvar::CondVar; +pub use guard::{Guard, Lock, LockFactory, LockInfo, LockIniter, ReadLock, WriteLock}; +pub use locked_by::LockedBy; +pub use mutex::{Mutex, RevocableMutex, RevocableMutexGuard}; +pub use nowait::{NoWaitLock, NoWaitLockGuard}; +pub use revocable::{Revocable, RevocableGuard}; +pub use rwsem::{RevocableRwSemaphore, RevocableRwSemaphoreGuard, RwSemaphore}; +pub use seqlock::{SeqLock, SeqLockReadGuard}; +pub use spinlock::{RawSpinLock, SpinLock}; + +/// Safely initialises an object that has an `init` function that takes a name and a lock class as +/// arguments, examples of these are [`Mutex`] and [`SpinLock`]. Each of them also provides a more +/// specialised name that uses this macro. +#[doc(hidden)] +#[macro_export] +macro_rules! init_with_lockdep { + ($obj:expr, $name:expr) => {{ + static mut CLASS1: core::mem::MaybeUninit<$crate::bindings::lock_class_key> = + core::mem::MaybeUninit::uninit(); + static mut CLASS2: core::mem::MaybeUninit<$crate::bindings::lock_class_key> = + core::mem::MaybeUninit::uninit(); + let obj = $obj; + let name = $crate::c_str!($name); + // SAFETY: `CLASS1` and `CLASS2` are never used by Rust code directly; the C portion of the + // kernel may change it though. + #[allow(unused_unsafe)] + unsafe { + $crate::sync::NeedsLockClass::init(obj, name, CLASS1.as_mut_ptr(), CLASS2.as_mut_ptr()) + }; + }}; +} + +/// A trait for types that need a lock class during initialisation. +/// +/// Implementers of this trait benefit from the [`init_with_lockdep`] macro that generates a new +/// class for each initialisation call site. +pub trait NeedsLockClass { + /// Initialises the type instance so that it can be safely used. + /// + /// Callers are encouraged to use the [`init_with_lockdep`] macro as it automatically creates a + /// new lock class on each usage. + /// + /// # Safety + /// + /// `key1` and `key2` must point to valid memory locations and remain valid until `self` is + /// dropped. + unsafe fn init( + self: Pin<&mut Self>, + name: &'static CStr, + key1: *mut bindings::lock_class_key, + key2: *mut bindings::lock_class_key, + ); +} + +/// Automatically initialises static instances of synchronisation primitives. +/// +/// The syntax resembles that of regular static variables, except that the value assigned is that +/// of the protected type (if one exists). In the examples below, all primitives except for +/// [`CondVar`] require the inner value to be supplied. +/// +/// # Examples +/// +/// ```ignore +/// # use kernel::{init_static_sync, sync::{CondVar, Mutex, RevocableMutex, SpinLock}}; +/// struct Test { +/// a: u32, +/// b: u32, +/// } +/// +/// init_static_sync! { +/// static A: Mutex = Test { a: 10, b: 20 }; +/// +/// /// Documentation for `B`. +/// pub static B: Mutex = 0; +/// +/// pub(crate) static C: SpinLock = Test { a: 10, b: 20 }; +/// static D: CondVar; +/// +/// static E: RevocableMutex = Test { a: 30, b: 40 }; +/// } +/// ``` +#[macro_export] +macro_rules! init_static_sync { + ($($(#[$outer:meta])* $v:vis static $id:ident : $t:ty $(= $value:expr)?;)*) => { + $( + $(#[$outer])* + $v static $id: $t = { + #[link_section = ".init_array"] + #[used] + static TMP: extern "C" fn() = { + extern "C" fn constructor() { + // SAFETY: This locally-defined function is only called from a constructor, + // which guarantees that `$id` is not accessible from other threads + // concurrently. + #[allow(clippy::cast_ref_to_mut)] + let mutable = unsafe { &mut *(&$id as *const _ as *mut $t) }; + // SAFETY: It's a shared static, so it cannot move. + let pinned = unsafe { core::pin::Pin::new_unchecked(mutable) }; + $crate::init_with_lockdep!(pinned, stringify!($id)); + } + constructor + }; + $crate::init_static_sync!(@call_new $t, $($value)?) + }; + )* + }; + (@call_new $t:ty, $value:expr) => {{ + let v = $value; + // SAFETY: the initialisation function is called by the constructor above. + unsafe { <$t>::new(v) } + }}; + (@call_new $t:ty,) => { + // SAFETY: the initialisation function is called by the constructor above. + unsafe { <$t>::new() } + }; +} + +/// Reschedules the caller's task if needed. +pub fn cond_resched() -> bool { + // SAFETY: No arguments, reschedules `current` if needed. + unsafe { bindings::cond_resched() != 0 } +} diff --git a/rust/kernel/sync/arc.rs b/rust/kernel/sync/arc.rs new file mode 100644 index 00000000000000..056d2bae632ad8 --- /dev/null +++ b/rust/kernel/sync/arc.rs @@ -0,0 +1,503 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A reference-counted pointer. +//! +//! This module implements a way for users to create reference-counted objects and pointers to +//! them. Such a pointer automatically increments and decrements the count, and drops the +//! underlying object when it reaches zero. It is also safe to use concurrently from multiple +//! threads. +//! +//! It is different from the standard library's [`Arc`] in a few ways: +//! 1. It is backed by the kernel's `refcount_t` type. +//! 2. It does not support weak references, which allows it to be half the size. +//! 3. It saturates the reference count instead of aborting when it goes over a threshold. +//! 4. It does not provide a `get_mut` method, so the ref counted object is pinned. +//! +//! [`Arc`]: https://doc.rust-lang.org/std/sync/struct.Arc.html + +use crate::{bindings, error::code::*, Error, Opaque, Result}; +use alloc::{ + alloc::{alloc, dealloc}, + vec::Vec, +}; +use core::{ + alloc::Layout, + convert::{AsRef, TryFrom}, + marker::{PhantomData, Unsize}, + mem::{ManuallyDrop, MaybeUninit}, + ops::{Deref, DerefMut}, + pin::Pin, + ptr::{self, NonNull}, +}; + +/// A reference-counted pointer to an instance of `T`. +/// +/// The reference count is incremented when new instances of [`Ref`] are created, and decremented +/// when they are dropped. When the count reaches zero, the underlying `T` is also dropped. +/// +/// # Invariants +/// +/// The reference count on an instance of [`Ref`] is always non-zero. +/// The object pointed to by [`Ref`] is always pinned. +pub struct Ref { + ptr: NonNull>, + _p: PhantomData>, +} + +#[repr(C)] +struct RefInner { + refcount: Opaque, + data: T, +} + +// This is to allow [`Ref`] (and variants) to be used as the type of `self`. +impl core::ops::Receiver for Ref {} + +// This is to allow [`RefBorrow`] (and variants) to be used as the type of `self`. +impl core::ops::Receiver for RefBorrow<'_, T> {} + +// This is to allow coercion from `Ref` to `Ref` if `T` can be converted to the +// dynamically-sized type (DST) `U`. +impl, U: ?Sized> core::ops::CoerceUnsized> for Ref {} + +// This is to allow `Ref` to be dispatched on when `Ref` can be coerced into `Ref`. +impl, U: ?Sized> core::ops::DispatchFromDyn> for Ref {} + +// SAFETY: It is safe to send `Ref` to another thread when the underlying `T` is `Sync` because +// it effectively means sharing `&T` (which is safe because `T` is `Sync`); additionally, it needs +// `T` to be `Send` because any thread that has a `Ref` may ultimately access `T` directly, for +// example, when the reference count reaches zero and `T` is dropped. +unsafe impl Send for Ref {} + +// SAFETY: It is safe to send `&Ref` to another thread when the underlying `T` is `Sync` for +// the same reason as above. `T` needs to be `Send` as well because a thread can clone a `&Ref` +// into a `Ref`, which may lead to `T` being accessed by the same reasoning as above. +unsafe impl Sync for Ref {} + +impl Ref { + /// Constructs a new reference counted instance of `T`. + pub fn try_new(contents: T) -> Result { + let layout = Layout::new::>(); + // SAFETY: The layout size is guaranteed to be non-zero because `RefInner` contains the + // reference count. + let inner = NonNull::new(unsafe { alloc(layout) }) + .ok_or(ENOMEM)? + .cast::>(); + + // INVARIANT: The refcount is initialised to a non-zero value. + let value = RefInner { + // SAFETY: Just an FFI call that returns a `refcount_t` initialised to 1. + refcount: Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }), + data: contents, + }; + // SAFETY: `inner` is writable and properly aligned. + unsafe { inner.as_ptr().write(value) }; + + // SAFETY: We just created `inner` with a reference count of 1, which is owned by the new + // `Ref` object. + Ok(unsafe { Self::from_inner(inner) }) + } + + /// Deconstructs a [`Ref`] object into a `usize`. + /// + /// It can be reconstructed once via [`Ref::from_usize`]. + pub fn into_usize(obj: Self) -> usize { + ManuallyDrop::new(obj).ptr.as_ptr() as _ + } + + /// Borrows a [`Ref`] instance previously deconstructed via [`Ref::into_usize`]. + /// + /// # Safety + /// + /// `encoded` must have been returned by a previous call to [`Ref::into_usize`]. Additionally, + /// [`Ref::from_usize`] can only be called after *all* instances of [`RefBorrow`] have been + /// dropped. + pub unsafe fn borrow_usize<'a>(encoded: usize) -> RefBorrow<'a, T> { + // SAFETY: By the safety requirement of this function, we know that `encoded` came from + // a previous call to `Ref::into_usize`. + let inner = NonNull::new(encoded as *mut RefInner).unwrap(); + + // SAFETY: The safety requirements ensure that the object remains alive for the lifetime of + // the returned value. There is no way to create mutable references to the object. + unsafe { RefBorrow::new(inner) } + } + + /// Recreates a [`Ref`] instance previously deconstructed via [`Ref::into_usize`]. + /// + /// # Safety + /// + /// `encoded` must have been returned by a previous call to [`Ref::into_usize`]. Additionally, + /// it can only be called once for each previous call to [`Ref::into_usize`]. + pub unsafe fn from_usize(encoded: usize) -> Self { + // SAFETY: By the safety invariants we know that `encoded` came from `Ref::into_usize`, so + // the reference count held then will be owned by the new `Ref` object. + unsafe { Self::from_inner(NonNull::new(encoded as _).unwrap()) } + } +} + +impl Ref { + /// Constructs a new [`Ref`] from an existing [`RefInner`]. + /// + /// # Safety + /// + /// The caller must ensure that `inner` points to a valid location and has a non-zero reference + /// count, one of which will be owned by the new [`Ref`] instance. + unsafe fn from_inner(inner: NonNull>) -> Self { + // INVARIANT: By the safety requirements, the invariants hold. + Ref { + ptr: inner, + _p: PhantomData, + } + } + + /// Determines if two reference-counted pointers point to the same underlying instance of `T`. + pub fn ptr_eq(a: &Self, b: &Self) -> bool { + ptr::eq(a.ptr.as_ptr(), b.ptr.as_ptr()) + } + + /// Deconstructs a [`Ref`] object into a raw pointer. + /// + /// It can be reconstructed once via [`Ref::from_raw`]. + pub fn into_raw(obj: Self) -> *const T { + let ret = &*obj as *const T; + core::mem::forget(obj); + ret + } + + /// Recreates a [`Ref`] instance previously deconstructed via [`Ref::into_raw`]. + /// + /// This code relies on the `repr(C)` layout of structs as described in + /// . + /// + /// # Safety + /// + /// `ptr` must have been returned by a previous call to [`Ref::into_raw`]. Additionally, it + /// can only be called once for each previous call to [`Ref::into_raw`]. + pub unsafe fn from_raw(ptr: *const T) -> Self { + // SAFETY: The safety requirement ensures that the pointer is valid. + let align = core::mem::align_of_val(unsafe { &*ptr }); + let offset = Layout::new::>() + .align_to(align) + .unwrap() + .pad_to_align() + .size(); + // SAFETY: The pointer is in bounds because by the safety requirements `ptr` came from + // `Ref::into_raw`, so it is a pointer `offset` bytes from the beginning of the allocation. + let data = unsafe { (ptr as *const u8).sub(offset) }; + let metadata = ptr::metadata(ptr as *const RefInner); + let ptr = ptr::from_raw_parts_mut(data as _, metadata); + // SAFETY: By the safety requirements we know that `ptr` came from `Ref::into_raw`, so the + // reference count held then will be owned by the new `Ref` object. + unsafe { Self::from_inner(NonNull::new(ptr).unwrap()) } + } + + /// Returns a [`RefBorrow`] from the given [`Ref`]. + /// + /// This is useful when the argument of a function call is a [`RefBorrow`] (e.g., in a method + /// receiver), but we have a [`Ref`] instead. Getting a [`RefBorrow`] is free when optimised. + #[inline] + pub fn as_ref_borrow(&self) -> RefBorrow<'_, T> { + // SAFETY: The constraint that lifetime of the shared reference must outlive that of + // the returned `RefBorrow` ensures that the object remains alive. + unsafe { RefBorrow::new(self.ptr) } + } +} + +impl Deref for Ref { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is + // safe to dereference it. + unsafe { &self.ptr.as_ref().data } + } +} + +impl Clone for Ref { + fn clone(&self) -> Self { + // INVARIANT: C `refcount_inc` saturates the refcount, so it cannot overflow to zero. + // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is + // safe to increment the refcount. + unsafe { bindings::refcount_inc(self.ptr.as_ref().refcount.get()) }; + + // SAFETY: We just incremented the refcount. This increment is now owned by the new `Ref`. + unsafe { Self::from_inner(self.ptr) } + } +} + +impl AsRef for Ref { + fn as_ref(&self) -> &T { + // SAFETY: By the type invariant, there is necessarily a reference to the object, so it is + // safe to dereference it. + unsafe { &self.ptr.as_ref().data } + } +} + +impl Drop for Ref { + fn drop(&mut self) { + // SAFETY: By the type invariant, there is necessarily a reference to the object. We cannot + // touch `refcount` after it's decremented to a non-zero value because another thread/CPU + // may concurrently decrement it to zero and free it. It is ok to have a raw pointer to + // freed/invalid memory as long as it is never dereferenced. + let refcount = unsafe { self.ptr.as_ref() }.refcount.get(); + + // INVARIANT: If the refcount reaches zero, there are no other instances of `Ref`, and + // this instance is being dropped, so the broken invariant is not observable. + // SAFETY: Also by the type invariant, we are allowed to decrement the refcount. + let is_zero = unsafe { bindings::refcount_dec_and_test(refcount) }; + if is_zero { + // The count reached zero, we must free the memory. + + // SAFETY: This thread holds the only remaining reference to `self`, so it is safe to + // get a mutable reference to it. + let inner = unsafe { self.ptr.as_mut() }; + let layout = Layout::for_value(inner); + // SAFETY: The value stored in inner is valid. + unsafe { core::ptr::drop_in_place(inner) }; + // SAFETY: The pointer was initialised from the result of a call to `alloc`. + unsafe { dealloc(self.ptr.cast().as_ptr(), layout) }; + } + } +} + +impl TryFrom> for Ref<[T]> { + type Error = Error; + + fn try_from(mut v: Vec) -> Result { + let value_layout = Layout::array::(v.len())?; + let layout = Layout::new::>() + .extend(value_layout)? + .0 + .pad_to_align(); + // SAFETY: The layout size is guaranteed to be non-zero because `RefInner` contains the + // reference count. + let ptr = NonNull::new(unsafe { alloc(layout) }).ok_or(ENOMEM)?; + let inner = + core::ptr::slice_from_raw_parts_mut(ptr.as_ptr() as _, v.len()) as *mut RefInner<[T]>; + + // SAFETY: Just an FFI call that returns a `refcount_t` initialised to 1. + let count = Opaque::new(unsafe { bindings::REFCOUNT_INIT(1) }); + // SAFETY: `inner.refcount` is writable and properly aligned. + unsafe { core::ptr::addr_of_mut!((*inner).refcount).write(count) }; + // SAFETY: The contents of `v` as readable and properly aligned; `inner.data` is writable + // and properly aligned. There is no overlap between the two because `inner` is a new + // allocation. + unsafe { + core::ptr::copy_nonoverlapping( + v.as_ptr(), + core::ptr::addr_of_mut!((*inner).data) as *mut [T] as *mut T, + v.len(), + ) + }; + // SAFETY: We're setting the new length to zero, so it is <= to capacity, and old_len..0 is + // an empty range (so satisfies vacuously the requirement of being initialised). + unsafe { v.set_len(0) }; + // SAFETY: We just created `inner` with a reference count of 1, which is owned by the new + // `Ref` object. + Ok(unsafe { Self::from_inner(NonNull::new(inner).unwrap()) }) + } +} + +impl From> for Ref { + fn from(item: UniqueRef) -> Self { + item.inner + } +} + +impl From> for Pin> { + fn from(obj: UniqueRef) -> Self { + // SAFETY: It is not possible to move/replace `T` inside a `Pin>` (unless `T` + // is `Unpin`), so it is ok to convert it to `Pin>`. + unsafe { Pin::new_unchecked(obj) } + } +} + +impl From>> for Ref { + fn from(item: Pin>) -> Self { + // SAFETY: The type invariants of `Ref` guarantee that the data is pinned. + unsafe { Pin::into_inner_unchecked(item).inner } + } +} + +/// A borrowed [`Ref`] with manually-managed lifetime. +/// +/// # Invariants +/// +/// There are no mutable references to the underlying [`Ref`], and it remains valid for the lifetime +/// of the [`RefBorrow`] instance. +pub struct RefBorrow<'a, T: ?Sized + 'a> { + inner: NonNull>, + _p: PhantomData<&'a ()>, +} + +impl Clone for RefBorrow<'_, T> { + fn clone(&self) -> Self { + *self + } +} + +impl Copy for RefBorrow<'_, T> {} + +impl RefBorrow<'_, T> { + /// Creates a new [`RefBorrow`] instance. + /// + /// # Safety + /// + /// Callers must ensure the following for the lifetime of the returned [`RefBorrow`] instance: + /// 1. That `obj` remains valid; + /// 2. That no mutable references to `obj` are created. + unsafe fn new(inner: NonNull>) -> Self { + // INVARIANT: The safety requirements guarantee the invariants. + Self { + inner, + _p: PhantomData, + } + } +} + +impl From> for Ref { + fn from(b: RefBorrow<'_, T>) -> Self { + // SAFETY: The existence of `b` guarantees that the refcount is non-zero. `ManuallyDrop` + // guarantees that `drop` isn't called, so it's ok that the temporary `Ref` doesn't own the + // increment. + ManuallyDrop::new(unsafe { Ref::from_inner(b.inner) }) + .deref() + .clone() + } +} + +impl Deref for RefBorrow<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: By the type invariant, the underlying object is still alive with no mutable + // references to it, so it is safe to create a shared reference. + unsafe { &self.inner.as_ref().data } + } +} + +/// A refcounted object that is known to have a refcount of 1. +/// +/// It is mutable and can be converted to a [`Ref`] so that it can be shared. +/// +/// # Invariants +/// +/// `inner` always has a reference count of 1. +/// +/// # Examples +/// +/// In the following example, we make changes to the inner object before turning it into a +/// `Ref` object (after which point, it cannot be mutated directly). Note that `x.into()` +/// cannot fail. +/// +/// ``` +/// use kernel::sync::{Ref, UniqueRef}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn test() -> Result> { +/// let mut x = UniqueRef::try_new(Example { a: 10, b: 20 })?; +/// x.a += 1; +/// x.b += 1; +/// Ok(x.into()) +/// } +/// +/// # test(); +/// ``` +/// +/// In the following example we first allocate memory for a ref-counted `Example` but we don't +/// initialise it on allocation. We do initialise it later with a call to [`UniqueRef::write`], +/// followed by a conversion to `Ref`. This is particularly useful when allocation happens +/// in one context (e.g., sleepable) and initialisation in another (e.g., atomic): +/// +/// ``` +/// use kernel::sync::{Ref, UniqueRef}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn test() -> Result> { +/// let x = UniqueRef::try_new_uninit()?; +/// Ok(x.write(Example { a: 10, b: 20 }).into()) +/// } +/// +/// # test(); +/// ``` +/// +/// In the last example below, the caller gets a pinned instance of `Example` while converting to +/// `Ref`; this is useful in scenarios where one needs a pinned reference during +/// initialisation, for example, when initialising fields that are wrapped in locks. +/// +/// ``` +/// use kernel::sync::{Ref, UniqueRef}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn test() -> Result> { +/// let mut pinned = Pin::from(UniqueRef::try_new(Example { a: 10, b: 20 })?); +/// // We can modify `pinned` because it is `Unpin`. +/// pinned.as_mut().a += 1; +/// Ok(pinned.into()) +/// } +/// +/// # test(); +/// ``` +pub struct UniqueRef { + inner: Ref, +} + +impl UniqueRef { + /// Tries to allocate a new [`UniqueRef`] instance. + pub fn try_new(value: T) -> Result { + Ok(Self { + // INVARIANT: The newly-created object has a ref-count of 1. + inner: Ref::try_new(value)?, + }) + } + + /// Tries to allocate a new [`UniqueRef`] instance whose contents are not initialised yet. + pub fn try_new_uninit() -> Result>> { + Ok(UniqueRef::> { + // INVARIANT: The newly-created object has a ref-count of 1. + inner: Ref::try_new(MaybeUninit::uninit())?, + }) + } +} + +impl UniqueRef> { + /// Converts a `UniqueRef>` into a `UniqueRef` by writing a value into it. + pub fn write(mut self, value: T) -> UniqueRef { + self.deref_mut().write(value); + let inner = ManuallyDrop::new(self).inner.ptr; + UniqueRef { + // SAFETY: The new `Ref` is taking over `ptr` from `self.inner` (which won't be + // dropped). The types are compatible because `MaybeUninit` is compatible with `T`. + inner: unsafe { Ref::from_inner(inner.cast()) }, + } + } +} + +impl Deref for UniqueRef { + type Target = T; + + fn deref(&self) -> &Self::Target { + self.inner.deref() + } +} + +impl DerefMut for UniqueRef { + fn deref_mut(&mut self) -> &mut Self::Target { + // SAFETY: By the `Ref` type invariant, there is necessarily a reference to the object, so + // it is safe to dereference it. Additionally, we know there is only one reference when + // it's inside a `UniqueRef`, so it is safe to get a mutable reference. + unsafe { &mut self.inner.ptr.as_mut().data } + } +} diff --git a/rust/kernel/sync/condvar.rs b/rust/kernel/sync/condvar.rs new file mode 100644 index 00000000000000..7f8aa1c55a1925 --- /dev/null +++ b/rust/kernel/sync/condvar.rs @@ -0,0 +1,138 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A condition variable. +//! +//! This module allows Rust code to use the kernel's [`struct wait_queue_head`] as a condition +//! variable. + +use super::{Guard, Lock, LockInfo, NeedsLockClass}; +use crate::{bindings, str::CStr, task::Task, Opaque}; +use core::{marker::PhantomPinned, pin::Pin}; + +/// Safely initialises a [`CondVar`] with the given name, generating a new lock class. +#[macro_export] +macro_rules! condvar_init { + ($condvar:expr, $name:literal) => { + $crate::init_with_lockdep!($condvar, $name) + }; +} + +// TODO: `bindgen` is not generating this constant. Figure out why. +const POLLFREE: u32 = 0x4000; + +/// Exposes the kernel's [`struct wait_queue_head`] as a condition variable. It allows the caller to +/// atomically release the given lock and go to sleep. It reacquires the lock when it wakes up. And +/// it wakes up when notified by another thread (via [`CondVar::notify_one`] or +/// [`CondVar::notify_all`]) or because the thread received a signal. +/// +/// [`struct wait_queue_head`]: ../../../include/linux/wait.h +pub struct CondVar { + pub(crate) wait_list: Opaque, + + /// A condvar needs to be pinned because it contains a [`struct list_head`] that is + /// self-referential, so it cannot be safely moved once it is initialised. + _pin: PhantomPinned, +} + +// SAFETY: `CondVar` only uses a `struct wait_queue_head`, which is safe to use on any thread. +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for CondVar {} + +// SAFETY: `CondVar` only uses a `struct wait_queue_head`, which is safe to use on multiple threads +// concurrently. +unsafe impl Sync for CondVar {} + +impl CondVar { + /// Constructs a new conditional variable. + /// + /// # Safety + /// + /// The caller must call `CondVar::init` before using the conditional variable. + pub const unsafe fn new() -> Self { + Self { + wait_list: Opaque::uninit(), + _pin: PhantomPinned, + } + } + + /// Atomically releases the given lock (whose ownership is proven by the guard) and puts the + /// thread to sleep. It wakes up when notified by [`CondVar::notify_one`] or + /// [`CondVar::notify_all`], or when the thread receives a signal. + /// + /// Returns whether there is a signal pending. + #[must_use = "wait returns if a signal is pending, so the caller must check the return value"] + pub fn wait, I: LockInfo>(&self, guard: &mut Guard<'_, L, I>) -> bool { + let lock = guard.lock; + let wait = Opaque::::uninit(); + + // SAFETY: `wait` points to valid memory. + unsafe { bindings::init_wait(wait.get()) }; + + // SAFETY: Both `wait` and `wait_list` point to valid memory. + unsafe { + bindings::prepare_to_wait_exclusive( + self.wait_list.get(), + wait.get(), + bindings::TASK_INTERRUPTIBLE as _, + ) + }; + + // SAFETY: The guard is evidence that the caller owns the lock. + unsafe { lock.unlock(&mut guard.context) }; + + // SAFETY: No arguments, switches to another thread. + unsafe { bindings::schedule() }; + + guard.context = lock.lock_noguard(); + + // SAFETY: Both `wait` and `wait_list` point to valid memory. + unsafe { bindings::finish_wait(self.wait_list.get(), wait.get()) }; + + Task::current().signal_pending() + } + + /// Calls the kernel function to notify the appropriate number of threads with the given flags. + fn notify(&self, count: i32, flags: u32) { + // SAFETY: `wait_list` points to valid memory. + unsafe { + bindings::__wake_up( + self.wait_list.get(), + bindings::TASK_NORMAL, + count, + flags as _, + ) + }; + } + + /// Wakes a single waiter up, if any. This is not 'sticky' in the sense that if no thread is + /// waiting, the notification is lost completely (as opposed to automatically waking up the + /// next waiter). + pub fn notify_one(&self) { + self.notify(1, 0); + } + + /// Wakes all waiters up, if any. This is not 'sticky' in the sense that if no thread is + /// waiting, the notification is lost completely (as opposed to automatically waking up the + /// next waiter). + pub fn notify_all(&self) { + self.notify(0, 0); + } + + /// Wakes all waiters up. If they were added by `epoll`, they are also removed from the list of + /// waiters. This is useful when cleaning up a condition variable that may be waited on by + /// threads that use `epoll`. + pub fn free_waiters(&self) { + self.notify(1, bindings::POLLHUP | POLLFREE); + } +} + +impl NeedsLockClass for CondVar { + unsafe fn init( + self: Pin<&mut Self>, + name: &'static CStr, + key: *mut bindings::lock_class_key, + _: *mut bindings::lock_class_key, + ) { + unsafe { bindings::__init_waitqueue_head(self.wait_list.get(), name.as_char_ptr(), key) }; + } +} diff --git a/rust/kernel/sync/guard.rs b/rust/kernel/sync/guard.rs new file mode 100644 index 00000000000000..b825e0cf70b004 --- /dev/null +++ b/rust/kernel/sync/guard.rs @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A generic lock guard and trait. +//! +//! This module contains a lock guard that can be used with any locking primitive that implements +//! the ([`Lock`]) trait. It also contains the definition of the trait, which can be leveraged by +//! other constructs to work on generic locking primitives. + +use super::NeedsLockClass; +use crate::{bindings, str::CStr, Bool, False, True}; +use core::pin::Pin; + +/// Allows mutual exclusion primitives that implement the [`Lock`] trait to automatically unlock +/// when a guard goes out of scope. It also provides a safe and convenient way to access the data +/// protected by the lock. +#[must_use = "the lock unlocks immediately when the guard is unused"] +pub struct Guard<'a, L: Lock + ?Sized, I: LockInfo = WriteLock> { + pub(crate) lock: &'a L, + pub(crate) context: L::GuardContext, +} + +// SAFETY: `Guard` is sync when the data protected by the lock is also sync. This is more +// conservative than the default compiler implementation; more details can be found on +// https://github.com/rust-lang/rust/issues/41622 -- it refers to `MutexGuard` from the standard +// library. +unsafe impl Sync for Guard<'_, L, I> +where + L: Lock + ?Sized, + L::Inner: Sync, + I: LockInfo, +{ +} + +impl + ?Sized, I: LockInfo> core::ops::Deref for Guard<'_, L, I> { + type Target = L::Inner; + + fn deref(&self) -> &Self::Target { + // SAFETY: The caller owns the lock, so it is safe to deref the protected data. + unsafe { &*self.lock.locked_data().get() } + } +} + +impl + ?Sized, I: LockInfo> core::ops::DerefMut for Guard<'_, L, I> { + fn deref_mut(&mut self) -> &mut Self::Target { + // SAFETY: The caller owns the lock, so it is safe to deref the protected data. + unsafe { &mut *self.lock.locked_data().get() } + } +} + +impl + ?Sized, I: LockInfo> Drop for Guard<'_, L, I> { + fn drop(&mut self) { + // SAFETY: The caller owns the lock, so it is safe to unlock it. + unsafe { self.lock.unlock(&mut self.context) }; + } +} + +impl<'a, L: Lock + ?Sized, I: LockInfo> Guard<'a, L, I> { + /// Constructs a new immutable lock guard. + /// + /// # Safety + /// + /// The caller must ensure that it owns the lock. + pub(crate) unsafe fn new(lock: &'a L, context: L::GuardContext) -> Self { + Self { lock, context } + } +} + +/// Specifies properties of a lock. +pub trait LockInfo { + /// Determines if the data protected by a lock is writable. + type Writable: Bool; +} + +/// A marker for locks that only allow reading. +pub struct ReadLock; +impl LockInfo for ReadLock { + type Writable = False; +} + +/// A marker for locks that allow reading and writing. +pub struct WriteLock; +impl LockInfo for WriteLock { + type Writable = True; +} + +/// A generic mutual exclusion primitive. +/// +/// [`Guard`] is written such that any mutual exclusion primitive that can implement this trait can +/// also benefit from having an automatic way to unlock itself. +/// +/// # Safety +/// +/// - Implementers of this trait with the [`WriteLock`] marker must ensure that only one thread/CPU +/// may access the protected data once the lock is held, that is, between calls to `lock_noguard` +/// and `unlock`. +/// - Implementers of all other markers must ensure that a mutable reference to the protected data +/// is not active in any thread/CPU because at least one shared reference is active between calls +/// to `lock_noguard` and `unlock`. +pub unsafe trait Lock { + /// The type of the data protected by the lock. + type Inner: ?Sized; + + /// The type of context, if any, that needs to be stored in the guard. + type GuardContext; + + /// Acquires the lock, making the caller its owner. + #[must_use] + fn lock_noguard(&self) -> Self::GuardContext; + + /// Reacquires the lock, making the caller its owner. + /// + /// The guard context before the last unlock is passed in. + /// + /// Locks that don't require this state on relock can simply use the default implementation + /// that calls [`Lock::lock_noguard`]. + fn relock(&self, ctx: &mut Self::GuardContext) { + *ctx = self.lock_noguard(); + } + + /// Releases the lock, giving up ownership of the lock. + /// + /// # Safety + /// + /// It must only be called by the current owner of the lock. + unsafe fn unlock(&self, context: &mut Self::GuardContext); + + /// Returns the data protected by the lock. + fn locked_data(&self) -> &core::cell::UnsafeCell; +} + +/// A creator of instances of a mutual exclusion (lock) primitive. +pub trait LockFactory { + /// The parametrised type of the mutual exclusion primitive that can be created by this factory. + type LockedType; + + /// Constructs a new instance of the mutual exclusion primitive. + /// + /// # Safety + /// + /// The caller must call [`LockIniter::init_lock`] before using the lock. + unsafe fn new_lock(data: T) -> Self::LockedType; +} + +/// A lock that can be initialised with a single lock class key. +pub trait LockIniter { + /// Initialises the lock instance so that it can be safely used. + /// + /// # Safety + /// + /// `key` must point to a valid memory location that will remain valid until the lock is + /// dropped. + unsafe fn init_lock( + self: Pin<&mut Self>, + name: &'static CStr, + key: *mut bindings::lock_class_key, + ); +} + +impl NeedsLockClass for L { + unsafe fn init( + self: Pin<&mut Self>, + name: &'static CStr, + key: *mut bindings::lock_class_key, + _: *mut bindings::lock_class_key, + ) { + // SAFETY: The safety requirements of this function satisfy those of `init_lock`. + unsafe { self.init_lock(name, key) }; + } +} diff --git a/rust/kernel/sync/locked_by.rs b/rust/kernel/sync/locked_by.rs new file mode 100644 index 00000000000000..334935fb1e37c7 --- /dev/null +++ b/rust/kernel/sync/locked_by.rs @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A wrapper for data protected by a lock that does not wrap it. + +use super::{Guard, Lock}; +use core::{cell::UnsafeCell, ops::Deref, ptr}; + +/// Allows access to some data to be serialised by a lock that does not wrap it. +/// +/// In most cases, data protected by a lock is wrapped by the appropriate lock type, e.g., +/// [`super::Mutex`] or [`super::SpinLock`]. [`LockedBy`] is meant for cases when this is not +/// possible. For example, if a container has a lock and some data in the contained elements needs +/// to be protected by the same lock. +/// +/// [`LockedBy`] wraps the data in lieu of another locking primitive, and only allows access to it +/// when the caller shows evidence that 'external' lock is locked. +/// +/// # Example +/// +/// The following is an example for illustrative purposes: `InnerDirectory::bytes_used` is an +/// aggregate of all `InnerFile::bytes_used` and must be kept consistent; so we wrap `InnerFile` in +/// a `LockedBy` so that it shares a lock with `InnerDirectory`. This allows us to enforce at +/// compile-time that access to `InnerFile` is only granted when an `InnerDirectory` is also +/// locked; we enforce at run time that the right `InnerDirectory` is locked. +/// +/// ``` +/// use kernel::sync::{LockedBy, Mutex}; +/// +/// struct InnerFile { +/// bytes_used: u64, +/// } +/// +/// struct File { +/// name: String, +/// inner: LockedBy>, +/// } +/// +/// struct InnerDirectory { +/// /// The sum of the bytes used by all files. +/// bytes_used: u64, +/// files: Vec, +/// } +/// +/// struct Directory { +/// name: String, +/// inner: Mutex, +/// } +/// ``` +pub struct LockedBy { + owner: *const L::Inner, + data: UnsafeCell, +} + +// SAFETY: `LockedBy` can be transferred across thread boundaries iff the data it protects can. +unsafe impl Send for LockedBy {} + +// SAFETY: `LockedBy` serialises the interior mutability it provides, so it is `Sync` as long as the +// data it protects is `Send`. +unsafe impl Sync for LockedBy {} + +impl LockedBy { + /// Constructs a new instance of [`LockedBy`]. + /// + /// It stores a raw pointer to the owner that is never dereferenced. It is only used to ensure + /// that the right owner is being used to access the protected data. If the owner is freed, the + /// data becomes inaccessible; if another instance of the owner is allocated *on the same + /// memory location*, the data becomes accessible again: none of this affects memory safety + /// because in any case at most one thread (or CPU) can access the protected data at a time. + pub fn new(owner: &L, data: T) -> Self { + Self { + owner: owner.locked_data().get(), + data: UnsafeCell::new(data), + } + } +} + +impl LockedBy { + /// Returns a reference to the protected data when the caller provides evidence (via a + /// [`Guard`]) that the owner is locked. + pub fn access<'a>(&'a self, guard: &'a Guard<'_, L>) -> &'a T { + if !ptr::eq(guard.deref(), self.owner) { + panic!("guard does not match owner"); + } + + // SAFETY: `guard` is evidence that the owner is locked. + unsafe { &mut *self.data.get() } + } + + /// Returns a mutable reference to the protected data when the caller provides evidence (via a + /// mutable [`Guard`]) that the owner is locked mutably. + pub fn access_mut<'a>(&'a self, guard: &'a mut Guard<'_, L>) -> &'a mut T { + if !ptr::eq(guard.deref().deref(), self.owner) { + panic!("guard does not match owner"); + } + + // SAFETY: `guard` is evidence that the owner is locked. + unsafe { &mut *self.data.get() } + } + + /// Returns a mutable reference to the protected data when the caller provides evidence (via a + /// mutable owner) that the owner is locked mutably. Showing a mutable reference to the owner + /// is sufficient because we know no other references can exist to it. + pub fn access_from_mut<'a>(&'a self, owner: &'a mut L::Inner) -> &'a mut T { + if !ptr::eq(owner, self.owner) { + panic!("mismatched owners"); + } + + // SAFETY: `owner` is evidence that there is only one reference to the owner. + unsafe { &mut *self.data.get() } + } +} diff --git a/rust/kernel/sync/mutex.rs b/rust/kernel/sync/mutex.rs new file mode 100644 index 00000000000000..fac846b00b0812 --- /dev/null +++ b/rust/kernel/sync/mutex.rs @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A kernel mutex. +//! +//! This module allows Rust code to use the kernel's [`struct mutex`]. + +use super::{Guard, Lock, LockFactory, LockIniter, WriteLock}; +use crate::{bindings, str::CStr, Opaque}; +use core::{cell::UnsafeCell, marker::PhantomPinned, pin::Pin}; + +/// Safely initialises a [`Mutex`] with the given name, generating a new lock class. +#[macro_export] +macro_rules! mutex_init { + ($mutex:expr, $name:literal) => { + $crate::init_with_lockdep!($mutex, $name) + }; +} + +/// Exposes the kernel's [`struct mutex`]. When multiple threads attempt to lock the same mutex, +/// only one at a time is allowed to progress, the others will block (sleep) until the mutex is +/// unlocked, at which point another thread will be allowed to wake up and make progress. +/// +/// A [`Mutex`] must first be initialised with a call to [`Mutex::init_lock`] before it can be +/// used. The [`mutex_init`] macro is provided to automatically assign a new lock class to a mutex +/// instance. +/// +/// Since it may block, [`Mutex`] needs to be used with care in atomic contexts. +/// +/// [`struct mutex`]: ../../../include/linux/mutex.h +pub struct Mutex { + /// The kernel `struct mutex` object. + mutex: Opaque, + + /// A mutex needs to be pinned because it contains a [`struct list_head`] that is + /// self-referential, so it cannot be safely moved once it is initialised. + _pin: PhantomPinned, + + /// The data protected by the mutex. + data: UnsafeCell, +} + +// SAFETY: `Mutex` can be transferred across thread boundaries iff the data it protects can. +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for Mutex {} + +// SAFETY: `Mutex` serialises the interior mutability it provides, so it is `Sync` as long as the +// data it protects is `Send`. +unsafe impl Sync for Mutex {} + +impl Mutex { + /// Constructs a new mutex. + /// + /// # Safety + /// + /// The caller must call [`Mutex::init_lock`] before using the mutex. + pub const unsafe fn new(t: T) -> Self { + Self { + mutex: Opaque::uninit(), + data: UnsafeCell::new(t), + _pin: PhantomPinned, + } + } +} + +impl Mutex { + /// Locks the mutex and gives the caller access to the data protected by it. Only one thread at + /// a time is allowed to access the protected data. + pub fn lock(&self) -> Guard<'_, Self> { + let ctx = self.lock_noguard(); + // SAFETY: The mutex was just acquired. + unsafe { Guard::new(self, ctx) } + } +} + +impl LockFactory for Mutex { + type LockedType = Mutex; + + unsafe fn new_lock(data: U) -> Mutex { + // SAFETY: The safety requirements of `new_lock` also require that `init_lock` be called. + unsafe { Mutex::new(data) } + } +} + +impl LockIniter for Mutex { + unsafe fn init_lock( + self: Pin<&mut Self>, + name: &'static CStr, + key: *mut bindings::lock_class_key, + ) { + unsafe { bindings::__mutex_init(self.mutex.get(), name.as_char_ptr(), key) }; + } +} + +pub struct EmptyGuardContext; + +// SAFETY: The underlying kernel `struct mutex` object ensures mutual exclusion. +unsafe impl Lock for Mutex { + type Inner = T; + type GuardContext = EmptyGuardContext; + + fn lock_noguard(&self) -> EmptyGuardContext { + // SAFETY: `mutex` points to valid memory. + unsafe { bindings::mutex_lock(self.mutex.get()) }; + EmptyGuardContext + } + + unsafe fn unlock(&self, _: &mut EmptyGuardContext) { + // SAFETY: The safety requirements of the function ensure that the mutex is owned by the + // caller. + unsafe { bindings::mutex_unlock(self.mutex.get()) }; + } + + fn locked_data(&self) -> &UnsafeCell { + &self.data + } +} + +/// A revocable mutex. +/// +/// That is, a mutex to which access can be revoked at runtime. It is a specialisation of the more +/// generic [`super::revocable::Revocable`]. +/// +/// # Examples +/// +/// ``` +/// # use kernel::sync::RevocableMutex; +/// # use kernel::revocable_init; +/// # use core::pin::Pin; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn read_sum(v: &RevocableMutex) -> Option { +/// let guard = v.try_write()?; +/// Some(guard.a + guard.b) +/// } +/// +/// // SAFETY: We call `revocable_init` immediately below. +/// let mut v = unsafe { RevocableMutex::new(Example { a: 10, b: 20 }) }; +/// // SAFETY: We never move out of `v`. +/// let pinned = unsafe { Pin::new_unchecked(&mut v) }; +/// revocable_init!(pinned, "example::v"); +/// assert_eq!(read_sum(&v), Some(30)); +/// v.revoke(); +/// assert_eq!(read_sum(&v), None); +/// ``` +pub type RevocableMutex = super::revocable::Revocable, T>; + +/// A guard for a revocable mutex. +pub type RevocableMutexGuard<'a, T, I = WriteLock> = + super::revocable::RevocableGuard<'a, Mutex<()>, T, I>; diff --git a/rust/kernel/sync/nowait.rs b/rust/kernel/sync/nowait.rs new file mode 100644 index 00000000000000..c9ee2f9a360743 --- /dev/null +++ b/rust/kernel/sync/nowait.rs @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A lock that never waits. + +use core::cell::UnsafeCell; +use core::sync::atomic::{AtomicU8, Ordering}; + +const LOCKED: u8 = 1; +const CONTENDED: u8 = 2; + +/// A lock that only offers a [`try_lock`](NoWaitLock::try_lock) method. +/// +/// That is, on contention it doesn't offer a way for the caller to block waiting for the current +/// owner to release the lock. This is useful for best-effort kind of scenarios where waiting is +/// never needed: in such cases, users don't need a full-featured mutex or spinlock. +/// +/// When the lock is released via call to [`NoWaitLockGuard::unlock`], it indicates to the caller +/// whether there was contention (i.e., if another thread tried and failed to acquire this lock). +/// If the return value is `false`, there was definitely no contention but if it is `true`, it's +/// possible that the contention was when attempting to acquire the lock. +/// +/// # Examples +/// +/// ``` +/// use kernel::sync::NoWaitLock; +/// +/// #[derive(PartialEq)] +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// let x = NoWaitLock::new(Example{ a: 10, b: 20 }); +/// +/// // Modifying the protected value. +/// { +/// let mut guard = x.try_lock().unwrap(); +/// assert_eq!(guard.a, 10); +/// assert_eq!(guard.b, 20); +/// guard.a += 20; +/// guard.b += 20; +/// assert_eq!(guard.a, 30); +/// assert_eq!(guard.b, 40); +/// } +/// +/// // Reading the protected value. +/// { +/// let guard = x.try_lock().unwrap(); +/// assert_eq!(guard.a, 30); +/// assert_eq!(guard.b, 40); +/// } +/// +/// // Second acquire fails, but succeeds after the guard is dropped. +/// { +/// let guard = x.try_lock().unwrap(); +/// assert!(x.try_lock().is_none()); +/// +/// drop(guard); +/// assert!(x.try_lock().is_some()); +/// } +/// ``` +/// +/// The following examples use the [`NoWaitLockGuard::unlock`] to release the lock and check for +/// contention. +/// +/// ``` +/// use kernel::sync::NoWaitLock; +/// +/// #[derive(PartialEq)] +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// let x = NoWaitLock::new(Example{ a: 10, b: 20 }); +/// +/// // No contention when lock is released. +/// let guard = x.try_lock().unwrap(); +/// assert_eq!(guard.unlock(), false); +/// +/// // Contention detected. +/// let guard = x.try_lock().unwrap(); +/// assert!(x.try_lock().is_none()); +/// assert_eq!(guard.unlock(), true); +/// +/// // No contention again. +/// let guard = x.try_lock().unwrap(); +/// assert_eq!(guard.a, 10); +/// assert_eq!(guard.b, 20); +/// assert_eq!(guard.unlock(), false); +/// ``` +pub struct NoWaitLock { + state: AtomicU8, + data: UnsafeCell, +} + +// SAFETY: `NoWaitLock` can be transferred across thread boundaries iff the data it protects can. +unsafe impl Send for NoWaitLock {} + +// SAFETY: `NoWaitLock` only allows a single thread at a time to access the interior mutability it +// provides, so it is `Sync` as long as the data it protects is `Send`. +unsafe impl Sync for NoWaitLock {} + +impl NoWaitLock { + /// Creates a new instance of the no-wait lock. + pub fn new(data: T) -> Self { + Self { + state: AtomicU8::new(0), + data: UnsafeCell::new(data), + } + } +} + +impl NoWaitLock { + /// Tries to acquire the lock. + /// + /// If no other thread/CPU currently owns the lock, it returns a guard that can be used to + /// access the protected data. Otherwise (i.e., the lock is already owned), it returns `None`. + pub fn try_lock(&self) -> Option> { + // Fast path -- just set the LOCKED bit. + // + // Acquire ordering matches the release in `NoWaitLockGuard::drop` or + // `NoWaitLockGuard::unlock`. + if self.state.fetch_or(LOCKED, Ordering::Acquire) & LOCKED == 0 { + // INVARIANTS: The thread that manages to set the `LOCKED` bit becomes the owner. + return Some(NoWaitLockGuard { lock: self }); + } + + // Set the `CONTENDED` bit. + // + // If the `LOCKED` bit has since been reset, the lock was released and the caller becomes + // the owner of the lock. It will see the `CONTENDED` bit when it releases the lock even if + // there was no additional contention but this is allowed by the interface. + if self.state.fetch_or(CONTENDED | LOCKED, Ordering::Relaxed) & LOCKED == 0 { + // INVARIANTS: The thread that manages to set the `LOCKED` bit becomes the owner. + Some(NoWaitLockGuard { lock: self }) + } else { + None + } + } +} + +/// A guard for the holder of the no-wait lock. +/// +/// # Invariants +/// +/// Only the current owner can have an instance of [`NoWaitLockGuard`]. +pub struct NoWaitLockGuard<'a, T: ?Sized> { + lock: &'a NoWaitLock, +} + +impl NoWaitLockGuard<'_, T> { + /// Unlocks the no-wait lock. + /// + /// The return value indicates whether there was contention while the lock was held, that is, + /// whether another thread tried (and failed) to acquire the lock. + pub fn unlock(self) -> bool { + // Matches the acquire in `NoWaitLock::try_lock`. + let contention = self.lock.state.swap(0, Ordering::Release) & CONTENDED != 0; + core::mem::forget(self); + contention + } +} + +impl core::ops::Deref for NoWaitLockGuard<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: The type invariant guarantees that only the owner has an instance of the guard, + // so the owner is the only one that can call this function. + unsafe { &*self.lock.data.get() } + } +} + +impl core::ops::DerefMut for NoWaitLockGuard<'_, T> { + fn deref_mut(&mut self) -> &mut Self::Target { + // SAFETY: The type invariant guarantees that only the owner has an instance of the guard, + // so the owner is the only one that can call this function. + unsafe { &mut *self.lock.data.get() } + } +} + +impl Drop for NoWaitLockGuard<'_, T> { + fn drop(&mut self) { + // Matches the acquire in `NoWaitLock::try_lock`. + self.lock.state.store(0, Ordering::Release); + } +} diff --git a/rust/kernel/sync/revocable.rs b/rust/kernel/sync/revocable.rs new file mode 100644 index 00000000000000..ddaa86e123f2b3 --- /dev/null +++ b/rust/kernel/sync/revocable.rs @@ -0,0 +1,250 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Synchronisation primitives where access to their contents can be revoked at runtime. + +use crate::{ + bindings, + str::CStr, + sync::{Guard, Lock, LockFactory, LockInfo, NeedsLockClass, ReadLock, WriteLock}, + True, +}; +use core::{ + mem::MaybeUninit, + ops::{Deref, DerefMut}, + pin::Pin, +}; + +/// The state within the revocable synchronisation primitive. +/// +/// We don't use simply `Option` because we need to drop in-place because the contents are +/// implicitly pinned. +/// +/// # Invariants +/// +/// The `is_available` field determines if `data` is initialised. +pub struct Inner { + is_available: bool, + data: MaybeUninit, +} + +impl Inner { + fn new(data: T) -> Self { + // INVARIANT: `data` is initialised and `is_available` is `true`, so the state matches. + Self { + is_available: true, + data: MaybeUninit::new(data), + } + } + + fn drop_in_place(&mut self) { + if !self.is_available { + // Already dropped. + return; + } + + // INVARIANT: `data` is being dropped and `is_available` is set to `false`, so the state + // matches. + self.is_available = false; + + // SAFETY: By the type invariants, `data` is valid because `is_available` was true. + unsafe { self.data.assume_init_drop() }; + } +} + +impl Drop for Inner { + fn drop(&mut self) { + self.drop_in_place(); + } +} + +/// Revocable synchronisation primitive. +/// +/// That is, it wraps synchronisation primitives so that access to their contents can be revoked at +/// runtime, rendering them inacessible. +/// +/// Once access is revoked and all concurrent users complete (i.e., all existing instances of +/// [`RevocableGuard`] are dropped), the wrapped object is also dropped. +/// +/// For better ergonomics, we advise the use of specialisations of this struct, for example, +/// [`super::RevocableMutex`] and [`super::RevocableRwSemaphore`]. Callers that do not need to +/// sleep while holding on to a guard should use [`crate::revocable::Revocable`] instead, which is +/// more efficient as it uses RCU to keep objects alive. +/// +/// # Examples +/// +/// ``` +/// # use kernel::sync::{Mutex, Revocable}; +/// # use kernel::revocable_init; +/// # use core::pin::Pin; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn add_two(v: &Revocable, Example>) -> Option { +/// let mut guard = v.try_write()?; +/// guard.a += 2; +/// guard.b += 2; +/// Some(guard.a + guard.b) +/// } +/// +/// // SAFETY: We call `revocable_init` immediately below. +/// let mut v = unsafe { Revocable::, Example>::new(Example { a: 10, b: 20 }) }; +/// // SAFETY: We never move out of `v`. +/// let pinned = unsafe { Pin::new_unchecked(&mut v) }; +/// revocable_init!(pinned, "example::v"); +/// assert_eq!(add_two(&v), Some(34)); +/// v.revoke(); +/// assert_eq!(add_two(&v), None); +/// ``` +pub struct Revocable { + inner: F::LockedType>, +} + +/// Safely initialises a [`Revocable`] instance with the given name, generating a new lock class. +#[macro_export] +macro_rules! revocable_init { + ($mutex:expr, $name:literal) => { + $crate::init_with_lockdep!($mutex, $name) + }; +} + +impl Revocable { + /// Creates a new revocable instance of the given lock. + /// + /// # Safety + /// + /// The caller must call [`Revocable::init`] before using the revocable synch primitive. + pub unsafe fn new(data: T) -> Self { + Self { + // SAFETY: The safety requirements of this function require that `Revocable::init` + // be called before the returned object can be used. Lock initialisation is called + // from `Revocable::init`. + inner: unsafe { F::new_lock(Inner::new(data)) }, + } + } +} + +impl NeedsLockClass for Revocable +where + F::LockedType>: NeedsLockClass, +{ + unsafe fn init( + self: Pin<&mut Self>, + name: &'static CStr, + key1: *mut bindings::lock_class_key, + key2: *mut bindings::lock_class_key, + ) { + // SAFETY: `inner` is pinned when `self` is. + let inner = unsafe { self.map_unchecked_mut(|r| &mut r.inner) }; + + // SAFETY: The safety requirements of this function satisfy the ones for `inner.init` + // (they're the same). + unsafe { inner.init(name, key1, key2) }; + } +} + +impl Revocable +where + F::LockedType>: Lock>, +{ + /// Revokes access to and drops the wrapped object. + /// + /// Revocation and dropping happen after ongoing accessors complete. + pub fn revoke(&self) { + self.lock().drop_in_place(); + } + + /// Tries to lock the \[revocable\] wrapped object in write (exclusive) mode. + /// + /// Returns `None` if the object has been revoked and is therefore no longer accessible. + /// + /// Returns a guard that gives access to the object otherwise; the object is guaranteed to + /// remain accessible while the guard is alive. Callers are allowed to sleep while holding on + /// to the returned guard. + pub fn try_write(&self) -> Option> { + let inner = self.lock(); + if !inner.is_available { + return None; + } + Some(RevocableGuard::new(inner)) + } + + fn lock(&self) -> Guard<'_, F::LockedType>> { + let ctx = self.inner.lock_noguard(); + // SAFETY: The lock was acquired in the call above. + unsafe { Guard::new(&self.inner, ctx) } + } +} + +impl Revocable +where + F::LockedType>: Lock>, +{ + /// Tries to lock the \[revocable\] wrapped object in read (shared) mode. + /// + /// Returns `None` if the object has been revoked and is therefore no longer accessible. + /// + /// Returns a guard that gives access to the object otherwise; the object is guaranteed to + /// remain accessible while the guard is alive. Callers are allowed to sleep while holding on + /// to the returned guard. + pub fn try_read(&self) -> Option> { + let ctx = self.inner.lock_noguard(); + // SAFETY: The lock was acquired in the call above. + let inner = unsafe { Guard::new(&self.inner, ctx) }; + if !inner.is_available { + return None; + } + Some(RevocableGuard::new(inner)) + } +} + +/// A guard that allows access to a revocable object and keeps it alive. +pub struct RevocableGuard<'a, F: LockFactory, T, I: LockInfo> +where + F::LockedType>: Lock>, +{ + guard: Guard<'a, F::LockedType>, I>, +} + +impl<'a, F: LockFactory, T, I: LockInfo> RevocableGuard<'a, F, T, I> +where + F::LockedType>: Lock>, +{ + fn new(guard: Guard<'a, F::LockedType>, I>) -> Self { + Self { guard } + } +} + +impl> RevocableGuard<'_, F, T, I> +where + F::LockedType>: Lock>, +{ + /// Returns a pinned mutable reference to the wrapped object. + pub fn as_pinned_mut(&mut self) -> Pin<&mut T> { + // SAFETY: Revocable mutexes must be pinned, so we choose to always project the data as + // pinned as well (i.e., we guarantee we never move it). + unsafe { Pin::new_unchecked(&mut *self) } + } +} + +impl Deref for RevocableGuard<'_, F, T, I> +where + F::LockedType>: Lock>, +{ + type Target = T; + + fn deref(&self) -> &Self::Target { + unsafe { &*self.guard.data.as_ptr() } + } +} + +impl> DerefMut for RevocableGuard<'_, F, T, I> +where + F::LockedType>: Lock>, +{ + fn deref_mut(&mut self) -> &mut Self::Target { + unsafe { &mut *self.guard.data.as_mut_ptr() } + } +} diff --git a/rust/kernel/sync/rwsem.rs b/rust/kernel/sync/rwsem.rs new file mode 100644 index 00000000000000..eb220e4972cfa5 --- /dev/null +++ b/rust/kernel/sync/rwsem.rs @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A kernel read/write mutex. +//! +//! This module allows Rust code to use the kernel's [`struct rw_semaphore`]. +//! +//! C header: [`include/linux/rwsem.h`](../../../../include/linux/rwsem.h) + +use super::{mutex::EmptyGuardContext, Guard, Lock, LockFactory, LockIniter, ReadLock, WriteLock}; +use crate::{bindings, str::CStr, Opaque}; +use core::{cell::UnsafeCell, marker::PhantomPinned, pin::Pin}; + +/// Safely initialises a [`RwSemaphore`] with the given name, generating a new lock class. +#[macro_export] +macro_rules! rwsemaphore_init { + ($rwsem:expr, $name:literal) => { + $crate::init_with_lockdep!($rwsem, $name) + }; +} + +/// Exposes the kernel's [`struct rw_semaphore`]. +/// +/// It's a read/write mutex. That is, it allows multiple readers to acquire it concurrently, but +/// only one writer at a time. On contention, waiters sleep. +/// +/// A [`RwSemaphore`] must first be initialised with a call to [`RwSemaphore::init_lock`] before it +/// can be used. The [`rwsemaphore_init`] macro is provided to automatically assign a new lock +/// class to an [`RwSemaphore`] instance. +/// +/// Since it may block, [`RwSemaphore`] needs to be used with care in atomic contexts. +/// +/// [`struct rw_semaphore`]: ../../../include/linux/rwsem.h +pub struct RwSemaphore { + /// The kernel `struct rw_semaphore` object. + rwsem: Opaque, + + /// An rwsem needs to be pinned because it contains a [`struct list_head`] that is + /// self-referential, so it cannot be safely moved once it is initialised. + _pin: PhantomPinned, + + /// The data protected by the rwsem. + data: UnsafeCell, +} + +// SAFETY: `RwSemaphore` can be transferred across thread boundaries iff the data it protects can. +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for RwSemaphore {} + +// SAFETY: `RwSemaphore` requires that the protected type be `Sync` for it to be `Sync` as well +// because the read mode allows multiple threads to access the protected data concurrently. It +// requires `Send` because the write lock allows a `&mut T` to be accessible from an arbitrary +// thread. +unsafe impl Sync for RwSemaphore {} + +impl RwSemaphore { + /// Constructs a new rw semaphore. + /// + /// # Safety + /// + /// The caller must call [`RwSemaphore::init_lock`] before using the rw semaphore. + pub unsafe fn new(t: T) -> Self { + Self { + rwsem: Opaque::uninit(), + data: UnsafeCell::new(t), + _pin: PhantomPinned, + } + } +} + +impl RwSemaphore { + /// Locks the rw semaphore in write (exclusive) mode and gives the caller access to the data + /// protected by it. Only one thread at a time is allowed to access the protected data. + pub fn write(&self) -> Guard<'_, Self> { + let ctx = ::lock_noguard(self); + // SAFETY: The rw semaphore was just acquired in write mode. + unsafe { Guard::new(self, ctx) } + } + + /// Locks the rw semaphore in read (shared) mode and gives the caller access to the data + /// protected by it. Only one thread at a time is allowed to access the protected data. + pub fn read(&self) -> Guard<'_, Self, ReadLock> { + let ctx = >::lock_noguard(self); + // SAFETY: The rw semaphore was just acquired in read mode. + unsafe { Guard::new(self, ctx) } + } +} + +impl LockFactory for RwSemaphore { + type LockedType = RwSemaphore; + + unsafe fn new_lock(data: U) -> RwSemaphore { + // SAFETY: The safety requirements of `new_lock` also require that `init_lock` be called. + unsafe { RwSemaphore::new(data) } + } +} + +impl LockIniter for RwSemaphore { + unsafe fn init_lock( + self: Pin<&mut Self>, + name: &'static CStr, + key: *mut bindings::lock_class_key, + ) { + unsafe { bindings::__init_rwsem(self.rwsem.get(), name.as_char_ptr(), key) }; + } +} + +// SAFETY: The underlying kernel `struct rw_semaphore` object ensures mutual exclusion because it's +// acquired in write mode. +unsafe impl Lock for RwSemaphore { + type Inner = T; + type GuardContext = EmptyGuardContext; + + fn lock_noguard(&self) -> EmptyGuardContext { + // SAFETY: `rwsem` points to valid memory. + unsafe { bindings::down_write(self.rwsem.get()) }; + EmptyGuardContext + } + + unsafe fn unlock(&self, _: &mut EmptyGuardContext) { + // SAFETY: The safety requirements of the function ensure that the rw semaphore is owned by + // the caller. + unsafe { bindings::up_write(self.rwsem.get()) }; + } + + fn locked_data(&self) -> &UnsafeCell { + &self.data + } +} + +// SAFETY: The underlying kernel `struct rw_semaphore` object ensures that only shared references +// are accessible from other threads because it's acquired in read mode. +unsafe impl Lock for RwSemaphore { + type Inner = T; + type GuardContext = EmptyGuardContext; + + fn lock_noguard(&self) -> EmptyGuardContext { + // SAFETY: `rwsem` points to valid memory. + unsafe { bindings::down_read(self.rwsem.get()) }; + EmptyGuardContext + } + + unsafe fn unlock(&self, _: &mut EmptyGuardContext) { + // SAFETY: The safety requirements of the function ensure that the rw semaphore is owned by + // the caller. + unsafe { bindings::up_read(self.rwsem.get()) }; + } + + fn locked_data(&self) -> &UnsafeCell { + &self.data + } +} + +/// A revocable rw semaphore. +/// +/// That is, a read/write semaphore to which access can be revoked at runtime. It is a +/// specialisation of the more generic [`super::revocable::Revocable`]. +/// +/// # Examples +/// +/// ``` +/// # use kernel::sync::RevocableRwSemaphore; +/// # use kernel::revocable_init; +/// # use core::pin::Pin; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn read_sum(v: &RevocableRwSemaphore) -> Option { +/// let guard = v.try_read()?; +/// Some(guard.a + guard.b) +/// } +/// +/// fn add_two(v: &RevocableRwSemaphore) -> Option { +/// let mut guard = v.try_write()?; +/// guard.a += 2; +/// guard.b += 2; +/// Some(guard.a + guard.b) +/// } +/// +/// // SAFETY: We call `revocable_init` immediately below. +/// let mut v = unsafe { RevocableRwSemaphore::new(Example { a: 10, b: 20 }) }; +/// // SAFETY: We never move out of `v`. +/// let pinned = unsafe { Pin::new_unchecked(&mut v) }; +/// revocable_init!(pinned, "example::v"); +/// assert_eq!(read_sum(&v), Some(30)); +/// assert_eq!(add_two(&v), Some(34)); +/// v.revoke(); +/// assert_eq!(read_sum(&v), None); +/// assert_eq!(add_two(&v), None); +/// ``` +pub type RevocableRwSemaphore = super::revocable::Revocable, T>; + +/// A guard for a revocable rw semaphore.. +pub type RevocableRwSemaphoreGuard<'a, T, I = WriteLock> = + super::revocable::RevocableGuard<'a, RwSemaphore<()>, T, I>; diff --git a/rust/kernel/sync/seqlock.rs b/rust/kernel/sync/seqlock.rs new file mode 100644 index 00000000000000..f42d649823c667 --- /dev/null +++ b/rust/kernel/sync/seqlock.rs @@ -0,0 +1,202 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A kernel sequential lock (seqlock). +//! +//! This module allows Rust code to use the sequential locks based on the kernel's `seqcount_t` and +//! any locks implementing the [`LockFactory`] trait. +//! +//! See . + +use super::{Guard, Lock, LockFactory, LockIniter, NeedsLockClass, ReadLock}; +use crate::{bindings, str::CStr, Opaque}; +use core::{cell::UnsafeCell, marker::PhantomPinned, ops::Deref, pin::Pin}; + +/// Exposes sequential locks backed by the kernel's `seqcount_t`. +/// +/// The write-side critical section is protected by a lock implementing the [`LockFactory`] trait. +/// +/// # Examples +/// +///``` +/// use kernel::sync::{SeqLock, SpinLock}; +/// use core::sync::atomic::{AtomicU32, Ordering}; +/// +/// struct Example { +/// a: AtomicU32, +/// b: AtomicU32, +/// } +/// +/// fn get_sum(v: &SeqLock>) -> u32 { +/// // Use `access` to access the fields of `Example`. +/// v.access(|e| e.a.load(Ordering::Relaxed) + e.b.load(Ordering::Relaxed)) +/// } +/// +/// fn get_sum_with_guard(v: &SeqLock>) -> u32 { +/// // Use `read` and `need_retry` in a loop to access the fields of `Example`. +/// loop { +/// let guard = v.read(); +/// let sum = guard.a.load(Ordering::Relaxed) + guard.b.load(Ordering::Relaxed); +/// if !guard.need_retry() { +/// break sum; +/// } +/// } +/// } +/// +/// fn inc_each(v: &SeqLock>) { +/// // Use a write-side guard to access the fields of `Example`. +/// let guard = v.write(); +/// let a = guard.a.load(Ordering::Relaxed); +/// guard.a.store(a + 1, Ordering::Relaxed); +/// let b = guard.b.load(Ordering::Relaxed); +/// guard.b.store(b + 1, Ordering::Relaxed); +/// } +/// ``` +pub struct SeqLock { + _p: PhantomPinned, + count: Opaque, + write_lock: L, +} + +// SAFETY: `SeqLock` can be transferred across thread boundaries iff the data it protects and the +// underlying lock can. +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for SeqLock where L::Inner: Send {} + +// SAFETY: `SeqLock` allows concurrent access to the data it protects by both readers and writers, +// so it requires that the data it protects be `Sync`, as well as the underlying lock. +unsafe impl Sync for SeqLock where L::Inner: Sync {} + +impl SeqLock { + /// Constructs a new instance of [`SeqLock`]. + /// + /// # Safety + /// + /// The caller must call [`SeqLock::init`] before using the seqlock. + pub unsafe fn new(data: L::Inner) -> Self + where + L: LockFactory = L>, + L::Inner: Sized, + { + Self { + _p: PhantomPinned, + count: Opaque::uninit(), + // SAFETY: `L::init_lock` is called from `SeqLock::init`, which is required to be + // called by the function's safety requirements. + write_lock: unsafe { L::new_lock(data) }, + } + } +} + +impl SeqLock { + /// Accesses the protected data in read mode. + /// + /// Readers and writers are allowed to run concurrently, so callers must check if they need to + /// refetch the values before they are used (e.g., because a writer changed them concurrently, + /// rendering them potentially inconsistent). The check is performed via calls to + /// [`SeqLockReadGuard::need_retry`]. + pub fn read(&self) -> SeqLockReadGuard<'_, L> { + SeqLockReadGuard { + lock: self, + // SAFETY: `count` contains valid memory. + start_count: unsafe { bindings::read_seqcount_begin(self.count.get()) }, + } + } + + /// Accesses the protected data in read mode. + /// + /// The provided closure is called repeatedly if it may have accessed inconsistent data (e.g., + /// because a concurrent writer modified it). This is a wrapper around [`SeqLock::read`] and + /// [`SeqLockReadGuard::need_retry`] in a loop. + pub fn access R, R>(&self, cb: F) -> R { + loop { + let guard = self.read(); + let ret = cb(&guard); + if !guard.need_retry() { + return ret; + } + } + } + + /// Locks the underlying lock and returns a guard that allows access to the protected data. + /// + /// The guard is not mutable though because readers are still allowed to concurrently access + /// the data. The protected data structure needs to provide interior mutability itself (e.g., + /// via atomic types) for the individual fields that can be mutated. + pub fn write(&self) -> Guard<'_, Self, ReadLock> { + let ctx = self.lock_noguard(); + // SAFETY: The seqlock was just acquired. + unsafe { Guard::new(self, ctx) } + } +} + +impl NeedsLockClass for SeqLock { + unsafe fn init( + mut self: Pin<&mut Self>, + name: &'static CStr, + key1: *mut bindings::lock_class_key, + key2: *mut bindings::lock_class_key, + ) { + // SAFETY: `write_lock` is pinned when `self` is. + let pinned = unsafe { self.as_mut().map_unchecked_mut(|s| &mut s.write_lock) }; + // SAFETY: `key1` is valid by the safety requirements of this function. + unsafe { pinned.init_lock(name, key1) }; + // SAFETY: `key2` is valid by the safety requirements of this function. + unsafe { bindings::__seqcount_init(self.count.get(), name.as_char_ptr(), key2) }; + } +} + +// SAFETY: The underlying lock ensures mutual exclusion. +unsafe impl Lock for SeqLock { + type Inner = L::Inner; + type GuardContext = L::GuardContext; + + fn lock_noguard(&self) -> L::GuardContext { + let ctx = self.write_lock.lock_noguard(); + // SAFETY: `count` contains valid memory. + unsafe { bindings::write_seqcount_begin(self.count.get()) }; + ctx + } + + fn relock(&self, ctx: &mut L::GuardContext) { + self.write_lock.relock(ctx); + // SAFETY: `count` contains valid memory. + unsafe { bindings::write_seqcount_begin(self.count.get()) }; + } + + unsafe fn unlock(&self, ctx: &mut L::GuardContext) { + // SAFETY: The safety requirements of the function ensure that lock is owned by the caller. + unsafe { bindings::write_seqcount_end(self.count.get()) }; + // SAFETY: The safety requirements of the function ensure that lock is owned by the caller. + unsafe { self.write_lock.unlock(ctx) }; + } + + fn locked_data(&self) -> &UnsafeCell { + self.write_lock.locked_data() + } +} + +/// Allows read-side access to data protected by a sequential lock. +pub struct SeqLockReadGuard<'a, L: Lock + ?Sized> { + lock: &'a SeqLock, + start_count: u32, +} + +impl SeqLockReadGuard<'_, L> { + /// Determine if the callers needs to retry reading values. + /// + /// It returns `true` when a concurrent writer ran between the guard being created and + /// [`Self::need_retry`] being called. + pub fn need_retry(&self) -> bool { + // SAFETY: `count` is valid because the guard guarantees that the lock remains alive. + unsafe { bindings::read_seqcount_retry(self.lock.count.get(), self.start_count) != 0 } + } +} + +impl Deref for SeqLockReadGuard<'_, L> { + type Target = L::Inner; + + fn deref(&self) -> &Self::Target { + // SAFETY: We only ever allow shared access to the protected data. + unsafe { &*self.lock.locked_data().get() } + } +} diff --git a/rust/kernel/sync/smutex.rs b/rust/kernel/sync/smutex.rs new file mode 100644 index 00000000000000..4f6797361ab320 --- /dev/null +++ b/rust/kernel/sync/smutex.rs @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A simple mutex implementation. +//! +//! Differently from [`super::Mutex`], this implementation does not require pinning, so the +//! ergonomics are much improved, though the implementation is not as feature-rich as the C-based +//! one. The main advantage is that it doesn't impose unsafe blocks on callers. +//! +//! The mutex is made up of 2 words in addition to the data it protects. The first one is accessed +//! concurrently by threads trying to acquire and release the mutex, it contains a "stack" of +//! waiters and a "locked" bit; the second one is only accessible by the thread holding the mutex, +//! it contains a queue of waiters. Waiters are moved from the stack to the queue when the mutex is +//! next unlocked while the stack is non-empty and the queue is empty. A single waiter is popped +//! from the wait queue when the owner of the mutex unlocks it. +//! +//! The initial state of the mutex is ``, meaning that it isn't +//! locked and both the waiter stack and queue are empty. +//! +//! A lock operation transitions the mutex to state ``. +//! +//! An unlock operation transitions the mutex back to the initial state, however, an attempt to +//! lock the mutex while it's already locked results in a waiter being created (on the stack) and +//! pushed onto the stack, so the state is ``. +//! +//! Another thread trying to lock the mutex results in another waiter being pushed onto the stack, +//! so the state becomes ``. +//! +//! In such states (queue is empty but stack is non-empty), the unlock operation is performed in +//! three steps: +//! 1. The stack is popped (but the mutex remains locked), so the state is: +//! `` +//! 2. The stack is turned into a queue by reversing it, so the state is: +//! ` +//! 3. Finally, the lock is released, and the first waiter is awakened, so the state is: +//! `` +//! +//! The mutex remains accessible to any threads attempting to lock it in any of the intermediate +//! states above. For example, while it is locked, other threads may add waiters to the stack +//! (which is ok because we want to release the ones on the queue first); another example is that +//! another thread may acquire the mutex before waiter W1 in the example above, this makes the +//! mutex unfair but this is desirable because the thread is running already and may in fact +//! release the lock before W1 manages to get scheduled -- it also mitigates the lock convoy +//! problem when the releasing thread wants to immediately acquire the lock again: it will be +//! allowed to do so (as long as W1 doesn't get to it first). +//! +//! When the waiter queue is non-empty, unlocking the mutex always results in the first waiter being +//! popped form the queue and awakened. + +use super::{mutex::EmptyGuardContext, Guard, Lock, LockFactory, LockIniter}; +use crate::{bindings, str::CStr, Opaque}; +use core::sync::atomic::{AtomicUsize, Ordering}; +use core::{cell::UnsafeCell, pin::Pin}; + +/// The value that is OR'd into the [`Mutex::waiter_stack`] when the mutex is locked. +const LOCKED: usize = 1; + +/// A simple mutex. +/// +/// This is mutual-exclusion primitive. It guarantees that only one thread at a time may access the +/// data it protects. When multiple threads attempt to lock the same mutex, only one at a time is +/// allowed to progress, the others will block (sleep) until the mutex is unlocked, at which point +/// another thread will be allowed to wake up and make progress. +/// +/// # Examples +/// +/// ``` +/// # use kernel::{Result, sync::Ref, sync::smutex::Mutex}; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// static EXAMPLE: Mutex = Mutex::new(Example{ a: 10, b: 20 }); +/// +/// fn inc_a(example: &Mutex) { +/// let mut guard = example.lock(); +/// guard.a += 1; +/// } +/// +/// fn sum(example: &Mutex) -> u32 { +/// let guard = example.lock(); +/// guard.a + guard.b +/// } +/// +/// fn try_new(a: u32, b: u32) -> Result>> { +/// Ref::try_new(Mutex::new(Example {a, b})) +/// } +/// +/// assert_eq!(EXAMPLE.lock().a, 10); +/// assert_eq!(sum(&EXAMPLE), 30); +/// +/// inc_a(&EXAMPLE); +/// +/// assert_eq!(EXAMPLE.lock().a, 11); +/// assert_eq!(sum(&EXAMPLE), 31); +/// +/// # try_new(42, 43); +/// ``` +pub struct Mutex { + /// A stack of waiters. + /// + /// It is accessed atomically by threads lock/unlocking the mutex. Additionally, the + /// least-significant bit is used to indicate whether the mutex is locked or not. + waiter_stack: AtomicUsize, + + /// A queue of waiters. + /// + /// This is only accessible to the holder of the mutex. When the owner of the mutex is + /// unlocking it, it will move waiters from the stack to the queue when the queue is empty and + /// the stack non-empty. + waiter_queue: UnsafeCell<*mut Waiter>, + + /// The data protected by the mutex. + data: UnsafeCell, +} + +// SAFETY: `Mutex` can be transferred across thread boundaries iff the data it protects can. +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for Mutex {} + +// SAFETY: `Mutex` serialises the interior mutability it provides, so it is `Sync` as long as the +// data it protects is `Send`. +unsafe impl Sync for Mutex {} + +impl Mutex { + /// Creates a new instance of the mutex. + pub const fn new(data: T) -> Self { + Self { + waiter_stack: AtomicUsize::new(0), + waiter_queue: UnsafeCell::new(core::ptr::null_mut()), + data: UnsafeCell::new(data), + } + } +} + +impl Mutex { + /// Locks the mutex and gives the caller access to the data protected by it. Only one thread at + /// a time is allowed to access the protected data. + pub fn lock(&self) -> Guard<'_, Self> { + let ctx = self.lock_noguard(); + // SAFETY: The mutex was just acquired. + unsafe { Guard::new(self, ctx) } + } +} + +impl LockFactory for Mutex { + type LockedType = Mutex; + + unsafe fn new_lock(data: U) -> Mutex { + Mutex::new(data) + } +} + +impl LockIniter for Mutex { + unsafe fn init_lock( + self: Pin<&mut Self>, + _name: &'static CStr, + _key: *mut bindings::lock_class_key, + ) { + } +} + +// SAFETY: The mutex implementation ensures mutual exclusion. +unsafe impl Lock for Mutex { + type Inner = T; + type GuardContext = EmptyGuardContext; + + fn lock_noguard(&self) -> EmptyGuardContext { + loop { + // Try the fast path: the caller owns the mutex if we manage to set the `LOCKED` bit. + // + // The `acquire` order matches with one of the `release` ones in `unlock`. + if self.waiter_stack.fetch_or(LOCKED, Ordering::Acquire) & LOCKED == 0 { + return EmptyGuardContext; + } + + // Slow path: we'll likely need to wait, so initialise a local waiter struct. + let mut waiter = Waiter { + completion: Opaque::uninit(), + next: core::ptr::null_mut(), + }; + + // SAFETY: The completion object was just allocated on the stack and is valid for + // writes. + unsafe { bindings::init_completion(waiter.completion.get()) }; + + // Try to enqueue the waiter by pushing into onto the waiter stack. We want to do it + // only while the mutex is locked by another thread. + loop { + // We use relaxed here because we're just reading the value we'll CAS later (which + // has a stronger ordering on success). + let mut v = self.waiter_stack.load(Ordering::Relaxed); + if v & LOCKED == 0 { + // The mutex was released by another thread, so try to acquire it. + // + // The `acquire` order matches with one of the `release` ones in `unlock`. + v = self.waiter_stack.fetch_or(LOCKED, Ordering::Acquire); + if v & LOCKED == 0 { + return EmptyGuardContext; + } + } + + waiter.next = (v & !LOCKED) as _; + + // The `release` order matches with `acquire` in `unlock` when the stack is swapped + // out. We use release order here to ensure that the other thread can see our + // waiter fully initialised. + if self + .waiter_stack + .compare_exchange( + v, + (&mut waiter as *mut _ as usize) | LOCKED, + Ordering::Release, + Ordering::Relaxed, + ) + .is_ok() + { + break; + } + } + + // Wait for the owner to lock to wake this thread up. + // + // SAFETY: Completion object was previously initialised with `init_completion` and + // remains valid. + unsafe { bindings::wait_for_completion(waiter.completion.get()) }; + } + } + + unsafe fn unlock(&self, _: &mut EmptyGuardContext) { + // SAFETY: The caller owns the mutex, so it is safe to manipulate the local wait queue. + let mut waiter = unsafe { *self.waiter_queue.get() }; + loop { + // If we have a non-empty local queue of waiters, pop the first one, release the mutex, + // and wake it up (the popped waiter). + if !waiter.is_null() { + // SAFETY: The caller owns the mutex, so it is safe to manipulate the local wait + // queue. + unsafe { *self.waiter_queue.get() = (*waiter).next }; + + // The `release` order matches with one of the `acquire` ones in `lock_noguard`. + self.waiter_stack.fetch_and(!LOCKED, Ordering::Release); + + // Wake up the first waiter. + // + // SAFETY: The completion object was initialised before being added to the wait + // stack and is only removed above, when called completed. So it is safe for + // writes. + unsafe { bindings::complete_all((*waiter).completion.get()) }; + return; + } + + // Try the fast path when there are no local waiters. + // + // The `release` order matches with one of the `acquire` ones in `lock_noguard`. + if self + .waiter_stack + .compare_exchange(LOCKED, 0, Ordering::Release, Ordering::Relaxed) + .is_ok() + { + return; + } + + // We don't have a local queue, so pull the whole stack off, reverse it, and use it as a + // local queue. Since we're manipulating this queue, we need to keep ownership of the + // mutex. + // + // The `acquire` order matches with the `release` one in `lock_noguard` where a waiter + // is pushed onto the stack. It ensures that we see the fully-initialised waiter. + let mut stack = + (self.waiter_stack.swap(LOCKED, Ordering::Acquire) & !LOCKED) as *mut Waiter; + while !stack.is_null() { + // SAFETY: The caller still owns the mutex, so it is safe to manipulate the + // elements of the wait queue, which will soon become that wait queue. + let next = unsafe { (*stack).next }; + + // SAFETY: Same as above. + unsafe { (*stack).next = waiter }; + + waiter = stack; + stack = next; + } + } + } + + fn locked_data(&self) -> &UnsafeCell { + &self.data + } +} + +struct Waiter { + completion: Opaque, + next: *mut Waiter, +} diff --git a/rust/kernel/sync/spinlock.rs b/rust/kernel/sync/spinlock.rs new file mode 100644 index 00000000000000..fb324d63127f1e --- /dev/null +++ b/rust/kernel/sync/spinlock.rs @@ -0,0 +1,360 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! A kernel spinlock. +//! +//! This module allows Rust code to use the kernel's [`struct spinlock`]. +//! +//! See . + +use super::{mutex::EmptyGuardContext, Guard, Lock, LockFactory, LockInfo, LockIniter, WriteLock}; +use crate::{bindings, c_types, str::CStr, Opaque, True}; +use core::{cell::UnsafeCell, marker::PhantomPinned, pin::Pin}; + +/// Safely initialises a [`SpinLock`] with the given name, generating a new lock class. +#[macro_export] +macro_rules! spinlock_init { + ($spinlock:expr, $name:literal) => { + $crate::init_with_lockdep!($spinlock, $name) + }; +} + +/// Exposes the kernel's [`spinlock_t`]. When multiple CPUs attempt to lock the same spinlock, only +/// one at a time is allowed to progress, the others will block (spinning) until the spinlock is +/// unlocked, at which point another CPU will be allowed to make progress. +/// +/// A [`SpinLock`] must first be initialised with a call to [`SpinLock::init_lock`] before it can be +/// used. The [`spinlock_init`] macro is provided to automatically assign a new lock class to a +/// spinlock instance. +/// +/// There are two ways to acquire the lock: +/// - [`SpinLock::lock`], which doesn't manage interrupt state, so it should be used in only two +/// cases: (a) when the caller knows that interrupts are disabled, or (b) when callers never use +/// it in atomic context (e.g., interrupt handlers), in which case it is ok for interrupts to be +/// enabled. +/// - [`SpinLock::lock_irqdisable`], which disables interrupts if they are enabled before +/// acquiring the lock. When the lock is released, the interrupt state is automatically returned +/// to its value before [`SpinLock::lock_irqdisable`] was called. +/// +/// # Examples +/// +/// ``` +/// # use kernel::sync::SpinLock; +/// # use core::pin::Pin; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// // Function that acquires spinlock without changing interrupt state. +/// fn lock_example(value: &SpinLock) { +/// let mut guard = value.lock(); +/// guard.a = 10; +/// guard.b = 20; +/// } +/// +/// // Function that acquires spinlock and disables interrupts while holding it. +/// fn lock_irqdisable_example(value: &SpinLock) { +/// let mut guard = value.lock_irqdisable(); +/// guard.a = 30; +/// guard.b = 40; +/// } +/// +/// // Initialises a spinlock. +/// // SAFETY: `spinlock_init` is called below. +/// let mut value = unsafe { SpinLock::new(Example { a: 1, b: 2 }) }; +/// // SAFETY: We don't move `value`. +/// kernel::spinlock_init!(unsafe { Pin::new_unchecked(&mut value) }, "value"); +/// +/// // Calls the example functions. +/// assert_eq!(value.lock().a, 1); +/// lock_example(&value); +/// assert_eq!(value.lock().a, 10); +/// lock_irqdisable_example(&value); +/// assert_eq!(value.lock().a, 30); +/// ``` +/// +/// [`spinlock_t`]: ../../../include/linux/spinlock.h +pub struct SpinLock { + spin_lock: Opaque, + + /// Spinlocks are architecture-defined. So we conservatively require them to be pinned in case + /// some architecture uses self-references now or in the future. + _pin: PhantomPinned, + + data: UnsafeCell, +} + +// SAFETY: `SpinLock` can be transferred across thread boundaries iff the data it protects can. +unsafe impl Send for SpinLock {} + +// SAFETY: `SpinLock` serialises the interior mutability it provides, so it is `Sync` as long as the +// data it protects is `Send`. +unsafe impl Sync for SpinLock {} + +impl SpinLock { + /// Constructs a new spinlock. + /// + /// # Safety + /// + /// The caller must call [`SpinLock::init_lock`] before using the spinlock. + pub const unsafe fn new(t: T) -> Self { + Self { + spin_lock: Opaque::uninit(), + data: UnsafeCell::new(t), + _pin: PhantomPinned, + } + } +} + +impl SpinLock { + /// Locks the spinlock and gives the caller access to the data protected by it. Only one thread + /// at a time is allowed to access the protected data. + pub fn lock(&self) -> Guard<'_, Self, WriteLock> { + let ctx = >::lock_noguard(self); + // SAFETY: The spinlock was just acquired. + unsafe { Guard::new(self, ctx) } + } + + /// Locks the spinlock and gives the caller access to the data protected by it. Additionally it + /// disables interrupts (if they are enabled). + /// + /// When the lock in unlocked, the interrupt state (enabled/disabled) is restored. + pub fn lock_irqdisable(&self) -> Guard<'_, Self, DisabledInterrupts> { + let ctx = >::lock_noguard(self); + // SAFETY: The spinlock was just acquired. + unsafe { Guard::new(self, ctx) } + } +} + +impl LockFactory for SpinLock { + type LockedType = SpinLock; + + unsafe fn new_lock(data: U) -> SpinLock { + // SAFETY: The safety requirements of `new_lock` also require that `init_lock` be called. + unsafe { SpinLock::new(data) } + } +} + +impl LockIniter for SpinLock { + unsafe fn init_lock( + self: Pin<&mut Self>, + name: &'static CStr, + key: *mut bindings::lock_class_key, + ) { + unsafe { bindings::__spin_lock_init(self.spin_lock.get(), name.as_char_ptr(), key) }; + } +} + +/// A type state indicating that interrupts were disabled. +pub struct DisabledInterrupts; +impl LockInfo for DisabledInterrupts { + type Writable = True; +} + +// SAFETY: The underlying kernel `spinlock_t` object ensures mutual exclusion. +unsafe impl Lock for SpinLock { + type Inner = T; + type GuardContext = EmptyGuardContext; + + fn lock_noguard(&self) -> EmptyGuardContext { + // SAFETY: `spin_lock` points to valid memory. + unsafe { bindings::spin_lock(self.spin_lock.get()) }; + EmptyGuardContext + } + + unsafe fn unlock(&self, _: &mut EmptyGuardContext) { + // SAFETY: The safety requirements of the function ensure that the spinlock is owned by + // the caller. + unsafe { bindings::spin_unlock(self.spin_lock.get()) } + } + + fn locked_data(&self) -> &UnsafeCell { + &self.data + } +} + +// SAFETY: The underlying kernel `spinlock_t` object ensures mutual exclusion. +unsafe impl Lock for SpinLock { + type Inner = T; + type GuardContext = c_types::c_ulong; + + fn lock_noguard(&self) -> c_types::c_ulong { + // SAFETY: `spin_lock` points to valid memory. + unsafe { bindings::spin_lock_irqsave(self.spin_lock.get()) } + } + + unsafe fn unlock(&self, ctx: &mut c_types::c_ulong) { + // SAFETY: The safety requirements of the function ensure that the spinlock is owned by + // the caller. + unsafe { bindings::spin_unlock_irqrestore(self.spin_lock.get(), *ctx) } + } + + fn locked_data(&self) -> &UnsafeCell { + &self.data + } +} + +/// Safely initialises a [`RawSpinLock`] with the given name, generating a new lock class. +#[macro_export] +macro_rules! rawspinlock_init { + ($spinlock:expr, $name:literal) => { + $crate::init_with_lockdep!($spinlock, $name) + }; +} + +/// Exposes the kernel's [`raw_spinlock_t`]. +/// +/// It is very similar to [`SpinLock`], except that it is guaranteed not to sleep even on RT +/// variants of the kernel. +/// +/// # Examples +/// +/// ``` +/// # use kernel::sync::RawSpinLock; +/// # use core::pin::Pin; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// // Function that acquires the raw spinlock without changing interrupt state. +/// fn lock_example(value: &RawSpinLock) { +/// let mut guard = value.lock(); +/// guard.a = 10; +/// guard.b = 20; +/// } +/// +/// // Function that acquires the raw spinlock and disables interrupts while holding it. +/// fn lock_irqdisable_example(value: &RawSpinLock) { +/// let mut guard = value.lock_irqdisable(); +/// guard.a = 30; +/// guard.b = 40; +/// } +/// +/// // Initialises a raw spinlock and calls the example functions. +/// fn spinlock_example() { +/// // SAFETY: `rawspinlock_init` is called below. +/// let mut value = unsafe { RawSpinLock::new(Example { a: 1, b: 2 }) }; +/// // SAFETY: We don't move `value`. +/// kernel::rawspinlock_init!(unsafe { Pin::new_unchecked(&mut value) }, "value"); +/// lock_example(&value); +/// lock_irqdisable_example(&value); +/// } +/// ``` +/// +/// [`raw_spinlock_t`]: ../../../include/linux/spinlock.h +pub struct RawSpinLock { + spin_lock: Opaque, + + // Spinlocks are architecture-defined. So we conservatively require them to be pinned in case + // some architecture uses self-references now or in the future. + _pin: PhantomPinned, + + data: UnsafeCell, +} + +// SAFETY: `RawSpinLock` can be transferred across thread boundaries iff the data it protects can. +unsafe impl Send for RawSpinLock {} + +// SAFETY: `RawSpinLock` serialises the interior mutability it provides, so it is `Sync` as long as +// the data it protects is `Send`. +unsafe impl Sync for RawSpinLock {} + +impl RawSpinLock { + /// Constructs a new raw spinlock. + /// + /// # Safety + /// + /// The caller must call [`RawSpinLock::init_lock`] before using the raw spinlock. + pub const unsafe fn new(t: T) -> Self { + Self { + spin_lock: Opaque::uninit(), + data: UnsafeCell::new(t), + _pin: PhantomPinned, + } + } +} + +impl RawSpinLock { + /// Locks the raw spinlock and gives the caller access to the data protected by it. Only one + /// thread at a time is allowed to access the protected data. + pub fn lock(&self) -> Guard<'_, Self, WriteLock> { + let ctx = >::lock_noguard(self); + // SAFETY: The raw spinlock was just acquired. + unsafe { Guard::new(self, ctx) } + } + + /// Locks the raw spinlock and gives the caller access to the data protected by it. + /// Additionally it disables interrupts (if they are enabled). + /// + /// When the lock in unlocked, the interrupt state (enabled/disabled) is restored. + pub fn lock_irqdisable(&self) -> Guard<'_, Self, DisabledInterrupts> { + let ctx = >::lock_noguard(self); + // SAFETY: The raw spinlock was just acquired. + unsafe { Guard::new(self, ctx) } + } +} + +impl LockFactory for RawSpinLock { + type LockedType = RawSpinLock; + + unsafe fn new_lock(data: U) -> RawSpinLock { + // SAFETY: The safety requirements of `new_lock` also require that `init_lock` be called. + unsafe { RawSpinLock::new(data) } + } +} + +impl LockIniter for RawSpinLock { + unsafe fn init_lock( + self: Pin<&mut Self>, + name: &'static CStr, + key: *mut bindings::lock_class_key, + ) { + unsafe { bindings::_raw_spin_lock_init(self.spin_lock.get(), name.as_char_ptr(), key) }; + } +} + +// SAFETY: The underlying kernel `raw_spinlock_t` object ensures mutual exclusion. +unsafe impl Lock for RawSpinLock { + type Inner = T; + type GuardContext = EmptyGuardContext; + + fn lock_noguard(&self) -> EmptyGuardContext { + // SAFETY: `spin_lock` points to valid memory. + unsafe { bindings::raw_spin_lock(self.spin_lock.get()) }; + EmptyGuardContext + } + + unsafe fn unlock(&self, _: &mut EmptyGuardContext) { + // SAFETY: The safety requirements of the function ensure that the raw spinlock is owned by + // the caller. + unsafe { bindings::raw_spin_unlock(self.spin_lock.get()) }; + } + + fn locked_data(&self) -> &UnsafeCell { + &self.data + } +} + +// SAFETY: The underlying kernel `raw_spinlock_t` object ensures mutual exclusion. +unsafe impl Lock for RawSpinLock { + type Inner = T; + type GuardContext = c_types::c_ulong; + + fn lock_noguard(&self) -> c_types::c_ulong { + // SAFETY: `spin_lock` points to valid memory. + unsafe { bindings::raw_spin_lock_irqsave(self.spin_lock.get()) } + } + + unsafe fn unlock(&self, ctx: &mut c_types::c_ulong) { + // SAFETY: The safety requirements of the function ensure that the raw spinlock is owned by + // the caller. + unsafe { bindings::raw_spin_unlock_irqrestore(self.spin_lock.get(), *ctx) }; + } + + fn locked_data(&self) -> &UnsafeCell { + &self.data + } +} From bc3e7f438ab481cd72502e2c58958142ae55aae6 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Fri, 11 Feb 2022 20:25:34 +0100 Subject: [PATCH 0026/1250] rust: add `kernel` crate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `kernel` crate currently includes all the abstractions that wrap kernel features written in C. These abstractions call the C side of the kernel via the generated bindings with the `bindgen` tool. Modules developed in Rust should never call the bindings themselves. In the future, as the abstractions grow in number, we may need to split this crate into several, possibly following a similar subdivision in subsystems as the kernel itself and/or moving the code to the actual subsystems. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Geoffrey Thomas Signed-off-by: Geoffrey Thomas Co-developed-by: Finn Behrens Signed-off-by: Finn Behrens Co-developed-by: Adam Bratschi-Kaye Signed-off-by: Adam Bratschi-Kaye Co-developed-by: Michael Ellerman Signed-off-by: Michael Ellerman Co-developed-by: Sumera Priyadarsini Signed-off-by: Sumera Priyadarsini Co-developed-by: Sven Van Asbroeck Signed-off-by: Sven Van Asbroeck Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boris-Chengbiao Zhou Signed-off-by: Boris-Chengbiao Zhou Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Co-developed-by: Fox Chen Signed-off-by: Fox Chen Co-developed-by: Dan Robertson Signed-off-by: Dan Robertson Co-developed-by: Viktor Garske Signed-off-by: Viktor Garske Co-developed-by: Dariusz Sosnowski Signed-off-by: Dariusz Sosnowski Co-developed-by: Léo Lanteri Thauvin Signed-off-by: Léo Lanteri Thauvin Co-developed-by: Niklas Mohrin Signed-off-by: Niklas Mohrin Co-developed-by: Gioh Kim Signed-off-by: Gioh Kim Co-developed-by: Daniel Xu Signed-off-by: Daniel Xu Co-developed-by: Milan Landaverde Signed-off-by: Milan Landaverde Co-developed-by: Morgan Bartlett Signed-off-by: Morgan Bartlett Co-developed-by: Maciej Falkowski Signed-off-by: Maciej Falkowski Co-developed-by: Jiapeng Chong Signed-off-by: Jiapeng Chong Co-developed-by: Nándor István Krácser Signed-off-by: Nándor István Krácser Co-developed-by: David Gow Signed-off-by: David Gow Signed-off-by: Wedson Almeida Filho Co-developed-by: Miguel Ojeda Signed-off-by: Miguel Ojeda --- rust/kernel/allocator.rs | 65 +++ rust/kernel/amba.rs | 257 ++++++++++ rust/kernel/bindings.rs | 47 ++ rust/kernel/bindings_helper.h | 46 ++ rust/kernel/build_assert.rs | 82 ++++ rust/kernel/c_types.rs | 119 +++++ rust/kernel/chrdev.rs | 207 ++++++++ rust/kernel/clk.rs | 79 ++++ rust/kernel/cred.rs | 46 ++ rust/kernel/device.rs | 546 +++++++++++++++++++++ rust/kernel/driver.rs | 442 +++++++++++++++++ rust/kernel/error.rs | 565 ++++++++++++++++++++++ rust/kernel/file.rs | 860 ++++++++++++++++++++++++++++++++++ rust/kernel/gpio.rs | 478 +++++++++++++++++++ rust/kernel/hwrng.rs | 242 ++++++++++ rust/kernel/io_buffer.rs | 153 ++++++ rust/kernel/io_mem.rs | 275 +++++++++++ rust/kernel/iov_iter.rs | 81 ++++ rust/kernel/irq.rs | 411 ++++++++++++++++ rust/kernel/kasync.rs | 6 + rust/kernel/kasync/net.rs | 322 +++++++++++++ rust/kernel/kunit.rs | 91 ++++ rust/kernel/lib.rs | 261 +++++++++++ rust/kernel/linked_list.rs | 247 ++++++++++ rust/kernel/miscdev.rs | 291 ++++++++++++ rust/kernel/mm.rs | 149 ++++++ rust/kernel/module_param.rs | 498 ++++++++++++++++++++ rust/kernel/net.rs | 392 ++++++++++++++++ rust/kernel/net/filter.rs | 447 ++++++++++++++++++ rust/kernel/of.rs | 63 +++ rust/kernel/pages.rs | 144 ++++++ rust/kernel/platform.rs | 223 +++++++++ rust/kernel/power.rs | 118 +++++ rust/kernel/prelude.rs | 36 ++ rust/kernel/print.rs | 405 ++++++++++++++++ rust/kernel/random.rs | 42 ++ rust/kernel/raw_list.rs | 361 ++++++++++++++ rust/kernel/rbtree.rs | 563 ++++++++++++++++++++++ rust/kernel/revocable.rs | 161 +++++++ rust/kernel/security.rs | 38 ++ rust/kernel/static_assert.rs | 38 ++ rust/kernel/std_vendor.rs | 160 +++++++ rust/kernel/str.rs | 597 +++++++++++++++++++++++ rust/kernel/sysctl.rs | 199 ++++++++ rust/kernel/task.rs | 175 +++++++ rust/kernel/types.rs | 679 +++++++++++++++++++++++++++ rust/kernel/user_ptr.rs | 175 +++++++ 47 files changed, 11882 insertions(+) create mode 100644 rust/kernel/allocator.rs create mode 100644 rust/kernel/amba.rs create mode 100644 rust/kernel/bindings.rs create mode 100644 rust/kernel/bindings_helper.h create mode 100644 rust/kernel/build_assert.rs create mode 100644 rust/kernel/c_types.rs create mode 100644 rust/kernel/chrdev.rs create mode 100644 rust/kernel/clk.rs create mode 100644 rust/kernel/cred.rs create mode 100644 rust/kernel/device.rs create mode 100644 rust/kernel/driver.rs create mode 100644 rust/kernel/error.rs create mode 100644 rust/kernel/file.rs create mode 100644 rust/kernel/gpio.rs create mode 100644 rust/kernel/hwrng.rs create mode 100644 rust/kernel/io_buffer.rs create mode 100644 rust/kernel/io_mem.rs create mode 100644 rust/kernel/iov_iter.rs create mode 100644 rust/kernel/irq.rs create mode 100644 rust/kernel/kasync.rs create mode 100644 rust/kernel/kasync/net.rs create mode 100644 rust/kernel/kunit.rs create mode 100644 rust/kernel/lib.rs create mode 100644 rust/kernel/linked_list.rs create mode 100644 rust/kernel/miscdev.rs create mode 100644 rust/kernel/mm.rs create mode 100644 rust/kernel/module_param.rs create mode 100644 rust/kernel/net.rs create mode 100644 rust/kernel/net/filter.rs create mode 100644 rust/kernel/of.rs create mode 100644 rust/kernel/pages.rs create mode 100644 rust/kernel/platform.rs create mode 100644 rust/kernel/power.rs create mode 100644 rust/kernel/prelude.rs create mode 100644 rust/kernel/print.rs create mode 100644 rust/kernel/random.rs create mode 100644 rust/kernel/raw_list.rs create mode 100644 rust/kernel/rbtree.rs create mode 100644 rust/kernel/revocable.rs create mode 100644 rust/kernel/security.rs create mode 100644 rust/kernel/static_assert.rs create mode 100644 rust/kernel/std_vendor.rs create mode 100644 rust/kernel/str.rs create mode 100644 rust/kernel/sysctl.rs create mode 100644 rust/kernel/task.rs create mode 100644 rust/kernel/types.rs create mode 100644 rust/kernel/user_ptr.rs diff --git a/rust/kernel/allocator.rs b/rust/kernel/allocator.rs new file mode 100644 index 00000000000000..4c5d2fc6f206a0 --- /dev/null +++ b/rust/kernel/allocator.rs @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Allocator support. + +use core::alloc::{GlobalAlloc, Layout}; +use core::ptr; + +use crate::bindings; +use crate::c_types; + +struct KernelAllocator; + +unsafe impl GlobalAlloc for KernelAllocator { + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + // `krealloc()` is used instead of `kmalloc()` because the latter is + // an inline function and cannot be bound to as a result. + unsafe { bindings::krealloc(ptr::null(), layout.size(), bindings::GFP_KERNEL) as *mut u8 } + } + + unsafe fn dealloc(&self, ptr: *mut u8, _layout: Layout) { + unsafe { + bindings::kfree(ptr as *const c_types::c_void); + } + } +} + +#[global_allocator] +static ALLOCATOR: KernelAllocator = KernelAllocator; + +// `rustc` only generates these for some crate types. Even then, we would need +// to extract the object file that has them from the archive. For the moment, +// let's generate them ourselves instead. +// +// Note that `#[no_mangle]` implies exported too, nowadays. +#[no_mangle] +fn __rust_alloc(size: usize, _align: usize) -> *mut u8 { + unsafe { bindings::krealloc(core::ptr::null(), size, bindings::GFP_KERNEL) as *mut u8 } +} + +#[no_mangle] +fn __rust_dealloc(ptr: *mut u8, _size: usize, _align: usize) { + unsafe { bindings::kfree(ptr as *const c_types::c_void) }; +} + +#[no_mangle] +fn __rust_realloc(ptr: *mut u8, _old_size: usize, _align: usize, new_size: usize) -> *mut u8 { + unsafe { + bindings::krealloc( + ptr as *const c_types::c_void, + new_size, + bindings::GFP_KERNEL, + ) as *mut u8 + } +} + +#[no_mangle] +fn __rust_alloc_zeroed(size: usize, _align: usize) -> *mut u8 { + unsafe { + bindings::krealloc( + core::ptr::null(), + size, + bindings::GFP_KERNEL | bindings::__GFP_ZERO, + ) as *mut u8 + } +} diff --git a/rust/kernel/amba.rs b/rust/kernel/amba.rs new file mode 100644 index 00000000000000..7ca5358d2580f5 --- /dev/null +++ b/rust/kernel/amba.rs @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Amba devices and drivers. +//! +//! C header: [`include/linux/amba/bus.h`](../../../../include/linux/amba/bus.h) + +use crate::{ + bindings, c_types, device, driver, error::from_kernel_result, io_mem::Resource, power, + str::CStr, to_result, types::PointerWrapper, Result, ThisModule, +}; + +/// A registration of an amba driver. +pub type Registration = driver::Registration>; + +/// Id of an Amba device. +#[derive(Clone, Copy)] +pub struct DeviceId { + /// Device id. + pub id: u32, + + /// Mask that identifies which bits are valid in the device id. + pub mask: u32, +} + +// SAFETY: `ZERO` is all zeroed-out and `to_rawid` stores `offset` in `amba_id::data`. +unsafe impl const driver::RawDeviceId for DeviceId { + type RawType = bindings::amba_id; + const ZERO: Self::RawType = bindings::amba_id { + id: 0, + mask: 0, + data: core::ptr::null_mut(), + }; + + fn to_rawid(&self, offset: isize) -> Self::RawType { + bindings::amba_id { + id: self.id, + mask: self.mask, + data: offset as _, + } + } +} + +/// An amba driver. +pub trait Driver { + /// Data stored on device by driver. + type Data: PointerWrapper + Send + Sync + driver::DeviceRemoval = (); + + /// The type that implements the power-management operations. + /// + /// The default is a type that implements no power-management operations. Drivers that do + /// implement them need to specify the type (commonly [`Self`]). + type PowerOps: power::Operations = power::NoOperations; + + /// The type holding information about each device id supported by the driver. + type IdInfo: 'static = (); + + /// The table of device ids supported by the driver. + const ID_TABLE: Option> = None; + + /// Probes for the device with the given id. + fn probe(dev: &mut Device, id_info: Option<&Self::IdInfo>) -> Result; + + /// Cleans any resources up that are associated with the device. + /// + /// This is called when the driver is detached from the device. + fn remove(_data: &Self::Data) {} +} + +/// An adapter for the registration of Amba drivers. +pub struct Adapter(T); + +impl driver::DriverOps for Adapter { + type RegType = bindings::amba_driver; + + unsafe fn register( + reg: *mut bindings::amba_driver, + name: &'static CStr, + module: &'static ThisModule, + ) -> Result { + // SAFETY: By the safety requirements of this function (defined in the trait definition), + // `reg` is non-null and valid. + let amba = unsafe { &mut *reg }; + amba.drv.name = name.as_char_ptr(); + amba.drv.owner = module.0; + amba.probe = Some(probe_callback::); + amba.remove = Some(remove_callback::); + if let Some(t) = T::ID_TABLE { + amba.id_table = t.as_ref(); + } + if cfg!(CONFIG_PM) { + // SAFETY: `probe_callback` sets the driver data after calling `T::Data::into_pointer`, + // and we guarantee that `T::Data` is the same as `T::PowerOps::Data` by a constraint + // in the type declaration. + amba.drv.pm = unsafe { power::OpsTable::::build() }; + } + // SAFETY: By the safety requirements of this function, `reg` is valid and fully + // initialised. + to_result(|| unsafe { bindings::amba_driver_register(reg) }) + } + + unsafe fn unregister(reg: *mut bindings::amba_driver) { + // SAFETY: By the safety requirements of this function (defined in the trait definition), + // `reg` was passed (and updated) by a previous successful call to `amba_driver_register`. + unsafe { bindings::amba_driver_unregister(reg) }; + } +} + +unsafe extern "C" fn probe_callback( + adev: *mut bindings::amba_device, + aid: *const bindings::amba_id, +) -> c_types::c_int { + from_kernel_result! { + // SAFETY: `adev` is valid by the contract with the C code. `dev` is alive only for the + // duration of this call, so it is guaranteed to remain alive for the lifetime of `dev`. + let mut dev = unsafe { Device::from_ptr(adev) }; + // SAFETY: `aid` is valid by the requirements the contract with the C code. + let offset = unsafe { (*aid).data }; + let info = if offset.is_null() { + None + } else { + // SAFETY: The offset comes from a previous call to `offset_from` in `IdArray::new`, + // which guarantees that the resulting pointer is within the table. + let ptr = unsafe { aid.cast::().offset(offset as _).cast::>() }; + // SAFETY: The id table has a static lifetime, so `ptr` is guaranteed to be valid for + // read. + unsafe { (&*ptr).as_ref() } + }; + let data = T::probe(&mut dev, info)?; + let ptr = T::Data::into_pointer(data); + // SAFETY: `adev` is valid for write by the contract with the C code. + unsafe { bindings::amba_set_drvdata(adev, ptr as _) }; + Ok(0) + } +} + +unsafe extern "C" fn remove_callback(adev: *mut bindings::amba_device) { + // SAFETY: `adev` is valid by the contract with the C code. + let ptr = unsafe { bindings::amba_get_drvdata(adev) }; + // SAFETY: The value returned by `amba_get_drvdata` was stored by a previous call to + // `amba_set_drvdata` in `probe_callback` above; the value comes from a call to + // `T::Data::into_pointer`. + let data = unsafe { T::Data::from_pointer(ptr) }; + T::remove(&data); + ::device_remove(&data); +} + +/// An Amba device. +/// +/// # Invariants +/// +/// The field `ptr` is non-null and valid for the lifetime of the object. +pub struct Device { + ptr: *mut bindings::amba_device, + res: Option, +} + +impl Device { + /// Creates a new device from the given pointer. + /// + /// # Safety + /// + /// `ptr` must be non-null and valid. It must remain valid for the lifetime of the returned + /// instance. + unsafe fn from_ptr(ptr: *mut bindings::amba_device) -> Self { + // SAFETY: The safety requirements of the function ensure that `ptr` is valid. + let dev = unsafe { &mut *ptr }; + // INVARIANT: The safety requirements of the function ensure the lifetime invariant. + Self { + ptr, + res: Resource::new(dev.res.start, dev.res.end), + } + } + + /// Returns the io mem resource associated with the device, if there is one. + /// + /// Ownership of the resource is transferred to the caller, so subsequent calls to this + /// function will return [`None`]. + pub fn take_resource(&mut self) -> Option { + self.res.take() + } + + /// Returns the index-th irq associated with the device, if one exists. + pub fn irq(&self, index: usize) -> Option { + // SAFETY: By the type invariants, `self.ptr` is valid for read. + let dev = unsafe { &*self.ptr }; + if index >= dev.irq.len() || dev.irq[index] == 0 { + None + } else { + Some(dev.irq[index]) + } + } +} + +// SAFETY: The device returned by `raw_device` is the raw Amba device. +unsafe impl device::RawDevice for Device { + fn raw_device(&self) -> *mut bindings::device { + // SAFETY: By the type invariants, we know that `self.ptr` is non-null and valid. + unsafe { &mut (*self.ptr).dev } + } +} + +/// Declares a kernel module that exposes a single amba driver. +/// +/// # Examples +/// +/// ```ignore +/// # use kernel::{amba, define_amba_id_table, module_amba_driver}; +/// # +/// struct MyDriver; +/// impl amba::Driver for MyDriver { +/// // [...] +/// # fn probe(_dev: &mut amba::Device, _id: Option<&Self::IdInfo>) -> Result { +/// # Ok(()) +/// # } +/// # define_amba_id_table! {(), [ +/// # ({ id: 0x00041061, mask: 0x000fffff }, None), +/// # ]} +/// } +/// +/// module_amba_driver! { +/// type: MyDriver, +/// name: b"module_name", +/// author: b"Author name", +/// license: b"GPL", +/// } +/// ``` +#[macro_export] +macro_rules! module_amba_driver { + ($($f:tt)*) => { + $crate::module_driver!(, $crate::amba::Adapter, { $($f)* }); + }; +} + +/// Defines the id table for amba devices. +/// +/// # Examples +/// +/// ``` +/// # use kernel::{amba, define_amba_id_table}; +/// # +/// # struct Sample; +/// # impl kernel::amba::Driver for Sample { +/// # fn probe(_dev: &mut amba::Device, _id: Option<&Self::IdInfo>) -> Result { +/// # Ok(()) +/// # } +/// define_amba_id_table! {(), [ +/// ({ id: 0x00041061, mask: 0x000fffff }, None), +/// ]} +/// # } +/// ``` +#[macro_export] +macro_rules! define_amba_id_table { + ($data_type:ty, $($t:tt)*) => { + type IdInfo = $data_type; + $crate::define_id_table!(ID_TABLE, $crate::amba::DeviceId, $data_type, $($t)*); + }; +} diff --git a/rust/kernel/bindings.rs b/rust/kernel/bindings.rs new file mode 100644 index 00000000000000..29a21030688e56 --- /dev/null +++ b/rust/kernel/bindings.rs @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Bindings. +//! +//! Imports the generated bindings by `bindgen`. + +// See https://github.com/rust-lang/rust-bindgen/issues/1651. +#![cfg_attr(test, allow(deref_nullptr))] +#![cfg_attr(test, allow(unaligned_references))] +#![cfg_attr(test, allow(unsafe_op_in_unsafe_fn))] +#![allow( + clippy::all, + non_camel_case_types, + non_upper_case_globals, + non_snake_case, + improper_ctypes, + unreachable_pub, + unsafe_op_in_unsafe_fn +)] + +mod bindings_raw { + // Use glob import here to expose all helpers. + // Symbols defined within the module will take precedence to the glob import. + pub use super::bindings_helper::*; + use crate::c_types; + include!(concat!(env!("OBJTREE"), "/rust/bindings_generated.rs")); +} + +// When both a directly exposed symbol and a helper exists for the same function, +// the directly exposed symbol is preferred and the helper becomes dead code, so +// ignore the warning here. +#[allow(dead_code)] +mod bindings_helper { + // Import the generated bindings for types. + use super::bindings_raw::*; + use crate::c_types; + include!(concat!( + env!("OBJTREE"), + "/rust/bindings_helpers_generated.rs" + )); +} + +pub use bindings_raw::*; + +pub const GFP_KERNEL: gfp_t = BINDINGS_GFP_KERNEL; +pub const __GFP_ZERO: gfp_t = BINDINGS___GFP_ZERO; +pub const __GFP_HIGHMEM: gfp_t = ___GFP_HIGHMEM; diff --git a/rust/kernel/bindings_helper.h b/rust/kernel/bindings_helper.h new file mode 100644 index 00000000000000..73100fa139ebb5 --- /dev/null +++ b/rust/kernel/bindings_helper.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Header that contains the code (mostly headers) for which Rust bindings + * will be automatically generated by `bindgen`. + * + * Sorted alphabetically. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* `bindgen` gets confused at certain things. */ +const gfp_t BINDINGS_GFP_KERNEL = GFP_KERNEL; +const gfp_t BINDINGS___GFP_ZERO = __GFP_ZERO; +const __poll_t BINDINGS_EPOLLIN = EPOLLIN; +const __poll_t BINDINGS_EPOLLOUT = EPOLLOUT; +const __poll_t BINDINGS_EPOLLERR = EPOLLERR; +const __poll_t BINDINGS_EPOLLHUP = EPOLLHUP; diff --git a/rust/kernel/build_assert.rs b/rust/kernel/build_assert.rs new file mode 100644 index 00000000000000..18cffec7d03723 --- /dev/null +++ b/rust/kernel/build_assert.rs @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Build-time assert. + +/// Fails the build if the code path calling `build_error!` can possibly be executed. +/// +/// If the macro is executed in const context, `build_error!` will panic. +/// If the compiler or optimizer cannot guarantee that `build_error!` can never +/// be called, a build error will be triggered. +/// +/// # Examples +/// ``` +/// # use kernel::build_error; +/// #[inline] +/// fn foo(a: usize) -> usize { +/// a.checked_add(1).unwrap_or_else(|| build_error!("overflow")) +/// } +/// +/// assert_eq!(foo(usize::MAX - 1), usize::MAX); // OK. +/// // foo(usize::MAX); // Fails to compile. +/// ``` +#[macro_export] +macro_rules! build_error { + () => {{ + $crate::build_error("") + }}; + ($msg:expr) => {{ + $crate::build_error($msg) + }}; +} + +/// Asserts that a boolean expression is `true` at compile time. +/// +/// If the condition is evaluated to `false` in const context, `build_assert!` +/// will panic. If the compiler or optimizer cannot guarantee the condition will +/// be evaluated to `true`, a build error will be triggered. +/// +/// [`static_assert!`] should be preferred to `build_assert!` whenever possible. +/// +/// # Examples +/// +/// These examples show that different types of [`assert!`] will trigger errors +/// at different stage of compilation. It is preferred to err as early as +/// possible, so [`static_assert!`] should be used whenever possible. +// TODO: Could be `compile_fail` when supported. +/// ```ignore +/// fn foo() { +/// static_assert!(1 > 1); // Compile-time error +/// build_assert!(1 > 1); // Build-time error +/// assert!(1 > 1); // Run-time error +/// } +/// ``` +/// +/// When the condition refers to generic parameters or parameters of an inline function, +/// [`static_assert!`] cannot be used. Use `build_assert!` in this scenario. +/// ``` +/// fn foo() { +/// // `static_assert!(N > 1);` is not allowed +/// build_assert!(N > 1); // Build-time check +/// assert!(N > 1); // Run-time check +/// } +/// +/// #[inline] +/// fn bar(n: usize) { +/// // `static_assert!(n > 1);` is not allowed +/// build_assert!(n > 1); // Build-time check +/// assert!(n > 1); // Run-time check +/// } +/// ``` +#[macro_export] +macro_rules! build_assert { + ($cond:expr $(,)?) => {{ + if !$cond { + $crate::build_error(concat!("assertion failed: ", stringify!($cond))); + } + }}; + ($cond:expr, $msg:expr) => {{ + if !$cond { + $crate::build_error($msg); + } + }}; +} diff --git a/rust/kernel/c_types.rs b/rust/kernel/c_types.rs new file mode 100644 index 00000000000000..07593a3ba8bedb --- /dev/null +++ b/rust/kernel/c_types.rs @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! C types for the bindings. +//! +//! The bindings generated by `bindgen` use these types to map to the C ones. +//! +//! C's standard integer types may differ in width depending on +//! the architecture, thus we need to conditionally compile those. + +#![allow(non_camel_case_types)] + +#[cfg(any(target_arch = "arm", target_arch = "x86", target_arch = "riscv32",))] +mod c { + /// C `void` type. + pub type c_void = core::ffi::c_void; + + /// C `char` type. + pub type c_char = i8; + + /// C `signed char` type. + pub type c_schar = i8; + + /// C `unsigned char` type. + pub type c_uchar = u8; + + /// C `short` type. + pub type c_short = i16; + + /// C `unsigned short` type. + pub type c_ushort = u16; + + /// C `int` type. + pub type c_int = i32; + + /// C `unsigned int` type. + pub type c_uint = u32; + + /// C `long` type. + pub type c_long = i32; + + /// C `unsigned long` type. + pub type c_ulong = u32; + + /// C `long long` type. + pub type c_longlong = i64; + + /// C `unsigned long long` type. + pub type c_ulonglong = u64; + + /// C `ssize_t` type (typically defined in `` by POSIX). + /// + /// For some 32-bit architectures like this one, the kernel defines it as + /// `int`, i.e. it is an [`i32`]. + pub type c_ssize_t = isize; + + /// C `size_t` type (typically defined in ``). + /// + /// For some 32-bit architectures like this one, the kernel defines it as + /// `unsigned int`, i.e. it is an [`u32`]. + pub type c_size_t = usize; +} + +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "powerpc64", + target_arch = "riscv64", +))] +mod c { + /// C `void` type. + pub type c_void = core::ffi::c_void; + + /// C `char` type. + pub type c_char = i8; + + /// C `signed char` type. + pub type c_schar = i8; + + /// C `unsigned char` type. + pub type c_uchar = u8; + + /// C `short` type. + pub type c_short = i16; + + /// C `unsigned short` type. + pub type c_ushort = u16; + + /// C `int` type. + pub type c_int = i32; + + /// C `unsigned int` type. + pub type c_uint = u32; + + /// C `long` type. + pub type c_long = i64; + + /// C `unsigned long` type. + pub type c_ulong = u64; + + /// C `long long` type. + pub type c_longlong = i64; + + /// C `unsigned long long` type. + pub type c_ulonglong = u64; + + /// C `ssize_t` type (typically defined in `` by POSIX). + /// + /// For 64-bit architectures like this one, the kernel defines it as + /// `long`, i.e. it is an [`i64`]. + pub type c_ssize_t = isize; + + /// C `size_t` type (typically defined in ``). + /// + /// For 64-bit architectures like this one, the kernel defines it as + /// `unsigned long`, i.e. it is an [`u64`]. + pub type c_size_t = usize; +} + +pub use c::*; diff --git a/rust/kernel/chrdev.rs b/rust/kernel/chrdev.rs new file mode 100644 index 00000000000000..85a52c2d0b8ad9 --- /dev/null +++ b/rust/kernel/chrdev.rs @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Character devices. +//! +//! Also called "char devices", `chrdev`, `cdev`. +//! +//! C header: [`include/linux/cdev.h`](../../../../include/linux/cdev.h) +//! +//! Reference: + +use alloc::boxed::Box; +use core::convert::TryInto; +use core::marker::PhantomPinned; +use core::pin::Pin; + +use crate::bindings; +use crate::c_types; +use crate::error::{code::*, Error, Result}; +use crate::file; +use crate::str::CStr; + +/// Character device. +/// +/// # Invariants +/// +/// - [`self.0`] is valid and non-null. +/// - [`(*self.0).ops`] is valid, non-null and has static lifetime. +/// - [`(*self.0).owner`] is valid and, if non-null, has module lifetime. +struct Cdev(*mut bindings::cdev); + +impl Cdev { + fn alloc( + fops: &'static bindings::file_operations, + module: &'static crate::ThisModule, + ) -> Result { + // SAFETY: FFI call. + let cdev = unsafe { bindings::cdev_alloc() }; + if cdev.is_null() { + return Err(ENOMEM); + } + // SAFETY: `cdev` is valid and non-null since `cdev_alloc()` + // returned a valid pointer which was null-checked. + unsafe { + (*cdev).ops = fops; + (*cdev).owner = module.0; + } + // INVARIANTS: + // - [`self.0`] is valid and non-null. + // - [`(*self.0).ops`] is valid, non-null and has static lifetime, + // because it was coerced from a reference with static lifetime. + // - [`(*self.0).owner`] is valid and, if non-null, has module lifetime, + // guaranteed by the [`ThisModule`] invariant. + Ok(Self(cdev)) + } + + fn add(&mut self, dev: bindings::dev_t, count: c_types::c_uint) -> Result { + // SAFETY: According to the type invariants: + // - [`self.0`] can be safely passed to [`bindings::cdev_add`]. + // - [`(*self.0).ops`] will live at least as long as [`self.0`]. + // - [`(*self.0).owner`] will live at least as long as the + // module, which is an implicit requirement. + let rc = unsafe { bindings::cdev_add(self.0, dev, count) }; + if rc != 0 { + return Err(Error::from_kernel_errno(rc)); + } + Ok(()) + } +} + +impl Drop for Cdev { + fn drop(&mut self) { + // SAFETY: [`self.0`] is valid and non-null by the type invariants. + unsafe { + bindings::cdev_del(self.0); + } + } +} + +struct RegistrationInner { + dev: bindings::dev_t, + used: usize, + cdevs: [Option; N], + _pin: PhantomPinned, +} + +/// Character device registration. +/// +/// May contain up to a fixed number (`N`) of devices. Must be pinned. +pub struct Registration { + name: &'static CStr, + minors_start: u16, + this_module: &'static crate::ThisModule, + inner: Option>, +} + +impl Registration<{ N }> { + /// Creates a [`Registration`] object for a character device. + /// + /// This does *not* register the device: see [`Self::register()`]. + /// + /// This associated function is intended to be used when you need to avoid + /// a memory allocation, e.g. when the [`Registration`] is a member of + /// a bigger structure inside your [`crate::Module`] instance. If you + /// are going to pin the registration right away, call + /// [`Self::new_pinned()`] instead. + pub fn new( + name: &'static CStr, + minors_start: u16, + this_module: &'static crate::ThisModule, + ) -> Self { + Registration { + name, + minors_start, + this_module, + inner: None, + } + } + + /// Creates a pinned [`Registration`] object for a character device. + /// + /// This does *not* register the device: see [`Self::register()`]. + pub fn new_pinned( + name: &'static CStr, + minors_start: u16, + this_module: &'static crate::ThisModule, + ) -> Result>> { + Ok(Pin::from(Box::try_new(Self::new( + name, + minors_start, + this_module, + ))?)) + } + + /// Registers a character device. + /// + /// You may call this once per device type, up to `N` times. + pub fn register>(self: Pin<&mut Self>) -> Result { + // SAFETY: We must ensure that we never move out of `this`. + let this = unsafe { self.get_unchecked_mut() }; + if this.inner.is_none() { + let mut dev: bindings::dev_t = 0; + // SAFETY: Calling unsafe function. `this.name` has `'static` + // lifetime. + let res = unsafe { + bindings::alloc_chrdev_region( + &mut dev, + this.minors_start.into(), + N.try_into()?, + this.name.as_char_ptr(), + ) + }; + if res != 0 { + return Err(Error::from_kernel_errno(res)); + } + const NONE: Option = None; + this.inner = Some(RegistrationInner { + dev, + used: 0, + cdevs: [NONE; N], + _pin: PhantomPinned, + }); + } + + let mut inner = this.inner.as_mut().unwrap(); + if inner.used == N { + return Err(EINVAL); + } + + // SAFETY: The adapter doesn't retrieve any state yet, so it's compatible with any + // registration. + let fops = unsafe { file::OperationsVtable::::build() }; + let mut cdev = Cdev::alloc(fops, this.this_module)?; + cdev.add(inner.dev + inner.used as bindings::dev_t, 1)?; + inner.cdevs[inner.used].replace(cdev); + inner.used += 1; + Ok(()) + } +} + +impl file::OpenAdapter<()> for Registration<{ N }> { + unsafe fn convert(_inode: *mut bindings::inode, _file: *mut bindings::file) -> *const () { + // TODO: Update the SAFETY comment on the call to `FileOperationsVTable::build` above once + // this is updated to retrieve state. + &() + } +} + +// SAFETY: `Registration` does not expose any of its state across threads +// (it is fine for multiple threads to have a shared reference to it). +unsafe impl Sync for Registration<{ N }> {} + +impl Drop for Registration<{ N }> { + fn drop(&mut self) { + if let Some(inner) = self.inner.as_mut() { + // Replicate kernel C behaviour: drop [`Cdev`]s before calling + // [`bindings::unregister_chrdev_region`]. + for i in 0..inner.used { + inner.cdevs[i].take(); + } + // SAFETY: [`self.inner`] is Some, so [`inner.dev`] was previously + // created using [`bindings::alloc_chrdev_region`]. + unsafe { + bindings::unregister_chrdev_region(inner.dev, N.try_into().unwrap()); + } + } + } +} diff --git a/rust/kernel/clk.rs b/rust/kernel/clk.rs new file mode 100644 index 00000000000000..465462b9bc854e --- /dev/null +++ b/rust/kernel/clk.rs @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Common clock framework. +//! +//! C header: [`include/linux/clk.h`](../../../../include/linux/clk.h) + +use crate::{bindings, error::Result, to_result}; +use core::mem::ManuallyDrop; + +/// Represents `struct clk *`. +/// +/// # Invariants +/// +/// The pointer is valid. +pub struct Clk(*mut bindings::clk); + +impl Clk { + /// Creates new clock structure from a raw pointer. + /// + /// # Safety + /// + /// The pointer must be valid. + pub unsafe fn new(clk: *mut bindings::clk) -> Self { + Self(clk) + } + + /// Returns value of the rate field of `struct clk`. + pub fn get_rate(&self) -> usize { + // SAFETY: The pointer is valid by the type invariant. + unsafe { bindings::clk_get_rate(self.0) as usize } + } + + /// Prepares and enables the underlying hardware clock. + /// + /// This function should not be called in atomic context. + pub fn prepare_enable(self) -> Result { + // SAFETY: The pointer is valid by the type invariant. + to_result(|| unsafe { bindings::clk_prepare_enable(self.0) })?; + Ok(EnabledClk(self)) + } +} + +impl Drop for Clk { + fn drop(&mut self) { + // SAFETY: The pointer is valid by the type invariant. + unsafe { bindings::clk_put(self.0) }; + } +} + +// SAFETY: `Clk` is not restricted to a single thread so it is safe +// to move it between threads. +unsafe impl Send for Clk {} + +/// A clock variant that is prepared and enabled. +pub struct EnabledClk(Clk); + +impl EnabledClk { + /// Returns value of the rate field of `struct clk`. + pub fn get_rate(&self) -> usize { + self.0.get_rate() + } + + /// Disables and later unprepares the underlying hardware clock prematurely. + /// + /// This function should not be called in atomic context. + pub fn disable_unprepare(self) -> Clk { + let mut clk = ManuallyDrop::new(self); + // SAFETY: The pointer is valid by the type invariant. + unsafe { bindings::clk_disable_unprepare(clk.0 .0) }; + core::mem::replace(&mut clk.0, Clk(core::ptr::null_mut())) + } +} + +impl Drop for EnabledClk { + fn drop(&mut self) { + // SAFETY: The pointer is valid by the type invariant. + unsafe { bindings::clk_disable_unprepare(self.0 .0) }; + } +} diff --git a/rust/kernel/cred.rs b/rust/kernel/cred.rs new file mode 100644 index 00000000000000..beacc71d92ac7f --- /dev/null +++ b/rust/kernel/cred.rs @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Credentials management. +//! +//! C header: [`include/linux/cred.h`](../../../../include/linux/cred.h) +//! +//! Reference: + +use crate::{bindings, AlwaysRefCounted}; +use core::cell::UnsafeCell; + +/// Wraps the kernel's `struct cred`. +/// +/// # Invariants +/// +/// Instances of this type are always ref-counted, that is, a call to `get_cred` ensures that the +/// allocation remains valid at least until the matching call to `put_cred`. +#[repr(transparent)] +pub struct Credential(pub(crate) UnsafeCell); + +impl Credential { + /// Creates a reference to a [`Credential`] from a valid pointer. + /// + /// # Safety + /// + /// The caller must ensure that `ptr` is valid and remains valid for the lifetime of the + /// returned [`Credential`] reference. + pub(crate) unsafe fn from_ptr<'a>(ptr: *const bindings::cred) -> &'a Self { + // SAFETY: The safety requirements guarantee the validity of the dereference, while the + // `Credential` type being transparent makes the cast ok. + unsafe { &*ptr.cast() } + } +} + +// SAFETY: The type invariants guarantee that `Credential` is always ref-counted. +unsafe impl AlwaysRefCounted for Credential { + fn inc_ref(&self) { + // SAFETY: The existence of a shared reference means that the refcount is nonzero. + unsafe { bindings::get_cred(self.0.get()) }; + } + + unsafe fn dec_ref(obj: core::ptr::NonNull) { + // SAFETY: The safety requirements guarantee that the refcount is nonzero. + unsafe { bindings::put_cred(obj.cast().as_ptr()) }; + } +} diff --git a/rust/kernel/device.rs b/rust/kernel/device.rs new file mode 100644 index 00000000000000..236d278f55767c --- /dev/null +++ b/rust/kernel/device.rs @@ -0,0 +1,546 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Generic devices that are part of the kernel's driver model. +//! +//! C header: [`include/linux/device.h`](../../../../include/linux/device.h) + +#[cfg(CONFIG_COMMON_CLK)] +use crate::{clk::Clk, error::from_kernel_err_ptr}; + +use crate::{ + bindings, + revocable::{Revocable, RevocableGuard}, + str::CStr, + sync::{NeedsLockClass, RevocableMutex, RevocableMutexGuard, UniqueRef}, + Result, +}; +use core::{ + fmt, + ops::{Deref, DerefMut}, + pin::Pin, +}; + +#[cfg(CONFIG_PRINTK)] +use crate::{c_str, c_types}; + +/// A raw device. +/// +/// # Safety +/// +/// Implementers must ensure that the `*mut device` returned by [`RawDevice::raw_device`] is +/// related to `self`, that is, actions on it will affect `self`. For example, if one calls +/// `get_device`, then the refcount on the device represented by `self` will be incremented. +/// +/// Additionally, implementers must ensure that the device is never renamed. Commit a5462516aa994 +/// has details on why `device_rename` should not be used. +pub unsafe trait RawDevice { + /// Returns the raw `struct device` related to `self`. + fn raw_device(&self) -> *mut bindings::device; + + /// Returns the name of the device. + fn name(&self) -> &CStr { + let ptr = self.raw_device(); + + // SAFETY: `ptr` is valid because `self` keeps it alive. + let name = unsafe { bindings::dev_name(ptr) }; + + // SAFETY: The name of the device remains valid while it is alive (because the device is + // never renamed, per the safety requirement of this trait). This is guaranteed to be the + // case because the reference to `self` outlives the one of the returned `CStr` (enforced + // by the compiler because of their lifetimes). + unsafe { CStr::from_char_ptr(name) } + } + + /// Lookups a clock producer consumed by this device. + /// + /// Returns a managed reference to the clock producer. + #[cfg(CONFIG_COMMON_CLK)] + fn clk_get(&self, id: Option<&CStr>) -> Result { + let id_ptr = match id { + Some(cstr) => cstr.as_char_ptr(), + None => core::ptr::null(), + }; + + // SAFETY: `id_ptr` is optional and may be either a valid pointer + // from the type invariant or NULL otherwise. + let clk_ptr = unsafe { from_kernel_err_ptr(bindings::clk_get(self.raw_device(), id_ptr)) }?; + + // SAFETY: Clock is initialized with valid pointer returned from `bindings::clk_get` call. + unsafe { Ok(Clk::new(clk_ptr)) } + } + + /// Prints an emergency-level message (level 0) prefixed with device information. + /// + /// More details are available from [`dev_emerg`]. + fn pr_emerg(&self, args: fmt::Arguments<'_>) { + // SAFETY: `klevel` is null-terminated, uses one of the kernel constants. + unsafe { self.printk(bindings::KERN_EMERG, args) }; + } + + /// Prints an alert-level message (level 1) prefixed with device information. + /// + /// More details are available from [`dev_alert`]. + fn pr_alert(&self, args: fmt::Arguments<'_>) { + // SAFETY: `klevel` is null-terminated, uses one of the kernel constants. + unsafe { self.printk(bindings::KERN_ALERT, args) }; + } + + /// Prints a critical-level message (level 2) prefixed with device information. + /// + /// More details are available from [`dev_crit`]. + fn pr_crit(&self, args: fmt::Arguments<'_>) { + // SAFETY: `klevel` is null-terminated, uses one of the kernel constants. + unsafe { self.printk(bindings::KERN_CRIT, args) }; + } + + /// Prints an error-level message (level 3) prefixed with device information. + /// + /// More details are available from [`dev_err`]. + fn pr_err(&self, args: fmt::Arguments<'_>) { + // SAFETY: `klevel` is null-terminated, uses one of the kernel constants. + unsafe { self.printk(bindings::KERN_ERR, args) }; + } + + /// Prints a warning-level message (level 4) prefixed with device information. + /// + /// More details are available from [`dev_warn`]. + fn pr_warn(&self, args: fmt::Arguments<'_>) { + // SAFETY: `klevel` is null-terminated, uses one of the kernel constants. + unsafe { self.printk(bindings::KERN_WARNING, args) }; + } + + /// Prints a notice-level message (level 5) prefixed with device information. + /// + /// More details are available from [`dev_notice`]. + fn pr_notice(&self, args: fmt::Arguments<'_>) { + // SAFETY: `klevel` is null-terminated, uses one of the kernel constants. + unsafe { self.printk(bindings::KERN_NOTICE, args) }; + } + + /// Prints an info-level message (level 6) prefixed with device information. + /// + /// More details are available from [`dev_info`]. + fn pr_info(&self, args: fmt::Arguments<'_>) { + // SAFETY: `klevel` is null-terminated, uses one of the kernel constants. + unsafe { self.printk(bindings::KERN_INFO, args) }; + } + + /// Prints a debug-level message (level 7) prefixed with device information. + /// + /// More details are available from [`dev_dbg`]. + fn pr_dbg(&self, args: fmt::Arguments<'_>) { + if cfg!(debug_assertions) { + // SAFETY: `klevel` is null-terminated, uses one of the kernel constants. + unsafe { self.printk(bindings::KERN_DEBUG, args) }; + } + } + + /// Prints the provided message to the console. + /// + /// # Safety + /// + /// Callers must ensure that `klevel` is null-terminated; in particular, one of the + /// `KERN_*`constants, for example, `KERN_CRIT`, `KERN_ALERT`, etc. + #[cfg_attr(not(CONFIG_PRINTK), allow(unused_variables))] + unsafe fn printk(&self, klevel: &[u8], msg: fmt::Arguments<'_>) { + // SAFETY: `klevel` is null-terminated and one of the kernel constants. `self.raw_device` + // is valid because `self` is valid. The "%pA" format string expects a pointer to + // `fmt::Arguments`, which is what we're passing as the last argument. + #[cfg(CONFIG_PRINTK)] + unsafe { + bindings::_dev_printk( + klevel as *const _ as *const c_types::c_char, + self.raw_device(), + c_str!("%pA").as_char_ptr(), + &msg as *const _ as *const c_types::c_void, + ) + }; + } +} + +/// A ref-counted device. +/// +/// # Invariants +/// +/// `ptr` is valid, non-null, and has a non-zero reference count. One of the references is owned by +/// `self`, and will be decremented when `self` is dropped. +pub struct Device { + pub(crate) ptr: *mut bindings::device, +} + +// SAFETY: `Device` only holds a pointer to a C device, which is safe to be used from any thread. +unsafe impl Send for Device {} + +// SAFETY: `Device` only holds a pointer to a C device, references to which are safe to be used +// from any thread. +unsafe impl Sync for Device {} + +impl Device { + /// Creates a new device instance. + /// + /// # Safety + /// + /// Callers must ensure that `ptr` is valid, non-null, and has a non-zero reference count. + pub unsafe fn new(ptr: *mut bindings::device) -> Self { + // SAFETY: By the safety requirements, ptr is valid and its refcounted will be incremented. + unsafe { bindings::get_device(ptr) }; + // INVARIANT: The safety requirements satisfy all but one invariant, which is that `self` + // owns a reference. This is satisfied by the call to `get_device` above. + Self { ptr } + } + + /// Creates a new device instance from an existing [`RawDevice`] instance. + pub fn from_dev(dev: &dyn RawDevice) -> Self { + // SAFETY: The requirements are satisfied by the existence of `RawDevice` and its safety + // requirements. + unsafe { Self::new(dev.raw_device()) } + } +} + +// SAFETY: The device returned by `raw_device` is the one for which we hold a reference. +unsafe impl RawDevice for Device { + fn raw_device(&self) -> *mut bindings::device { + self.ptr + } +} + +impl Drop for Device { + fn drop(&mut self) { + // SAFETY: By the type invariants, we know that `self` owns a reference, so it is safe to + // relinquish it now. + unsafe { bindings::put_device(self.ptr) }; + } +} + +/// Device data. +/// +/// When a device is removed (for whatever reason, for example, because the device was unplugged or +/// because the user decided to unbind the driver), the driver is given a chance to clean its state +/// up, and all io resources should ideally not be used anymore. +/// +/// However, the device data is reference-counted because other subsystems hold pointers to it. So +/// some device state must be freed and not used anymore, while others must remain accessible. +/// +/// This struct separates the device data into three categories: +/// 1. Registrations: are destroyed when the device is removed, but before the io resources +/// become inaccessible. +/// 2. Io resources: are available until the device is removed. +/// 3. General data: remain available as long as the ref count is nonzero. +/// +/// This struct implements the `DeviceRemoval` trait so that it can clean resources up even if not +/// explicitly called by the device drivers. +pub struct Data { + registrations: RevocableMutex, + resources: Revocable, + general: V, +} + +/// Safely creates an new reference-counted instance of [`Data`]. +#[doc(hidden)] +#[macro_export] +macro_rules! new_device_data { + ($reg:expr, $res:expr, $gen:expr, $name:literal) => {{ + static mut CLASS1: core::mem::MaybeUninit<$crate::bindings::lock_class_key> = + core::mem::MaybeUninit::uninit(); + static mut CLASS2: core::mem::MaybeUninit<$crate::bindings::lock_class_key> = + core::mem::MaybeUninit::uninit(); + let regs = $reg; + let res = $res; + let gen = $gen; + let name = $crate::c_str!($name); + // SAFETY: `CLASS1` and `CLASS2` are never used by Rust code directly; the C portion of the + // kernel may change it though. + unsafe { + $crate::device::Data::try_new( + regs, + res, + gen, + name, + CLASS1.as_mut_ptr(), + CLASS2.as_mut_ptr(), + ) + } + }}; +} + +impl Data { + /// Creates a new instance of `Data`. + /// + /// It is recommended that the [`new_device_data`] macro be used as it automatically creates + /// the lock classes. + /// + /// # Safety + /// + /// `key1` and `key2` must point to valid memory locations and remain valid until `self` is + /// dropped. + pub unsafe fn try_new( + registrations: T, + resources: U, + general: V, + name: &'static CStr, + key1: *mut bindings::lock_class_key, + key2: *mut bindings::lock_class_key, + ) -> Result>> { + let mut ret = Pin::from(UniqueRef::try_new(Self { + // SAFETY: We call `RevocableMutex::init` below. + registrations: unsafe { RevocableMutex::new(registrations) }, + resources: Revocable::new(resources), + general, + })?); + + // SAFETY: `Data::registrations` is pinned when `Data` is. + let pinned = unsafe { ret.as_mut().map_unchecked_mut(|d| &mut d.registrations) }; + + // SAFETY: The safety requirements of this function satisfy those of `RevocableMutex::init`. + unsafe { pinned.init(name, key1, key2) }; + Ok(ret) + } + + /// Returns the resources if they're still available. + pub fn resources(&self) -> Option> { + self.resources.try_access() + } + + /// Returns the locked registrations if they're still available. + pub fn registrations(&self) -> Option> { + self.registrations.try_write() + } +} + +impl crate::driver::DeviceRemoval for Data { + fn device_remove(&self) { + // We revoke the registrations first so that resources are still available to them during + // unregistration. + self.registrations.revoke(); + + // Release resources now. General data remains available. + self.resources.revoke(); + } +} + +impl Deref for Data { + type Target = V; + + fn deref(&self) -> &V { + &self.general + } +} + +impl DerefMut for Data { + fn deref_mut(&mut self) -> &mut V { + &mut self.general + } +} + +#[doc(hidden)] +#[macro_export] +macro_rules! dev_printk { + ($method:ident, $dev:expr, $($f:tt)*) => { + { + // We have an explicity `use` statement here so that callers of this macro are not + // required to explicitly use the `RawDevice` trait to use its functions. + use $crate::device::RawDevice; + ($dev).$method(core::format_args!($($f)*)); + } + } +} + +/// Prints an emergency-level message (level 0) prefixed with device information. +/// +/// This level should be used if the system is unusable. +/// +/// Equivalent to the kernel's `dev_emerg` macro. +/// +/// Mimics the interface of [`std::print!`]. More information about the syntax is available from +/// [`core::fmt`] and [`alloc::format!`]. +/// +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// # use kernel::device::Device; +/// +/// fn example(dev: &Device) { +/// dev_emerg!(dev, "hello {}\n", "there"); +/// } +/// ``` +#[macro_export] +macro_rules! dev_emerg { + ($($f:tt)*) => { $crate::dev_printk!(pr_emerg, $($f)*); } +} + +/// Prints an alert-level message (level 1) prefixed with device information. +/// +/// This level should be used if action must be taken immediately. +/// +/// Equivalent to the kernel's `dev_alert` macro. +/// +/// Mimics the interface of [`std::print!`]. More information about the syntax is available from +/// [`core::fmt`] and [`alloc::format!`]. +/// +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// # use kernel::device::Device; +/// +/// fn example(dev: &Device) { +/// dev_alert!(dev, "hello {}\n", "there"); +/// } +/// ``` +#[macro_export] +macro_rules! dev_alert { + ($($f:tt)*) => { $crate::dev_printk!(pr_alert, $($f)*); } +} + +/// Prints a critical-level message (level 2) prefixed with device information. +/// +/// This level should be used in critical conditions. +/// +/// Equivalent to the kernel's `dev_crit` macro. +/// +/// Mimics the interface of [`std::print!`]. More information about the syntax is available from +/// [`core::fmt`] and [`alloc::format!`]. +/// +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// # use kernel::device::Device; +/// +/// fn example(dev: &Device) { +/// dev_crit!(dev, "hello {}\n", "there"); +/// } +/// ``` +#[macro_export] +macro_rules! dev_crit { + ($($f:tt)*) => { $crate::dev_printk!(pr_crit, $($f)*); } +} + +/// Prints an error-level message (level 3) prefixed with device information. +/// +/// This level should be used in error conditions. +/// +/// Equivalent to the kernel's `dev_err` macro. +/// +/// Mimics the interface of [`std::print!`]. More information about the syntax is available from +/// [`core::fmt`] and [`alloc::format!`]. +/// +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// # use kernel::device::Device; +/// +/// fn example(dev: &Device) { +/// dev_err!(dev, "hello {}\n", "there"); +/// } +/// ``` +#[macro_export] +macro_rules! dev_err { + ($($f:tt)*) => { $crate::dev_printk!(pr_err, $($f)*); } +} + +/// Prints a warning-level message (level 4) prefixed with device information. +/// +/// This level should be used in warning conditions. +/// +/// Equivalent to the kernel's `dev_warn` macro. +/// +/// Mimics the interface of [`std::print!`]. More information about the syntax is available from +/// [`core::fmt`] and [`alloc::format!`]. +/// +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// # use kernel::device::Device; +/// +/// fn example(dev: &Device) { +/// dev_warn!(dev, "hello {}\n", "there"); +/// } +/// ``` +#[macro_export] +macro_rules! dev_warn { + ($($f:tt)*) => { $crate::dev_printk!(pr_warn, $($f)*); } +} + +/// Prints a notice-level message (level 5) prefixed with device information. +/// +/// This level should be used in normal but significant conditions. +/// +/// Equivalent to the kernel's `dev_notice` macro. +/// +/// Mimics the interface of [`std::print!`]. More information about the syntax is available from +/// [`core::fmt`] and [`alloc::format!`]. +/// +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// # use kernel::device::Device; +/// +/// fn example(dev: &Device) { +/// dev_notice!(dev, "hello {}\n", "there"); +/// } +/// ``` +#[macro_export] +macro_rules! dev_notice { + ($($f:tt)*) => { $crate::dev_printk!(pr_notice, $($f)*); } +} + +/// Prints an info-level message (level 6) prefixed with device information. +/// +/// This level should be used for informational messages. +/// +/// Equivalent to the kernel's `dev_info` macro. +/// +/// Mimics the interface of [`std::print!`]. More information about the syntax is available from +/// [`core::fmt`] and [`alloc::format!`]. +/// +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// # use kernel::device::Device; +/// +/// fn example(dev: &Device) { +/// dev_info!(dev, "hello {}\n", "there"); +/// } +/// ``` +#[macro_export] +macro_rules! dev_info { + ($($f:tt)*) => { $crate::dev_printk!(pr_info, $($f)*); } +} + +/// Prints a debug-level message (level 7) prefixed with device information. +/// +/// This level should be used for debug messages. +/// +/// Equivalent to the kernel's `dev_dbg` macro, except that it doesn't support dynamic debug yet. +/// +/// Mimics the interface of [`std::print!`]. More information about the syntax is available from +/// [`core::fmt`] and [`alloc::format!`]. +/// +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// # use kernel::device::Device; +/// +/// fn example(dev: &Device) { +/// dev_dbg!(dev, "hello {}\n", "there"); +/// } +/// ``` +#[macro_export] +macro_rules! dev_dbg { + ($($f:tt)*) => { $crate::dev_printk!(pr_dbg, $($f)*); } +} diff --git a/rust/kernel/driver.rs b/rust/kernel/driver.rs new file mode 100644 index 00000000000000..0ae9f4d3dbc53d --- /dev/null +++ b/rust/kernel/driver.rs @@ -0,0 +1,442 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Generic support for drivers of different buses (e.g., PCI, Platform, Amba, etc.). +//! +//! Each bus/subsystem is expected to implement [`DriverOps`], which allows drivers to register +//! using the [`Registration`] class. + +use crate::{error::code::*, str::CStr, sync::Ref, Result, ThisModule}; +use alloc::boxed::Box; +use core::{cell::UnsafeCell, marker::PhantomData, ops::Deref, pin::Pin}; + +/// A subsystem (e.g., PCI, Platform, Amba, etc.) that allows drivers to be written for it. +pub trait DriverOps { + /// The type that holds information about the registration. This is typically a struct defined + /// by the C portion of the kernel. + type RegType: Default; + + /// Registers a driver. + /// + /// # Safety + /// + /// `reg` must point to valid, initialised, and writable memory. It may be modified by this + /// function to hold registration state. + /// + /// On success, `reg` must remain pinned and valid until the matching call to + /// [`DriverOps::unregister`]. + unsafe fn register( + reg: *mut Self::RegType, + name: &'static CStr, + module: &'static ThisModule, + ) -> Result; + + /// Unregisters a driver previously registered with [`DriverOps::register`]. + /// + /// # Safety + /// + /// `reg` must point to valid writable memory, initialised by a previous successful call to + /// [`DriverOps::register`]. + unsafe fn unregister(reg: *mut Self::RegType); +} + +/// The registration of a driver. +pub struct Registration { + is_registered: bool, + concrete_reg: UnsafeCell, +} + +// SAFETY: `Registration` has no fields or methods accessible via `&Registration`, so it is safe to +// share references to it with multiple threads as nothing can be done. +unsafe impl Sync for Registration {} + +impl Registration { + /// Creates a new instance of the registration object. + pub fn new() -> Self { + Self { + is_registered: false, + concrete_reg: UnsafeCell::new(T::RegType::default()), + } + } + + /// Allocates a pinned registration object and registers it. + /// + /// Returns a pinned heap-allocated representation of the registration. + pub fn new_pinned(name: &'static CStr, module: &'static ThisModule) -> Result>> { + let mut reg = Pin::from(Box::try_new(Self::new())?); + reg.as_mut().register(name, module)?; + Ok(reg) + } + + /// Registers a driver with its subsystem. + /// + /// It must be pinned because the memory block that represents the registration is potentially + /// self-referential. + pub fn register( + self: Pin<&mut Self>, + name: &'static CStr, + module: &'static ThisModule, + ) -> Result { + // SAFETY: We never move out of `this`. + let this = unsafe { self.get_unchecked_mut() }; + if this.is_registered { + // Already registered. + return Err(EINVAL); + } + + // SAFETY: `concrete_reg` was initialised via its default constructor. It is only freed + // after `Self::drop` is called, which first calls `T::unregister`. + unsafe { T::register(this.concrete_reg.get(), name, module) }?; + + this.is_registered = true; + Ok(()) + } +} + +impl Default for Registration { + fn default() -> Self { + Self::new() + } +} + +impl Drop for Registration { + fn drop(&mut self) { + if self.is_registered { + // SAFETY: This path only runs if a previous call to `T::register` completed + // successfully. + unsafe { T::unregister(self.concrete_reg.get()) }; + } + } +} + +/// Conversion from a device id to a raw device id. +/// +/// This is meant to be implemented by buses/subsystems so that they can use [`IdTable`] to +/// guarantee (at compile-time) zero-termination of device id tables provided by drivers. +/// +/// # Safety +/// +/// Implementers must ensure that: +/// - [`RawDeviceId::ZERO`] is actually a zeroed-out version of the raw device id. +/// - [`RawDeviceId::to_rawid`] stores `offset` in the context/data field of the raw device id so +/// that buses can recover the pointer to the data. +pub unsafe trait RawDeviceId { + /// The raw type that holds the device id. + /// + /// Id tables created from [`Self`] are going to hold this type in its zero-terminated array. + type RawType: Copy; + + /// A zeroed-out representation of the raw device id. + /// + /// Id tables created from [`Self`] use [`Self::ZERO`] as the sentinel to indicate the end of + /// the table. + const ZERO: Self::RawType; + + /// Converts an id into a raw id. + /// + /// `offset` is the offset from the memory location where the raw device id is stored to the + /// location where its associated context information is stored. Implementations must store + /// this in the appropriate context/data field of the raw type. + fn to_rawid(&self, offset: isize) -> Self::RawType; +} + +/// A zero-terminated device id array, followed by context data. +#[repr(C)] +pub struct IdArray { + ids: [T::RawType; N], + sentinel: T::RawType, + id_infos: [Option; N], +} + +impl IdArray { + /// Creates a new instance of the array. + /// + /// The contents are derived from the given identifiers and context information. + pub const fn new(ids: [T; N], infos: [Option; N]) -> Self + where + T: ~const RawDeviceId + Copy, + { + let mut array = Self { + ids: [T::ZERO; N], + sentinel: T::ZERO, + id_infos: infos, + }; + let mut i = 0usize; + while i < N { + // SAFETY: Both pointers are within `array` (or one byte beyond), consequently they are + // derived from the same allocated object. We are using a `u8` pointer, whose size 1, + // so the pointers are necessarily 1-byte aligned. + let offset = unsafe { + (&array.id_infos[i] as *const _ as *const u8) + .offset_from(&array.ids[i] as *const _ as _) + }; + array.ids[i] = ids[i].to_rawid(offset); + i += 1; + } + array + } + + /// Returns an `IdTable` backed by `self`. + /// + /// This is used to essentially erase the array size. + pub const fn as_table(&self) -> IdTable<'_, T, U> { + IdTable { + first: &self.ids[0], + _p: PhantomData, + } + } +} + +/// A device id table. +/// +/// The table is guaranteed to be zero-terminated and to be followed by an array of context data of +/// type `Option`. +pub struct IdTable<'a, T: RawDeviceId, U> { + first: &'a T::RawType, + _p: PhantomData<&'a U>, +} + +impl const AsRef for IdTable<'_, T, U> { + fn as_ref(&self) -> &T::RawType { + self.first + } +} + +/// Counts the number of parenthesis-delimited, comma-separated items. +/// +/// # Examples +/// +/// ``` +/// # use kernel::count_paren_items; +/// +/// assert_eq!(0, count_paren_items!()); +/// assert_eq!(1, count_paren_items!((A))); +/// assert_eq!(1, count_paren_items!((A),)); +/// assert_eq!(2, count_paren_items!((A), (B))); +/// assert_eq!(2, count_paren_items!((A), (B),)); +/// assert_eq!(3, count_paren_items!((A), (B), (C))); +/// assert_eq!(3, count_paren_items!((A), (B), (C),)); +/// ``` +#[macro_export] +macro_rules! count_paren_items { + (($($item:tt)*), $($remaining:tt)*) => { 1 + $crate::count_paren_items!($($remaining)*) }; + (($($item:tt)*)) => { 1 }; + () => { 0 }; +} + +/// Converts a comma-separated list of pairs into an array with the first element. That is, it +/// discards the second element of the pair. +/// +/// Additionally, it automatically introduces a type if the first element is warpped in curly +/// braces, for example, if it's `{v: 10}`, it becomes `X { v: 10 }`; this is to avoid repeating +/// the type. +/// +/// # Examples +/// +/// ``` +/// # use kernel::first_item; +/// +/// #[derive(PartialEq, Debug)] +/// struct X { +/// v: u32, +/// } +/// +/// assert_eq!([] as [X; 0], first_item!(X, )); +/// assert_eq!([X { v: 10 }], first_item!(X, ({ v: 10 }, Y))); +/// assert_eq!([X { v: 10 }], first_item!(X, ({ v: 10 }, Y),)); +/// assert_eq!([X { v: 10 }], first_item!(X, (X { v: 10 }, Y))); +/// assert_eq!([X { v: 10 }], first_item!(X, (X { v: 10 }, Y),)); +/// assert_eq!([X { v: 10 }, X { v: 20 }], first_item!(X, ({ v: 10 }, Y), ({ v: 20 }, Y))); +/// assert_eq!([X { v: 10 }, X { v: 20 }], first_item!(X, ({ v: 10 }, Y), ({ v: 20 }, Y),)); +/// assert_eq!([X { v: 10 }, X { v: 20 }], first_item!(X, (X { v: 10 }, Y), (X { v: 20 }, Y))); +/// assert_eq!([X { v: 10 }, X { v: 20 }], first_item!(X, (X { v: 10 }, Y), (X { v: 20 }, Y),)); +/// assert_eq!([X { v: 10 }, X { v: 20 }, X { v: 30 }], +/// first_item!(X, ({ v: 10 }, Y), ({ v: 20 }, Y), ({v: 30}, Y))); +/// assert_eq!([X { v: 10 }, X { v: 20 }, X { v: 30 }], +/// first_item!(X, ({ v: 10 }, Y), ({ v: 20 }, Y), ({v: 30}, Y),)); +/// assert_eq!([X { v: 10 }, X { v: 20 }, X { v: 30 }], +/// first_item!(X, (X { v: 10 }, Y), (X { v: 20 }, Y), (X {v: 30}, Y))); +/// assert_eq!([X { v: 10 }, X { v: 20 }, X { v: 30 }], +/// first_item!(X, (X { v: 10 }, Y), (X { v: 20 }, Y), (X {v: 30}, Y),)); +/// ``` +#[macro_export] +macro_rules! first_item { + ($id_type:ty, $(({$($first:tt)*}, $second:expr)),* $(,)?) => { + { + type IdType = $id_type; + [$(IdType{$($first)*},)*] + } + }; + ($id_type:ty, $(($first:expr, $second:expr)),* $(,)?) => { [$($first,)*] }; +} + +/// Converts a comma-separated list of pairs into an array with the second element. That is, it +/// discards the first element of the pair. +/// +/// # Examples +/// +/// ``` +/// # use kernel::second_item; +/// +/// assert_eq!([] as [u32; 0], second_item!()); +/// assert_eq!([10u32], second_item!((X, 10u32))); +/// assert_eq!([10u32], second_item!((X, 10u32),)); +/// assert_eq!([10u32], second_item!(({X}, 10u32))); +/// assert_eq!([10u32], second_item!(({X}, 10u32),)); +/// assert_eq!([10u32, 20], second_item!((X, 10u32), (X, 20))); +/// assert_eq!([10u32, 20], second_item!((X, 10u32), (X, 20),)); +/// assert_eq!([10u32, 20], second_item!(({X}, 10u32), ({X}, 20))); +/// assert_eq!([10u32, 20], second_item!(({X}, 10u32), ({X}, 20),)); +/// assert_eq!([10u32, 20, 30], second_item!((X, 10u32), (X, 20), (X, 30))); +/// assert_eq!([10u32, 20, 30], second_item!((X, 10u32), (X, 20), (X, 30),)); +/// assert_eq!([10u32, 20, 30], second_item!(({X}, 10u32), ({X}, 20), ({X}, 30))); +/// assert_eq!([10u32, 20, 30], second_item!(({X}, 10u32), ({X}, 20), ({X}, 30),)); +/// ``` +#[macro_export] +macro_rules! second_item { + ($(({$($first:tt)*}, $second:expr)),* $(,)?) => { [$($second,)*] }; + ($(($first:expr, $second:expr)),* $(,)?) => { [$($second,)*] }; +} + +/// Defines a new constant [`IdArray`] with a concise syntax. +/// +/// It is meant to be used by buses and subsystems to create a similar macro with their device id +/// type already specified, i.e., with fewer parameters to the end user. +/// +/// # Examples +/// +// TODO: Exported but not usable by kernel modules (requires `const_trait_impl`). +/// ```ignore +/// #![feature(const_trait_impl)] +/// # use kernel::{define_id_array, driver::RawDeviceId}; +/// +/// #[derive(Copy, Clone)] +/// struct Id(u32); +/// +/// // SAFETY: `ZERO` is all zeroes and `to_rawid` stores `offset` as the second element of the raw +/// // device id pair. +/// unsafe impl const RawDeviceId for Id { +/// type RawType = (u64, isize); +/// const ZERO: Self::RawType = (0, 0); +/// fn to_rawid(&self, offset: isize) -> Self::RawType { +/// (self.0 as u64 + 1, offset) +/// } +/// } +/// +/// define_id_array!(A1, Id, (), []); +/// define_id_array!(A2, Id, &'static [u8], [(Id(10), None)]); +/// define_id_array!(A3, Id, &'static [u8], [(Id(10), Some(b"id1")), ]); +/// define_id_array!(A4, Id, &'static [u8], [(Id(10), Some(b"id1")), (Id(20), Some(b"id2"))]); +/// define_id_array!(A5, Id, &'static [u8], [(Id(10), Some(b"id1")), (Id(20), Some(b"id2")), ]); +/// define_id_array!(A6, Id, &'static [u8], [(Id(10), None), (Id(20), Some(b"id2")), ]); +/// define_id_array!(A7, Id, &'static [u8], [(Id(10), Some(b"id1")), (Id(20), None), ]); +/// define_id_array!(A8, Id, &'static [u8], [(Id(10), None), (Id(20), None), ]); +/// ``` +#[macro_export] +macro_rules! define_id_array { + ($table_name:ident, $id_type:ty, $data_type:ty, [ $($t:tt)* ]) => { + const $table_name: + $crate::driver::IdArray<$id_type, $data_type, { $crate::count_paren_items!($($t)*) }> = + $crate::driver::IdArray::new( + $crate::first_item!($id_type, $($t)*), $crate::second_item!($($t)*)); + }; +} + +/// Defines a new constant [`IdTable`] with a concise syntax. +/// +/// It is meant to be used by buses and subsystems to create a similar macro with their device id +/// type already specified, i.e., with fewer parameters to the end user. +/// +/// # Examples +/// +// TODO: Exported but not usable by kernel modules (requires `const_trait_impl`). +/// ```ignore +/// #![feature(const_trait_impl)] +/// # use kernel::{define_id_table, driver::RawDeviceId}; +/// +/// #[derive(Copy, Clone)] +/// struct Id(u32); +/// +/// // SAFETY: `ZERO` is all zeroes and `to_rawid` stores `offset` as the second element of the raw +/// // device id pair. +/// unsafe impl const RawDeviceId for Id { +/// type RawType = (u64, isize); +/// const ZERO: Self::RawType = (0, 0); +/// fn to_rawid(&self, offset: isize) -> Self::RawType { +/// (self.0 as u64 + 1, offset) +/// } +/// } +/// +/// define_id_table!(T1, Id, &'static [u8], [(Id(10), None)]); +/// define_id_table!(T2, Id, &'static [u8], [(Id(10), Some(b"id1")), ]); +/// define_id_table!(T3, Id, &'static [u8], [(Id(10), Some(b"id1")), (Id(20), Some(b"id2"))]); +/// define_id_table!(T4, Id, &'static [u8], [(Id(10), Some(b"id1")), (Id(20), Some(b"id2")), ]); +/// define_id_table!(T5, Id, &'static [u8], [(Id(10), None), (Id(20), Some(b"id2")), ]); +/// define_id_table!(T6, Id, &'static [u8], [(Id(10), Some(b"id1")), (Id(20), None), ]); +/// define_id_table!(T7, Id, &'static [u8], [(Id(10), None), (Id(20), None), ]); +/// ``` +#[macro_export] +macro_rules! define_id_table { + ($table_name:ident, $id_type:ty, $data_type:ty, [ $($t:tt)* ]) => { + const $table_name: Option<$crate::driver::IdTable<'static, $id_type, $data_type>> = { + $crate::define_id_array!(ARRAY, $id_type, $data_type, [ $($t)* ]); + Some(ARRAY.as_table()) + }; + }; +} + +/// Custom code within device removal. +pub trait DeviceRemoval { + /// Cleans resources up when the device is removed. + /// + /// This is called when a device is removed and offers implementers the chance to run some code + /// that cleans state up. + fn device_remove(&self); +} + +impl DeviceRemoval for () { + fn device_remove(&self) {} +} + +impl DeviceRemoval for Ref { + fn device_remove(&self) { + self.deref().device_remove(); + } +} + +impl DeviceRemoval for Box { + fn device_remove(&self) { + self.deref().device_remove(); + } +} + +/// A kernel module that only registers the given driver on init. +/// +/// This is a helper struct to make it easier to define single-functionality modules, in this case, +/// modules that offer a single driver. +pub struct Module { + _driver: Pin>>, +} + +impl crate::Module for Module { + fn init(name: &'static CStr, module: &'static ThisModule) -> Result { + Ok(Self { + _driver: Registration::new_pinned(name, module)?, + }) + } +} + +/// Declares a kernel module that exposes a single driver. +/// +/// It is meant to be used as a helper by other subsystems so they can more easily expose their own +/// macros. +#[macro_export] +macro_rules! module_driver { + (<$gen_type:ident>, $driver_ops:ty, { type: $type:ty, $($f:tt)* }) => { + type Ops<$gen_type> = $driver_ops; + type ModuleType = $crate::driver::Module>; + $crate::prelude::module! { + type: ModuleType, + $($f)* + } + } +} diff --git a/rust/kernel/error.rs b/rust/kernel/error.rs new file mode 100644 index 00000000000000..55029cf09a9172 --- /dev/null +++ b/rust/kernel/error.rs @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Kernel errors. +//! +//! C header: [`include/uapi/asm-generic/errno-base.h`](../../../include/uapi/asm-generic/errno-base.h) + +use crate::str::CStr; +use crate::{bindings, c_types}; +use alloc::{ + alloc::{AllocError, LayoutError}, + collections::TryReserveError, +}; +use core::convert::From; +use core::fmt; +use core::num::TryFromIntError; +use core::str::{self, Utf8Error}; + +/// Contains the C-compatible error codes. +pub mod code { + macro_rules! declare_err { + ($err:tt $(,)? $($doc:expr),+) => { + $( + #[doc = $doc] + )* + pub const $err: super::Error = super::Error(-(crate::bindings::$err as i32)); + }; + } + + declare_err!(EPERM, "Operation not permitted."); + + declare_err!(ENOENT, "No such file or directory."); + + declare_err!(ESRCH, "No such process."); + + declare_err!(EINTR, "Interrupted system call."); + + declare_err!(EIO, "I/O error."); + + declare_err!(ENXIO, "No such device or address."); + + declare_err!(E2BIG, "Argument list too long."); + + declare_err!(ENOEXEC, "Exec format error."); + + declare_err!(EBADF, "Bad file number."); + + declare_err!(ECHILD, "Exec format error."); + + declare_err!(EAGAIN, "Try again."); + + declare_err!(ENOMEM, "Out of memory."); + + declare_err!(EACCES, "Permission denied."); + + declare_err!(EFAULT, "Bad address."); + + declare_err!(ENOTBLK, "Block device required."); + + declare_err!(EBUSY, "Device or resource busy."); + + declare_err!(EEXIST, "File exists."); + + declare_err!(EXDEV, "Cross-device link."); + + declare_err!(ENODEV, "No such device."); + + declare_err!(ENOTDIR, "Not a directory."); + + declare_err!(EISDIR, "Is a directory."); + + declare_err!(EINVAL, "Invalid argument."); + + declare_err!(ENFILE, "File table overflow."); + + declare_err!(EMFILE, "Too many open files."); + + declare_err!(ENOTTY, "Not a typewriter."); + + declare_err!(ETXTBSY, "Text file busy."); + + declare_err!(EFBIG, "File too large."); + + declare_err!(ENOSPC, "No space left on device."); + + declare_err!(ESPIPE, "Illegal seek."); + + declare_err!(EROFS, "Read-only file system."); + + declare_err!(EMLINK, "Too many links."); + + declare_err!(EPIPE, "Broken pipe."); + + declare_err!(EDOM, "Math argument out of domain of func."); + + declare_err!(ERANGE, "Math result not representable."); + + declare_err!(EDEADLK, "Resource deadlock would occur"); + + declare_err!(ENAMETOOLONG, "File name too long"); + + declare_err!(ENOLCK, "No record locks available"); + + declare_err!( + ENOSYS, + "Invalid system call number.", + "", + "This error code is special: arch syscall entry code will return", + "[`ENOSYS`] if users try to call a syscall that doesn't exist.", + "To keep failures of syscalls that really do exist distinguishable from", + "failures due to attempts to use a nonexistent syscall, syscall", + "implementations should refrain from returning [`ENOSYS`]." + ); + + declare_err!(ENOTEMPTY, "Directory not empty."); + + declare_err!(ELOOP, "Too many symbolic links encountered."); + + declare_err!(EWOULDBLOCK, "Operation would block."); + + declare_err!(ENOMSG, "No message of desired type."); + + declare_err!(EIDRM, "Identifier removed."); + + declare_err!(ECHRNG, "Channel number out of range."); + + declare_err!(EL2NSYNC, "Level 2 not synchronized."); + + declare_err!(EL3HLT, "Level 3 halted."); + + declare_err!(EL3RST, "Level 3 reset."); + + declare_err!(ELNRNG, "Link number out of range."); + + declare_err!(EUNATCH, "Protocol driver not attached."); + + declare_err!(ENOCSI, "No CSI structure available."); + + declare_err!(EL2HLT, "Level 2 halted."); + + declare_err!(EBADE, "Invalid exchange."); + + declare_err!(EBADR, "Invalid request descriptor."); + + declare_err!(EXFULL, "Exchange full."); + + declare_err!(ENOANO, "No anode."); + + declare_err!(EBADRQC, "Invalid request code."); + + declare_err!(EBADSLT, "Invalid slot."); + + declare_err!(EDEADLOCK, "Resource deadlock would occur."); + + declare_err!(EBFONT, "Bad font file format."); + + declare_err!(ENOSTR, "Device not a stream."); + + declare_err!(ENODATA, "No data available."); + + declare_err!(ETIME, "Timer expired."); + + declare_err!(ENOSR, "Out of streams resources."); + + declare_err!(ENONET, "Machine is not on the network."); + + declare_err!(ENOPKG, "Package not installed."); + + declare_err!(EREMOTE, "Object is remote."); + + declare_err!(ENOLINK, "Link has been severed."); + + declare_err!(EADV, "Advertise error."); + + declare_err!(ESRMNT, "Srmount error."); + + declare_err!(ECOMM, "Communication error on send."); + + declare_err!(EPROTO, "Protocol error."); + + declare_err!(EMULTIHOP, "Multihop attempted."); + + declare_err!(EDOTDOT, "RFS specific error."); + + declare_err!(EBADMSG, "Not a data message."); + + declare_err!(EOVERFLOW, "Value too large for defined data type."); + + declare_err!(ENOTUNIQ, "Name not unique on network."); + + declare_err!(EBADFD, "File descriptor in bad state."); + + declare_err!(EREMCHG, "Remote address changed."); + + declare_err!(ELIBACC, "Can not access a needed shared library."); + + declare_err!(ELIBBAD, "Accessing a corrupted shared library."); + + declare_err!(ELIBSCN, ".lib section in a.out corrupted."); + + declare_err!(ELIBMAX, "Attempting to link in too many shared libraries."); + + declare_err!(ELIBEXEC, "Cannot exec a shared library directly."); + + declare_err!(EILSEQ, "Illegal byte sequence."); + + declare_err!(ERESTART, "Interrupted system call should be restarted."); + + declare_err!(ESTRPIPE, "Streams pipe error."); + + declare_err!(EUSERS, "Too many users."); + + declare_err!(ENOTSOCK, "Socket operation on non-socket."); + + declare_err!(EDESTADDRREQ, "Destination address required."); + + declare_err!(EMSGSIZE, "Message too long."); + + declare_err!(EPROTOTYPE, "Protocol wrong type for socket."); + + declare_err!(ENOPROTOOPT, "Protocol not available."); + + declare_err!(EPROTONOSUPPORT, "Protocol not supported."); + + declare_err!(ESOCKTNOSUPPORT, "Socket type not supported."); + + declare_err!(EOPNOTSUPP, "Operation not supported on transport endpoint."); + + declare_err!(EPFNOSUPPORT, "Protocol family not supported."); + + declare_err!(EAFNOSUPPORT, "Address family not supported by protocol."); + + declare_err!(EADDRINUSE, "Address already in use."); + + declare_err!(EADDRNOTAVAIL, "Cannot assign requested address."); + + declare_err!(ENETDOWN, "Network is down."); + + declare_err!(ENETUNREACH, "Network is unreachable."); + + declare_err!(ENETRESET, "Network dropped connection because of reset."); + + declare_err!(ECONNABORTED, "Software caused connection abort."); + + declare_err!(ECONNRESET, "Connection reset by peer."); + + declare_err!(ENOBUFS, "No buffer space available."); + + declare_err!(EISCONN, "Transport endpoint is already connected."); + + declare_err!(ENOTCONN, "Transport endpoint is not connected."); + + declare_err!(ESHUTDOWN, "Cannot send after transport endpoint shutdown."); + + declare_err!(ETOOMANYREFS, "Too many references: cannot splice."); + + declare_err!(ETIMEDOUT, "Connection timed out."); + + declare_err!(ECONNREFUSED, "Connection refused."); + + declare_err!(EHOSTDOWN, "Host is down."); + + declare_err!(EHOSTUNREACH, "No route to host."); + + declare_err!(EALREADY, "Operation already in progress."); + + declare_err!(EINPROGRESS, "Operation now in progress."); + + declare_err!(ESTALE, "Stale file handle."); + + declare_err!(EUCLEAN, "Structure needs cleaning."); + + declare_err!(ENOTNAM, "Not a XENIX named type file."); + + declare_err!(ENAVAIL, "No XENIX semaphores available."); + + declare_err!(EISNAM, "Is a named type file."); + + declare_err!(EREMOTEIO, "Remote I/O error."); + + declare_err!(EDQUOT, "Quota exceeded."); + + declare_err!(ENOMEDIUM, "No medium found."); + + declare_err!(EMEDIUMTYPE, "Wrong medium type."); + + declare_err!(ECANCELED, "Operation Canceled."); + + declare_err!(ENOKEY, "Required key not available."); + + declare_err!(EKEYEXPIRED, "Key has expired."); + + declare_err!(EKEYREVOKED, "Key has been revoked."); + + declare_err!(EKEYREJECTED, "Key was rejected by service."); + + declare_err!(EOWNERDEAD, "Owner died.", "", "For robust mutexes."); + + declare_err!(ENOTRECOVERABLE, "State not recoverable."); + + declare_err!(ERFKILL, "Operation not possible due to RF-kill."); + + declare_err!(EHWPOISON, "Memory page has hardware error."); + + declare_err!(ERESTARTSYS, "Restart the system call."); + + declare_err!(ENOTSUPP, "Operation is not supported."); +} + +/// Generic integer kernel error. +/// +/// The kernel defines a set of integer generic error codes based on C and +/// POSIX ones. These codes may have a more specific meaning in some contexts. +/// +/// # Invariants +/// +/// The value is a valid `errno` (i.e. `>= -MAX_ERRNO && < 0`). +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Error(c_types::c_int); + +impl Error { + /// Creates an [`Error`] from a kernel error code. + /// + /// It is a bug to pass an out-of-range `errno`. `EINVAL` would + /// be returned in such a case. + pub(crate) fn from_kernel_errno(errno: c_types::c_int) -> Error { + if errno < -(bindings::MAX_ERRNO as i32) || errno >= 0 { + // TODO: Make it a `WARN_ONCE` once available. + crate::pr_warn!( + "attempted to create `Error` with out of range `errno`: {}", + errno + ); + return code::EINVAL; + } + + // INVARIANT: The check above ensures the type invariant + // will hold. + Error(errno) + } + + /// Creates an [`Error`] from a kernel error code. + /// + /// # Safety + /// + /// `errno` must be within error code range (i.e. `>= -MAX_ERRNO && < 0`). + pub(crate) unsafe fn from_kernel_errno_unchecked(errno: c_types::c_int) -> Error { + // INVARIANT: The contract ensures the type invariant + // will hold. + Error(errno) + } + + /// Returns the kernel error code. + pub fn to_kernel_errno(self) -> c_types::c_int { + self.0 + } + + /// Returns a string representing the error, if one exists. + #[cfg(not(testlib))] + pub fn name(&self) -> Option<&'static CStr> { + // SAFETY: Just an FFI call, there are no extra safety requirements. + let ptr = unsafe { bindings::errname(-self.0) }; + if ptr.is_null() { + None + } else { + // SAFETY: The string returned by `errname` is static and `NUL`-terminated. + Some(unsafe { CStr::from_char_ptr(ptr) }) + } + } + + /// Returns a string representing the error, if one exists. + /// + /// When `testlib` is configured, this always returns `None` to avoid the dependency on a + /// kernel function so that tests that use this (e.g., by calling [`Result::unwrap`]) can still + /// run in userspace. + #[cfg(testlib)] + pub fn name(&self) -> Option<&'static CStr> { + None + } +} + +impl fmt::Debug for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self.name() { + // Print out number if no name can be found. + None => f.debug_tuple("Error").field(&-self.0).finish(), + // SAFETY: These strings are ASCII-only. + Some(name) => f + .debug_tuple(unsafe { str::from_utf8_unchecked(name) }) + .finish(), + } + } +} + +impl From for Error { + fn from(_: TryFromIntError) -> Error { + code::EINVAL + } +} + +impl From for Error { + fn from(_: Utf8Error) -> Error { + code::EINVAL + } +} + +impl From for Error { + fn from(_: TryReserveError) -> Error { + code::ENOMEM + } +} + +impl From for Error { + fn from(_: LayoutError) -> Error { + code::ENOMEM + } +} + +impl From for Error { + fn from(_: core::fmt::Error) -> Error { + code::EINVAL + } +} + +impl From for Error { + fn from(e: core::convert::Infallible) -> Error { + match e {} + } +} + +/// A [`Result`] with an [`Error`] error type. +/// +/// To be used as the return type for functions that may fail. +/// +/// # Error codes in C and Rust +/// +/// In C, it is common that functions indicate success or failure through +/// their return value; modifying or returning extra data through non-`const` +/// pointer parameters. In particular, in the kernel, functions that may fail +/// typically return an `int` that represents a generic error code. We model +/// those as [`Error`]. +/// +/// In Rust, it is idiomatic to model functions that may fail as returning +/// a [`Result`]. Since in the kernel many functions return an error code, +/// [`Result`] is a type alias for a [`core::result::Result`] that uses +/// [`Error`] as its error type. +/// +/// Note that even if a function does not return anything when it succeeds, +/// it should still be modeled as returning a `Result` rather than +/// just an [`Error`]. +pub type Result = core::result::Result; + +impl From for Error { + fn from(_: AllocError) -> Error { + code::ENOMEM + } +} + +// # Invariant: `-bindings::MAX_ERRNO` fits in an `i16`. +crate::static_assert!(bindings::MAX_ERRNO <= -(i16::MIN as i32) as u32); + +pub(crate) fn from_kernel_result_helper(r: Result) -> T +where + T: From, +{ + match r { + Ok(v) => v, + // NO-OVERFLOW: negative `errno`s are no smaller than `-bindings::MAX_ERRNO`, + // `-bindings::MAX_ERRNO` fits in an `i16` as per invariant above, + // therefore a negative `errno` always fits in an `i16` and will not overflow. + Err(e) => T::from(e.to_kernel_errno() as i16), + } +} + +/// Transforms a [`crate::error::Result`] to a kernel C integer result. +/// +/// This is useful when calling Rust functions that return [`crate::error::Result`] +/// from inside `extern "C"` functions that need to return an integer +/// error result. +/// +/// `T` should be convertible to an `i16` via `From`. +/// +/// # Examples +/// +/// ```ignore +/// # use kernel::from_kernel_result; +/// # use kernel::c_types; +/// # use kernel::bindings; +/// unsafe extern "C" fn probe_callback( +/// pdev: *mut bindings::platform_device, +/// ) -> c_types::c_int { +/// from_kernel_result! { +/// let ptr = devm_alloc(pdev)?; +/// bindings::platform_set_drvdata(pdev, ptr); +/// Ok(0) +/// } +/// } +/// ``` +macro_rules! from_kernel_result { + ($($tt:tt)*) => {{ + $crate::error::from_kernel_result_helper((|| { + $($tt)* + })()) + }}; +} + +pub(crate) use from_kernel_result; + +/// Transform a kernel "error pointer" to a normal pointer. +/// +/// Some kernel C API functions return an "error pointer" which optionally +/// embeds an `errno`. Callers are supposed to check the returned pointer +/// for errors. This function performs the check and converts the "error pointer" +/// to a normal pointer in an idiomatic fashion. +/// +/// # Examples +/// +/// ```ignore +/// # use kernel::from_kernel_err_ptr; +/// # use kernel::c_types; +/// # use kernel::bindings; +/// fn devm_platform_ioremap_resource( +/// pdev: &mut PlatformDevice, +/// index: u32, +/// ) -> Result<*mut c_types::c_void> { +/// // SAFETY: FFI call. +/// unsafe { +/// from_kernel_err_ptr(bindings::devm_platform_ioremap_resource( +/// pdev.to_ptr(), +/// index, +/// )) +/// } +/// } +/// ``` +// TODO: Remove `dead_code` marker once an in-kernel client is available. +#[allow(dead_code)] +pub(crate) fn from_kernel_err_ptr(ptr: *mut T) -> Result<*mut T> { + // CAST: Casting a pointer to `*const c_types::c_void` is always valid. + let const_ptr: *const c_types::c_void = ptr.cast(); + // SAFETY: The FFI function does not deref the pointer. + if unsafe { bindings::IS_ERR(const_ptr) } { + // SAFETY: The FFI function does not deref the pointer. + let err = unsafe { bindings::PTR_ERR(const_ptr) }; + // CAST: If `IS_ERR()` returns `true`, + // then `PTR_ERR()` is guaranteed to return a + // negative value greater-or-equal to `-bindings::MAX_ERRNO`, + // which always fits in an `i16`, as per the invariant above. + // And an `i16` always fits in an `i32`. So casting `err` to + // an `i32` can never overflow, and is always valid. + // + // SAFETY: `IS_ERR()` ensures `err` is a + // negative value greater-or-equal to `-bindings::MAX_ERRNO`. + return Err(unsafe { Error::from_kernel_errno_unchecked(err as i32) }); + } + Ok(ptr) +} + +/// Calls a kernel function that returns an integer error code on failure and converts the result +/// to a [`Result`]. +pub fn to_result(func: impl FnOnce() -> c_types::c_int) -> Result { + let err = func(); + if err < 0 { + Err(Error::from_kernel_errno(err)) + } else { + Ok(()) + } +} diff --git a/rust/kernel/file.rs b/rust/kernel/file.rs new file mode 100644 index 00000000000000..e1b3b324bb3db0 --- /dev/null +++ b/rust/kernel/file.rs @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Files and file descriptors. +//! +//! C headers: [`include/linux/fs.h`](../../../../include/linux/fs.h) and +//! [`include/linux/file.h`](../../../../include/linux/file.h) + +use crate::{ + bindings, c_types, + cred::Credential, + error::{code::*, from_kernel_result, Error, Result}, + io_buffer::{IoBufferReader, IoBufferWriter}, + iov_iter::IovIter, + mm, + sync::CondVar, + types::PointerWrapper, + user_ptr::{UserSlicePtr, UserSlicePtrReader, UserSlicePtrWriter}, + ARef, AlwaysRefCounted, +}; +use core::convert::{TryFrom, TryInto}; +use core::{cell::UnsafeCell, marker, mem, ptr}; + +/// Wraps the kernel's `struct file`. +/// +/// # Invariants +/// +/// Instances of this type are always ref-counted, that is, a call to `get_file` ensures that the +/// allocation remains valid at least until the matching call to `fput`. +#[repr(transparent)] +pub struct File(pub(crate) UnsafeCell); + +// TODO: Accessing fields of `struct file` through the pointer is UB because other threads may be +// writing to them. However, this is how the C code currently operates: naked reads and writes to +// fields. Even if we used relaxed atomics on the Rust side, we can't force this on the C side. +impl File { + /// Constructs a new [`struct file`] wrapper from a file descriptor. + /// + /// The file descriptor belongs to the current process. + pub fn from_fd(fd: u32) -> Result> { + // SAFETY: FFI call, there are no requirements on `fd`. + let ptr = ptr::NonNull::new(unsafe { bindings::fget(fd) }).ok_or(EBADF)?; + + // SAFETY: `fget` increments the refcount before returning. + Ok(unsafe { ARef::from_raw(ptr.cast()) }) + } + + /// Creates a reference to a [`File`] from a valid pointer. + /// + /// # Safety + /// + /// The caller must ensure that `ptr` is valid and remains valid for the lifetime of the + /// returned [`File`] instance. + pub(crate) unsafe fn from_ptr<'a>(ptr: *const bindings::file) -> &'a File { + // SAFETY: The safety requirements guarantee the validity of the dereference, while the + // `File` type being transparent makes the cast ok. + unsafe { &*ptr.cast() } + } + + /// Returns the current seek/cursor/pointer position (`struct file::f_pos`). + pub fn pos(&self) -> u64 { + // SAFETY: The file is valid because the shared reference guarantees a nonzero refcount. + unsafe { core::ptr::addr_of!((*self.0.get()).f_pos).read() as _ } + } + + /// Returns whether the file is in blocking mode. + pub fn is_blocking(&self) -> bool { + self.flags() & bindings::O_NONBLOCK == 0 + } + + /// Returns the credentials of the task that originally opened the file. + pub fn cred(&self) -> &Credential { + // SAFETY: The file is valid because the shared reference guarantees a nonzero refcount. + let ptr = unsafe { core::ptr::addr_of!((*self.0.get()).f_cred).read() }; + // SAFETY: The lifetimes of `self` and `Credential` are tied, so it is guaranteed that + // the credential pointer remains valid (because the file is still alive, and it doesn't + // change over the lifetime of a file). + unsafe { Credential::from_ptr(ptr) } + } + + /// Returns the flags associated with the file. + pub fn flags(&self) -> u32 { + // SAFETY: The file is valid because the shared reference guarantees a nonzero refcount. + unsafe { core::ptr::addr_of!((*self.0.get()).f_flags).read() } + } +} + +// SAFETY: The type invariants guarantee that `File` is always ref-counted. +unsafe impl AlwaysRefCounted for File { + fn inc_ref(&self) { + // SAFETY: The existence of a shared reference means that the refcount is nonzero. + unsafe { bindings::get_file(self.0.get()) }; + } + + unsafe fn dec_ref(obj: ptr::NonNull) { + // SAFETY: The safety requirements guarantee that the refcount is nonzero. + unsafe { bindings::fput(obj.cast().as_ptr()) } + } +} + +/// A file descriptor reservation. +/// +/// This allows the creation of a file descriptor in two steps: first, we reserve a slot for it, +/// then we commit or drop the reservation. The first step may fail (e.g., the current process ran +/// out of available slots), but commit and drop never fail (and are mutually exclusive). +pub struct FileDescriptorReservation { + fd: u32, +} + +impl FileDescriptorReservation { + /// Creates a new file descriptor reservation. + pub fn new(flags: u32) -> Result { + // SAFETY: FFI call, there are no safety requirements on `flags`. + let fd = unsafe { bindings::get_unused_fd_flags(flags) }; + if fd < 0 { + return Err(Error::from_kernel_errno(fd)); + } + Ok(Self { fd: fd as _ }) + } + + /// Returns the file descriptor number that was reserved. + pub fn reserved_fd(&self) -> u32 { + self.fd + } + + /// Commits the reservation. + /// + /// The previously reserved file descriptor is bound to `file`. + pub fn commit(self, file: ARef) { + // SAFETY: `self.fd` was previously returned by `get_unused_fd_flags`, and `file.ptr` is + // guaranteed to have an owned ref count by its type invariants. + unsafe { bindings::fd_install(self.fd, file.0.get()) }; + + // `fd_install` consumes both the file descriptor and the file reference, so we cannot run + // the destructors. + core::mem::forget(self); + core::mem::forget(file); + } +} + +impl Drop for FileDescriptorReservation { + fn drop(&mut self) { + // SAFETY: `self.fd` was returned by a previous call to `get_unused_fd_flags`. + unsafe { bindings::put_unused_fd(self.fd) }; + } +} + +/// Wraps the kernel's `struct poll_table_struct`. +/// +/// # Invariants +/// +/// The pointer `PollTable::ptr` is null or valid. +pub struct PollTable { + ptr: *mut bindings::poll_table_struct, +} + +impl PollTable { + /// Constructors a new `struct poll_table_struct` wrapper. + /// + /// # Safety + /// + /// The pointer `ptr` must be either null or a valid pointer for the lifetime of the object. + unsafe fn from_ptr(ptr: *mut bindings::poll_table_struct) -> Self { + Self { ptr } + } + + /// Associates the given file and condition variable to this poll table. It means notifying the + /// condition variable will notify the poll table as well; additionally, the association + /// between the condition variable and the file will automatically be undone by the kernel when + /// the file is destructed. To unilaterally remove the association before then, one can call + /// [`CondVar::free_waiters`]. + /// + /// # Safety + /// + /// If the condition variable is destroyed before the file, then [`CondVar::free_waiters`] must + /// be called to ensure that all waiters are flushed out. + pub unsafe fn register_wait<'a>(&self, file: &'a File, cv: &'a CondVar) { + if self.ptr.is_null() { + return; + } + + // SAFETY: `PollTable::ptr` is guaranteed to be valid by the type invariants and the null + // check above. + let table = unsafe { &*self.ptr }; + if let Some(proc) = table._qproc { + // SAFETY: All pointers are known to be valid. + unsafe { proc(file.0.get() as _, cv.wait_list.get(), self.ptr) } + } + } +} + +/// Equivalent to [`std::io::SeekFrom`]. +/// +/// [`std::io::SeekFrom`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html +pub enum SeekFrom { + /// Equivalent to C's `SEEK_SET`. + Start(u64), + + /// Equivalent to C's `SEEK_END`. + End(i64), + + /// Equivalent to C's `SEEK_CUR`. + Current(i64), +} + +pub(crate) struct OperationsVtable(marker::PhantomData, marker::PhantomData); + +impl, T: Operations> OperationsVtable { + /// Called by the VFS when an inode should be opened. + /// + /// Calls `T::open` on the returned value of `A::convert`. + /// + /// # Safety + /// + /// The returned value of `A::convert` must be a valid non-null pointer and + /// `T:open` must return a valid non-null pointer on an `Ok` result. + unsafe extern "C" fn open_callback( + inode: *mut bindings::inode, + file: *mut bindings::file, + ) -> c_types::c_int { + from_kernel_result! { + // SAFETY: `A::convert` must return a valid non-null pointer that + // should point to data in the inode or file that lives longer + // than the following use of `T::open`. + let arg = unsafe { A::convert(inode, file) }; + // SAFETY: The C contract guarantees that `file` is valid. Additionally, + // `fileref` never outlives this function, so it is guaranteed to be + // valid. + let fileref = unsafe { File::from_ptr(file) }; + // SAFETY: `arg` was previously returned by `A::convert` and must + // be a valid non-null pointer. + let ptr = T::open(unsafe { &*arg }, fileref)?.into_pointer(); + // SAFETY: The C contract guarantees that `private_data` is available + // for implementers of the file operations (no other C code accesses + // it), so we know that there are no concurrent threads/CPUs accessing + // it (it's not visible to any other Rust code). + unsafe { (*file).private_data = ptr as *mut c_types::c_void }; + Ok(0) + } + } + + unsafe extern "C" fn read_callback( + file: *mut bindings::file, + buf: *mut c_types::c_char, + len: c_types::c_size_t, + offset: *mut bindings::loff_t, + ) -> c_types::c_ssize_t { + from_kernel_result! { + let mut data = unsafe { UserSlicePtr::new(buf as *mut c_types::c_void, len).writer() }; + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the + // `release` callback, which the C API guarantees that will be called only when all + // references to `file` have been released, so we know it can't be called while this + // function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + // No `FMODE_UNSIGNED_OFFSET` support, so `offset` must be in [0, 2^63). + // See discussion in https://github.com/fishinabarrel/linux-kernel-module-rust/pull/113 + let read = T::read( + f, + unsafe { File::from_ptr(file) }, + &mut data, + unsafe { *offset }.try_into()?, + )?; + unsafe { (*offset) += bindings::loff_t::try_from(read).unwrap() }; + Ok(read as _) + } + } + + unsafe extern "C" fn read_iter_callback( + iocb: *mut bindings::kiocb, + raw_iter: *mut bindings::iov_iter, + ) -> isize { + from_kernel_result! { + let mut iter = unsafe { IovIter::from_ptr(raw_iter) }; + let file = unsafe { (*iocb).ki_filp }; + let offset = unsafe { (*iocb).ki_pos }; + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the + // `release` callback, which the C API guarantees that will be called only when all + // references to `file` have been released, so we know it can't be called while this + // function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + let read = + T::read(f, unsafe { File::from_ptr(file) }, &mut iter, offset.try_into()?)?; + unsafe { (*iocb).ki_pos += bindings::loff_t::try_from(read).unwrap() }; + Ok(read as _) + } + } + + unsafe extern "C" fn write_callback( + file: *mut bindings::file, + buf: *const c_types::c_char, + len: c_types::c_size_t, + offset: *mut bindings::loff_t, + ) -> c_types::c_ssize_t { + from_kernel_result! { + let mut data = unsafe { UserSlicePtr::new(buf as *mut c_types::c_void, len).reader() }; + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the + // `release` callback, which the C API guarantees that will be called only when all + // references to `file` have been released, so we know it can't be called while this + // function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + // No `FMODE_UNSIGNED_OFFSET` support, so `offset` must be in [0, 2^63). + // See discussion in https://github.com/fishinabarrel/linux-kernel-module-rust/pull/113 + let written = T::write( + f, + unsafe { File::from_ptr(file) }, + &mut data, + unsafe { *offset }.try_into()? + )?; + unsafe { (*offset) += bindings::loff_t::try_from(written).unwrap() }; + Ok(written as _) + } + } + + unsafe extern "C" fn write_iter_callback( + iocb: *mut bindings::kiocb, + raw_iter: *mut bindings::iov_iter, + ) -> isize { + from_kernel_result! { + let mut iter = unsafe { IovIter::from_ptr(raw_iter) }; + let file = unsafe { (*iocb).ki_filp }; + let offset = unsafe { (*iocb).ki_pos }; + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the + // `release` callback, which the C API guarantees that will be called only when all + // references to `file` have been released, so we know it can't be called while this + // function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + let written = + T::write(f, unsafe { File::from_ptr(file) }, &mut iter, offset.try_into()?)?; + unsafe { (*iocb).ki_pos += bindings::loff_t::try_from(written).unwrap() }; + Ok(written as _) + } + } + + unsafe extern "C" fn release_callback( + _inode: *mut bindings::inode, + file: *mut bindings::file, + ) -> c_types::c_int { + let ptr = mem::replace(unsafe { &mut (*file).private_data }, ptr::null_mut()); + T::release(unsafe { T::Data::from_pointer(ptr as _) }, unsafe { + File::from_ptr(file) + }); + 0 + } + + unsafe extern "C" fn llseek_callback( + file: *mut bindings::file, + offset: bindings::loff_t, + whence: c_types::c_int, + ) -> bindings::loff_t { + from_kernel_result! { + let off = match whence as u32 { + bindings::SEEK_SET => SeekFrom::Start(offset.try_into()?), + bindings::SEEK_CUR => SeekFrom::Current(offset), + bindings::SEEK_END => SeekFrom::End(offset), + _ => return Err(EINVAL), + }; + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the + // `release` callback, which the C API guarantees that will be called only when all + // references to `file` have been released, so we know it can't be called while this + // function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + let off = T::seek(f, unsafe { File::from_ptr(file) }, off)?; + Ok(off as bindings::loff_t) + } + } + + unsafe extern "C" fn unlocked_ioctl_callback( + file: *mut bindings::file, + cmd: c_types::c_uint, + arg: c_types::c_ulong, + ) -> c_types::c_long { + from_kernel_result! { + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the + // `release` callback, which the C API guarantees that will be called only when all + // references to `file` have been released, so we know it can't be called while this + // function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + let mut cmd = IoctlCommand::new(cmd as _, arg as _); + let ret = T::ioctl(f, unsafe { File::from_ptr(file) }, &mut cmd)?; + Ok(ret as _) + } + } + + unsafe extern "C" fn compat_ioctl_callback( + file: *mut bindings::file, + cmd: c_types::c_uint, + arg: c_types::c_ulong, + ) -> c_types::c_long { + from_kernel_result! { + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the + // `release` callback, which the C API guarantees that will be called only when all + // references to `file` have been released, so we know it can't be called while this + // function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + let mut cmd = IoctlCommand::new(cmd as _, arg as _); + let ret = T::compat_ioctl(f, unsafe { File::from_ptr(file) }, &mut cmd)?; + Ok(ret as _) + } + } + + unsafe extern "C" fn mmap_callback( + file: *mut bindings::file, + vma: *mut bindings::vm_area_struct, + ) -> c_types::c_int { + from_kernel_result! { + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the + // `release` callback, which the C API guarantees that will be called only when all + // references to `file` have been released, so we know it can't be called while this + // function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + + // SAFETY: The C API guarantees that `vma` is valid for the duration of this call. + // `area` only lives within this call, so it is guaranteed to be valid. + let mut area = unsafe { mm::virt::Area::from_ptr(vma) }; + + // SAFETY: The C API guarantees that `file` is valid for the duration of this call, + // which is longer than the lifetime of the file reference. + T::mmap(f, unsafe { File::from_ptr(file) }, &mut area)?; + Ok(0) + } + } + + unsafe extern "C" fn fsync_callback( + file: *mut bindings::file, + start: bindings::loff_t, + end: bindings::loff_t, + datasync: c_types::c_int, + ) -> c_types::c_int { + from_kernel_result! { + let start = start.try_into()?; + let end = end.try_into()?; + let datasync = datasync != 0; + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the + // `release` callback, which the C API guarantees that will be called only when all + // references to `file` have been released, so we know it can't be called while this + // function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + let res = T::fsync(f, unsafe { File::from_ptr(file) }, start, end, datasync)?; + Ok(res.try_into().unwrap()) + } + } + + unsafe extern "C" fn poll_callback( + file: *mut bindings::file, + wait: *mut bindings::poll_table_struct, + ) -> bindings::__poll_t { + // SAFETY: `private_data` was initialised by `open_callback` with a value returned by + // `T::Data::into_pointer`. `T::Data::from_pointer` is only called by the `release` + // callback, which the C API guarantees that will be called only when all references to + // `file` have been released, so we know it can't be called while this function is running. + let f = unsafe { T::Data::borrow((*file).private_data) }; + match T::poll(f, unsafe { File::from_ptr(file) }, unsafe { + &PollTable::from_ptr(wait) + }) { + Ok(v) => v, + Err(_) => bindings::POLLERR, + } + } + + const VTABLE: bindings::file_operations = bindings::file_operations { + open: Some(Self::open_callback), + release: Some(Self::release_callback), + read: if T::TO_USE.read { + Some(Self::read_callback) + } else { + None + }, + write: if T::TO_USE.write { + Some(Self::write_callback) + } else { + None + }, + llseek: if T::TO_USE.seek { + Some(Self::llseek_callback) + } else { + None + }, + + check_flags: None, + compat_ioctl: if T::TO_USE.compat_ioctl { + Some(Self::compat_ioctl_callback) + } else { + None + }, + copy_file_range: None, + fallocate: None, + fadvise: None, + fasync: None, + flock: None, + flush: None, + fsync: if T::TO_USE.fsync { + Some(Self::fsync_callback) + } else { + None + }, + get_unmapped_area: None, + iterate: None, + iterate_shared: None, + iopoll: None, + lock: None, + mmap: if T::TO_USE.mmap { + Some(Self::mmap_callback) + } else { + None + }, + mmap_supported_flags: 0, + owner: ptr::null_mut(), + poll: if T::TO_USE.poll { + Some(Self::poll_callback) + } else { + None + }, + read_iter: if T::TO_USE.read_iter { + Some(Self::read_iter_callback) + } else { + None + }, + remap_file_range: None, + sendpage: None, + setlease: None, + show_fdinfo: None, + splice_read: None, + splice_write: None, + unlocked_ioctl: if T::TO_USE.ioctl { + Some(Self::unlocked_ioctl_callback) + } else { + None + }, + write_iter: if T::TO_USE.write_iter { + Some(Self::write_iter_callback) + } else { + None + }, + }; + + /// Builds an instance of [`struct file_operations`]. + /// + /// # Safety + /// + /// The caller must ensure that the adapter is compatible with the way the device is registered. + pub(crate) const unsafe fn build() -> &'static bindings::file_operations { + &Self::VTABLE + } +} + +/// Represents which fields of [`struct file_operations`] should be populated with pointers. +pub struct ToUse { + /// The `read` field of [`struct file_operations`]. + pub read: bool, + + /// The `read_iter` field of [`struct file_operations`]. + pub read_iter: bool, + + /// The `write` field of [`struct file_operations`]. + pub write: bool, + + /// The `write_iter` field of [`struct file_operations`]. + pub write_iter: bool, + + /// The `llseek` field of [`struct file_operations`]. + pub seek: bool, + + /// The `unlocked_ioctl` field of [`struct file_operations`]. + pub ioctl: bool, + + /// The `compat_ioctl` field of [`struct file_operations`]. + pub compat_ioctl: bool, + + /// The `fsync` field of [`struct file_operations`]. + pub fsync: bool, + + /// The `mmap` field of [`struct file_operations`]. + pub mmap: bool, + + /// The `poll` field of [`struct file_operations`]. + pub poll: bool, +} + +/// A constant version where all values are to set to `false`, that is, all supported fields will +/// be set to null pointers. +pub const USE_NONE: ToUse = ToUse { + read: false, + read_iter: false, + write: false, + write_iter: false, + seek: false, + ioctl: false, + compat_ioctl: false, + fsync: false, + mmap: false, + poll: false, +}; + +/// Defines the [`Operations::TO_USE`] field based on a list of fields to be populated. +#[macro_export] +macro_rules! declare_file_operations { + () => { + const TO_USE: $crate::file::ToUse = $crate::file::USE_NONE; + }; + ($($i:ident),+) => { + const TO_USE: kernel::file::ToUse = + $crate::file::ToUse { + $($i: true),+ , + ..$crate::file::USE_NONE + }; + }; +} + +/// Allows the handling of ioctls defined with the `_IO`, `_IOR`, `_IOW`, and `_IOWR` macros. +/// +/// For each macro, there is a handler function that takes the appropriate types as arguments. +pub trait IoctlHandler: Sync { + /// The type of the first argument to each associated function. + type Target<'a>; + + /// Handles ioctls defined with the `_IO` macro, that is, with no buffer as argument. + fn pure(_this: Self::Target<'_>, _file: &File, _cmd: u32, _arg: usize) -> Result { + Err(EINVAL) + } + + /// Handles ioctls defined with the `_IOR` macro, that is, with an output buffer provided as + /// argument. + fn read( + _this: Self::Target<'_>, + _file: &File, + _cmd: u32, + _writer: &mut UserSlicePtrWriter, + ) -> Result { + Err(EINVAL) + } + + /// Handles ioctls defined with the `_IOW` macro, that is, with an input buffer provided as + /// argument. + fn write( + _this: Self::Target<'_>, + _file: &File, + _cmd: u32, + _reader: &mut UserSlicePtrReader, + ) -> Result { + Err(EINVAL) + } + + /// Handles ioctls defined with the `_IOWR` macro, that is, with a buffer for both input and + /// output provided as argument. + fn read_write( + _this: Self::Target<'_>, + _file: &File, + _cmd: u32, + _data: UserSlicePtr, + ) -> Result { + Err(EINVAL) + } +} + +/// Represents an ioctl command. +/// +/// It can use the components of an ioctl command to dispatch ioctls using +/// [`IoctlCommand::dispatch`]. +pub struct IoctlCommand { + cmd: u32, + arg: usize, + user_slice: Option, +} + +impl IoctlCommand { + /// Constructs a new [`IoctlCommand`]. + fn new(cmd: u32, arg: usize) -> Self { + let size = (cmd >> bindings::_IOC_SIZESHIFT) & bindings::_IOC_SIZEMASK; + + // SAFETY: We only create one instance of the user slice per ioctl call, so TOCTOU issues + // are not possible. + let user_slice = Some(unsafe { UserSlicePtr::new(arg as _, size as _) }); + Self { + cmd, + arg, + user_slice, + } + } + + /// Dispatches the given ioctl to the appropriate handler based on the value of the command. It + /// also creates a [`UserSlicePtr`], [`UserSlicePtrReader`], or [`UserSlicePtrWriter`] + /// depending on the direction of the buffer of the command. + /// + /// It is meant to be used in implementations of [`Operations::ioctl`] and + /// [`Operations::compat_ioctl`]. + pub fn dispatch( + &mut self, + handler: T::Target<'_>, + file: &File, + ) -> Result { + let dir = (self.cmd >> bindings::_IOC_DIRSHIFT) & bindings::_IOC_DIRMASK; + if dir == bindings::_IOC_NONE { + return T::pure(handler, file, self.cmd, self.arg); + } + + let data = self.user_slice.take().ok_or(EINVAL)?; + const READ_WRITE: u32 = bindings::_IOC_READ | bindings::_IOC_WRITE; + match dir { + bindings::_IOC_WRITE => T::write(handler, file, self.cmd, &mut data.reader()), + bindings::_IOC_READ => T::read(handler, file, self.cmd, &mut data.writer()), + READ_WRITE => T::read_write(handler, file, self.cmd, data), + _ => Err(EINVAL), + } + } + + /// Returns the raw 32-bit value of the command and the ptr-sized argument. + pub fn raw(&self) -> (u32, usize) { + (self.cmd, self.arg) + } +} + +/// Trait for extracting file open arguments from kernel data structures. +/// +/// This is meant to be implemented by registration managers. +pub trait OpenAdapter { + /// Converts untyped data stored in [`struct inode`] and [`struct file`] (when [`struct + /// file_operations::open`] is called) into the given type. For example, for `miscdev` + /// devices, a pointer to the registered [`struct miscdev`] is stored in [`struct + /// file::private_data`]. + /// + /// # Safety + /// + /// This function must be called only when [`struct file_operations::open`] is being called for + /// a file that was registered by the implementer. The returned pointer must be valid and + /// not-null. + unsafe fn convert(_inode: *mut bindings::inode, _file: *mut bindings::file) -> *const T; +} + +/// Corresponds to the kernel's `struct file_operations`. +/// +/// You implement this trait whenever you would create a `struct file_operations`. +/// +/// File descriptors may be used from multiple threads/processes concurrently, so your type must be +/// [`Sync`]. It must also be [`Send`] because [`Operations::release`] will be called from the +/// thread that decrements that associated file's refcount to zero. +pub trait Operations { + /// The methods to use to populate [`struct file_operations`]. + const TO_USE: ToUse; + + /// The type of the context data returned by [`Operations::open`] and made available to + /// other methods. + type Data: PointerWrapper + Send + Sync = (); + + /// The type of the context data passed to [`Operations::open`]. + type OpenData: Sync = (); + + /// Creates a new instance of this file. + /// + /// Corresponds to the `open` function pointer in `struct file_operations`. + fn open(context: &Self::OpenData, file: &File) -> Result; + + /// Cleans up after the last reference to the file goes away. + /// + /// Note that context data is moved, so it will be freed automatically unless the + /// implementation moves it elsewhere. + /// + /// Corresponds to the `release` function pointer in `struct file_operations`. + fn release(_data: Self::Data, _file: &File) {} + + /// Reads data from this file to the caller's buffer. + /// + /// Corresponds to the `read` and `read_iter` function pointers in `struct file_operations`. + fn read( + _data: ::Borrowed<'_>, + _file: &File, + _writer: &mut impl IoBufferWriter, + _offset: u64, + ) -> Result { + Err(EINVAL) + } + + /// Writes data from the caller's buffer to this file. + /// + /// Corresponds to the `write` and `write_iter` function pointers in `struct file_operations`. + fn write( + _data: ::Borrowed<'_>, + _file: &File, + _reader: &mut impl IoBufferReader, + _offset: u64, + ) -> Result { + Err(EINVAL) + } + + /// Changes the position of the file. + /// + /// Corresponds to the `llseek` function pointer in `struct file_operations`. + fn seek( + _data: ::Borrowed<'_>, + _file: &File, + _offset: SeekFrom, + ) -> Result { + Err(EINVAL) + } + + /// Performs IO control operations that are specific to the file. + /// + /// Corresponds to the `unlocked_ioctl` function pointer in `struct file_operations`. + fn ioctl( + _data: ::Borrowed<'_>, + _file: &File, + _cmd: &mut IoctlCommand, + ) -> Result { + Err(ENOTTY) + } + + /// Performs 32-bit IO control operations on that are specific to the file on 64-bit kernels. + /// + /// Corresponds to the `compat_ioctl` function pointer in `struct file_operations`. + fn compat_ioctl( + _data: ::Borrowed<'_>, + _file: &File, + _cmd: &mut IoctlCommand, + ) -> Result { + Err(ENOTTY) + } + + /// Syncs pending changes to this file. + /// + /// Corresponds to the `fsync` function pointer in `struct file_operations`. + fn fsync( + _data: ::Borrowed<'_>, + _file: &File, + _start: u64, + _end: u64, + _datasync: bool, + ) -> Result { + Err(EINVAL) + } + + /// Maps areas of the caller's virtual memory with device/file memory. + /// + /// Corresponds to the `mmap` function pointer in `struct file_operations`. + fn mmap( + _data: ::Borrowed<'_>, + _file: &File, + _vma: &mut mm::virt::Area, + ) -> Result { + Err(EINVAL) + } + + /// Checks the state of the file and optionally registers for notification when the state + /// changes. + /// + /// Corresponds to the `poll` function pointer in `struct file_operations`. + fn poll( + _data: ::Borrowed<'_>, + _file: &File, + _table: &PollTable, + ) -> Result { + Ok(bindings::POLLIN | bindings::POLLOUT | bindings::POLLRDNORM | bindings::POLLWRNORM) + } +} diff --git a/rust/kernel/gpio.rs b/rust/kernel/gpio.rs new file mode 100644 index 00000000000000..2e4365dfcf7414 --- /dev/null +++ b/rust/kernel/gpio.rs @@ -0,0 +1,478 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Support for gpio device drivers. +//! +//! C header: [`include/linux/gpio/driver.h`](../../../../include/linux/gpio/driver.h) + +use crate::{ + bindings, c_types, device, error::code::*, error::from_kernel_result, types::PointerWrapper, + Error, Result, +}; +use core::{ + cell::UnsafeCell, + marker::{PhantomData, PhantomPinned}, + pin::Pin, +}; + +#[cfg(CONFIG_GPIOLIB_IRQCHIP)] +pub use irqchip::{ChipWithIrqChip, RegistrationWithIrqChip}; + +/// The direction of a gpio line. +pub enum LineDirection { + /// Direction is input. + In = bindings::GPIO_LINE_DIRECTION_IN as _, + + /// Direction is output. + Out = bindings::GPIO_LINE_DIRECTION_OUT as _, +} + +/// A gpio chip. +pub trait Chip { + /// Context data associated with the gpio chip. + /// + /// It determines the type of the context data passed to each of the methods of the trait. + type Data: PointerWrapper + Sync + Send; + + /// The methods to use to populate [`struct gpio_chip`]. This is typically populated with + /// [`declare_gpio_chip_operations`]. + const TO_USE: ToUse; + + /// Returns the direction of the given gpio line. + fn get_direction( + _data: ::Borrowed<'_>, + _offset: u32, + ) -> Result { + Err(ENOTSUPP) + } + + /// Configures the direction as input of the given gpio line. + fn direction_input( + _data: ::Borrowed<'_>, + _offset: u32, + ) -> Result { + Err(EIO) + } + + /// Configures the direction as output of the given gpio line. + /// + /// The value that will be initially output is also specified. + fn direction_output( + _data: ::Borrowed<'_>, + _offset: u32, + _value: bool, + ) -> Result { + Err(ENOTSUPP) + } + + /// Returns the current value of the given gpio line. + fn get(_data: ::Borrowed<'_>, _offset: u32) -> Result { + Err(EIO) + } + + /// Sets the value of the given gpio line. + fn set(_data: ::Borrowed<'_>, _offset: u32, _value: bool) {} +} + +/// Represents which fields of [`struct gpio_chip`] should be populated with pointers. +/// +/// This is typically populated with the [`declare_gpio_chip_operations`] macro. +pub struct ToUse { + /// The `get_direction` field of [`struct gpio_chip`]. + pub get_direction: bool, + + /// The `direction_input` field of [`struct gpio_chip`]. + pub direction_input: bool, + + /// The `direction_output` field of [`struct gpio_chip`]. + pub direction_output: bool, + + /// The `get` field of [`struct gpio_chip`]. + pub get: bool, + + /// The `set` field of [`struct gpio_chip`]. + pub set: bool, +} + +/// A constant version where all values are set to `false`, that is, all supported fields will be +/// set to null pointers. +pub const USE_NONE: ToUse = ToUse { + get_direction: false, + direction_input: false, + direction_output: false, + get: false, + set: false, +}; + +/// Defines the [`Chip::TO_USE`] field based on a list of fields to be populated. +#[macro_export] +macro_rules! declare_gpio_chip_operations { + () => { + const TO_USE: $crate::gpio::ToUse = $crate::gpio::USE_NONE; + }; + ($($i:ident),+) => { + #[allow(clippy::needless_update)] + const TO_USE: $crate::gpio::ToUse = + $crate::gpio::ToUse { + $($i: true),+ , + ..$crate::gpio::USE_NONE + }; + }; +} + +/// A registration of a gpio chip. +pub struct Registration { + gc: UnsafeCell, + parent: Option, + _p: PhantomData, + _pin: PhantomPinned, +} + +impl Registration { + /// Creates a new [`Registration`] but does not register it yet. + /// + /// It is allowed to move. + pub fn new() -> Self { + Self { + parent: None, + gc: UnsafeCell::new(bindings::gpio_chip::default()), + _pin: PhantomPinned, + _p: PhantomData, + } + } + + /// Registers a gpio chip with the rest of the kernel. + pub fn register( + self: Pin<&mut Self>, + gpio_count: u16, + base: Option, + parent: &dyn device::RawDevice, + data: T::Data, + ) -> Result { + if self.parent.is_some() { + // Already registered. + return Err(EINVAL); + } + + // SAFETY: We never move out of `this`. + let this = unsafe { self.get_unchecked_mut() }; + { + let gc = this.gc.get_mut(); + + // Set up the callbacks. + gc.request = Some(bindings::gpiochip_generic_request); + gc.free = Some(bindings::gpiochip_generic_free); + if T::TO_USE.get_direction { + gc.get_direction = Some(get_direction_callback::); + } + if T::TO_USE.direction_input { + gc.direction_input = Some(direction_input_callback::); + } + if T::TO_USE.direction_output { + gc.direction_output = Some(direction_output_callback::); + } + if T::TO_USE.get { + gc.get = Some(get_callback::); + } + if T::TO_USE.set { + gc.set = Some(set_callback::); + } + + // When a base is not explicitly given, use -1 for one to be picked. + if let Some(b) = base { + gc.base = b; + } else { + gc.base = -1; + } + + gc.ngpio = gpio_count; + gc.parent = parent.raw_device(); + gc.label = parent.name().as_char_ptr(); + + // TODO: Define `gc.owner` as well. + } + + let data_pointer = ::into_pointer(data); + // SAFETY: `gc` was initilised above, so it is valid. + let ret = unsafe { + bindings::gpiochip_add_data_with_key( + this.gc.get(), + data_pointer as _, + core::ptr::null_mut(), + core::ptr::null_mut(), + ) + }; + if ret < 0 { + // SAFETY: `data_pointer` was returned by `into_pointer` above. + unsafe { T::Data::from_pointer(data_pointer) }; + return Err(Error::from_kernel_errno(ret)); + } + + this.parent = Some(device::Device::from_dev(parent)); + Ok(()) + } +} + +// SAFETY: `Registration` doesn't offer any methods or access to fields when shared between threads +// or CPUs, so it is safe to share it. +unsafe impl Sync for Registration {} + +// SAFETY: Registration with and unregistration from the gpio subsystem can happen from any thread. +// Additionally, `T::Data` (which is dropped during unregistration) is `Send`, so it is ok to move +// `Registration` to different threads. +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for Registration {} + +impl Default for Registration { + fn default() -> Self { + Self::new() + } +} + +impl Drop for Registration { + /// Removes the registration from the kernel if it has completed successfully before. + fn drop(&mut self) { + if self.parent.is_some() { + // Get a pointer to the data stored in chip before destroying it. + // SAFETY: `gc` was during registration, which is guaranteed to have succeeded (because + // `parent` is `Some(_)`, so it remains valid. + let data_pointer = unsafe { bindings::gpiochip_get_data(self.gc.get()) }; + + // SAFETY: By the same argument above, `gc` is still valid. + unsafe { bindings::gpiochip_remove(self.gc.get()) }; + + // Free data as well. + // SAFETY: `data_pointer` was returned by `into_pointer` during registration. + unsafe { ::from_pointer(data_pointer) }; + } + } +} + +unsafe extern "C" fn get_direction_callback( + gc: *mut bindings::gpio_chip, + offset: c_types::c_uint, +) -> c_types::c_int { + from_kernel_result! { + // SAFETY: The value stored as chip data was returned by `into_pointer` during registration. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc)) }; + Ok(T::get_direction(data, offset)? as i32) + } +} + +unsafe extern "C" fn direction_input_callback( + gc: *mut bindings::gpio_chip, + offset: c_types::c_uint, +) -> c_types::c_int { + from_kernel_result! { + // SAFETY: The value stored as chip data was returned by `into_pointer` during registration. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc)) }; + T::direction_input(data, offset)?; + Ok(0) + } +} + +unsafe extern "C" fn direction_output_callback( + gc: *mut bindings::gpio_chip, + offset: c_types::c_uint, + value: c_types::c_int, +) -> c_types::c_int { + from_kernel_result! { + // SAFETY: The value stored as chip data was returned by `into_pointer` during registration. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc)) }; + T::direction_output(data, offset, value != 0)?; + Ok(0) + } +} + +unsafe extern "C" fn get_callback( + gc: *mut bindings::gpio_chip, + offset: c_types::c_uint, +) -> c_types::c_int { + from_kernel_result! { + // SAFETY: The value stored as chip data was returned by `into_pointer` during registration. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc)) }; + let v = T::get(data, offset)?; + Ok(v as _) + } +} + +unsafe extern "C" fn set_callback( + gc: *mut bindings::gpio_chip, + offset: c_types::c_uint, + value: c_types::c_int, +) { + // SAFETY: The value stored as chip data was returned by `into_pointer` during registration. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc)) }; + T::set(data, offset, value != 0); +} + +#[cfg(CONFIG_GPIOLIB_IRQCHIP)] +mod irqchip { + use super::*; + use crate::irq; + + /// A gpio chip that includes an irq chip. + pub trait ChipWithIrqChip: Chip { + /// Implements the irq flow for the gpio chip. + fn handle_irq_flow( + _data: ::Borrowed<'_>, + _desc: &irq::Descriptor, + _domain: &irq::Domain, + ); + } + + /// A registration of a gpio chip that includes an irq chip. + pub struct RegistrationWithIrqChip { + reg: Registration, + irq_chip: UnsafeCell, + parent_irq: u32, + } + + impl RegistrationWithIrqChip { + /// Creates a new [`RegistrationWithIrqChip`] but does not register it yet. + /// + /// It is allowed to move. + pub fn new() -> Self { + Self { + reg: Registration::new(), + irq_chip: UnsafeCell::new(bindings::irq_chip::default()), + parent_irq: 0, + } + } + + /// Registers a gpio chip and its irq chip with the rest of the kernel. + pub fn register>( + mut self: Pin<&mut Self>, + gpio_count: u16, + base: Option, + parent: &dyn device::RawDevice, + data: T::Data, + parent_irq: u32, + ) -> Result { + if self.reg.parent.is_some() { + // Already registered. + return Err(EINVAL); + } + + // SAFETY: We never move out of `this`. + let this = unsafe { self.as_mut().get_unchecked_mut() }; + + // Initialise the irq_chip. + { + let irq_chip = this.irq_chip.get_mut(); + irq_chip.name = parent.name().as_char_ptr(); + + // SAFETY: The gpio subsystem configures a pointer to `gpio_chip` as the irq chip + // data, so we use `IrqChipAdapter` to convert to the `T::Data`, which is the same + // as `irq::Chip::Data` per the bound above. + unsafe { irq::init_chip::>(irq_chip) }; + } + + // Initialise gc irq state. + { + let girq = &mut this.reg.gc.get_mut().irq; + girq.chip = this.irq_chip.get(); + // SAFETY: By leaving `parent_handler_data` set to `null`, the gpio subsystem + // initialises it to a pointer to the gpio chip, which is what `FlowHandler` + // expects. + girq.parent_handler = unsafe { irq::new_flow_handler::>() }; + girq.num_parents = 1; + girq.parents = &mut this.parent_irq; + this.parent_irq = parent_irq; + girq.default_type = bindings::IRQ_TYPE_NONE; + girq.handler = Some(bindings::handle_bad_irq); + } + + // SAFETY: `reg` is pinned when `self` is. + let pinned = unsafe { self.map_unchecked_mut(|r| &mut r.reg) }; + pinned.register(gpio_count, base, parent, data) + } + } + + impl Default for RegistrationWithIrqChip { + fn default() -> Self { + Self::new() + } + } + + // SAFETY: `RegistrationWithIrqChip` doesn't offer any methods or access to fields when shared + // between threads or CPUs, so it is safe to share it. + unsafe impl Sync for RegistrationWithIrqChip {} + + // SAFETY: Registration with and unregistration from the gpio subsystem (including irq chips for + // them) can happen from any thread. Additionally, `T::Data` (which is dropped during + // unregistration) is `Send`, so it is ok to move `Registration` to different threads. + #[allow(clippy::non_send_fields_in_send_ty)] + unsafe impl Send for RegistrationWithIrqChip where T::Data: Send {} + + struct FlowHandler(PhantomData); + + impl irq::FlowHandler for FlowHandler { + type Data = *mut bindings::gpio_chip; + + fn handle_irq_flow(gc: *mut bindings::gpio_chip, desc: &irq::Descriptor) { + // SAFETY: `FlowHandler` is only used in gpio chips, and it is removed when the gpio is + // unregistered, so we know that `gc` must still be valid. We also know that the value + // stored as gpio data was returned by `T::Data::into_pointer` again because + // `FlowHandler` is a private structure only used in this way. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc)) }; + + // SAFETY: `gc` is valid (see comment above), so we can dereference it. + let domain = unsafe { irq::Domain::from_ptr((*gc).irq.domain) }; + + T::handle_irq_flow(data, desc, &domain); + } + } + + /// Adapter from an irq chip with `gpio_chip` pointer as context to one where the gpio chip + /// data is passed as context. + struct IrqChipAdapter(PhantomData); + + impl irq::Chip for IrqChipAdapter { + type Data = *mut bindings::gpio_chip; + const TO_USE: irq::ToUse = T::TO_USE; + + fn ack(gc: *mut bindings::gpio_chip, irq_data: &irq::IrqData) { + // SAFETY: `IrqChipAdapter` is a private struct, only used when the data stored in the + // gpio chip is known to come from `T::Data`, and only valid while the gpio chip is + // registered, so `gc` is valid. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc as _)) }; + T::ack(data, irq_data); + } + + fn mask(gc: *mut bindings::gpio_chip, irq_data: &irq::IrqData) { + // SAFETY: `IrqChipAdapter` is a private struct, only used when the data stored in the + // gpio chip is known to come from `T::Data`, and only valid while the gpio chip is + // registered, so `gc` is valid. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc as _)) }; + T::mask(data, irq_data); + } + + fn unmask(gc: *mut bindings::gpio_chip, irq_data: &irq::IrqData) { + // SAFETY: `IrqChipAdapter` is a private struct, only used when the data stored in the + // gpio chip is known to come from `T::Data`, and only valid while the gpio chip is + // registered, so `gc` is valid. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc as _)) }; + T::unmask(data, irq_data); + } + + fn set_type( + gc: *mut bindings::gpio_chip, + irq_data: &mut irq::LockedIrqData, + flow_type: u32, + ) -> Result { + // SAFETY: `IrqChipAdapter` is a private struct, only used when the data stored in the + // gpio chip is known to come from `T::Data`, and only valid while the gpio chip is + // registered, so `gc` is valid. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc as _)) }; + T::set_type(data, irq_data, flow_type) + } + + fn set_wake(gc: *mut bindings::gpio_chip, irq_data: &irq::IrqData, on: bool) -> Result { + // SAFETY: `IrqChipAdapter` is a private struct, only used when the data stored in the + // gpio chip is known to come from `T::Data`, and only valid while the gpio chip is + // registered, so `gc` is valid. + let data = unsafe { T::Data::borrow(bindings::gpiochip_get_data(gc as _)) }; + T::set_wake(data, irq_data, on) + } + } +} diff --git a/rust/kernel/hwrng.rs b/rust/kernel/hwrng.rs new file mode 100644 index 00000000000000..a50de951063147 --- /dev/null +++ b/rust/kernel/hwrng.rs @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Hardware Random Number Generator. +//! +//! C header: [`include/linux/hw_random.h`](../../../../include/linux/hw_random.h) + +use alloc::{boxed::Box, slice::from_raw_parts_mut}; + +use crate::{ + bindings, c_types, error::code::*, error::from_kernel_result, str::CString, to_result, + types::PointerWrapper, Result, ScopeGuard, +}; + +use core::{cell::UnsafeCell, fmt, marker::PhantomData, pin::Pin}; + +/// This trait is implemented in order to provide callbacks to `struct hwrng`. +pub trait Operations { + /// The methods to use to populate [`struct hwrng`]. + const TO_USE: ToUse; + + /// The pointer type that will be used to hold user-defined data type. + type Data: PointerWrapper + Send + Sync = (); + + /// Initialization callback, can be left undefined. + fn init(_data: ::Borrowed<'_>) -> Result { + Err(EINVAL) + } + + /// Cleanup callback, can be left undefined. + fn cleanup(_data: Self::Data) {} + + /// Read data into the provided buffer. + /// Drivers can fill up to max bytes of data into the buffer. + /// The buffer is aligned for any type and its size is a multiple of 4 and >= 32 bytes. + fn read( + data: ::Borrowed<'_>, + buffer: &mut [u8], + wait: bool, + ) -> Result; +} + +/// Registration structure for Hardware Random Number Generator driver. +pub struct Registration { + hwrng: UnsafeCell, + name: Option, + registered: bool, + _p: PhantomData, +} + +impl Registration { + /// Creates new instance of registration. + /// + /// The data must be registered. + pub fn new() -> Self { + Self { + hwrng: UnsafeCell::new(bindings::hwrng::default()), + name: None, + registered: false, + _p: PhantomData, + } + } + + /// Returns a registered and pinned, heap-allocated representation of the registration. + pub fn new_pinned( + name: fmt::Arguments<'_>, + quality: u16, + data: T::Data, + ) -> Result>> { + let mut reg = Pin::from(Box::try_new(Self::new())?); + reg.as_mut().register(name, quality, data)?; + Ok(reg) + } + + /// Registers a hwrng device within the rest of the kernel. + /// + /// It must be pinned because the memory block that represents + /// the registration may be self-referential. + pub fn register( + self: Pin<&mut Self>, + name: fmt::Arguments<'_>, + quality: u16, + data: T::Data, + ) -> Result { + // SAFETY: We never move out of `this`. + let this = unsafe { self.get_unchecked_mut() }; + + if this.registered { + return Err(EINVAL); + } + + let data_pointer = data.into_pointer(); + + // SAFETY: `data_pointer` comes from the call to `data.into_pointer()` above. + let guard = ScopeGuard::new(|| unsafe { + T::Data::from_pointer(data_pointer); + }); + + let name = CString::try_from_fmt(name)?; + + // SAFETY: Registration is pinned and contains allocated and set to zero `bindings::hwrng` structure. + Self::init_hwrng( + unsafe { &mut *this.hwrng.get() }, + &name, + quality, + data_pointer, + ); + + // SAFETY: `bindings::hwrng` is initialized above which guarantees safety. + to_result(|| unsafe { bindings::hwrng_register(this.hwrng.get()) })?; + + this.registered = true; + this.name = Some(name); + guard.dismiss(); + Ok(()) + } + + fn init_hwrng( + hwrng: &mut bindings::hwrng, + name: &CString, + quality: u16, + data: *const c_types::c_void, + ) { + hwrng.name = name.as_char_ptr(); + + hwrng.init = if T::TO_USE.init { + Some(Self::init_callback) + } else { + None + }; + hwrng.cleanup = if T::TO_USE.cleanup { + Some(Self::cleanup_callback) + } else { + None + }; + hwrng.data_present = None; + hwrng.data_read = None; + hwrng.read = Some(Self::read_callback); + + hwrng.priv_ = data as _; + hwrng.quality = quality; + + // SAFETY: All fields are properly initialized as + // remaining fields `list`, `ref` and `cleanup_done` are already + // zeroed by `bindings::hwrng::default()` call. + } + + unsafe extern "C" fn init_callback(rng: *mut bindings::hwrng) -> c_types::c_int { + from_kernel_result! { + // SAFETY: `priv` private data field was initialized during creation of + // the `bindings::hwrng` in `Self::init_hwrng` method. This callback is only + // called once the driver is registered. + let data = unsafe { T::Data::borrow((*rng).priv_ as *const _) }; + T::init(data)?; + Ok(0) + } + } + + unsafe extern "C" fn cleanup_callback(rng: *mut bindings::hwrng) { + // SAFETY: `priv` private data field was initialized during creation of + // the `bindings::hwrng` in `Self::init_hwrng` method. This callback is only + // called once the driver is registered. + let data = unsafe { T::Data::from_pointer((*rng).priv_ as *const _) }; + T::cleanup(data); + } + + unsafe extern "C" fn read_callback( + rng: *mut bindings::hwrng, + data: *mut c_types::c_void, + max: usize, + wait: bindings::bool_, + ) -> c_types::c_int { + from_kernel_result! { + // SAFETY: `priv` private data field was initialized during creation of + // the `bindings::hwrng` in `Self::init_hwrng` method. This callback is only + // called once the driver is registered. + let drv_data = unsafe { T::Data::borrow((*rng).priv_ as *const _) }; + + // SAFETY: Slice is created from `data` and `max` arguments that are C's buffer + // along with its size in bytes that are safe for this conversion. + let buffer = unsafe { from_raw_parts_mut(data as *mut u8, max) }; + let ret = T::read(drv_data, buffer, wait)?; + Ok(ret as _) + } + } +} + +impl Default for Registration { + fn default() -> Self { + Self::new() + } +} + +/// Represents which callbacks of [`struct hwrng`] should be populated with pointers. +pub struct ToUse { + /// The `init` field of [`struct hwrng`]. + pub init: bool, + + /// The `cleanup` field of [`struct hwrng`]. + pub cleanup: bool, +} + +/// A constant version where all values are to set to `false`, that is, all supported fields will +/// be set to null pointers. +pub const USE_NONE: ToUse = ToUse { + init: false, + cleanup: false, +}; + +/// Defines the [`Operations::TO_USE`] field based on a list of fields to be populated. +#[macro_export] +macro_rules! declare_hwrng_operations { + () => { + const TO_USE: $crate::hwrng::ToUse = $crate::hwrng::USE_NONE; + }; + ($($i:ident),+) => { + #[allow(clippy::needless_update)] + const TO_USE: kernel::hwrng::ToUse = + $crate::hwrng::ToUse { + $($i: true),+ , + ..$crate::hwrng::USE_NONE + }; + }; +} + +// SAFETY: `Registration` does not expose any of its state across threads. +unsafe impl Sync for Registration {} + +// SAFETY: `Registration` is not restricted to a single thread, +// its `T::Data` is also `Send` so it may be moved to different threads. +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for Registration {} + +impl Drop for Registration { + /// Removes the registration from the kernel if it has completed successfully before. + fn drop(&mut self) { + // SAFETY: The instance of Registration is unregistered only + // after being initialized and registered before. + if self.registered { + unsafe { bindings::hwrng_unregister(self.hwrng.get()) }; + } + } +} diff --git a/rust/kernel/io_buffer.rs b/rust/kernel/io_buffer.rs new file mode 100644 index 00000000000000..ccecc4763aca30 --- /dev/null +++ b/rust/kernel/io_buffer.rs @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Buffers used in IO. + +use crate::Result; +use alloc::vec::Vec; +use core::mem::{size_of, MaybeUninit}; + +/// Represents a buffer to be read from during IO. +pub trait IoBufferReader { + /// Returns the number of bytes left to be read from the io buffer. + /// + /// Note that even reading less than this number of bytes may fail. + fn len(&self) -> usize; + + /// Returns `true` if no data is available in the io buffer. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Reads raw data from the io buffer into a raw kernel buffer. + /// + /// # Safety + /// + /// The output buffer must be valid. + unsafe fn read_raw(&mut self, out: *mut u8, len: usize) -> Result; + + /// Reads all data remaining in the io buffer. + /// + /// Returns `EFAULT` if the address does not currently point to mapped, readable memory. + fn read_all(&mut self) -> Result> { + let mut data = Vec::::new(); + data.try_resize(self.len(), 0)?; + + // SAFETY: The output buffer is valid as we just allocated it. + unsafe { self.read_raw(data.as_mut_ptr(), data.len())? }; + Ok(data) + } + + /// Reads a byte slice from the io buffer. + /// + /// Returns `EFAULT` if the byte slice is bigger than the remaining size of the user slice or + /// if the address does not currently point to mapped, readable memory. + fn read_slice(&mut self, data: &mut [u8]) -> Result { + // SAFETY: The output buffer is valid as it's coming from a live reference. + unsafe { self.read_raw(data.as_mut_ptr(), data.len()) } + } + + /// Reads the contents of a plain old data (POD) type from the io buffer. + fn read(&mut self) -> Result { + let mut out = MaybeUninit::::uninit(); + // SAFETY: The buffer is valid as it was just allocated. + unsafe { self.read_raw(out.as_mut_ptr() as _, size_of::()) }?; + // SAFETY: We just initialised the data. + Ok(unsafe { out.assume_init() }) + } +} + +/// Represents a buffer to be written to during IO. +pub trait IoBufferWriter { + /// Returns the number of bytes left to be written into the io buffer. + /// + /// Note that even writing less than this number of bytes may fail. + fn len(&self) -> usize; + + /// Returns `true` if the io buffer cannot hold any additional data. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Writes zeroes to the io buffer. + /// + /// Differently from the other write functions, `clear` will zero as much as it can and update + /// the writer internal state to reflect this. It will, however, return an error if it cannot + /// clear `len` bytes. + /// + /// For example, if a caller requests that 100 bytes be cleared but a segfault happens after + /// 20 bytes, then EFAULT is returned and the writer is advanced by 20 bytes. + fn clear(&mut self, len: usize) -> Result; + + /// Writes a byte slice into the io buffer. + /// + /// Returns `EFAULT` if the byte slice is bigger than the remaining size of the io buffer or if + /// the address does not currently point to mapped, writable memory. + fn write_slice(&mut self, data: &[u8]) -> Result { + // SAFETY: The input buffer is valid as it's coming from a live reference. + unsafe { self.write_raw(data.as_ptr(), data.len()) } + } + + /// Writes raw data to the io buffer from a raw kernel buffer. + /// + /// # Safety + /// + /// The input buffer must be valid. + unsafe fn write_raw(&mut self, data: *const u8, len: usize) -> Result; + + /// Writes the contents of the given data into the io buffer. + fn write(&mut self, data: &T) -> Result { + // SAFETY: The input buffer is valid as it's coming from a live + // reference to a type that implements `WritableToBytes`. + unsafe { self.write_raw(data as *const T as _, size_of::()) } + } +} + +/// Specifies that a type is safely readable from byte slices. +/// +/// Not all types can be safely read from byte slices; examples from +/// include `bool` +/// that must be either `0` or `1`, and `char` that cannot be a surrogate or above `char::MAX`. +/// +/// # Safety +/// +/// Implementers must ensure that the type is made up only of types that can be safely read from +/// arbitrary byte sequences (e.g., `u32`, `u64`, etc.). +pub unsafe trait ReadableFromBytes {} + +// SAFETY: All bit patterns are acceptable values of the types below. +unsafe impl ReadableFromBytes for u8 {} +unsafe impl ReadableFromBytes for u16 {} +unsafe impl ReadableFromBytes for u32 {} +unsafe impl ReadableFromBytes for u64 {} +unsafe impl ReadableFromBytes for usize {} +unsafe impl ReadableFromBytes for i8 {} +unsafe impl ReadableFromBytes for i16 {} +unsafe impl ReadableFromBytes for i32 {} +unsafe impl ReadableFromBytes for i64 {} +unsafe impl ReadableFromBytes for isize {} + +/// Specifies that a type is safely writable to byte slices. +/// +/// This means that we don't read undefined values (which leads to UB) in preparation for writing +/// to the byte slice. It also ensures that no potentially sensitive information is leaked into the +/// byte slices. +/// +/// # Safety +/// +/// A type must not include padding bytes and must be fully initialised to safely implement +/// [`WritableToBytes`] (i.e., it doesn't contain [`MaybeUninit`] fields). A composition of +/// writable types in a structure is not necessarily writable because it may result in padding +/// bytes. +pub unsafe trait WritableToBytes {} + +// SAFETY: Initialised instances of the following types have no uninitialised portions. +unsafe impl WritableToBytes for u8 {} +unsafe impl WritableToBytes for u16 {} +unsafe impl WritableToBytes for u32 {} +unsafe impl WritableToBytes for u64 {} +unsafe impl WritableToBytes for usize {} +unsafe impl WritableToBytes for i8 {} +unsafe impl WritableToBytes for i16 {} +unsafe impl WritableToBytes for i32 {} +unsafe impl WritableToBytes for i64 {} +unsafe impl WritableToBytes for isize {} diff --git a/rust/kernel/io_mem.rs b/rust/kernel/io_mem.rs new file mode 100644 index 00000000000000..25096fe436753a --- /dev/null +++ b/rust/kernel/io_mem.rs @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Memory-mapped IO. +//! +//! C header: [`include/asm-generic/io.h`](../../../../include/asm-generic/io.h) + +#![allow(dead_code)] + +use crate::{bindings, error::code::*, Result}; +use core::convert::TryInto; + +/// Represents a memory resource. +pub struct Resource { + offset: bindings::resource_size_t, + size: bindings::resource_size_t, +} + +impl Resource { + pub(crate) fn new( + start: bindings::resource_size_t, + end: bindings::resource_size_t, + ) -> Option { + if start == 0 { + return None; + } + Some(Self { + offset: start, + size: end.checked_sub(start)?.checked_add(1)?, + }) + } +} + +/// Represents a memory block of at least `SIZE` bytes. +/// +/// # Invariants +/// +/// `ptr` is a non-null and valid address of at least `SIZE` bytes and returned by an `ioremap` +/// variant. `ptr` is also 8-byte aligned. +/// +/// # Examples +/// +/// ``` +/// # use kernel::prelude::*; +/// use kernel::io_mem::{IoMem, Resource}; +/// +/// fn test(res: Resource) -> Result { +/// // Create an io mem block of at least 100 bytes. +/// // SAFETY: No DMA operations are initiated through `mem`. +/// let mem = unsafe { IoMem::<100>::try_new(res) }?; +/// +/// // Read one byte from offset 10. +/// let v = mem.readb(10); +/// +/// // Write value to offset 20. +/// mem.writeb(v, 20); +/// +/// Ok(()) +/// } +/// +/// ``` +pub struct IoMem { + ptr: usize, +} + +macro_rules! define_read { + ($(#[$attr:meta])* $name:ident, $try_name:ident, $type_name:ty) => { + /// Reads IO data from the given offset known, at compile time. + /// + /// If the offset is not known at compile time, the build will fail. + $(#[$attr])* + pub fn $name(&self, offset: usize) -> $type_name { + Self::check_offset::<$type_name>(offset); + let ptr = self.ptr.wrapping_add(offset); + // SAFETY: The type invariants guarantee that `ptr` is a valid pointer. The check above + // guarantees that the code won't build if `offset` makes the read go out of bounds + // (including the type size). + unsafe { bindings::$name(ptr as _) } + } + + /// Reads IO data from the given offset. + /// + /// It fails if/when the offset (plus the type size) is out of bounds. + $(#[$attr])* + pub fn $try_name(&self, offset: usize) -> Result<$type_name> { + if !Self::offset_ok::<$type_name>(offset) { + return Err(EINVAL); + } + let ptr = self.ptr.wrapping_add(offset); + // SAFETY: The type invariants guarantee that `ptr` is a valid pointer. The check above + // returns an error if `offset` would make the read go out of bounds (including the + // type size). + Ok(unsafe { bindings::$name(ptr as _) }) + } + }; +} + +macro_rules! define_write { + ($(#[$attr:meta])* $name:ident, $try_name:ident, $type_name:ty) => { + /// Writes IO data to the given offset, known at compile time. + /// + /// If the offset is not known at compile time, the build will fail. + $(#[$attr])* + pub fn $name(&self, value: $type_name, offset: usize) { + Self::check_offset::<$type_name>(offset); + let ptr = self.ptr.wrapping_add(offset); + // SAFETY: The type invariants guarantee that `ptr` is a valid pointer. The check above + // guarantees that the code won't link if `offset` makes the write go out of bounds + // (including the type size). + unsafe { bindings::$name(value, ptr as _) } + } + + /// Writes IO data to the given offset. + /// + /// It fails if/when the offset (plus the type size) is out of bounds. + $(#[$attr])* + pub fn $try_name(&self, value: $type_name, offset: usize) -> Result { + if !Self::offset_ok::<$type_name>(offset) { + return Err(EINVAL); + } + let ptr = self.ptr.wrapping_add(offset); + // SAFETY: The type invariants guarantee that `ptr` is a valid pointer. The check above + // returns an error if `offset` would make the write go out of bounds (including the + // type size). + unsafe { bindings::$name(value, ptr as _) }; + Ok(()) + } + }; +} + +impl IoMem { + /// Tries to create a new instance of a memory block. + /// + /// The resource described by `res` is mapped into the CPU's address space so that it can be + /// accessed directly. It is also consumed by this function so that it can't be mapped again + /// to a different address. + /// + /// # Safety + /// + /// Callers must ensure that either (a) the resulting interface cannot be used to initiate DMA + /// operations, or (b) that DMA operations initiated via the returned interface use DMA handles + /// allocated through the `dma` module. + pub unsafe fn try_new(res: Resource) -> Result { + // Check that the resource has at least `SIZE` bytes in it. + if res.size < SIZE.try_into()? { + return Err(EINVAL); + } + + // To be able to check pointers at compile time based only on offsets, we need to guarantee + // that the base pointer is minimally aligned. So we conservatively expect at least 8 bytes. + if res.offset % 8 != 0 { + crate::pr_err!("Physical address is not 64-bit aligned: {:x}", res.offset); + return Err(EDOM); + } + + // Try to map the resource. + // SAFETY: Just mapping the memory range. + let addr = unsafe { bindings::ioremap(res.offset, res.size as _) }; + if addr.is_null() { + Err(ENOMEM) + } else { + // INVARIANT: `addr` is non-null and was returned by `ioremap`, so it is valid. It is + // also 8-byte aligned because we checked it above. + Ok(Self { ptr: addr as usize }) + } + } + + const fn offset_ok(offset: usize) -> bool { + let type_size = core::mem::size_of::(); + if let Some(end) = offset.checked_add(type_size) { + end <= SIZE && offset % type_size == 0 + } else { + false + } + } + + fn offset_ok_of_val(offset: usize, value: &T) -> bool { + let value_size = core::mem::size_of_val(value); + let value_alignment = core::mem::align_of_val(value); + if let Some(end) = offset.checked_add(value_size) { + end <= SIZE && offset % value_alignment == 0 + } else { + false + } + } + + const fn check_offset(offset: usize) { + crate::build_assert!(Self::offset_ok::(offset), "IoMem offset overflow"); + } + + /// Copy memory block from an i/o memory by filling the specified buffer with it. + /// + /// # Examples + /// ``` + /// use kernel::io_mem::{self, IoMem, Resource}; + /// + /// fn test(res: Resource) -> Result { + /// // Create an i/o memory block of at least 100 bytes. + /// let mem = unsafe { IoMem::<100>::try_new(res) }?; + /// + /// let mut buffer: [u8; 32] = [0; 32]; + /// + /// // Memcpy 16 bytes from an offset 10 of i/o memory block into the buffer. + /// mem.try_memcpy_fromio(&mut buffer[..16], 10)?; + /// + /// Ok(()) + /// } + /// ``` + pub fn try_memcpy_fromio(&self, buffer: &mut [u8], offset: usize) -> Result { + if !Self::offset_ok_of_val(offset, buffer) { + return Err(EINVAL); + } + + let ptr = self.ptr.wrapping_add(offset); + + // SAFETY: + // - The type invariants guarantee that `ptr` is a valid pointer. + // - The bounds of `buffer` are checked with a call to `offset_ok_of_val()`. + unsafe { + bindings::memcpy_fromio( + buffer.as_mut_ptr() as *mut _, + ptr as *const _, + buffer.len() as _, + ) + }; + Ok(()) + } + + define_read!(readb, try_readb, u8); + define_read!(readw, try_readw, u16); + define_read!(readl, try_readl, u32); + define_read!( + #[cfg(CONFIG_64BIT)] + readq, + try_readq, + u64 + ); + + define_read!(readb_relaxed, try_readb_relaxed, u8); + define_read!(readw_relaxed, try_readw_relaxed, u16); + define_read!(readl_relaxed, try_readl_relaxed, u32); + define_read!( + #[cfg(CONFIG_64BIT)] + readq_relaxed, + try_readq_relaxed, + u64 + ); + + define_write!(writeb, try_writeb, u8); + define_write!(writew, try_writew, u16); + define_write!(writel, try_writel, u32); + define_write!( + #[cfg(CONFIG_64BIT)] + writeq, + try_writeq, + u64 + ); + + define_write!(writeb_relaxed, try_writeb_relaxed, u8); + define_write!(writew_relaxed, try_writew_relaxed, u16); + define_write!(writel_relaxed, try_writel_relaxed, u32); + define_write!( + #[cfg(CONFIG_64BIT)] + writeq_relaxed, + try_writeq_relaxed, + u64 + ); +} + +impl Drop for IoMem { + fn drop(&mut self) { + // SAFETY: By the type invariant, `self.ptr` is a value returned by a previous successful + // call to `ioremap`. + unsafe { bindings::iounmap(self.ptr as _) }; + } +} diff --git a/rust/kernel/iov_iter.rs b/rust/kernel/iov_iter.rs new file mode 100644 index 00000000000000..b9b8dc882bd057 --- /dev/null +++ b/rust/kernel/iov_iter.rs @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! IO vector iterators. +//! +//! C header: [`include/linux/uio.h`](../../../../include/linux/uio.h) + +use crate::{ + bindings, + error::code::*, + io_buffer::{IoBufferReader, IoBufferWriter}, + Result, +}; + +/// Wraps the kernel's `struct iov_iter`. +/// +/// # Invariants +/// +/// The pointer `IovIter::ptr` is non-null and valid. +pub struct IovIter { + ptr: *mut bindings::iov_iter, +} + +impl IovIter { + fn common_len(&self) -> usize { + // SAFETY: `IovIter::ptr` is guaranteed to be valid by the type invariants. + unsafe { (*self.ptr).count } + } + + /// Constructs a new [`struct iov_iter`] wrapper. + /// + /// # Safety + /// + /// The pointer `ptr` must be non-null and valid for the lifetime of the object. + pub(crate) unsafe fn from_ptr(ptr: *mut bindings::iov_iter) -> Self { + // INVARIANTS: the safety contract ensures the type invariant will hold. + Self { ptr } + } +} + +impl IoBufferWriter for IovIter { + fn len(&self) -> usize { + self.common_len() + } + + fn clear(&mut self, mut len: usize) -> Result { + while len > 0 { + // SAFETY: `IovIter::ptr` is guaranteed to be valid by the type invariants. + let written = unsafe { bindings::iov_iter_zero(len, self.ptr) }; + if written == 0 { + return Err(EFAULT); + } + + len -= written; + } + Ok(()) + } + + unsafe fn write_raw(&mut self, data: *const u8, len: usize) -> Result { + let res = unsafe { bindings::copy_to_iter(data as _, len, self.ptr) }; + if res != len { + Err(EFAULT) + } else { + Ok(()) + } + } +} + +impl IoBufferReader for IovIter { + fn len(&self) -> usize { + self.common_len() + } + + unsafe fn read_raw(&mut self, out: *mut u8, len: usize) -> Result { + let res = unsafe { bindings::copy_from_iter(out as _, len, self.ptr) }; + if res != len { + Err(EFAULT) + } else { + Ok(()) + } + } +} diff --git a/rust/kernel/irq.rs b/rust/kernel/irq.rs new file mode 100644 index 00000000000000..b1d067de69258d --- /dev/null +++ b/rust/kernel/irq.rs @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Interrupts and interrupt chips. +//! +//! See . +//! +//! C headers: [`include/linux/irq.h`](../../../../include/linux/irq.h) and +//! [`include/linux/interrupt.h`](../../../../include/linux/interrupt.h). + +#![allow(dead_code)] + +use crate::{bindings, c_types, error::from_kernel_result, types::PointerWrapper, Error, Result}; +use core::ops::Deref; + +/// The type of irq hardware numbers. +pub type HwNumber = bindings::irq_hw_number_t; + +/// Wraps the kernel's `struct irq_data`. +/// +/// # Invariants +/// +/// The pointer `IrqData::ptr` is non-null and valid. +pub struct IrqData { + ptr: *mut bindings::irq_data, +} + +impl IrqData { + /// Creates a new `IrqData` instance from a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `ptr` is non-null and valid when the function is called, and that + /// it remains valid for the lifetime of the return [`IrqData`] instance. + unsafe fn from_ptr(ptr: *mut bindings::irq_data) -> Self { + // INVARIANTS: By the safety requirements, the instance we're creating satisfies the type + // invariants. + Self { ptr } + } + + /// Returns the hardware irq number. + pub fn hwirq(&self) -> HwNumber { + // SAFETY: By the type invariants, it's ok to dereference `ptr`. + unsafe { (*self.ptr).hwirq } + } +} + +/// Wraps the kernel's `struct irq_data` when it is locked. +/// +/// Being locked allows additional operations to be performed on the data. +pub struct LockedIrqData(IrqData); + +impl LockedIrqData { + /// Sets the high-level irq flow handler to the builtin one for level-triggered irqs. + pub fn set_level_handler(&mut self) { + // SAFETY: By the type invariants of `self.0`, we know `self.0.ptr` is valid. + unsafe { bindings::irq_set_handler_locked(self.0.ptr, Some(bindings::handle_level_irq)) }; + } + + /// Sets the high-level irq flow handler to the builtin one for edge-triggered irqs. + pub fn set_edge_handler(&mut self) { + // SAFETY: By the type invariants of `self.0`, we know `self.0.ptr` is valid. + unsafe { bindings::irq_set_handler_locked(self.0.ptr, Some(bindings::handle_edge_irq)) }; + } + + /// Sets the high-level irq flow handler to the builtin one for bad irqs. + pub fn set_bad_handler(&mut self) { + // SAFETY: By the type invariants of `self.0`, we know `self.0.ptr` is valid. + unsafe { bindings::irq_set_handler_locked(self.0.ptr, Some(bindings::handle_bad_irq)) }; + } +} + +impl Deref for LockedIrqData { + type Target = IrqData; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// Extra information returned by some of the [`Chip`] methods on success. +pub enum ExtraResult { + /// Indicates that the caller (irq core) will update the descriptor state. + None = bindings::IRQ_SET_MASK_OK as _, + + /// Indicates that the callee (irq chip implementation) already updated the descriptor state. + NoCopy = bindings::IRQ_SET_MASK_OK_NOCOPY as _, + + /// Same as [`ExtraResult::None`] in terms of updating descriptor state. It is used in stacked + /// irq chips to indicate that descendant chips should be skipped. + Done = bindings::IRQ_SET_MASK_OK_DONE as _, +} + +/// An irq chip. +/// +/// It is a trait for the functions defined in [`struct irq_chip`]. +/// +/// [`struct irq_chip`]: ../../../include/linux/irq.h +pub trait Chip: Sized { + /// The type of the context data stored in the irq chip and made available on each callback. + type Data: PointerWrapper; + + /// The methods to use to populate [`struct irq_chip`]. This is typically populated with + /// [`declare_irq_chip_operations`]. + const TO_USE: ToUse; + + /// Called at the start of a new interrupt. + fn ack(data: ::Borrowed<'_>, irq_data: &IrqData); + + /// Masks an interrupt source. + fn mask(data: ::Borrowed<'_>, irq_data: &IrqData); + + /// Unmasks an interrupt source. + fn unmask(_data: ::Borrowed<'_>, irq_data: &IrqData); + + /// Sets the flow type of an interrupt. + /// + /// The flow type is a combination of the constants in [`Type`]. + fn set_type( + _data: ::Borrowed<'_>, + _irq_data: &mut LockedIrqData, + _flow_type: u32, + ) -> Result { + Ok(ExtraResult::None) + } + + /// Enables or disables power-management wake-on of an interrupt. + fn set_wake( + _data: ::Borrowed<'_>, + _irq_data: &IrqData, + _on: bool, + ) -> Result { + Ok(()) + } +} + +/// Initialises `chip` with the callbacks defined in `T`. +/// +/// # Safety +/// +/// The caller must ensure that the value stored in the irq chip data is the result of calling +/// [`PointerWrapper::into_pointer] for the [`T::Data`] type. +pub(crate) unsafe fn init_chip(chip: &mut bindings::irq_chip) { + chip.irq_ack = Some(irq_ack_callback::); + chip.irq_mask = Some(irq_mask_callback::); + chip.irq_unmask = Some(irq_unmask_callback::); + + if T::TO_USE.set_type { + chip.irq_set_type = Some(irq_set_type_callback::); + } + + if T::TO_USE.set_wake { + chip.irq_set_wake = Some(irq_set_wake_callback::); + } +} + +/// Represents which fields of [`struct irq_chip`] should be populated with pointers. +/// +/// This is typically populated with the [`declare_irq_chip_operations`] macro. +pub struct ToUse { + /// The `irq_set_type` field of [`struct irq_chip`]. + pub set_type: bool, + + /// The `irq_set_wake` field of [`struct irq_chip`]. + pub set_wake: bool, +} + +/// A constant version where all values are to set to `false`, that is, all supported fields will +/// be set to null pointers. +pub const USE_NONE: ToUse = ToUse { + set_type: false, + set_wake: false, +}; + +/// Defines the [`Chip::TO_USE`] field based on a list of fields to be populated. +#[macro_export] +macro_rules! declare_irq_chip_operations { + () => { + const TO_USE: $crate::irq::ToUse = $crate::irq::USE_NONE; + }; + ($($i:ident),+) => { + #[allow(clippy::needless_update)] + const TO_USE: $crate::irq::ToUse = + $crate::irq::ToUse { + $($i: true),+ , + ..$crate::irq::USE_NONE + }; + }; +} + +/// Enables or disables power-management wake-on for the given irq number. +pub fn set_wake(irq: u32, on: bool) -> Result { + // SAFETY: Just an FFI call, there are no extra requirements for safety. + let ret = unsafe { bindings::irq_set_irq_wake(irq, on as _) }; + if ret < 0 { + Err(Error::from_kernel_errno(ret)) + } else { + Ok(()) + } +} + +unsafe extern "C" fn irq_ack_callback(irq_data: *mut bindings::irq_data) { + // SAFETY: The safety requirements of `init_chip`, which is the only place that uses this + // callback, ensure that the value stored as irq chip data comes from a previous call to + // `PointerWrapper::into_pointer`. + let data = unsafe { T::Data::borrow(bindings::irq_data_get_irq_chip_data(irq_data)) }; + + // SAFETY: The value returned by `IrqData` is only valid until the end of this function, and + // `irq_data` is guaranteed to be valid until then (by the contract with C code). + T::ack(data, unsafe { &IrqData::from_ptr(irq_data) }) +} + +unsafe extern "C" fn irq_mask_callback(irq_data: *mut bindings::irq_data) { + // SAFETY: The safety requirements of `init_chip`, which is the only place that uses this + // callback, ensure that the value stored as irq chip data comes from a previous call to + // `PointerWrapper::into_pointer`. + let data = unsafe { T::Data::borrow(bindings::irq_data_get_irq_chip_data(irq_data)) }; + + // SAFETY: The value returned by `IrqData` is only valid until the end of this function, and + // `irq_data` is guaranteed to be valid until then (by the contract with C code). + T::mask(data, unsafe { &IrqData::from_ptr(irq_data) }) +} + +unsafe extern "C" fn irq_unmask_callback(irq_data: *mut bindings::irq_data) { + // SAFETY: The safety requirements of `init_chip`, which is the only place that uses this + // callback, ensure that the value stored as irq chip data comes from a previous call to + // `PointerWrapper::into_pointer`. + let data = unsafe { T::Data::borrow(bindings::irq_data_get_irq_chip_data(irq_data)) }; + + // SAFETY: The value returned by `IrqData` is only valid until the end of this function, and + // `irq_data` is guaranteed to be valid until then (by the contract with C code). + T::unmask(data, unsafe { &IrqData::from_ptr(irq_data) }) +} + +unsafe extern "C" fn irq_set_type_callback( + irq_data: *mut bindings::irq_data, + flow_type: c_types::c_uint, +) -> c_types::c_int { + from_kernel_result! { + // SAFETY: The safety requirements of `init_chip`, which is the only place that uses this + // callback, ensure that the value stored as irq chip data comes from a previous call to + // `PointerWrapper::into_pointer`. + let data = unsafe { T::Data::borrow(bindings::irq_data_get_irq_chip_data(irq_data)) }; + + // SAFETY: The value returned by `IrqData` is only valid until the end of this function, and + // `irq_data` is guaranteed to be valid until then (by the contract with C code). + let ret = T::set_type(data, &mut LockedIrqData(unsafe { IrqData::from_ptr(irq_data) }), flow_type)?; + Ok(ret as _) + } +} + +unsafe extern "C" fn irq_set_wake_callback( + irq_data: *mut bindings::irq_data, + on: c_types::c_uint, +) -> c_types::c_int { + from_kernel_result! { + // SAFETY: The safety requirements of `init_chip`, which is the only place that uses this + // callback, ensure that the value stored as irq chip data comes from a previous call to + // `PointerWrapper::into_pointer`. + let data = unsafe { T::Data::borrow(bindings::irq_data_get_irq_chip_data(irq_data)) }; + + // SAFETY: The value returned by `IrqData` is only valid until the end of this function, and + // `irq_data` is guaranteed to be valid until then (by the contract with C code). + T::set_wake(data, unsafe { &IrqData::from_ptr(irq_data) }, on != 0)?; + Ok(0) + } +} + +/// Contains constants that describes how an interrupt can be triggered. +/// +/// It is tagged with `non_exhaustive` to prevent users from instantiating it. +#[non_exhaustive] +pub struct Type; + +impl Type { + /// The interrupt cannot be triggered. + pub const NONE: u32 = bindings::IRQ_TYPE_NONE; + + /// The interrupt is triggered when the signal goes from low to high. + pub const EDGE_RISING: u32 = bindings::IRQ_TYPE_EDGE_RISING; + + /// The interrupt is triggered when the signal goes from high to low. + pub const EDGE_FALLING: u32 = bindings::IRQ_TYPE_EDGE_FALLING; + + /// The interrupt is triggered when the signal goes from low to high and when it goes to high + /// to low. + pub const EDGE_BOTH: u32 = bindings::IRQ_TYPE_EDGE_BOTH; + + /// The interrupt is triggered while the signal is held high. + pub const LEVEL_HIGH: u32 = bindings::IRQ_TYPE_LEVEL_HIGH; + + /// The interrupt is triggered while the signal is held low. + pub const LEVEL_LOW: u32 = bindings::IRQ_TYPE_LEVEL_LOW; +} + +/// Wraps the kernel's `struct irq_desc`. +/// +/// # Invariants +/// +/// The pointer `Descriptor::ptr` is non-null and valid. +pub struct Descriptor { + pub(crate) ptr: *mut bindings::irq_desc, +} + +impl Descriptor { + /// Constructs a new `struct irq_desc` wrapper. + /// + /// # Safety + /// + /// The pointer `ptr` must be non-null and valid for the lifetime of the returned object. + unsafe fn from_ptr(ptr: *mut bindings::irq_desc) -> Self { + // INVARIANT: The safety requirements ensure the invariant. + Self { ptr } + } + + /// Calls `chained_irq_enter` and returns a guard that calls `chained_irq_exit` once dropped. + /// + /// It is meant to be used by chained irq handlers to dispatch irqs to the next handlers. + pub fn enter_chained(&self) -> ChainedGuard<'_> { + // SAFETY: By the type invariants, `ptr` is always non-null and valid. + let irq_chip = unsafe { bindings::irq_desc_get_chip(self.ptr) }; + + // SAFETY: By the type invariants, `ptr` is always non-null and valid. `irq_chip` was just + // returned from `ptr`, so it is still valid too. + unsafe { bindings::chained_irq_enter(irq_chip, self.ptr) }; + ChainedGuard { + desc: self, + irq_chip, + } + } +} + +/// A guard to call `chained_irq_exit` after `chained_irq_enter` was called. +/// +/// It is also used as evidence that a previous `chained_irq_enter` was called. So there are no +/// public constructors and it is only created after indeed calling `chained_irq_enter`. +pub struct ChainedGuard<'a> { + desc: &'a Descriptor, + irq_chip: *mut bindings::irq_chip, +} + +impl Drop for ChainedGuard<'_> { + fn drop(&mut self) { + // SAFETY: The lifetime of `ChainedGuard` guarantees that `self.desc` remains valid, so it + // also guarantess `irq_chip` (which was returned from it) and `self.desc.ptr` (guaranteed + // by the type invariants). + unsafe { bindings::chained_irq_exit(self.irq_chip, self.desc.ptr) }; + } +} + +/// Wraps the kernel's `struct irq_domain`. +/// +/// # Invariants +/// +/// The pointer `Domain::ptr` is non-null and valid. +#[cfg(CONFIG_IRQ_DOMAIN)] +pub struct Domain { + ptr: *mut bindings::irq_domain, +} + +#[cfg(CONFIG_IRQ_DOMAIN)] +impl Domain { + /// Constructs a new `struct irq_domain` wrapper. + /// + /// # Safety + /// + /// The pointer `ptr` must be non-null and valid for the lifetime of the returned object. + pub(crate) unsafe fn from_ptr(ptr: *mut bindings::irq_domain) -> Self { + // INVARIANT: The safety requirements ensure the invariant. + Self { ptr } + } + + /// Invokes the chained handler of the given hw irq of the given domain. + /// + /// It requires evidence that `chained_irq_enter` was called, which is done by passing a + /// `ChainedGuard` instance. + pub fn generic_handle_chained(&self, hwirq: u32, _guard: &ChainedGuard<'_>) { + // SAFETY: `ptr` is valid by the type invariants. + unsafe { bindings::generic_handle_domain_irq(self.ptr, hwirq) }; + } +} + +/// A high-level irq flow handler. +pub trait FlowHandler { + /// The data associated with the handler. + type Data: PointerWrapper; + + /// Implements the irq flow for the given descriptor. + fn handle_irq_flow(data: ::Borrowed<'_>, desc: &Descriptor); +} + +/// Returns the raw irq flow handler corresponding to the (high-level) one defined in `T`. +/// +/// # Safety +/// +/// The caller must ensure that the value stored in the irq handler data (as returned by +/// `irq_desc_get_handler_data`) is the result of calling [`PointerWrapper::into_pointer] for the +/// [`T::Data`] type. +pub(crate) unsafe fn new_flow_handler() -> bindings::irq_flow_handler_t { + Some(irq_flow_handler::) +} + +unsafe extern "C" fn irq_flow_handler(desc: *mut bindings::irq_desc) { + // SAFETY: By the safety requirements of `new_flow_handler`, we know that the value returned by + // `irq_desc_get_handler_data` comes from calling `T::Data::into_pointer`. `desc` is valid by + // the C API contract. + let data = unsafe { T::Data::borrow(bindings::irq_desc_get_handler_data(desc)) }; + + // SAFETY: The C API guarantees that `desc` is valid for the duration of this call, which + // outlives the lifetime returned by `from_desc`. + T::handle_irq_flow(data, &unsafe { Descriptor::from_ptr(desc) }); +} diff --git a/rust/kernel/kasync.rs b/rust/kernel/kasync.rs new file mode 100644 index 00000000000000..4b57116bebc57a --- /dev/null +++ b/rust/kernel/kasync.rs @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Kernel async functionality. + +#[cfg(CONFIG_NET)] +pub mod net; diff --git a/rust/kernel/kasync/net.rs b/rust/kernel/kasync/net.rs new file mode 100644 index 00000000000000..f7d15559e73840 --- /dev/null +++ b/rust/kernel/kasync/net.rs @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Async networking. + +use crate::{bindings, c_types, error::code::*, net, sync::NoWaitLock, types::Opaque, Result}; +use core::{ + future::Future, + marker::{PhantomData, PhantomPinned}, + ops::Deref, + pin::Pin, + task::{Context, Poll, Waker}, +}; + +/// A socket listening on a TCP port. +/// +/// The [`TcpListener::accept`] method is meant to be used in async contexts. +pub struct TcpListener { + listener: net::TcpListener, +} + +impl TcpListener { + /// Creates a new TCP listener. + /// + /// It is configured to listen on the given socket address for the given namespace. + pub fn try_new(ns: &net::Namespace, addr: &net::SocketAddr) -> Result { + Ok(Self { + listener: net::TcpListener::try_new(ns, addr)?, + }) + } + + /// Accepts a new connection. + /// + /// Returns a future that when ready indicates the result of the accept operation; on success, + /// it contains the newly-accepted tcp stream. + pub fn accept(&self) -> impl Future> + '_ { + SocketFuture::from_listener( + self, + bindings::BINDINGS_EPOLLIN | bindings::BINDINGS_EPOLLERR, + || { + Ok(TcpStream { + stream: self.listener.accept(false)?, + }) + }, + ) + } +} + +impl Deref for TcpListener { + type Target = net::TcpListener; + + fn deref(&self) -> &Self::Target { + &self.listener + } +} + +/// A connected TCP socket. +/// +/// The potentially blocking methods (e.g., [`TcpStream::read`], [`TcpStream::write`]) are meant +/// to be used in async contexts. +/// +/// # Examples +/// +/// ``` +/// # use kernel::prelude::*; +/// # use kernel::kasync::net::TcpStream; +/// async fn echo_server(stream: TcpStream) -> Result { +/// let mut buf = [0u8; 1024]; +/// loop { +/// let n = stream.read(&mut buf).await?; +/// if n == 0 { +/// return Ok(()); +/// } +/// stream.write_all(&buf[..n]).await?; +/// } +/// } +/// ``` +pub struct TcpStream { + stream: net::TcpStream, +} + +impl TcpStream { + /// Reads data from a connected socket. + /// + /// Returns a future that when ready indicates the result of the read operation; on success, it + /// contains the number of bytes read, which will be zero if the connection is closed. + pub fn read<'a>(&'a self, buf: &'a mut [u8]) -> impl Future> + 'a { + SocketFuture::from_stream( + self, + bindings::BINDINGS_EPOLLIN | bindings::BINDINGS_EPOLLHUP | bindings::BINDINGS_EPOLLERR, + || self.stream.read(buf, false), + ) + } + + /// Writes data to the connected socket. + /// + /// Returns a future that when ready indicates the result of the write operation; on success, it + /// contains the number of bytes written. + pub fn write<'a>(&'a self, buf: &'a [u8]) -> impl Future> + 'a { + SocketFuture::from_stream( + self, + bindings::BINDINGS_EPOLLOUT | bindings::BINDINGS_EPOLLHUP | bindings::BINDINGS_EPOLLERR, + || self.stream.write(buf, false), + ) + } + + /// Writes all the data to the connected socket. + /// + /// Returns a future that when ready indicates the result of the write operation; on success, it + /// has written all the data. + pub async fn write_all<'a>(&'a self, buf: &'a [u8]) -> Result { + let mut rem = buf; + + while !rem.is_empty() { + let n = self.write(rem).await?; + rem = &rem[n..]; + } + + Ok(()) + } +} + +impl Deref for TcpStream { + type Target = net::TcpStream; + + fn deref(&self) -> &Self::Target { + &self.stream + } +} + +/// A future for a socket operation. +/// +/// # Invariants +/// +/// `sock` is always non-null and valid for the duration of the lifetime of the instance. +struct SocketFuture<'a, Out, F: FnMut() -> Result + Send + 'a> { + sock: *mut bindings::socket, + mask: u32, + is_queued: bool, + wq_entry: Opaque, + waker: NoWaitLock>, + _p: PhantomData<&'a ()>, + _pin: PhantomPinned, + operation: F, +} + +// SAFETY: A kernel socket can be used from any thread, `wq_entry` is only used on drop and when +// `is_queued` is initially `false`. +unsafe impl Result + Send> Send for SocketFuture<'_, Out, F> {} + +impl<'a, Out, F: FnMut() -> Result + Send + 'a> SocketFuture<'a, Out, F> { + /// Creates a new socket future. + /// + /// # Safety + /// + /// Callers must ensure that `sock` is non-null, valid, and remains valid for the lifetime + /// (`'a`) of the returned instance. + unsafe fn new(sock: *mut bindings::socket, mask: u32, operation: F) -> Self { + Self { + sock, + mask, + is_queued: false, + wq_entry: Opaque::uninit(), + waker: NoWaitLock::new(None), + operation, + _p: PhantomData, + _pin: PhantomPinned, + } + } + + /// Creates a new socket future for a tcp listener. + fn from_listener(listener: &'a TcpListener, mask: u32, operation: F) -> Self { + // SAFETY: The socket is guaranteed to remain valid because it is bound to the reference to + // the listener (whose existence guarantees the socket remains valid). + unsafe { Self::new(listener.listener.sock, mask, operation) } + } + + /// Creates a new socket future for a tcp stream. + fn from_stream(stream: &'a TcpStream, mask: u32, operation: F) -> Self { + // SAFETY: The socket is guaranteed to remain valid because it is bound to the reference to + // the stream (whose existence guarantees the socket remains valid). + unsafe { Self::new(stream.stream.sock, mask, operation) } + } + + /// Callback called when the socket changes state. + /// + /// If the state matches the one we're waiting on, we wake up the task so that the future can be + /// polled again. + unsafe extern "C" fn wake_callback( + wq_entry: *mut bindings::wait_queue_entry, + _mode: c_types::c_uint, + _flags: c_types::c_int, + key: *mut c_types::c_void, + ) -> c_types::c_int { + let mask = key as u32; + + // SAFETY: The future is valid while this callback is called because we remove from the + // queue on drop. + // + // There is a potential soundness issue here because we're generating a shared reference to + // `Self` while `Self::poll` has a mutable (unique) reference. However, for `!Unpin` types + // (like `Self`), `&mut T` is treated as `*mut T` per + // https://github.com/rust-lang/rust/issues/63818 -- so we avoid the unsoundness. Once a + // more definitive solution is available, we can change this to use it. + let s = unsafe { &*crate::container_of!(wq_entry, Self, wq_entry) }; + if mask & s.mask == 0 { + // Nothing to do as this notification doesn't interest us. + return 0; + } + + // If we can't acquire the waker lock, the waker is in the process of being modified. Our + // attempt to acquire the lock will be reported to the lock owner, so it will trigger the + // wake up. + if let Some(guard) = s.waker.try_lock() { + if let Some(ref w) = *guard { + let cloned = w.clone(); + drop(guard); + cloned.wake(); + return 1; + } + } + 0 + } + + /// Poll the future once. + /// + /// It calls the operation and converts `EAGAIN` errors into a pending state. + fn poll_once(self: Pin<&mut Self>) -> Poll> { + // SAFETY: We never move out of `this`. + let this = unsafe { self.get_unchecked_mut() }; + match (this.operation)() { + Ok(s) => Poll::Ready(Ok(s)), + Err(e) => { + if e == EAGAIN { + Poll::Pending + } else { + Poll::Ready(Err(e)) + } + } + } + } + + /// Updates the waker stored in the future. + /// + /// It automatically triggers a wake up on races with the reactor. + fn set_waker(&self, waker: &Waker) { + if let Some(mut guard) = self.waker.try_lock() { + let old = core::mem::replace(&mut *guard, Some(waker.clone())); + let contention = guard.unlock(); + drop(old); + if !contention { + return; + } + } + + // We either couldn't store the waker because the existing one is being awakened, or the + // reactor tried to acquire the lock while we held it (contention). In either case, we just + // wake it up to ensure we don't miss any notification. + waker.wake_by_ref(); + } +} + +impl Result + Send> Future for SocketFuture<'_, Out, F> { + type Output = Result; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + match self.as_mut().poll_once() { + Poll::Ready(r) => Poll::Ready(r), + Poll::Pending => { + // Store away the latest waker every time we may `Pending`. + self.set_waker(cx.waker()); + if self.is_queued { + // Nothing else to do was the waiter is already queued. + return Poll::Pending; + } + + // SAFETY: We never move out of `this`. + let this = unsafe { self.as_mut().get_unchecked_mut() }; + + this.is_queued = true; + + // SAFETY: `wq_entry` is valid for write. + unsafe { + bindings::init_waitqueue_func_entry( + this.wq_entry.get(), + Some(Self::wake_callback), + ) + }; + + // SAFETY: `wq_entry` was just initialised above and is valid for read/write. + // By the type invariants, the socket is always valid. + unsafe { + bindings::add_wait_queue( + core::ptr::addr_of_mut!((*this.sock).wq.wait), + this.wq_entry.get(), + ) + }; + + // If the future wasn't queued yet, we need to poll again in case it reached + // the desired state between the last poll and being queued (in which case we + // would have missed the notification). + self.poll_once() + } + } + } +} + +impl Result + Send> Drop for SocketFuture<'_, Out, F> { + fn drop(&mut self) { + if !self.is_queued { + return; + } + + // SAFETY: `wq_entry` is initialised because `is_queued` is set to `true`, so it is valid + // for read/write. By the type invariants, the socket is always valid. + unsafe { + bindings::remove_wait_queue( + core::ptr::addr_of_mut!((*self.sock).wq.wait), + self.wq_entry.get(), + ) + }; + } +} diff --git a/rust/kernel/kunit.rs b/rust/kernel/kunit.rs new file mode 100644 index 00000000000000..5f3e102962c3b8 --- /dev/null +++ b/rust/kernel/kunit.rs @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! KUnit-based macros for Rust unit tests. +//! +//! C header: [`include/kunit/test.h`](../../../../../include/kunit/test.h) +//! +//! Reference: + +/// Asserts that a boolean expression is `true` at runtime. +/// +/// Public but hidden since it should only be used from generated tests. +/// +/// Unlike the one in `core`, this one does not panic; instead, it is mapped to the KUnit +/// facilities. See [`assert!`] for more details. +#[doc(hidden)] +#[macro_export] +macro_rules! kunit_assert { + ($test:expr, $cond:expr $(,)?) => {{ + if !$cond { + #[repr(transparent)] + struct Location($crate::bindings::kunit_loc); + + #[repr(transparent)] + struct UnaryAssert($crate::bindings::kunit_unary_assert); + + // SAFETY: There is only a static instance and in that one the pointer field + // points to an immutable C string. + unsafe impl Sync for Location {} + + // SAFETY: There is only a static instance and in that one the pointer field + // points to an immutable C string. + unsafe impl Sync for UnaryAssert {} + + static FILE: &'static $crate::str::CStr = $crate::c_str!(core::file!()); + static LOCATION: Location = Location($crate::bindings::kunit_loc { + file: FILE.as_char_ptr(), + line: core::line!() as i32, + }); + static CONDITION: &'static $crate::str::CStr = $crate::c_str!(stringify!($cond)); + static ASSERTION: UnaryAssert = UnaryAssert($crate::bindings::kunit_unary_assert { + assert: $crate::bindings::kunit_assert { + format: Some($crate::bindings::kunit_unary_assert_format), + }, + condition: CONDITION.as_char_ptr(), + expected_true: true, + }); + + // SAFETY: + // - FFI call. + // - The `test` pointer is valid because this hidden macro should only be called by + // the generated documentation tests which forward the test pointer given by KUnit. + // - The string pointers (`file` and `condition`) point to null-terminated ones. + // - The function pointer (`format`) points to the proper function. + // - The pointers passed will remain valid since they point to statics. + // - The format string is allowed to be null. + // - There are, however, problems with this: first of all, this will end up stopping + // the thread, without running destructors. While that is problematic in itself, + // it is considered UB to have what is effectively an forced foreign unwind + // with `extern "C"` ABI. One could observe the stack that is now gone from + // another thread. We should avoid pinning stack variables to prevent library UB, + // too. For the moment, given test failures are reported immediately before the + // next test runs, that test failures should be fixed and that KUnit is explicitly + // documented as not suitable for production environments, we feel it is reasonable. + unsafe { + $crate::bindings::kunit_do_failed_assertion( + $test, + core::ptr::addr_of!(LOCATION.0), + $crate::bindings::kunit_assert_type_KUNIT_ASSERTION, + core::ptr::addr_of!(ASSERTION.0.assert), + core::ptr::null(), + ); + } + } + }}; +} + +/// Asserts that two expressions are equal to each other (using [`PartialEq`]). +/// +/// Public but hidden since it should only be used from generated tests. +/// +/// Unlike the one in `core`, this one does not panic; instead, it is mapped to the KUnit +/// facilities. See [`assert!`] for more details. +#[doc(hidden)] +#[macro_export] +macro_rules! kunit_assert_eq { + ($test:expr, $left:expr, $right:expr $(,)?) => {{ + // For the moment, we just forward to the expression assert because, + // for binary asserts, KUnit supports only a few types (e.g. integers). + $crate::kunit_assert!($test, $left == $right); + }}; +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs new file mode 100644 index 00000000000000..3e01c30de6704a --- /dev/null +++ b/rust/kernel/lib.rs @@ -0,0 +1,261 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! The `kernel` crate. +//! +//! This crate contains the kernel APIs that have been ported or wrapped for +//! usage by Rust code in the kernel and is shared by all of them. +//! +//! In other words, all the rest of the Rust code in the kernel (e.g. kernel +//! modules written in Rust) depends on [`core`], [`alloc`] and this crate. +//! +//! If you need a kernel C API that is not ported or wrapped yet here, then +//! do so first instead of bypassing this crate. + +#![no_std] +#![feature(allocator_api)] +#![feature(associated_type_defaults)] +#![feature(concat_idents)] +#![feature(const_fn_trait_bound)] +#![feature(const_mut_refs)] +#![feature(const_ptr_offset_from)] +#![feature(const_refs_to_cell)] +#![feature(const_trait_impl)] +#![feature(doc_cfg)] +#![feature(generic_associated_types)] +#![feature(ptr_metadata)] +#![feature(receiver_trait)] +#![feature(coerce_unsized)] +#![feature(dispatch_from_dyn)] +#![feature(unsize)] + +// Ensure conditional compilation based on the kernel configuration works; +// otherwise we may silently break things like initcall handling. +#[cfg(not(CONFIG_RUST))] +compile_error!("Missing kernel configuration for conditional compilation"); + +#[cfg(not(test))] +#[cfg(not(testlib))] +mod allocator; + +#[doc(hidden)] +pub mod bindings; + +#[cfg(CONFIG_ARM_AMBA)] +pub mod amba; +pub mod c_types; +pub mod chrdev; +#[cfg(CONFIG_COMMON_CLK)] +pub mod clk; +pub mod cred; +pub mod device; +pub mod driver; +pub mod error; +pub mod file; +pub mod gpio; +pub mod hwrng; +pub mod irq; +pub mod kasync; +pub mod miscdev; +pub mod mm; +#[cfg(CONFIG_NET)] +pub mod net; +pub mod pages; +pub mod power; +pub mod revocable; +pub mod security; +pub mod str; +pub mod task; + +pub mod linked_list; +mod raw_list; +pub mod rbtree; + +#[doc(hidden)] +pub mod module_param; + +mod build_assert; +pub mod prelude; +pub mod print; +pub mod random; +mod static_assert; +#[doc(hidden)] +pub mod std_vendor; +pub mod sync; + +#[cfg(any(CONFIG_SYSCTL, doc))] +#[doc(cfg(CONFIG_SYSCTL))] +pub mod sysctl; + +pub mod io_buffer; +#[cfg(CONFIG_HAS_IOMEM)] +pub mod io_mem; +pub mod iov_iter; +pub mod of; +pub mod platform; +mod types; +pub mod user_ptr; + +#[cfg(CONFIG_KUNIT)] +pub mod kunit; + +#[doc(hidden)] +pub use build_error::build_error; + +pub use crate::error::{to_result, Error, Result}; +pub use crate::types::{ + bit, bits_iter, ARef, AlwaysRefCounted, Bool, False, Mode, Opaque, ScopeGuard, True, +}; + +use core::marker::PhantomData; + +/// Page size defined in terms of the `PAGE_SHIFT` macro from C. +/// +/// [`PAGE_SHIFT`]: ../../../include/asm-generic/page.h +pub const PAGE_SIZE: usize = 1 << bindings::PAGE_SHIFT; + +/// Prefix to appear before log messages printed from within the kernel crate. +const __LOG_PREFIX: &[u8] = b"rust_kernel\0"; + +/// The top level entrypoint to implementing a kernel module. +/// +/// For any teardown or cleanup operations, your type may implement [`Drop`]. +pub trait Module: Sized + Sync { + /// Called at module initialization time. + /// + /// Use this method to perform whatever setup or registration your module + /// should do. + /// + /// Equivalent to the `module_init` macro in the C API. + fn init(name: &'static str::CStr, module: &'static ThisModule) -> Result; +} + +/// Equivalent to `THIS_MODULE` in the C API. +/// +/// C header: `include/linux/export.h` +pub struct ThisModule(*mut bindings::module); + +// SAFETY: `THIS_MODULE` may be used from all threads within a module. +unsafe impl Sync for ThisModule {} + +impl ThisModule { + /// Creates a [`ThisModule`] given the `THIS_MODULE` pointer. + /// + /// # Safety + /// + /// The pointer must be equal to the right `THIS_MODULE`. + pub const unsafe fn from_ptr(ptr: *mut bindings::module) -> ThisModule { + ThisModule(ptr) + } + + /// Locks the module parameters to access them. + /// + /// Returns a [`KParamGuard`] that will release the lock when dropped. + pub fn kernel_param_lock(&self) -> KParamGuard<'_> { + // SAFETY: `kernel_param_lock` will check if the pointer is null and + // use the built-in mutex in that case. + #[cfg(CONFIG_SYSFS)] + unsafe { + bindings::kernel_param_lock(self.0) + } + + KParamGuard { + #[cfg(CONFIG_SYSFS)] + this_module: self, + phantom: PhantomData, + } + } +} + +/// Scoped lock on the kernel parameters of [`ThisModule`]. +/// +/// Lock will be released when this struct is dropped. +pub struct KParamGuard<'a> { + #[cfg(CONFIG_SYSFS)] + this_module: &'a ThisModule, + phantom: PhantomData<&'a ()>, +} + +#[cfg(CONFIG_SYSFS)] +impl<'a> Drop for KParamGuard<'a> { + fn drop(&mut self) { + // SAFETY: `kernel_param_lock` will check if the pointer is null and + // use the built-in mutex in that case. The existence of `self` + // guarantees that the lock is held. + unsafe { bindings::kernel_param_unlock(self.this_module.0) } + } +} + +/// Calculates the offset of a field from the beginning of the struct it belongs to. +/// +/// # Example +/// +/// ``` +/// # use kernel::prelude::*; +/// # use kernel::offset_of; +/// struct Test { +/// a: u64, +/// b: u32, +/// } +/// +/// assert_eq!(offset_of!(Test, b), 8); +/// ``` +#[macro_export] +macro_rules! offset_of { + ($type:ty, $($f:tt)*) => {{ + let tmp = core::mem::MaybeUninit::<$type>::uninit(); + let outer = tmp.as_ptr(); + // To avoid warnings when nesting `unsafe` blocks. + #[allow(unused_unsafe)] + // SAFETY: The pointer is valid and aligned, just not initialised; `addr_of` ensures that + // we don't actually read from `outer` (which would be UB) nor create an intermediate + // reference. + let inner = unsafe { core::ptr::addr_of!((*outer).$($f)*) } as *const u8; + // To avoid warnings when nesting `unsafe` blocks. + #[allow(unused_unsafe)] + // SAFETY: The two pointers are within the same allocation block. + unsafe { inner.offset_from(outer as *const u8) } + }} +} + +/// Produces a pointer to an object from a pointer to one of its fields. +/// +/// # Safety +/// +/// Callers must ensure that the pointer to the field is in fact a pointer to the specified field, +/// as opposed to a pointer to another object of the same type. If this condition is not met, +/// any dereference of the resulting pointer is UB. +/// +/// # Example +/// +/// ``` +/// # use kernel::container_of; +/// struct Test { +/// a: u64, +/// b: u32, +/// } +/// +/// let test = Test { a: 10, b: 20 }; +/// let b_ptr = &test.b; +/// let test_alias = container_of!(b_ptr, Test, b); +/// assert!(core::ptr::eq(&test, test_alias)); +/// ``` +#[macro_export] +macro_rules! container_of { + ($ptr:expr, $type:ty, $($f:tt)*) => {{ + let ptr = $ptr as *const _ as *const u8; + let offset = $crate::offset_of!($type, $($f)*); + ptr.wrapping_offset(-offset) as *const $type + }} +} + +#[cfg(not(any(testlib, test)))] +#[panic_handler] +fn panic(info: &core::panic::PanicInfo<'_>) -> ! { + pr_emerg!("{}\n", info); + // SAFETY: FFI call. + unsafe { bindings::BUG() }; + // Bindgen currently does not recognize `__noreturn` so `BUG` returns `()` + // instead of `!`. + // https://github.com/rust-lang/rust-bindgen/issues/2094 + loop {} +} diff --git a/rust/kernel/linked_list.rs b/rust/kernel/linked_list.rs new file mode 100644 index 00000000000000..3330edcc7ca8d4 --- /dev/null +++ b/rust/kernel/linked_list.rs @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Linked lists. +//! +//! TODO: This module is a work in progress. + +use alloc::boxed::Box; +use core::ptr::NonNull; + +pub use crate::raw_list::{Cursor, GetLinks, Links}; +use crate::{raw_list, raw_list::RawList, sync::Ref}; + +// TODO: Use the one from `kernel::file_operations::PointerWrapper` instead. +/// Wraps an object to be inserted in a linked list. +pub trait Wrapper { + /// Converts the wrapped object into a pointer that represents it. + fn into_pointer(self) -> NonNull; + + /// Converts the object back from the pointer representation. + /// + /// # Safety + /// + /// The passed pointer must come from a previous call to [`Wrapper::into_pointer()`]. + unsafe fn from_pointer(ptr: NonNull) -> Self; + + /// Returns a reference to the wrapped object. + fn as_ref(&self) -> &T; +} + +impl Wrapper for Box { + fn into_pointer(self) -> NonNull { + NonNull::new(Box::into_raw(self)).unwrap() + } + + unsafe fn from_pointer(ptr: NonNull) -> Self { + unsafe { Box::from_raw(ptr.as_ptr()) } + } + + fn as_ref(&self) -> &T { + AsRef::as_ref(self) + } +} + +impl Wrapper for Ref { + fn into_pointer(self) -> NonNull { + NonNull::new(Ref::into_raw(self) as _).unwrap() + } + + unsafe fn from_pointer(ptr: NonNull) -> Self { + // SAFETY: The safety requirements of `from_pointer` satisfy the ones from `Ref::from_raw`. + unsafe { Ref::from_raw(ptr.as_ptr() as _) } + } + + fn as_ref(&self) -> &T { + AsRef::as_ref(self) + } +} + +impl Wrapper for &T { + fn into_pointer(self) -> NonNull { + NonNull::from(self) + } + + unsafe fn from_pointer(ptr: NonNull) -> Self { + unsafe { &*ptr.as_ptr() } + } + + fn as_ref(&self) -> &T { + self + } +} + +/// A descriptor of wrapped list elements. +pub trait GetLinksWrapped: GetLinks { + /// Specifies which wrapper (e.g., `Box` and `Arc`) wraps the list entries. + type Wrapped: Wrapper; +} + +impl GetLinksWrapped for Box +where + Box: GetLinks, +{ + type Wrapped = Box< as GetLinks>::EntryType>; +} + +impl GetLinks for Box { + type EntryType = T::EntryType; + fn get_links(data: &Self::EntryType) -> &Links { + ::get_links(data) + } +} + +impl GetLinksWrapped for Ref +where + Ref: GetLinks, +{ + type Wrapped = Ref< as GetLinks>::EntryType>; +} + +impl GetLinks for Ref { + type EntryType = T::EntryType; + + fn get_links(data: &Self::EntryType) -> &Links { + ::get_links(data) + } +} + +/// A linked list. +/// +/// Elements in the list are wrapped and ownership is transferred to the list while the element is +/// in the list. +pub struct List { + list: RawList, +} + +impl List { + /// Constructs a new empty linked list. + pub fn new() -> Self { + Self { + list: RawList::new(), + } + } + + /// Returns whether the list is empty. + pub fn is_empty(&self) -> bool { + self.list.is_empty() + } + + /// Adds the given object to the end (back) of the list. + /// + /// It is dropped if it's already on this (or another) list; this can happen for + /// reference-counted objects, so dropping means decrementing the reference count. + pub fn push_back(&mut self, data: G::Wrapped) { + let ptr = data.into_pointer(); + + // SAFETY: We took ownership of the entry, so it is safe to insert it. + if !unsafe { self.list.push_back(ptr.as_ref()) } { + // If insertion failed, rebuild object so that it can be freed. + // SAFETY: We just called `into_pointer` above. + unsafe { G::Wrapped::from_pointer(ptr) }; + } + } + + /// Inserts the given object after `existing`. + /// + /// It is dropped if it's already on this (or another) list; this can happen for + /// reference-counted objects, so dropping means decrementing the reference count. + /// + /// # Safety + /// + /// Callers must ensure that `existing` points to a valid entry that is on the list. + pub unsafe fn insert_after(&mut self, existing: NonNull, data: G::Wrapped) { + let ptr = data.into_pointer(); + let entry = unsafe { &*existing.as_ptr() }; + if unsafe { !self.list.insert_after(entry, ptr.as_ref()) } { + // If insertion failed, rebuild object so that it can be freed. + unsafe { G::Wrapped::from_pointer(ptr) }; + } + } + + /// Removes the given entry. + /// + /// # Safety + /// + /// Callers must ensure that `data` is either on this list or in no list. It being on another + /// list leads to memory unsafety. + pub unsafe fn remove(&mut self, data: &G::Wrapped) -> Option { + let entry_ref = Wrapper::as_ref(data); + if unsafe { self.list.remove(entry_ref) } { + Some(unsafe { G::Wrapped::from_pointer(NonNull::from(entry_ref)) }) + } else { + None + } + } + + /// Removes the element currently at the front of the list and returns it. + /// + /// Returns `None` if the list is empty. + pub fn pop_front(&mut self) -> Option { + let front = self.list.pop_front()?; + // SAFETY: Elements on the list were inserted after a call to `into_pointer `. + Some(unsafe { G::Wrapped::from_pointer(front) }) + } + + /// Returns a cursor starting on the first (front) element of the list. + pub fn cursor_front(&self) -> Cursor<'_, G> { + self.list.cursor_front() + } + + /// Returns a mutable cursor starting on the first (front) element of the list. + pub fn cursor_front_mut(&mut self) -> CursorMut<'_, G> { + CursorMut::new(self.list.cursor_front_mut()) + } +} + +impl Default for List { + fn default() -> Self { + Self::new() + } +} + +impl Drop for List { + fn drop(&mut self) { + while self.pop_front().is_some() {} + } +} + +/// A list cursor that allows traversing a linked list and inspecting & mutating elements. +pub struct CursorMut<'a, G: GetLinksWrapped> { + cursor: raw_list::CursorMut<'a, G>, +} + +impl<'a, G: GetLinksWrapped> CursorMut<'a, G> { + fn new(cursor: raw_list::CursorMut<'a, G>) -> Self { + Self { cursor } + } + + /// Returns the element the cursor is currently positioned on. + pub fn current(&mut self) -> Option<&mut G::EntryType> { + self.cursor.current() + } + + /// Removes the element the cursor is currently positioned on. + /// + /// After removal, it advances the cursor to the next element. + pub fn remove_current(&mut self) -> Option { + let ptr = self.cursor.remove_current()?; + + // SAFETY: Elements on the list were inserted after a call to `into_pointer `. + Some(unsafe { G::Wrapped::from_pointer(ptr) }) + } + + /// Returns the element immediately after the one the cursor is positioned on. + pub fn peek_next(&mut self) -> Option<&mut G::EntryType> { + self.cursor.peek_next() + } + + /// Returns the element immediately before the one the cursor is positioned on. + pub fn peek_prev(&mut self) -> Option<&mut G::EntryType> { + self.cursor.peek_prev() + } + + /// Moves the cursor to the next element. + pub fn move_next(&mut self) { + self.cursor.move_next(); + } +} diff --git a/rust/kernel/miscdev.rs b/rust/kernel/miscdev.rs new file mode 100644 index 00000000000000..8b1110b0143c9c --- /dev/null +++ b/rust/kernel/miscdev.rs @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Miscellaneous devices. +//! +//! C header: [`include/linux/miscdevice.h`](../../../../include/linux/miscdevice.h) +//! +//! Reference: + +use crate::bindings; +use crate::error::{code::*, Error, Result}; +use crate::file; +use crate::{device, str::CStr, str::CString, ThisModule}; +use alloc::boxed::Box; +use core::marker::PhantomPinned; +use core::{fmt, mem::MaybeUninit, pin::Pin}; + +/// Options which can be used to configure how a misc device is registered. +/// +/// # Examples +/// +/// ``` +/// # use kernel::{c_str, device::RawDevice, file, miscdev, prelude::*}; +/// fn example( +/// reg: Pin<&mut miscdev::Registration>>, +/// parent: &dyn RawDevice, +/// ) -> Result { +/// miscdev::Options::new() +/// .mode(0o600) +/// .minor(10) +/// .parent(parent) +/// .register(reg, fmt!("sample"), ()) +/// } +/// ``` +#[derive(Default)] +pub struct Options<'a> { + minor: Option, + mode: Option, + parent: Option<&'a dyn device::RawDevice>, +} + +impl<'a> Options<'a> { + /// Creates new [`Options`] instance with the required fields. + pub const fn new() -> Self { + Self { + minor: None, + mode: None, + parent: None, + } + } + + /// Sets the minor device number. + pub const fn minor(&mut self, v: i32) -> &mut Self { + self.minor = Some(v); + self + } + + /// Sets the device mode. + /// + /// This is usually an octal number and describes who can perform read/write/execute operations + /// on the device. + pub const fn mode(&mut self, m: u16) -> &mut Self { + self.mode = Some(m); + self + } + + /// Sets the device parent. + pub const fn parent(&mut self, p: &'a dyn device::RawDevice) -> &mut Self { + self.parent = Some(p); + self + } + + /// Registers a misc device using the configured options. + pub fn register( + &self, + reg: Pin<&mut Registration>, + name: fmt::Arguments<'_>, + open_data: T::OpenData, + ) -> Result { + reg.register_with_options(name, open_data, self) + } + + /// Allocates a new registration of a misc device and completes the registration with the + /// configured options. + pub fn register_new( + &self, + name: fmt::Arguments<'_>, + open_data: T::OpenData, + ) -> Result>>> { + let mut r = Pin::from(Box::try_new(Registration::new())?); + self.register(r.as_mut(), name, open_data)?; + Ok(r) + } +} + +/// A registration of a miscellaneous device. +/// +/// # Invariants +/// +/// `Context` is always initialised when `registered` is `true`, and not initialised otherwise. +pub struct Registration { + registered: bool, + mdev: bindings::miscdevice, + name: Option, + _pin: PhantomPinned, + + /// Context initialised on construction and made available to all file instances on + /// [`file::Operations::open`]. + open_data: MaybeUninit, +} + +impl Registration { + /// Creates a new [`Registration`] but does not register it yet. + /// + /// It is allowed to move. + pub fn new() -> Self { + // INVARIANT: `registered` is `false` and `open_data` is not initialised. + Self { + registered: false, + mdev: bindings::miscdevice::default(), + name: None, + _pin: PhantomPinned, + open_data: MaybeUninit::uninit(), + } + } + + /// Registers a miscellaneous device. + /// + /// Returns a pinned heap-allocated representation of the registration. + pub fn new_pinned(name: fmt::Arguments<'_>, open_data: T::OpenData) -> Result>> { + Options::new().register_new(name, open_data) + } + + /// Registers a miscellaneous device with the rest of the kernel. + /// + /// It must be pinned because the memory block that represents the registration is + /// self-referential. + pub fn register( + self: Pin<&mut Self>, + name: fmt::Arguments<'_>, + open_data: T::OpenData, + ) -> Result { + Options::new().register(self, name, open_data) + } + + /// Registers a miscellaneous device with the rest of the kernel. Additional optional settings + /// are provided via the `opts` parameter. + /// + /// It must be pinned because the memory block that represents the registration is + /// self-referential. + pub fn register_with_options( + self: Pin<&mut Self>, + name: fmt::Arguments<'_>, + open_data: T::OpenData, + opts: &Options<'_>, + ) -> Result { + // SAFETY: We must ensure that we never move out of `this`. + let this = unsafe { self.get_unchecked_mut() }; + if this.registered { + // Already registered. + return Err(EINVAL); + } + + let name = CString::try_from_fmt(name)?; + + // SAFETY: The adapter is compatible with `misc_register`. + this.mdev.fops = unsafe { file::OperationsVtable::::build() }; + this.mdev.name = name.as_char_ptr(); + this.mdev.minor = opts.minor.unwrap_or(bindings::MISC_DYNAMIC_MINOR as i32); + this.mdev.mode = opts.mode.unwrap_or(0); + this.mdev.parent = opts + .parent + .map_or(core::ptr::null_mut(), |p| p.raw_device()); + + // We write to `open_data` here because as soon as `misc_register` succeeds, the file can be + // opened, so we need `open_data` configured ahead of time. + // + // INVARIANT: `registered` is set to `true`, but `open_data` is also initialised. + this.registered = true; + this.open_data.write(open_data); + + let ret = unsafe { bindings::misc_register(&mut this.mdev) }; + if ret < 0 { + // INVARIANT: `registered` is set back to `false` and the `open_data` is destructued. + this.registered = false; + // SAFETY: `open_data` was initialised a few lines above. + unsafe { this.open_data.assume_init_drop() }; + return Err(Error::from_kernel_errno(ret)); + } + + this.name = Some(name); + + Ok(()) + } +} + +impl Default for Registration { + fn default() -> Self { + Self::new() + } +} + +impl file::OpenAdapter for Registration { + unsafe fn convert( + _inode: *mut bindings::inode, + file: *mut bindings::file, + ) -> *const T::OpenData { + // SAFETY: The caller must guarantee that `file` is valid. + let reg = crate::container_of!(unsafe { (*file).private_data }, Self, mdev); + + // SAFETY: This function is only called while the misc device is still registered, so the + // registration must be valid. Additionally, the type invariants guarantee that while the + // miscdev is registered, `open_data` is initialised. + unsafe { (*reg).open_data.as_ptr() } + } +} + +// SAFETY: The only method is `register()`, which requires a (pinned) mutable `Registration`, so it +// is safe to pass `&Registration` to multiple threads because it offers no interior mutability. +unsafe impl Sync for Registration {} + +// SAFETY: All functions work from any thread. So as long as the `Registration::open_data` is +// `Send`, so is `Registration`. +unsafe impl Send for Registration where T::OpenData: Send {} + +impl Drop for Registration { + /// Removes the registration from the kernel if it has completed successfully before. + fn drop(&mut self) { + if self.registered { + // SAFETY: `registered` being `true` indicates that a previous call to `misc_register` + // succeeded. + unsafe { bindings::misc_deregister(&mut self.mdev) }; + + // SAFETY: The type invariant guarantees that `open_data` is initialised when + // `registered` is `true`. + unsafe { self.open_data.assume_init_drop() }; + } + } +} + +/// Kernel module that exposes a single miscdev device implemented by `T`. +pub struct Module> { + _dev: Pin>>, +} + +impl> crate::Module for Module { + fn init(name: &'static CStr, _module: &'static ThisModule) -> Result { + Ok(Self { + _dev: Registration::new_pinned(crate::fmt!("{name}"), ())?, + }) + } +} + +/// Declares a kernel module that exposes a single misc device. +/// +/// The `type` argument should be a type which implements the [`FileOpener`] trait. Also accepts +/// various forms of kernel metadata. +/// +/// C header: [`include/linux/moduleparam.h`](../../../include/linux/moduleparam.h) +/// +/// [`FileOpener`]: ../kernel/file_operations/trait.FileOpener.html +/// +/// # Examples +/// +/// ```ignore +/// use kernel::prelude::*; +/// +/// module_misc_device! { +/// type: MyFile, +/// name: b"my_miscdev_kernel_module", +/// author: b"Rust for Linux Contributors", +/// description: b"My very own misc device kernel module!", +/// license: b"GPL", +/// } +/// +/// #[derive(Default)] +/// struct MyFile; +/// +/// impl kernel::file::Operations for MyFile { +/// kernel::declare_file_operations!(); +/// } +/// ``` +#[macro_export] +macro_rules! module_misc_device { + (type: $type:ty, $($f:tt)*) => { + type ModuleType = kernel::miscdev::Module<$type>; + module! { + type: ModuleType, + $($f)* + } + } +} diff --git a/rust/kernel/mm.rs b/rust/kernel/mm.rs new file mode 100644 index 00000000000000..322f94f501e09b --- /dev/null +++ b/rust/kernel/mm.rs @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Memory management. +//! +//! C header: [`include/linux/mm.h`](../../../../include/linux/mm.h) + +use crate::{bindings, pages, to_result, Result}; + +/// Virtual memory. +pub mod virt { + use super::*; + + /// A wrapper for the kernel's `struct vm_area_struct`. + /// + /// It represents an area of virtual memory. + /// + /// # Invariants + /// + /// `vma` is always non-null and valid. + pub struct Area { + vma: *mut bindings::vm_area_struct, + } + + impl Area { + /// Creates a new instance of a virtual memory area. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is non-null and valid for the duration of the new area's + /// lifetime. + pub(crate) unsafe fn from_ptr(vma: *mut bindings::vm_area_struct) -> Self { + // INVARIANTS: The safety requirements guarantee the invariants. + Self { vma } + } + + /// Returns the flags associated with the virtual memory area. + /// + /// The possible flags are a combination of the constants in [`flags`]. + pub fn flags(&self) -> usize { + // SAFETY: `self.vma` is valid by the type invariants. + unsafe { (*self.vma).vm_flags as _ } + } + + /// Sets the flags associated with the virtual memory area. + /// + /// The possible flags are a combination of the constants in [`flags`]. + pub fn set_flags(&mut self, flags: usize) { + // SAFETY: `self.vma` is valid by the type invariants. + unsafe { (*self.vma).vm_flags = flags as _ }; + } + + /// Returns the start address of the virtual memory area. + pub fn start(&self) -> usize { + // SAFETY: `self.vma` is valid by the type invariants. + unsafe { (*self.vma).vm_start as _ } + } + + /// Returns the end address of the virtual memory area. + pub fn end(&self) -> usize { + // SAFETY: `self.vma` is valid by the type invariants. + unsafe { (*self.vma).vm_end as _ } + } + + /// Maps a single page at the given address within the virtual memory area. + pub fn insert_page(&mut self, address: usize, page: &pages::Pages<0>) -> Result { + // SAFETY: The page is guaranteed to be order 0 by the type system. The range of + // `address` is already checked by `vm_insert_page`. `self.vma` and `page.pages` are + // guaranteed by their repective type invariants to be valid. + to_result(|| unsafe { bindings::vm_insert_page(self.vma, address as _, page.pages) }) + } + } + + /// Container for [`Area`] flags. + pub mod flags { + use crate::bindings; + + /// No flags are set. + pub const NONE: usize = bindings::VM_NONE as _; + + /// Mapping allows reads. + pub const READ: usize = bindings::VM_READ as _; + + /// Mapping allows writes. + pub const WRITE: usize = bindings::VM_WRITE as _; + + /// Mapping allows execution. + pub const EXEC: usize = bindings::VM_EXEC as _; + + /// Mapping is shared. + pub const SHARED: usize = bindings::VM_SHARED as _; + + /// Mapping may be updated to allow reads. + pub const MAYREAD: usize = bindings::VM_MAYREAD as _; + + /// Mapping may be updated to allow writes. + pub const MAYWRITE: usize = bindings::VM_MAYWRITE as _; + + /// Mapping may be updated to allow execution. + pub const MAYEXEC: usize = bindings::VM_MAYEXEC as _; + + /// Mapping may be updated to be shared. + pub const MAYSHARE: usize = bindings::VM_MAYSHARE as _; + + /// Do not copy this vma on fork. + pub const DONTCOPY: usize = bindings::VM_DONTCOPY as _; + + /// Cannot expand with mremap(). + pub const DONTEXPAND: usize = bindings::VM_DONTEXPAND as _; + + /// Lock the pages covered when they are faulted in. + pub const LOCKONFAULT: usize = bindings::VM_LOCKONFAULT as _; + + /// Is a VM accounted object. + pub const ACCOUNT: usize = bindings::VM_ACCOUNT as _; + + /// should the VM suppress accounting. + pub const NORESERVE: usize = bindings::VM_NORESERVE as _; + + /// Huge TLB Page VM. + pub const HUGETLB: usize = bindings::VM_HUGETLB as _; + + /// Synchronous page faults. + pub const SYNC: usize = bindings::VM_SYNC as _; + + /// Architecture-specific flag. + pub const ARCH_1: usize = bindings::VM_ARCH_1 as _; + + /// Wipe VMA contents in child.. + pub const WIPEONFORK: usize = bindings::VM_WIPEONFORK as _; + + /// Do not include in the core dump. + pub const DONTDUMP: usize = bindings::VM_DONTDUMP as _; + + /// Not soft dirty clean area. + pub const SOFTDIRTY: usize = bindings::VM_SOFTDIRTY as _; + + /// Can contain "struct page" and pure PFN pages. + pub const MIXEDMAP: usize = bindings::VM_MIXEDMAP as _; + + /// MADV_HUGEPAGE marked this vma. + pub const HUGEPAGE: usize = bindings::VM_HUGEPAGE as _; + + /// MADV_NOHUGEPAGE marked this vma. + pub const NOHUGEPAGE: usize = bindings::VM_NOHUGEPAGE as _; + + /// KSM may merge identical pages. + pub const MERGEABLE: usize = bindings::VM_MERGEABLE as _; + } +} diff --git a/rust/kernel/module_param.rs b/rust/kernel/module_param.rs new file mode 100644 index 00000000000000..3aee16e5efc770 --- /dev/null +++ b/rust/kernel/module_param.rs @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Types for module parameters. +//! +//! C header: [`include/linux/moduleparam.h`](../../../include/linux/moduleparam.h) + +use crate::error::{code::*, from_kernel_result}; +use crate::str::{CStr, Formatter}; +use core::fmt::Write; + +/// Types that can be used for module parameters. +/// +/// Note that displaying the type in `sysfs` will fail if +/// [`alloc::string::ToString::to_string`] (as implemented through the +/// [`core::fmt::Display`] trait) writes more than [`PAGE_SIZE`] +/// bytes (including an additional null terminator). +/// +/// [`PAGE_SIZE`]: `crate::PAGE_SIZE` +pub trait ModuleParam: core::fmt::Display + core::marker::Sized { + /// The `ModuleParam` will be used by the kernel module through this type. + /// + /// This may differ from `Self` if, for example, `Self` needs to track + /// ownership without exposing it or allocate extra space for other possible + /// parameter values. See [`StringParam`] or [`ArrayParam`] for examples. + type Value: ?Sized; + + /// Whether the parameter is allowed to be set without an argument. + /// + /// Setting this to `true` allows the parameter to be passed without an + /// argument (e.g. just `module.param` instead of `module.param=foo`). + const NOARG_ALLOWED: bool; + + /// Convert a parameter argument into the parameter value. + /// + /// `None` should be returned when parsing of the argument fails. + /// `arg == None` indicates that the parameter was passed without an + /// argument. If `NOARG_ALLOWED` is set to `false` then `arg` is guaranteed + /// to always be `Some(_)`. + /// + /// Parameters passed at boot time will be set before [`kmalloc`] is + /// available (even if the module is loaded at a later time). However, in + /// this case, the argument buffer will be valid for the entire lifetime of + /// the kernel. So implementations of this method which need to allocate + /// should first check that the allocator is available (with + /// [`crate::bindings::slab_is_available`]) and when it is not available + /// provide an alternative implementation which doesn't allocate. In cases + /// where the allocator is not available it is safe to save references to + /// `arg` in `Self`, but in other cases a copy should be made. + /// + /// [`kmalloc`]: ../../../include/linux/slab.h + fn try_from_param_arg(arg: Option<&'static [u8]>) -> Option; + + /// Get the current value of the parameter for use in the kernel module. + /// + /// This function should not be used directly. Instead use the wrapper + /// `read` which will be generated by [`macros::module`]. + fn value(&self) -> &Self::Value; + + /// Set the module parameter from a string. + /// + /// Used to set the parameter value when loading the module or when set + /// through `sysfs`. + /// + /// # Safety + /// + /// If `val` is non-null then it must point to a valid null-terminated + /// string. The `arg` field of `param` must be an instance of `Self`. + unsafe extern "C" fn set_param( + val: *const crate::c_types::c_char, + param: *const crate::bindings::kernel_param, + ) -> crate::c_types::c_int { + let arg = if val.is_null() { + None + } else { + Some(unsafe { CStr::from_char_ptr(val).as_bytes() }) + }; + match Self::try_from_param_arg(arg) { + Some(new_value) => { + let old_value = unsafe { (*param).__bindgen_anon_1.arg as *mut Self }; + let _ = unsafe { core::ptr::replace(old_value, new_value) }; + 0 + } + None => EINVAL.to_kernel_errno(), + } + } + + /// Write a string representation of the current parameter value to `buf`. + /// + /// Used for displaying the current parameter value in `sysfs`. + /// + /// # Safety + /// + /// `buf` must be a buffer of length at least `kernel::PAGE_SIZE` that is + /// writeable. The `arg` field of `param` must be an instance of `Self`. + unsafe extern "C" fn get_param( + buf: *mut crate::c_types::c_char, + param: *const crate::bindings::kernel_param, + ) -> crate::c_types::c_int { + from_kernel_result! { + // SAFETY: The C contracts guarantees that the buffer is at least `PAGE_SIZE` bytes. + let mut f = unsafe { Formatter::from_buffer(buf.cast(), crate::PAGE_SIZE) }; + unsafe { write!(f, "{}\0", *((*param).__bindgen_anon_1.arg as *mut Self)) }?; + Ok(f.bytes_written().try_into()?) + } + } + + /// Drop the parameter. + /// + /// Called when unloading a module. + /// + /// # Safety + /// + /// The `arg` field of `param` must be an instance of `Self`. + unsafe extern "C" fn free(arg: *mut crate::c_types::c_void) { + unsafe { core::ptr::drop_in_place(arg as *mut Self) }; + } +} + +/// Trait for parsing integers. +/// +/// Strings beginning with `0x`, `0o`, or `0b` are parsed as hex, octal, or +/// binary respectively. Strings beginning with `0` otherwise are parsed as +/// octal. Anything else is parsed as decimal. A leading `+` or `-` is also +/// permitted. Any string parsed by [`kstrtol()`] or [`kstrtoul()`] will be +/// successfully parsed. +/// +/// [`kstrtol()`]: https://www.kernel.org/doc/html/latest/core-api/kernel-api.html#c.kstrtol +/// [`kstrtoul()`]: https://www.kernel.org/doc/html/latest/core-api/kernel-api.html#c.kstrtoul +trait ParseInt: Sized { + fn from_str_radix(src: &str, radix: u32) -> Result; + fn checked_neg(self) -> Option; + + fn from_str_unsigned(src: &str) -> Result { + let (radix, digits) = if let Some(n) = src.strip_prefix("0x") { + (16, n) + } else if let Some(n) = src.strip_prefix("0X") { + (16, n) + } else if let Some(n) = src.strip_prefix("0o") { + (8, n) + } else if let Some(n) = src.strip_prefix("0O") { + (8, n) + } else if let Some(n) = src.strip_prefix("0b") { + (2, n) + } else if let Some(n) = src.strip_prefix("0B") { + (2, n) + } else if src.starts_with('0') { + (8, src) + } else { + (10, src) + }; + Self::from_str_radix(digits, radix) + } + + fn from_str(src: &str) -> Option { + match src.bytes().next() { + None => None, + Some(b'-') => Self::from_str_unsigned(&src[1..]).ok()?.checked_neg(), + Some(b'+') => Some(Self::from_str_unsigned(&src[1..]).ok()?), + Some(_) => Some(Self::from_str_unsigned(src).ok()?), + } + } +} + +macro_rules! impl_parse_int { + ($ty:ident) => { + impl ParseInt for $ty { + fn from_str_radix(src: &str, radix: u32) -> Result { + $ty::from_str_radix(src, radix) + } + + fn checked_neg(self) -> Option { + self.checked_neg() + } + } + }; +} + +impl_parse_int!(i8); +impl_parse_int!(u8); +impl_parse_int!(i16); +impl_parse_int!(u16); +impl_parse_int!(i32); +impl_parse_int!(u32); +impl_parse_int!(i64); +impl_parse_int!(u64); +impl_parse_int!(isize); +impl_parse_int!(usize); + +macro_rules! impl_module_param { + ($ty:ident) => { + impl ModuleParam for $ty { + type Value = $ty; + + const NOARG_ALLOWED: bool = false; + + fn try_from_param_arg(arg: Option<&'static [u8]>) -> Option { + let bytes = arg?; + let utf8 = core::str::from_utf8(bytes).ok()?; + <$ty as crate::module_param::ParseInt>::from_str(utf8) + } + + fn value(&self) -> &Self::Value { + self + } + } + }; +} + +#[doc(hidden)] +#[macro_export] +/// Generate a static [`kernel_param_ops`](../../../include/linux/moduleparam.h) struct. +/// +/// # Example +/// ```ignore +/// make_param_ops!( +/// /// Documentation for new param ops. +/// PARAM_OPS_MYTYPE, // Name for the static. +/// MyType // A type which implements [`ModuleParam`]. +/// ); +/// ``` +macro_rules! make_param_ops { + ($ops:ident, $ty:ty) => { + $crate::make_param_ops!( + #[doc=""] + $ops, + $ty + ); + }; + ($(#[$meta:meta])* $ops:ident, $ty:ty) => { + $(#[$meta])* + /// + /// Static [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// struct generated by [`make_param_ops`]. + pub static $ops: $crate::bindings::kernel_param_ops = $crate::bindings::kernel_param_ops { + flags: if <$ty as $crate::module_param::ModuleParam>::NOARG_ALLOWED { + $crate::bindings::KERNEL_PARAM_OPS_FL_NOARG + } else { + 0 + }, + set: Some(<$ty as $crate::module_param::ModuleParam>::set_param), + get: Some(<$ty as $crate::module_param::ModuleParam>::get_param), + free: Some(<$ty as $crate::module_param::ModuleParam>::free), + }; + }; +} + +impl_module_param!(i8); +impl_module_param!(u8); +impl_module_param!(i16); +impl_module_param!(u16); +impl_module_param!(i32); +impl_module_param!(u32); +impl_module_param!(i64); +impl_module_param!(u64); +impl_module_param!(isize); +impl_module_param!(usize); + +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`i8`]. + PARAM_OPS_I8, + i8 +); +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`u8`]. + PARAM_OPS_U8, + u8 +); +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`i16`]. + PARAM_OPS_I16, + i16 +); +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`u16`]. + PARAM_OPS_U16, + u16 +); +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`i32`]. + PARAM_OPS_I32, + i32 +); +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`u32`]. + PARAM_OPS_U32, + u32 +); +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`i64`]. + PARAM_OPS_I64, + i64 +); +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`u64`]. + PARAM_OPS_U64, + u64 +); +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`isize`]. + PARAM_OPS_ISIZE, + isize +); +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`usize`]. + PARAM_OPS_USIZE, + usize +); + +impl ModuleParam for bool { + type Value = bool; + + const NOARG_ALLOWED: bool = true; + + fn try_from_param_arg(arg: Option<&'static [u8]>) -> Option { + match arg { + None => Some(true), + Some(b"y") | Some(b"Y") | Some(b"1") | Some(b"true") => Some(true), + Some(b"n") | Some(b"N") | Some(b"0") | Some(b"false") => Some(false), + _ => None, + } + } + + fn value(&self) -> &Self::Value { + self + } +} + +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`bool`]. + PARAM_OPS_BOOL, + bool +); + +/// An array of at __most__ `N` values. +/// +/// # Invariant +/// +/// The first `self.used` elements of `self.values` are initialized. +pub struct ArrayParam { + values: [core::mem::MaybeUninit; N], + used: usize, +} + +impl ArrayParam { + fn values(&self) -> &[T] { + // SAFETY: The invariant maintained by `ArrayParam` allows us to cast + // the first `self.used` elements to `T`. + unsafe { + &*(&self.values[0..self.used] as *const [core::mem::MaybeUninit] as *const [T]) + } + } +} + +impl ArrayParam { + const fn new() -> Self { + // INVARIANT: The first `self.used` elements of `self.values` are + // initialized. + ArrayParam { + values: [core::mem::MaybeUninit::uninit(); N], + used: 0, + } + } + + const fn push(&mut self, val: T) { + if self.used < N { + // INVARIANT: The first `self.used` elements of `self.values` are + // initialized. + self.values[self.used] = core::mem::MaybeUninit::new(val); + self.used += 1; + } + } + + /// Create an instance of `ArrayParam` initialized with `vals`. + /// + /// This function is only meant to be used in the [`module::module`] macro. + pub const fn create(vals: &[T]) -> Self { + let mut result = ArrayParam::new(); + let mut i = 0; + while i < vals.len() { + result.push(vals[i]); + i += 1; + } + result + } +} + +impl core::fmt::Display for ArrayParam { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + for val in self.values() { + write!(f, "{},", val)?; + } + Ok(()) + } +} + +impl ModuleParam + for ArrayParam +{ + type Value = [T]; + + const NOARG_ALLOWED: bool = false; + + fn try_from_param_arg(arg: Option<&'static [u8]>) -> Option { + arg.and_then(|args| { + let mut result = Self::new(); + for arg in args.split(|b| *b == b',') { + result.push(T::try_from_param_arg(Some(arg))?); + } + Some(result) + }) + } + + fn value(&self) -> &Self::Value { + self.values() + } +} + +/// A C-style string parameter. +/// +/// The Rust version of the [`charp`] parameter. This type is meant to be +/// used by the [`macros::module`] macro, not handled directly. Instead use the +/// `read` method generated by that macro. +/// +/// [`charp`]: ../../../include/linux/moduleparam.h +pub enum StringParam { + /// A borrowed parameter value. + /// + /// Either the default value (which is static in the module) or borrowed + /// from the original argument buffer used to set the value. + Ref(&'static [u8]), + + /// A value that was allocated when the parameter was set. + /// + /// The value needs to be freed when the parameter is reset or the module is + /// unloaded. + Owned(alloc::vec::Vec), +} + +impl StringParam { + fn bytes(&self) -> &[u8] { + match self { + StringParam::Ref(bytes) => *bytes, + StringParam::Owned(vec) => &vec[..], + } + } +} + +impl core::fmt::Display for StringParam { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + let bytes = self.bytes(); + match core::str::from_utf8(bytes) { + Ok(utf8) => write!(f, "{}", utf8), + Err(_) => write!(f, "{:?}", bytes), + } + } +} + +impl ModuleParam for StringParam { + type Value = [u8]; + + const NOARG_ALLOWED: bool = false; + + fn try_from_param_arg(arg: Option<&'static [u8]>) -> Option { + // SAFETY: It is always safe to call [`slab_is_available`](../../../include/linux/slab.h). + let slab_available = unsafe { crate::bindings::slab_is_available() }; + arg.and_then(|arg| { + if slab_available { + let mut vec = alloc::vec::Vec::new(); + vec.try_extend_from_slice(arg).ok()?; + Some(StringParam::Owned(vec)) + } else { + Some(StringParam::Ref(arg)) + } + }) + } + + fn value(&self) -> &Self::Value { + self.bytes() + } +} + +make_param_ops!( + /// Rust implementation of [`kernel_param_ops`](../../../include/linux/moduleparam.h) + /// for [`StringParam`]. + PARAM_OPS_STR, + StringParam +); diff --git a/rust/kernel/net.rs b/rust/kernel/net.rs new file mode 100644 index 00000000000000..0495ab77814472 --- /dev/null +++ b/rust/kernel/net.rs @@ -0,0 +1,392 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Networking core. +//! +//! C headers: [`include/net/net_namespace.h`](../../../../include/linux/net/net_namespace.h), +//! [`include/linux/netdevice.h`](../../../../include/linux/netdevice.h), +//! [`include/linux/skbuff.h`](../../../../include/linux/skbuff.h). + +use crate::{bindings, str::CStr, to_result, ARef, AlwaysRefCounted, Error, Result}; +use core::{cell::UnsafeCell, ptr::NonNull}; + +#[cfg(CONFIG_NETFILTER)] +pub mod filter; + +/// Wraps the kernel's `struct net_device`. +#[repr(transparent)] +pub struct Device(UnsafeCell); + +// SAFETY: Instances of `Device` are created on the C side. They are always refcounted. +unsafe impl AlwaysRefCounted for Device { + fn inc_ref(&self) { + // SAFETY: The existence of a shared reference means that the refcount is nonzero. + unsafe { bindings::dev_hold(self.0.get()) }; + } + + unsafe fn dec_ref(obj: core::ptr::NonNull) { + // SAFETY: The safety requirements guarantee that the refcount is nonzero. + unsafe { bindings::dev_put(obj.cast().as_ptr()) }; + } +} + +/// Wraps the kernel's `struct net`. +#[repr(transparent)] +pub struct Namespace(UnsafeCell); + +impl Namespace { + /// Finds a network device with the given name in the namespace. + pub fn dev_get_by_name(&self, name: &CStr) -> Option> { + // SAFETY: The existence of a shared reference guarantees the refcount is nonzero. + let ptr = + NonNull::new(unsafe { bindings::dev_get_by_name(self.0.get(), name.as_char_ptr()) })?; + Some(unsafe { ARef::from_raw(ptr.cast()) }) + } +} + +// SAFETY: Instances of `Namespace` are created on the C side. They are always refcounted. +unsafe impl AlwaysRefCounted for Namespace { + fn inc_ref(&self) { + // SAFETY: The existence of a shared reference means that the refcount is nonzero. + unsafe { bindings::get_net(self.0.get()) }; + } + + unsafe fn dec_ref(obj: core::ptr::NonNull) { + // SAFETY: The safety requirements guarantee that the refcount is nonzero. + unsafe { bindings::put_net(obj.cast().as_ptr()) }; + } +} + +/// Returns the network namespace for the `init` process. +pub fn init_ns() -> &'static Namespace { + unsafe { &*core::ptr::addr_of!(bindings::init_net).cast() } +} + +/// Wraps the kernel's `struct sk_buff`. +#[repr(transparent)] +pub struct SkBuff(UnsafeCell); + +impl SkBuff { + /// Creates a reference to an [`SkBuff`] from a valid pointer. + /// + /// # Safety + /// + /// The caller must ensure that `ptr` is valid and remains valid for the lifetime of the + /// returned [`SkBuff`] instance. + pub unsafe fn from_ptr<'a>(ptr: *const bindings::sk_buff) -> &'a SkBuff { + // SAFETY: The safety requirements guarantee the validity of the dereference, while the + // `SkBuff` type being transparent makes the cast ok. + unsafe { &*ptr.cast() } + } + + /// Returns the remaining data in the buffer's first segment. + pub fn head_data(&self) -> &[u8] { + // SAFETY: The existence of a shared reference means that the refcount is nonzero. + let headlen = unsafe { bindings::skb_headlen(self.0.get()) }; + let len = headlen.try_into().unwrap_or(usize::MAX); + // SAFETY: The existence of a shared reference means `self.0` is valid. + let data = unsafe { core::ptr::addr_of!((*self.0.get()).data).read() }; + // SAFETY: The `struct sk_buff` conventions guarantee that at least `skb_headlen(skb)` bytes + // are valid from `skb->data`. + unsafe { core::slice::from_raw_parts(data, len) } + } + + /// Returns the total length of the data (in all segments) in the skb. + #[allow(clippy::len_without_is_empty)] + pub fn len(&self) -> u32 { + // SAFETY: The existence of a shared reference means `self.0` is valid. + unsafe { core::ptr::addr_of!((*self.0.get()).len).read() } + } +} + +// SAFETY: Instances of `SkBuff` are created on the C side. They are always refcounted. +unsafe impl AlwaysRefCounted for SkBuff { + fn inc_ref(&self) { + // SAFETY: The existence of a shared reference means that the refcount is nonzero. + unsafe { bindings::skb_get(self.0.get()) }; + } + + unsafe fn dec_ref(obj: core::ptr::NonNull) { + // SAFETY: The safety requirements guarantee that the refcount is nonzero. + unsafe { + bindings::kfree_skb_reason( + obj.cast().as_ptr(), + bindings::skb_drop_reason_SKB_DROP_REASON_NOT_SPECIFIED, + ) + }; + } +} + +/// An IPv4 address. +/// +/// This is equivalent to C's `in_addr`. +#[repr(transparent)] +pub struct Ipv4Addr(bindings::in_addr); + +impl Ipv4Addr { + /// A wildcard IPv4 address. + /// + /// Binding to this address means binding to all IPv4 addresses. + pub const ANY: Self = Self::new(0, 0, 0, 0); + + /// The IPv4 loopback address. + pub const LOOPBACK: Self = Self::new(127, 0, 0, 1); + + /// The IPv4 broadcast address. + pub const BROADCAST: Self = Self::new(255, 255, 255, 255); + + /// Creates a new IPv4 address with the given components. + pub const fn new(a: u8, b: u8, c: u8, d: u8) -> Self { + Self(bindings::in_addr { + s_addr: u32::from_be_bytes([a, b, c, d]).to_be(), + }) + } +} + +/// An IPv6 address. +/// +/// This is equivalent to C's `in6_addr`. +#[repr(transparent)] +pub struct Ipv6Addr(bindings::in6_addr); + +impl Ipv6Addr { + /// A wildcard IPv6 address. + /// + /// Binding to this address means binding to all IPv6 addresses. + pub const ANY: Self = Self::new(0, 0, 0, 0, 0, 0, 0, 0); + + /// The IPv6 loopback address. + pub const LOOPBACK: Self = Self::new(0, 0, 0, 0, 0, 0, 0, 1); + + /// Creates a new IPv6 address with the given components. + #[allow(clippy::too_many_arguments)] + pub const fn new(a: u16, b: u16, c: u16, d: u16, e: u16, f: u16, g: u16, h: u16) -> Self { + Self(bindings::in6_addr { + in6_u: bindings::in6_addr__bindgen_ty_1 { + u6_addr16: [ + a.to_be(), + b.to_be(), + c.to_be(), + d.to_be(), + e.to_be(), + f.to_be(), + g.to_be(), + h.to_be(), + ], + }, + }) + } +} + +/// A socket address. +/// +/// It's an enum with either an IPv4 or IPv6 socket address. +pub enum SocketAddr { + /// An IPv4 socket address. + V4(SocketAddrV4), + + /// An IPv6 socket address. + V6(SocketAddrV6), +} + +/// An IPv4 socket address. +/// +/// This is equivalent to C's `sockaddr_in`. +#[repr(transparent)] +pub struct SocketAddrV4(bindings::sockaddr_in); + +impl SocketAddrV4 { + /// Creates a new IPv4 socket address. + pub const fn new(addr: Ipv4Addr, port: u16) -> Self { + Self(bindings::sockaddr_in { + sin_family: bindings::AF_INET as _, + sin_port: port.to_be(), + sin_addr: addr.0, + __pad: [0; 8], + }) + } +} + +/// An IPv6 socket address. +/// +/// This is equivalent to C's `sockaddr_in6`. +#[repr(transparent)] +pub struct SocketAddrV6(bindings::sockaddr_in6); + +impl SocketAddrV6 { + /// Creates a new IPv6 socket address. + pub const fn new(addr: Ipv6Addr, port: u16, flowinfo: u32, scopeid: u32) -> Self { + Self(bindings::sockaddr_in6 { + sin6_family: bindings::AF_INET6 as _, + sin6_port: port.to_be(), + sin6_addr: addr.0, + sin6_flowinfo: flowinfo, + sin6_scope_id: scopeid, + }) + } +} + +/// A socket listening on a TCP port. +/// +/// # Invariants +/// +/// The socket pointer is always non-null and valid. +pub struct TcpListener { + pub(crate) sock: *mut bindings::socket, +} + +// SAFETY: `TcpListener` is just a wrapper for a kernel socket, which can be used from any thread. +unsafe impl Send for TcpListener {} + +// SAFETY: `TcpListener` is just a wrapper for a kernel socket, which can be used from any thread. +unsafe impl Sync for TcpListener {} + +impl TcpListener { + /// Creates a new TCP listener. + /// + /// It is configured to listen on the given socket address for the given namespace. + pub fn try_new(ns: &Namespace, addr: &SocketAddr) -> Result { + let mut socket = core::ptr::null_mut(); + let (pf, addr, addrlen) = match addr { + SocketAddr::V4(addr) => ( + bindings::PF_INET, + addr as *const _ as _, + core::mem::size_of::(), + ), + SocketAddr::V6(addr) => ( + bindings::PF_INET6, + addr as *const _ as _, + core::mem::size_of::(), + ), + }; + + // SAFETY: The namespace is valid and the output socket pointer is valid for write. + to_result(|| unsafe { + bindings::sock_create_kern( + ns.0.get(), + pf as _, + bindings::sock_type_SOCK_STREAM as _, + bindings::IPPROTO_TCP as _, + &mut socket, + ) + })?; + + // INVARIANT: The socket was just created, so it is valid. + let listener = Self { sock: socket }; + + // SAFETY: The type invariant guarantees that the socket is valid, and `addr` and `addrlen` + // were initialised based on valid values provided in the address enum. + to_result(|| unsafe { bindings::kernel_bind(socket, addr, addrlen as _) })?; + + // SAFETY: The socket is valid per the type invariant. + to_result(|| unsafe { bindings::kernel_listen(socket, bindings::SOMAXCONN as _) })?; + + Ok(listener) + } + + /// Accepts a new connection. + /// + /// On success, returns the newly-accepted socket stream. + /// + /// If no connection is available to be accepted, one of two behaviours will occur: + /// - If `block` is `false`, returns [`crate::error::code::EAGAIN`]; + /// - If `block` is `true`, blocks until an error occurs or some connection can be accepted. + pub fn accept(&self, block: bool) -> Result { + let mut new = core::ptr::null_mut(); + let flags = if block { 0 } else { bindings::O_NONBLOCK }; + // SAFETY: The type invariant guarantees that the socket is valid, and the output argument + // is also valid for write. + to_result(|| unsafe { bindings::kernel_accept(self.sock, &mut new, flags as _) })?; + Ok(TcpStream { sock: new }) + } +} + +impl Drop for TcpListener { + fn drop(&mut self) { + // SAFETY: The type invariant guarantees that the socket is valid. + unsafe { bindings::sock_release(self.sock) }; + } +} + +/// A connected TCP socket. +/// +/// # Invariants +/// +/// The socket pointer is always non-null and valid. +pub struct TcpStream { + pub(crate) sock: *mut bindings::socket, +} + +// SAFETY: `TcpStream` is just a wrapper for a kernel socket, which can be used from any thread. +unsafe impl Send for TcpStream {} + +// SAFETY: `TcpStream` is just a wrapper for a kernel socket, which can be used from any thread. +unsafe impl Sync for TcpStream {} + +impl TcpStream { + /// Reads data from a connected socket. + /// + /// On success, returns the number of bytes read, which will be zero if the connection is + /// closed. + /// + /// If no data is immediately available for reading, one of two behaviours will occur: + /// - If `block` is `false`, returns [`crate::error::code::EAGAIN`]; + /// - If `block` is `true`, blocks until an error occurs, the connection is closed, or some + /// becomes readable. + pub fn read(&self, buf: &mut [u8], block: bool) -> Result { + let mut msg = bindings::msghdr::default(); + let mut vec = bindings::kvec { + iov_base: buf.as_mut_ptr().cast(), + iov_len: buf.len(), + }; + // SAFETY: The type invariant guarantees that the socket is valid, and `vec` was + // initialised with the output buffer. + let r = unsafe { + bindings::kernel_recvmsg( + self.sock, + &mut msg, + &mut vec, + 1, + vec.iov_len, + if block { 0 } else { bindings::MSG_DONTWAIT } as _, + ) + }; + if r < 0 { + Err(Error::from_kernel_errno(r)) + } else { + Ok(r as _) + } + } + + /// Writes data to the connected socket. + /// + /// On success, returns the number of bytes written. + /// + /// If the send buffer of the socket is full, one of two behaviours will occur: + /// - If `block` is `false`, returns [`crate::error::code::EAGAIN`]; + /// - If `block` is `true`, blocks until an error occurs or some data is written. + pub fn write(&self, buf: &[u8], block: bool) -> Result { + let mut msg = bindings::msghdr { + msg_flags: if block { 0 } else { bindings::MSG_DONTWAIT }, + ..bindings::msghdr::default() + }; + let mut vec = bindings::kvec { + iov_base: buf.as_ptr() as *mut u8 as _, + iov_len: buf.len(), + }; + // SAFETY: The type invariant guarantees that the socket is valid, and `vec` was + // initialised with the input buffer. + let r = unsafe { bindings::kernel_sendmsg(self.sock, &mut msg, &mut vec, 1, vec.iov_len) }; + if r < 0 { + Err(Error::from_kernel_errno(r)) + } else { + Ok(r as _) + } + } +} + +impl Drop for TcpStream { + fn drop(&mut self) { + // SAFETY: The type invariant guarantees that the socket is valid. + unsafe { bindings::sock_release(self.sock) }; + } +} diff --git a/rust/kernel/net/filter.rs b/rust/kernel/net/filter.rs new file mode 100644 index 00000000000000..3241100a1561d2 --- /dev/null +++ b/rust/kernel/net/filter.rs @@ -0,0 +1,447 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Networking filters. +//! +//! C header: [`include/linux/netfilter.h`](../../../../../include/linux/netfilter.h) + +use crate::{ + bindings, c_types, + error::{code::*, to_result}, + net, + types::PointerWrapper, + ARef, AlwaysRefCounted, Result, ScopeGuard, +}; +use alloc::boxed::Box; +use core::{ + marker::{PhantomData, PhantomPinned}, + pin::Pin, +}; + +/// A network filter. +pub trait Filter { + /// The type of the context data stored on registration and made available to the + /// [`Filter::filter`] function. + type Data: PointerWrapper + Sync = (); + + /// Filters the packet stored in the given buffer. + /// + /// It dictates to the netfilter core what the fate of the packet should be. + fn filter( + _data: ::Borrowed<'_>, + _skb: &net::SkBuff, + ) -> Disposition; +} + +/// Specifies the action to be taken by the netfilter core. +pub enum Disposition { + /// Drop the packet. + Drop, + + /// Accept the packet. + Accept, + + /// The packet was stolen by the filter and must be treated as if it didn't exist. + Stolen, + + /// Queue the packet to the given user-space queue. + Queue { + /// The identifier of the queue to which the packet should be added. + queue_id: u16, + + /// Specifies the behaviour if a queue with the given identifier doesn't exist: if `true`, + /// the packet is accepted, otherwise it is rejected. + accept_if_queue_non_existent: bool, + }, +} + +/// The filter hook families. +pub enum Family { + /// IPv4 and IPv6 packets. + INet(inet::Hook), + + /// IPv4 packets. + Ipv4(ipv4::Hook, ipv4::PriorityBase), + + /// All packets through a device. + /// + /// When this family is used, a device _must_ be specified. + NetDev(netdev::Hook), + + /// IPv6 packets. + Ipv6(ipv6::Hook, ipv6::PriorityBase), + + /// Address resolution protocol (ARP) packets. + Arp(arp::Hook), +} + +/// A registration of a networking filter. +/// +/// # Examples +/// +/// The following is an example of a function that attaches an inbound filter (that always accepts +/// all packets after printing their lengths) on the specified device (in the `init` ns). +/// +/// ``` +/// use kernel::net::{self, filter as netfilter}; +/// +/// struct MyFilter; +/// impl netfilter::Filter for MyFilter { +/// fn filter(_data: (), skb: &net::SkBuff) -> netfilter::Disposition { +/// pr_info!("Packet of length {}\n", skb.len()); +/// netfilter::Disposition::Accept +/// } +/// } +/// +/// fn register(name: &CStr) -> Result>>> { +/// let ns = net::init_ns(); +/// let dev = ns.dev_get_by_name(name).ok_or(ENOENT)?; +/// netfilter::Registration::new_pinned( +/// netfilter::Family::NetDev(netfilter::netdev::Hook::Ingress), +/// 0, +/// ns.into(), +/// Some(dev), +/// (), +/// ) +/// } +/// ``` +#[derive(Default)] +pub struct Registration { + hook: bindings::nf_hook_ops, + // When `ns` is `Some(_)`, the hook is registered. + ns: Option>, + dev: Option>, + _p: PhantomData, + _pinned: PhantomPinned, +} + +// SAFETY: `Registration` does not expose any of its state across threads. +unsafe impl Sync for Registration {} + +impl Registration { + /// Creates a new [`Registration`] but does not register it yet. + /// + /// It is allowed to move. + pub fn new() -> Self { + Self { + hook: bindings::nf_hook_ops::default(), + dev: None, + ns: None, + _p: PhantomData, + _pinned: PhantomPinned, + } + } + + /// Creates a new filter registration and registers it. + /// + /// Returns a pinned heap-allocated representation of the registration. + pub fn new_pinned( + family: Family, + priority: i32, + ns: ARef, + dev: Option>, + data: T::Data, + ) -> Result>> { + let mut filter = Pin::from(Box::try_new(Self::new())?); + filter.as_mut().register(family, priority, ns, dev, data)?; + Ok(filter) + } + + /// Registers a network filter. + /// + /// It must be pinned because the C portion of the kernel stores a pointer to it while it is + /// registered. + /// + /// The priority is relative to the family's base priority. For example, if the base priority + /// is `100` and `priority` is `-1`, the actual priority will be `99`. If a family doesn't + /// explicitly allow a base to be specified, `0` is assumed. + pub fn register( + self: Pin<&mut Self>, + family: Family, + priority: i32, + ns: ARef, + dev: Option>, + data: T::Data, + ) -> Result { + // SAFETY: We must ensure that we never move out of `this`. + let this = unsafe { self.get_unchecked_mut() }; + if this.ns.is_some() { + // Already registered. + return Err(EINVAL); + } + + let data_pointer = data.into_pointer(); + + // SAFETY: `data_pointer` comes from the call to `data.into_pointer()` above. + let guard = ScopeGuard::new(|| unsafe { + T::Data::from_pointer(data_pointer); + }); + + let mut pri_base = 0i32; + match family { + Family::INet(hook) => { + this.hook.pf = bindings::NFPROTO_INET as _; + this.hook.hooknum = hook as _; + } + Family::Ipv4(hook, pbase) => { + this.hook.pf = bindings::NFPROTO_IPV4 as _; + this.hook.hooknum = hook as _; + pri_base = pbase as _; + } + Family::Ipv6(hook, pbase) => { + this.hook.pf = bindings::NFPROTO_IPV6 as _; + this.hook.hooknum = hook as _; + pri_base = pbase as _; + } + Family::NetDev(hook) => { + this.hook.pf = bindings::NFPROTO_NETDEV as _; + this.hook.hooknum = hook as _; + } + Family::Arp(hook) => { + this.hook.pf = bindings::NFPROTO_ARP as _; + this.hook.hooknum = hook as _; + } + } + + this.hook.priority = pri_base.saturating_add(priority); + this.hook.priv_ = data_pointer as _; + this.hook.hook = Some(Self::hook_callback); + crate::static_assert!(bindings::nf_hook_ops_type_NF_HOOK_OP_UNDEFINED == 0); + + if let Some(ref device) = dev { + this.hook.dev = device.0.get(); + } + + // SAFETY: `ns` has a valid reference to the namespace, and `this.hook` was just + // initialised above, so they're both valid. + to_result(|| unsafe { bindings::nf_register_net_hook(ns.0.get(), &this.hook) })?; + + this.dev = dev; + this.ns = Some(ns); + guard.dismiss(); + Ok(()) + } + + unsafe extern "C" fn hook_callback( + priv_: *mut c_types::c_void, + skb: *mut bindings::sk_buff, + _state: *const bindings::nf_hook_state, + ) -> c_types::c_uint { + // SAFETY: `priv_` was initialised on registration by a value returned from + // `T::Data::into_pointer`, and it remains valid until the hook is unregistered. + let data = unsafe { T::Data::borrow(priv_) }; + + // SAFETY: The C contract guarantees that `skb` remains valid for the duration of this + // function call. + match T::filter(data, unsafe { net::SkBuff::from_ptr(skb) }) { + Disposition::Drop => bindings::NF_DROP, + Disposition::Accept => bindings::NF_ACCEPT, + Disposition::Stolen => { + // SAFETY: This function takes over ownership of `skb` when it returns `NF_STOLEN`, + // so we decrement the refcount here to avoid a leak. + unsafe { net::SkBuff::dec_ref(core::ptr::NonNull::new(skb).unwrap().cast()) }; + bindings::NF_STOLEN + } + Disposition::Queue { + queue_id, + accept_if_queue_non_existent, + } => { + // SAFETY: Just an FFI call, no additional safety requirements. + let verdict = unsafe { bindings::NF_QUEUE_NR(queue_id as _) }; + if accept_if_queue_non_existent { + verdict | bindings::NF_VERDICT_FLAG_QUEUE_BYPASS + } else { + verdict + } + } + } + } +} + +impl Drop for Registration { + fn drop(&mut self) { + if let Some(ref ns) = self.ns { + // SAFETY: `self.ns` is `Some(_)` only when a previous call to `nf_register_net_hook` + // succeeded. And the arguments are the same. + unsafe { bindings::nf_unregister_net_hook(ns.0.get(), &self.hook) }; + + // `self.hook.priv_` was initialised during registration to a value returned from + // `T::Data::into_pointer`, so it is ok to convert back here. + unsafe { T::Data::from_pointer(self.hook.priv_) }; + } + } +} + +/// Definitions used when defining hooks for the [`Family::NetDev`] family. +pub mod netdev { + use crate::bindings; + + /// Hooks allowed in the [`super::Family::NetDev`] family. + #[repr(u32)] + pub enum Hook { + /// All inbound packets through the given device. + Ingress = bindings::nf_dev_hooks_NF_NETDEV_INGRESS, + + /// All outbound packets through the given device. + Egress = bindings::nf_dev_hooks_NF_NETDEV_EGRESS, + } +} + +/// Definitions used when defining hooks for the [`Family::Ipv4`] family. +pub mod ipv4 { + use crate::bindings; + + /// Hooks allowed in [`super::Family::Ipv4`] family. + pub type Hook = super::inet::Hook; + + /// The base priority for [`super::Family::Ipv4`] hooks. + /// + /// The actual priority is the base priority plus the priority specified when registering. + #[repr(i32)] + pub enum PriorityBase { + /// Same as the `NF_IP_PRI_FIRST` C constant. + First = bindings::nf_ip_hook_priorities_NF_IP_PRI_FIRST, + + /// Same as the `NF_IP_PRI_RAW_BEFORE_DEFRAG` C constant. + RawBeforeDefrag = bindings::nf_ip_hook_priorities_NF_IP_PRI_RAW_BEFORE_DEFRAG, + + /// Same as the `NF_IP_PRI_CONNTRACK_DEFRAG` C constant. + ConnTrackDefrag = bindings::nf_ip_hook_priorities_NF_IP_PRI_CONNTRACK_DEFRAG, + + /// Same as the `NF_IP_PRI_RAW` C constant. + Raw = bindings::nf_ip_hook_priorities_NF_IP_PRI_RAW, + + /// Same as the `NF_IP_PRI_SELINUX_FIRST` C constant. + SeLinuxFirst = bindings::nf_ip_hook_priorities_NF_IP_PRI_SELINUX_FIRST, + + /// Same as the `NF_IP_PRI_CONNTRACK` C constant. + ConnTrack = bindings::nf_ip_hook_priorities_NF_IP_PRI_CONNTRACK, + + /// Same as the `NF_IP_PRI_MANGLE` C constant. + Mangle = bindings::nf_ip_hook_priorities_NF_IP_PRI_MANGLE, + + /// Same as the `NF_IP_PRI_NAT_DST` C constant. + NatDst = bindings::nf_ip_hook_priorities_NF_IP_PRI_NAT_DST, + + /// Same as the `NF_IP_PRI_FILTER` C constant. + Filter = bindings::nf_ip_hook_priorities_NF_IP_PRI_FILTER, + + /// Same as the `NF_IP_PRI_SECURITY` C constant. + Security = bindings::nf_ip_hook_priorities_NF_IP_PRI_SECURITY, + + /// Same as the `NF_IP_PRI_NAT_SRC` C constant. + NatSrc = bindings::nf_ip_hook_priorities_NF_IP_PRI_NAT_SRC, + + /// Same as the `NF_IP_PRI_SELINUX_LAST` C constant. + SeLinuxLast = bindings::nf_ip_hook_priorities_NF_IP_PRI_SELINUX_LAST, + + /// Same as the `NF_IP_PRI_CONNTRACK_HELPER` C constant. + ConnTrackHelper = bindings::nf_ip_hook_priorities_NF_IP_PRI_CONNTRACK_HELPER, + + /// Same as the `NF_IP_PRI_LAST` and `NF_IP_PRI_CONNTRACK_CONFIRM` C constants. + Last = bindings::nf_ip_hook_priorities_NF_IP_PRI_LAST, + } +} + +/// Definitions used when defining hooks for the [`Family::Ipv6`] family. +pub mod ipv6 { + use crate::bindings; + + /// Hooks allowed in [`super::Family::Ipv6`] family. + pub type Hook = super::inet::Hook; + + /// The base priority for [`super::Family::Ipv6`] hooks. + /// + /// The actual priority is the base priority plus the priority specified when registering. + #[repr(i32)] + pub enum PriorityBase { + /// Same as the `NF_IP6_PRI_FIRST` C constant. + First = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_FIRST, + + /// Same as the `NF_IP6_PRI_RAW_BEFORE_DEFRAG` C constant. + RawBeforeDefrag = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_RAW_BEFORE_DEFRAG, + + /// Same as the `NF_IP6_PRI_CONNTRACK_DEFRAG` C constant. + ConnTrackDefrag = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_CONNTRACK_DEFRAG, + + /// Same as the `NF_IP6_PRI_RAW` C constant. + Raw = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_RAW, + + /// Same as the `NF_IP6_PRI_SELINUX_FIRST` C constant. + SeLinuxFirst = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_SELINUX_FIRST, + + /// Same as the `NF_IP6_PRI_CONNTRACK` C constant. + ConnTrack = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_CONNTRACK, + + /// Same as the `NF_IP6_PRI_MANGLE` C constant. + Mangle = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_MANGLE, + + /// Same as the `NF_IP6_PRI_NAT_DST` C constant. + NatDst = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_NAT_DST, + + /// Same as the `NF_IP6_PRI_FILTER` C constant. + Filter = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_FILTER, + + /// Same as the `NF_IP6_PRI_SECURITY` C constant. + Security = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_SECURITY, + + /// Same as the `NF_IP6_PRI_NAT_SRC` C constant. + NatSrc = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_NAT_SRC, + + /// Same as the `NF_IP6_PRI_SELINUX_LAST` C constant. + SeLinuxLast = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_SELINUX_LAST, + + /// Same as the `NF_IP6_PRI_CONNTRACK_HELPER` C constant. + ConnTrackHelper = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_CONNTRACK_HELPER, + + /// Same as the `NF_IP6_PRI_LAST` C constant. + Last = bindings::nf_ip6_hook_priorities_NF_IP6_PRI_LAST, + } +} + +/// Definitions used when defining hooks for the [`Family::Arp`] family. +pub mod arp { + use crate::bindings; + + /// Hooks allowed in the [`super::Family::Arp`] family. + #[repr(u32)] + pub enum Hook { + /// Inbound ARP packets. + In = bindings::NF_ARP_IN, + + /// Outbound ARP packets. + Out = bindings::NF_ARP_OUT, + + /// Forwarded ARP packets. + Forward = bindings::NF_ARP_FORWARD, + } +} + +/// Definitions used when defining hooks for the [`Family::INet`] family. +pub mod inet { + use crate::bindings; + + /// Hooks allowed in the [`super::Family::INet`], [`super::Family::Ipv4`], and + /// [`super::Family::Ipv6`] families. + #[repr(u32)] + pub enum Hook { + /// Inbound packets before routing decisions are made (i.e., before it's determined if the + /// packet is to be delivered locally or forwarded to another host). + PreRouting = bindings::nf_inet_hooks_NF_INET_PRE_ROUTING as _, + + /// Inbound packets that are meant to be delivered locally. + LocalIn = bindings::nf_inet_hooks_NF_INET_LOCAL_IN as _, + + /// Inbound packets that are meant to be forwarded to another host. + Forward = bindings::nf_inet_hooks_NF_INET_FORWARD as _, + + /// Outbound packet created by the local networking stack. + LocalOut = bindings::nf_inet_hooks_NF_INET_LOCAL_OUT as _, + + /// All outbound packets (i.e., generated locally or being forwarded to another host). + PostRouting = bindings::nf_inet_hooks_NF_INET_POST_ROUTING as _, + + /// Equivalent to [`super::netdev::Hook::Ingress`], so a device must be specified. Packets + /// of all types (not just ipv4/ipv6) will be delivered to the filter. + Ingress = bindings::nf_inet_hooks_NF_INET_INGRESS as _, + } +} diff --git a/rust/kernel/of.rs b/rust/kernel/of.rs new file mode 100644 index 00000000000000..cdcd8324433769 --- /dev/null +++ b/rust/kernel/of.rs @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Devicetree and Open Firmware abstractions. +//! +//! C header: [`include/linux/of_*.h`](../../../../include/linux/of_*.h) + +use crate::{bindings, driver, str::BStr}; + +/// An open firmware device id. +#[derive(Clone, Copy)] +pub enum DeviceId { + /// An open firmware device id where only a compatible string is specified. + Compatible(&'static BStr), +} + +/// Defines a const open firmware device id table that also carries per-entry data/context/info. +/// +/// The name of the const is `OF_DEVICE_ID_TABLE`, which is what buses are expected to name their +/// open firmware tables. +/// +/// # Examples +/// +/// ``` +/// # use kernel::define_of_id_table; +/// use kernel::of; +/// +/// define_of_id_table! {u32, [ +/// (of::DeviceId::Compatible(b"test-device1,test-device2"), Some(0xff)), +/// (of::DeviceId::Compatible(b"test-device3"), None), +/// ]}; +/// ``` +#[macro_export] +macro_rules! define_of_id_table { + ($data_type:ty, $($t:tt)*) => { + $crate::define_id_table!(OF_DEVICE_ID_TABLE, $crate::of::DeviceId, $data_type, $($t)*); + }; +} + +// SAFETY: `ZERO` is all zeroed-out and `to_rawid` stores `offset` in `of_device_id::data`. +unsafe impl const driver::RawDeviceId for DeviceId { + type RawType = bindings::of_device_id; + const ZERO: Self::RawType = bindings::of_device_id { + name: [0; 32], + type_: [0; 32], + compatible: [0; 128], + data: core::ptr::null(), + }; + + fn to_rawid(&self, offset: isize) -> Self::RawType { + let DeviceId::Compatible(compatible) = self; + let mut id = Self::ZERO; + let mut i = 0; + while i < compatible.len() { + // If `compatible` does not fit in `id.compatible`, an "index out of bounds" build time + // error will be triggered. + id.compatible[i] = compatible[i] as _; + i += 1; + } + id.compatible[i] = b'\0' as _; + id.data = offset as _; + id + } +} diff --git a/rust/kernel/pages.rs b/rust/kernel/pages.rs new file mode 100644 index 00000000000000..91def8ed062a20 --- /dev/null +++ b/rust/kernel/pages.rs @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Kernel page allocation and management. +//! +//! TODO: This module is a work in progress. + +use crate::{ + bindings, c_types, error::code::*, io_buffer::IoBufferReader, user_ptr::UserSlicePtrReader, + Result, PAGE_SIZE, +}; +use core::{marker::PhantomData, ptr}; + +/// A set of physical pages. +/// +/// `Pages` holds a reference to a set of pages of order `ORDER`. Having the order as a generic +/// const allows the struct to have the same size as a pointer. +/// +/// # Invariants +/// +/// The pointer `Pages::pages` is valid and points to 2^ORDER pages. +pub struct Pages { + pub(crate) pages: *mut bindings::page, +} + +impl Pages { + /// Allocates a new set of contiguous pages. + pub fn new() -> Result { + // TODO: Consider whether we want to allow callers to specify flags. + // SAFETY: This only allocates pages. We check that it succeeds in the next statement. + let pages = unsafe { + bindings::alloc_pages( + bindings::GFP_KERNEL | bindings::__GFP_ZERO | bindings::__GFP_HIGHMEM, + ORDER, + ) + }; + if pages.is_null() { + return Err(ENOMEM); + } + // INVARIANTS: We checked that the allocation above succeeded> + Ok(Self { pages }) + } + + /// Copies data from the given [`UserSlicePtrReader`] into the pages. + pub fn copy_into_page( + &self, + reader: &mut UserSlicePtrReader, + offset: usize, + len: usize, + ) -> Result { + // TODO: For now this only works on the first page. + let end = offset.checked_add(len).ok_or(EINVAL)?; + if end > PAGE_SIZE { + return Err(EINVAL); + } + + let mapping = self.kmap(0).ok_or(EINVAL)?; + + // SAFETY: We ensured that the buffer was valid with the check above. + unsafe { reader.read_raw((mapping.ptr as usize + offset) as _, len) }?; + Ok(()) + } + + /// Maps the pages and reads from them into the given buffer. + /// + /// # Safety + /// + /// Callers must ensure that the destination buffer is valid for the given length. + /// Additionally, if the raw buffer is intended to be recast, they must ensure that the data + /// can be safely cast; [`crate::io_buffer::ReadableFromBytes`] has more details about it. + pub unsafe fn read(&self, dest: *mut u8, offset: usize, len: usize) -> Result { + // TODO: For now this only works on the first page. + let end = offset.checked_add(len).ok_or(EINVAL)?; + if end > PAGE_SIZE { + return Err(EINVAL); + } + + let mapping = self.kmap(0).ok_or(EINVAL)?; + unsafe { ptr::copy((mapping.ptr as *mut u8).add(offset), dest, len) }; + Ok(()) + } + + /// Maps the pages and writes into them from the given buffer. + /// + /// # Safety + /// + /// Callers must ensure that the buffer is valid for the given length. Additionally, if the + /// page is (or will be) mapped by userspace, they must ensure that no kernel data is leaked + /// through padding if it was cast from another type; [`crate::io_buffer::WritableToBytes`] has + /// more details about it. + pub unsafe fn write(&self, src: *const u8, offset: usize, len: usize) -> Result { + // TODO: For now this only works on the first page. + let end = offset.checked_add(len).ok_or(EINVAL)?; + if end > PAGE_SIZE { + return Err(EINVAL); + } + + let mapping = self.kmap(0).ok_or(EINVAL)?; + unsafe { ptr::copy(src, (mapping.ptr as *mut u8).add(offset), len) }; + Ok(()) + } + + /// Maps the page at index `index`. + fn kmap(&self, index: usize) -> Option> { + if index >= 1usize << ORDER { + return None; + } + + // SAFETY: We checked above that `index` is within range. + let page = unsafe { self.pages.add(index) }; + + // SAFETY: `page` is valid based on the checks above. + let ptr = unsafe { bindings::kmap(page) }; + if ptr.is_null() { + return None; + } + + Some(PageMapping { + page, + ptr, + _phantom: PhantomData, + }) + } +} + +impl Drop for Pages { + fn drop(&mut self) { + // SAFETY: By the type invariants, we know the pages are allocated with the given order. + unsafe { bindings::__free_pages(self.pages, ORDER) }; + } +} + +struct PageMapping<'a> { + page: *mut bindings::page, + ptr: *mut c_types::c_void, + _phantom: PhantomData<&'a i32>, +} + +impl Drop for PageMapping<'_> { + fn drop(&mut self) { + // SAFETY: An instance of `PageMapping` is created only when `kmap` succeeded for the given + // page, so it is safe to unmap it here. + unsafe { bindings::kunmap(self.page) }; + } +} diff --git a/rust/kernel/platform.rs b/rust/kernel/platform.rs new file mode 100644 index 00000000000000..586cb8f27c3f40 --- /dev/null +++ b/rust/kernel/platform.rs @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Platform devices and drivers. +//! +//! Also called `platdev`, `pdev`. +//! +//! C header: [`include/linux/platform_device.h`](../../../../include/linux/platform_device.h) + +use crate::{ + bindings, c_types, + device::{self, RawDevice}, + driver, + error::{from_kernel_result, Result}, + of, + str::CStr, + to_result, + types::PointerWrapper, + ThisModule, +}; + +/// A registration of a platform driver. +pub type Registration = driver::Registration>; + +/// An adapter for the registration of platform drivers. +pub struct Adapter(T); + +impl driver::DriverOps for Adapter { + type RegType = bindings::platform_driver; + + unsafe fn register( + reg: *mut bindings::platform_driver, + name: &'static CStr, + module: &'static ThisModule, + ) -> Result { + // SAFETY: By the safety requirements of this function (defined in the trait definition), + // `reg` is non-null and valid. + let pdrv = unsafe { &mut *reg }; + + pdrv.driver.name = name.as_char_ptr(); + pdrv.probe = Some(Self::probe_callback); + pdrv.remove = Some(Self::remove_callback); + if let Some(t) = T::OF_DEVICE_ID_TABLE { + pdrv.driver.of_match_table = t.as_ref(); + } + // SAFETY: + // - `pdrv` lives at least until the call to `platform_driver_unregister()` returns. + // - `name` pointer has static lifetime. + // - `module.0` lives at least as long as the module. + // - `probe()` and `remove()` are static functions. + // - `of_match_table` is either a raw pointer with static lifetime, + // as guaranteed by the [`driver::IdTable`] type, or null. + to_result(|| unsafe { bindings::__platform_driver_register(reg, module.0) }) + } + + unsafe fn unregister(reg: *mut bindings::platform_driver) { + // SAFETY: By the safety requirements of this function (defined in the trait definition), + // `reg` was passed (and updated) by a previous successful call to + // `platform_driver_register`. + unsafe { bindings::platform_driver_unregister(reg) }; + } +} + +impl Adapter { + fn get_id_info(dev: &Device) -> Option<&'static T::IdInfo> { + let table = T::OF_DEVICE_ID_TABLE?; + + // SAFETY: `table` has static lifetime, so it is valid for read. `dev` is guaranteed to be + // valid while it's alive, so is the raw device returned by it. + let id = unsafe { bindings::of_match_device(table.as_ref(), dev.raw_device()) }; + if id.is_null() { + return None; + } + + // SAFETY: `id` is a pointer within the static table, so it's always valid. + let offset = unsafe { (*id).data }; + if offset.is_null() { + return None; + } + + // SAFETY: The offset comes from a previous call to `offset_from` in `IdArray::new`, which + // guarantees that the resulting pointer is within the table. + let ptr = unsafe { + id.cast::() + .offset(offset as _) + .cast::>() + }; + + // SAFETY: The id table has a static lifetime, so `ptr` is guaranteed to be valid for read. + unsafe { (&*ptr).as_ref() } + } + + extern "C" fn probe_callback(pdev: *mut bindings::platform_device) -> c_types::c_int { + from_kernel_result! { + // SAFETY: `pdev` is valid by the contract with the C code. `dev` is alive only for the + // duration of this call, so it is guaranteed to remain alive for the lifetime of + // `pdev`. + let mut dev = unsafe { Device::from_ptr(pdev) }; + let info = Self::get_id_info(&dev); + let data = T::probe(&mut dev, info)?; + // SAFETY: `pdev` is guaranteed to be a valid, non-null pointer. + unsafe { bindings::platform_set_drvdata(pdev, data.into_pointer() as _) }; + Ok(0) + } + } + + extern "C" fn remove_callback(pdev: *mut bindings::platform_device) -> c_types::c_int { + from_kernel_result! { + // SAFETY: `pdev` is guaranteed to be a valid, non-null pointer. + let ptr = unsafe { bindings::platform_get_drvdata(pdev) }; + // SAFETY: + // - we allocated this pointer using `T::Data::into_pointer`, + // so it is safe to turn back into a `T::Data`. + // - the allocation happened in `probe`, no-one freed the memory, + // `remove` is the canonical kernel location to free driver data. so OK + // to convert the pointer back to a Rust structure here. + let data = unsafe { T::Data::from_pointer(ptr) }; + let ret = T::remove(&data); + ::device_remove(&data); + ret?; + Ok(0) + } + } +} + +/// A platform driver. +pub trait Driver { + /// Data stored on device by driver. + /// + /// Corresponds to the data set or retrieved via the kernel's + /// `platform_{set,get}_drvdata()` functions. + /// + /// Require that `Data` implements `PointerWrapper`. We guarantee to + /// never move the underlying wrapped data structure. This allows + type Data: PointerWrapper + Send + Sync + driver::DeviceRemoval = (); + + /// The type holding information about each device id supported by the driver. + type IdInfo: 'static = (); + + /// The table of device ids supported by the driver. + const OF_DEVICE_ID_TABLE: Option> = None; + + /// Platform driver probe. + /// + /// Called when a new platform device is added or discovered. + /// Implementers should attempt to initialize the device here. + fn probe(dev: &mut Device, id_info: Option<&Self::IdInfo>) -> Result; + + /// Platform driver remove. + /// + /// Called when a platform device is removed. + /// Implementers should prepare the device for complete removal here. + fn remove(_data: &Self::Data) -> Result { + Ok(()) + } +} + +/// A platform device. +/// +/// # Invariants +/// +/// The field `ptr` is non-null and valid for the lifetime of the object. +pub struct Device { + ptr: *mut bindings::platform_device, +} + +impl Device { + /// Creates a new device from the given pointer. + /// + /// # Safety + /// + /// `ptr` must be non-null and valid. It must remain valid for the lifetime of the returned + /// instance. + unsafe fn from_ptr(ptr: *mut bindings::platform_device) -> Self { + // INVARIANT: The safety requirements of the function ensure the lifetime invariant. + Self { ptr } + } + + /// Returns id of the platform device. + pub fn id(&self) -> i32 { + // SAFETY: By the type invariants, we know that `self.ptr` is non-null and valid. + unsafe { (*self.ptr).id } + } +} + +// SAFETY: The device returned by `raw_device` is the raw platform device. +unsafe impl device::RawDevice for Device { + fn raw_device(&self) -> *mut bindings::device { + // SAFETY: By the type invariants, we know that `self.ptr` is non-null and valid. + unsafe { &mut (*self.ptr).dev } + } +} + +/// Declares a kernel module that exposes a single platform driver. +/// +/// # Examples +/// +/// ```ignore +/// # use kernel::{platform, define_of_id_table, module_platform_driver}; +/// # +/// struct MyDriver; +/// impl platform::Driver for MyDriver { +/// // [...] +/// # fn probe(_dev: &mut platform::Device, _id_info: Option<&Self::IdInfo>) -> Result { +/// # Ok(()) +/// # } +/// # define_of_id_table! {(), [ +/// # (of::DeviceId::Compatible(b"brcm,bcm2835-rng"), None), +/// # ]} +/// } +/// +/// module_platform_driver! { +/// type: MyDriver, +/// name: b"module_name", +/// author: b"Author name", +/// license: b"GPL", +/// } +/// ``` +#[macro_export] +macro_rules! module_platform_driver { + ($($f:tt)*) => { + $crate::module_driver!(, $crate::platform::Adapter, { $($f)* }); + }; +} diff --git a/rust/kernel/power.rs b/rust/kernel/power.rs new file mode 100644 index 00000000000000..e318b5d9f0c0cc --- /dev/null +++ b/rust/kernel/power.rs @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Power management interfaces. +//! +//! C header: [`include/linux/pm.h`](../../../../include/linux/pm.h) + +#![allow(dead_code)] + +use crate::{bindings, c_types, error::from_kernel_result, types::PointerWrapper, Result}; +use core::marker::PhantomData; + +/// Corresponds to the kernel's `struct dev_pm_ops`. +/// +/// It is meant to be implemented by drivers that support power-management operations. +pub trait Operations { + /// The type of the context data stored by the driver on each device. + type Data: PointerWrapper + Sync + Send; + + /// Called before the system goes into a sleep state. + fn suspend(_data: ::Borrowed<'_>) -> Result { + Ok(()) + } + + /// Called after the system comes back from a sleep state. + fn resume(_data: ::Borrowed<'_>) -> Result { + Ok(()) + } + + /// Called before creating a hibernation image. + fn freeze(_data: ::Borrowed<'_>) -> Result { + Ok(()) + } + + /// Called after the system is restored from a hibernation image. + fn restore(_data: ::Borrowed<'_>) -> Result { + Ok(()) + } +} + +macro_rules! pm_callback { + ($callback:ident, $method:ident) => { + unsafe extern "C" fn $callback( + dev: *mut bindings::device, + ) -> c_types::c_int { + from_kernel_result! { + // SAFETY: `dev` is valid as it was passed in by the C portion. + let ptr = unsafe { bindings::dev_get_drvdata(dev) }; + // SAFETY: By the safety requirements of `OpsTable::build`, we know that `ptr` came + // from a previous call to `T::Data::into_pointer`. + let data = unsafe { T::Data::borrow(ptr) }; + T::$method(data)?; + Ok(0) + } + } + }; +} + +pm_callback!(suspend_callback, suspend); +pm_callback!(resume_callback, resume); +pm_callback!(freeze_callback, freeze); +pm_callback!(restore_callback, restore); + +pub(crate) struct OpsTable(PhantomData<*const T>); + +impl OpsTable { + const VTABLE: bindings::dev_pm_ops = bindings::dev_pm_ops { + prepare: None, + complete: None, + suspend: Some(suspend_callback::), + resume: Some(resume_callback::), + freeze: Some(freeze_callback::), + thaw: None, + poweroff: None, + restore: Some(restore_callback::), + suspend_late: None, + resume_early: None, + freeze_late: None, + thaw_early: None, + poweroff_late: None, + restore_early: None, + suspend_noirq: None, + resume_noirq: None, + freeze_noirq: None, + thaw_noirq: None, + poweroff_noirq: None, + restore_noirq: None, + runtime_suspend: None, + runtime_resume: None, + runtime_idle: None, + }; + + /// Builds an instance of `struct dev_pm_ops`. + /// + /// # Safety + /// + /// The caller must ensure that `dev_get_drvdata` will result in a value returned by + /// [`T::Data::into_pointer`]. + pub(crate) const unsafe fn build() -> &'static bindings::dev_pm_ops { + &Self::VTABLE + } +} + +/// Implements the [`Operations`] trait as no-ops. +/// +/// This is useful when one doesn't want to provide the implementation of any power-manager related +/// operation. +pub struct NoOperations(PhantomData); + +impl Operations for NoOperations { + type Data = T; +} + +// SAFETY: `NoOperation` provides no functionality, it is safe to send a reference to it to +// different threads. +unsafe impl Sync for NoOperations {} + +// SAFETY: `NoOperation` provides no functionality, it is safe to send it to different threads. +unsafe impl Send for NoOperations {} diff --git a/rust/kernel/prelude.rs b/rust/kernel/prelude.rs new file mode 100644 index 00000000000000..a02b9a9d1937cc --- /dev/null +++ b/rust/kernel/prelude.rs @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! The `kernel` prelude. +//! +//! These are the most common items used by Rust code in the kernel, +//! intended to be imported by all Rust code, for convenience. +//! +//! # Examples +//! +//! ``` +//! use kernel::prelude::*; +//! ``` + +pub use core::pin::Pin; + +pub use alloc::{boxed::Box, string::String, vec::Vec}; + +pub use macros::module; + +pub use super::build_assert; + +pub use super::{ + dbg, dev_alert, dev_crit, dev_dbg, dev_emerg, dev_err, dev_info, dev_notice, dev_warn, fmt, + pr_alert, pr_crit, pr_debug, pr_emerg, pr_err, pr_info, pr_notice, pr_warn, +}; + +pub use super::module_misc_device; + +#[cfg(CONFIG_ARM_AMBA)] +pub use super::module_amba_driver; + +pub use super::static_assert; + +pub use super::{error::code::*, Error, Result}; + +pub use super::{str::CStr, ARef, ThisModule}; diff --git a/rust/kernel/print.rs b/rust/kernel/print.rs new file mode 100644 index 00000000000000..9846bd13eab69e --- /dev/null +++ b/rust/kernel/print.rs @@ -0,0 +1,405 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Printing facilities. +//! +//! C header: [`include/linux/printk.h`](../../../../include/linux/printk.h) +//! +//! Reference: + +use core::fmt; + +use crate::{ + c_types::{c_char, c_void}, + str::RawFormatter, +}; + +#[cfg(CONFIG_PRINTK)] +use crate::bindings; + +// Called from `vsprintf` with format specifier `%pA`. +#[no_mangle] +unsafe fn rust_fmt_argument(buf: *mut c_char, end: *mut c_char, ptr: *const c_void) -> *mut c_char { + use fmt::Write; + // SAFETY: The C contract guarantees that `buf` is valid if it's less than `end`. + let mut w = unsafe { RawFormatter::from_ptrs(buf.cast(), end.cast()) }; + let _ = w.write_fmt(unsafe { *(ptr as *const fmt::Arguments<'_>) }); + w.pos().cast() +} + +/// Format strings. +/// +/// Public but hidden since it should only be used from public macros. +#[doc(hidden)] +pub mod format_strings { + use crate::bindings; + + /// The length we copy from the `KERN_*` kernel prefixes. + const LENGTH_PREFIX: usize = 2; + + /// The length of the fixed format strings. + pub const LENGTH: usize = 10; + + /// Generates a fixed format string for the kernel's [`_printk`]. + /// + /// The format string is always the same for a given level, i.e. for a + /// given `prefix`, which are the kernel's `KERN_*` constants. + /// + /// [`_printk`]: ../../../../include/linux/printk.h + const fn generate(is_cont: bool, prefix: &[u8; 3]) -> [u8; LENGTH] { + // Ensure the `KERN_*` macros are what we expect. + assert!(prefix[0] == b'\x01'); + if is_cont { + assert!(prefix[1] == b'c'); + } else { + assert!(prefix[1] >= b'0' && prefix[1] <= b'7'); + } + assert!(prefix[2] == b'\x00'); + + let suffix: &[u8; LENGTH - LENGTH_PREFIX] = if is_cont { + b"%pA\0\0\0\0\0" + } else { + b"%s: %pA\0" + }; + + [ + prefix[0], prefix[1], suffix[0], suffix[1], suffix[2], suffix[3], suffix[4], suffix[5], + suffix[6], suffix[7], + ] + } + + // Generate the format strings at compile-time. + // + // This avoids the compiler generating the contents on the fly in the stack. + // + // Furthermore, `static` instead of `const` is used to share the strings + // for all the kernel. + pub static EMERG: [u8; LENGTH] = generate(false, bindings::KERN_EMERG); + pub static ALERT: [u8; LENGTH] = generate(false, bindings::KERN_ALERT); + pub static CRIT: [u8; LENGTH] = generate(false, bindings::KERN_CRIT); + pub static ERR: [u8; LENGTH] = generate(false, bindings::KERN_ERR); + pub static WARNING: [u8; LENGTH] = generate(false, bindings::KERN_WARNING); + pub static NOTICE: [u8; LENGTH] = generate(false, bindings::KERN_NOTICE); + pub static INFO: [u8; LENGTH] = generate(false, bindings::KERN_INFO); + pub static DEBUG: [u8; LENGTH] = generate(false, bindings::KERN_DEBUG); + pub static CONT: [u8; LENGTH] = generate(true, bindings::KERN_CONT); +} + +/// Prints a message via the kernel's [`_printk`]. +/// +/// Public but hidden since it should only be used from public macros. +/// +/// # Safety +/// +/// The format string must be one of the ones in [`format_strings`], and +/// the module name must be null-terminated. +/// +/// [`_printk`]: ../../../../include/linux/_printk.h +#[doc(hidden)] +#[cfg_attr(not(CONFIG_PRINTK), allow(unused_variables))] +pub unsafe fn call_printk( + format_string: &[u8; format_strings::LENGTH], + module_name: &[u8], + args: fmt::Arguments<'_>, +) { + // `_printk` does not seem to fail in any path. + #[cfg(CONFIG_PRINTK)] + unsafe { + bindings::_printk( + format_string.as_ptr() as _, + module_name.as_ptr(), + &args as *const _ as *const c_void, + ); + } +} + +/// Prints a message via the kernel's [`_printk`] for the `CONT` level. +/// +/// Public but hidden since it should only be used from public macros. +/// +/// [`_printk`]: ../../../../include/linux/printk.h +#[doc(hidden)] +#[cfg_attr(not(CONFIG_PRINTK), allow(unused_variables))] +pub fn call_printk_cont(args: fmt::Arguments<'_>) { + // `_printk` does not seem to fail in any path. + // + // SAFETY: The format string is fixed. + #[cfg(CONFIG_PRINTK)] + unsafe { + bindings::_printk( + format_strings::CONT.as_ptr() as _, + &args as *const _ as *const c_void, + ); + } +} + +/// Performs formatting and forwards the string to [`call_printk`]. +/// +/// Public but hidden since it should only be used from public macros. +#[doc(hidden)] +#[cfg(not(testlib))] +#[macro_export] +macro_rules! print_macro ( + // The non-continuation cases (most of them, e.g. `INFO`). + ($format_string:path, false, $($arg:tt)+) => ( + // SAFETY: This hidden macro should only be called by the documented + // printing macros which ensure the format string is one of the fixed + // ones. All `__LOG_PREFIX`s are null-terminated as they are generated + // by the `module!` proc macro or fixed values defined in a kernel + // crate. + unsafe { + $crate::print::call_printk( + &$format_string, + crate::__LOG_PREFIX, + format_args!($($arg)+), + ); + } + ); + + // The `CONT` case. + ($format_string:path, true, $($arg:tt)+) => ( + $crate::print::call_printk_cont( + format_args!($($arg)+), + ); + ); +); + +/// Stub for doctests +#[cfg(testlib)] +#[macro_export] +macro_rules! print_macro ( + ($format_string:path, $e:expr, $($arg:tt)+) => ( + () + ); +); + +// We could use a macro to generate these macros. However, doing so ends +// up being a bit ugly: it requires the dollar token trick to escape `$` as +// well as playing with the `doc` attribute. Furthermore, they cannot be easily +// imported in the prelude due to [1]. So, for the moment, we just write them +// manually, like in the C side; while keeping most of the logic in another +// macro, i.e. [`print_macro`]. +// +// [1]: https://github.com/rust-lang/rust/issues/52234 + +/// Prints an emergency-level message (level 0). +/// +/// Use this level if the system is unusable. +/// +/// Equivalent to the kernel's [`pr_emerg`] macro. +/// +/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and +/// [`alloc::format!`] for information about the formatting syntax. +/// +/// [`pr_emerg`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_emerg +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// pr_emerg!("hello {}\n", "there"); +/// ``` +#[macro_export] +macro_rules! pr_emerg ( + ($($arg:tt)*) => ( + $crate::print_macro!($crate::print::format_strings::EMERG, false, $($arg)*) + ) +); + +/// Prints an alert-level message (level 1). +/// +/// Use this level if action must be taken immediately. +/// +/// Equivalent to the kernel's [`pr_alert`] macro. +/// +/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and +/// [`alloc::format!`] for information about the formatting syntax. +/// +/// [`pr_alert`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_alert +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// pr_alert!("hello {}\n", "there"); +/// ``` +#[macro_export] +macro_rules! pr_alert ( + ($($arg:tt)*) => ( + $crate::print_macro!($crate::print::format_strings::ALERT, false, $($arg)*) + ) +); + +/// Prints a critical-level message (level 2). +/// +/// Use this level for critical conditions. +/// +/// Equivalent to the kernel's [`pr_crit`] macro. +/// +/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and +/// [`alloc::format!`] for information about the formatting syntax. +/// +/// [`pr_crit`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_crit +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// pr_crit!("hello {}\n", "there"); +/// ``` +#[macro_export] +macro_rules! pr_crit ( + ($($arg:tt)*) => ( + $crate::print_macro!($crate::print::format_strings::CRIT, false, $($arg)*) + ) +); + +/// Prints an error-level message (level 3). +/// +/// Use this level for error conditions. +/// +/// Equivalent to the kernel's [`pr_err`] macro. +/// +/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and +/// [`alloc::format!`] for information about the formatting syntax. +/// +/// [`pr_err`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_err +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// pr_err!("hello {}\n", "there"); +/// ``` +#[macro_export] +macro_rules! pr_err ( + ($($arg:tt)*) => ( + $crate::print_macro!($crate::print::format_strings::ERR, false, $($arg)*) + ) +); + +/// Prints a warning-level message (level 4). +/// +/// Use this level for warning conditions. +/// +/// Equivalent to the kernel's [`pr_warn`] macro. +/// +/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and +/// [`alloc::format!`] for information about the formatting syntax. +/// +/// [`pr_warn`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_warn +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// pr_warn!("hello {}\n", "there"); +/// ``` +#[macro_export] +macro_rules! pr_warn ( + ($($arg:tt)*) => ( + $crate::print_macro!($crate::print::format_strings::WARNING, false, $($arg)*) + ) +); + +/// Prints a notice-level message (level 5). +/// +/// Use this level for normal but significant conditions. +/// +/// Equivalent to the kernel's [`pr_notice`] macro. +/// +/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and +/// [`alloc::format!`] for information about the formatting syntax. +/// +/// [`pr_notice`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_notice +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// pr_notice!("hello {}\n", "there"); +/// ``` +#[macro_export] +macro_rules! pr_notice ( + ($($arg:tt)*) => ( + $crate::print_macro!($crate::print::format_strings::NOTICE, false, $($arg)*) + ) +); + +/// Prints an info-level message (level 6). +/// +/// Use this level for informational messages. +/// +/// Equivalent to the kernel's [`pr_info`] macro. +/// +/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and +/// [`alloc::format!`] for information about the formatting syntax. +/// +/// [`pr_info`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_info +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// pr_info!("hello {}\n", "there"); +/// ``` +#[macro_export] +#[doc(alias = "print")] +macro_rules! pr_info ( + ($($arg:tt)*) => ( + $crate::print_macro!($crate::print::format_strings::INFO, false, $($arg)*) + ) +); + +/// Prints a debug-level message (level 7). +/// +/// Use this level for debug messages. +/// +/// Equivalent to the kernel's [`pr_debug`] macro, except that it doesn't support dynamic debug +/// yet. +/// +/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and +/// [`alloc::format!`] for information about the formatting syntax. +/// +/// [`pr_debug`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_debug +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// pr_debug!("hello {}\n", "there"); +/// ``` +#[macro_export] +#[doc(alias = "print")] +macro_rules! pr_debug ( + ($($arg:tt)*) => ( + if cfg!(debug_assertions) { + $crate::print_macro!($crate::print::format_strings::DEBUG, false, $($arg)*) + } + ) +); + +/// Continues a previous log message in the same line. +/// +/// Use only when continuing a previous `pr_*!` macro (e.g. [`pr_info!`]). +/// +/// Equivalent to the kernel's [`pr_cont`] macro. +/// +/// Mimics the interface of [`std::print!`]. See [`core::fmt`] and +/// [`alloc::format!`] for information about the formatting syntax. +/// +/// [`pr_cont`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html#c.pr_cont +/// [`std::print!`]: https://doc.rust-lang.org/std/macro.print.html +/// +/// # Examples +/// +/// ``` +/// # use kernel::pr_cont; +/// pr_info!("hello"); +/// pr_cont!(" {}\n", "there"); +/// ``` +#[macro_export] +macro_rules! pr_cont ( + ($($arg:tt)*) => ( + $crate::print_macro!($crate::print::format_strings::CONT, true, $($arg)*) + ) +); diff --git a/rust/kernel/random.rs b/rust/kernel/random.rs new file mode 100644 index 00000000000000..a0926cb68a7562 --- /dev/null +++ b/rust/kernel/random.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Random numbers. +//! +//! C header: [`include/linux/random.h`](../../../../include/linux/random.h) + +use crate::{bindings, c_types, error::code::*, Error, Result}; + +/// Fills a byte slice with random bytes generated from the kernel's CSPRNG. +/// +/// Ensures that the CSPRNG has been seeded before generating any random bytes, +/// and will block until it is ready. +pub fn getrandom(dest: &mut [u8]) -> Result { + let res = unsafe { bindings::wait_for_random_bytes() }; + if res != 0 { + return Err(Error::from_kernel_errno(res)); + } + + unsafe { + bindings::get_random_bytes(dest.as_mut_ptr() as *mut c_types::c_void, dest.len()); + } + Ok(()) +} + +/// Fills a byte slice with random bytes generated from the kernel's CSPRNG. +/// +/// If the CSPRNG is not yet seeded, returns an `Err(EAGAIN)` immediately. +pub fn getrandom_nonblock(dest: &mut [u8]) -> Result { + if !unsafe { bindings::rng_is_initialized() } { + return Err(EAGAIN); + } + getrandom(dest) +} + +/// Contributes the contents of a byte slice to the kernel's entropy pool. +/// +/// Does *not* credit the kernel entropy counter though. +pub fn add_randomness(data: &[u8]) { + unsafe { + bindings::add_device_randomness(data.as_ptr() as *const c_types::c_void, data.len()); + } +} diff --git a/rust/kernel/raw_list.rs b/rust/kernel/raw_list.rs new file mode 100644 index 00000000000000..267b21709c296a --- /dev/null +++ b/rust/kernel/raw_list.rs @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Raw lists. +//! +//! TODO: This module is a work in progress. + +use core::{ + cell::UnsafeCell, + ptr, + ptr::NonNull, + sync::atomic::{AtomicBool, Ordering}, +}; + +/// A descriptor of list elements. +/// +/// It describes the type of list elements and provides a function to determine how to get the +/// links to be used on a list. +/// +/// A type that may be in multiple lists simultaneously needs to implement one of these for each +/// simultaneous list. +pub trait GetLinks { + /// The type of the entries in the list. + type EntryType: ?Sized; + + /// Returns the links to be used when linking an entry within a list. + fn get_links(data: &Self::EntryType) -> &Links; +} + +/// The links used to link an object on a linked list. +/// +/// Instances of this type are usually embedded in structures and returned in calls to +/// [`GetLinks::get_links`]. +pub struct Links { + inserted: AtomicBool, + entry: UnsafeCell>, +} + +impl Links { + /// Constructs a new [`Links`] instance that isn't inserted on any lists yet. + pub fn new() -> Self { + Self { + inserted: AtomicBool::new(false), + entry: UnsafeCell::new(ListEntry::new()), + } + } + + fn acquire_for_insertion(&self) -> bool { + self.inserted + .compare_exchange(false, true, Ordering::Acquire, Ordering::Relaxed) + .is_ok() + } + + fn release_after_removal(&self) { + self.inserted.store(false, Ordering::Release); + } +} + +impl Default for Links { + fn default() -> Self { + Self::new() + } +} + +struct ListEntry { + next: Option>, + prev: Option>, +} + +impl ListEntry { + fn new() -> Self { + Self { + next: None, + prev: None, + } + } +} + +/// A linked list. +/// +/// # Invariants +/// +/// The links of objects added to a list are owned by the list. +pub(crate) struct RawList { + head: Option>, +} + +impl RawList { + pub(crate) fn new() -> Self { + Self { head: None } + } + + pub(crate) fn is_empty(&self) -> bool { + self.head.is_none() + } + + fn insert_after_priv( + &mut self, + existing: &G::EntryType, + new_entry: &mut ListEntry, + new_ptr: Option>, + ) { + { + // SAFETY: It's safe to get the previous entry of `existing` because the list cannot + // change. + let existing_links = unsafe { &mut *G::get_links(existing).entry.get() }; + new_entry.next = existing_links.next; + existing_links.next = new_ptr; + } + + new_entry.prev = Some(NonNull::from(existing)); + + // SAFETY: It's safe to get the next entry of `existing` because the list cannot change. + let next_links = + unsafe { &mut *G::get_links(new_entry.next.unwrap().as_ref()).entry.get() }; + next_links.prev = new_ptr; + } + + /// Inserts the given object after `existing`. + /// + /// # Safety + /// + /// Callers must ensure that `existing` points to a valid entry that is on the list. + pub(crate) unsafe fn insert_after( + &mut self, + existing: &G::EntryType, + new: &G::EntryType, + ) -> bool { + let links = G::get_links(new); + if !links.acquire_for_insertion() { + // Nothing to do if already inserted. + return false; + } + + // SAFETY: The links are now owned by the list, so it is safe to get a mutable reference. + let new_entry = unsafe { &mut *links.entry.get() }; + self.insert_after_priv(existing, new_entry, Some(NonNull::from(new))); + true + } + + fn push_back_internal(&mut self, new: &G::EntryType) -> bool { + let links = G::get_links(new); + if !links.acquire_for_insertion() { + // Nothing to do if already inserted. + return false; + } + + // SAFETY: The links are now owned by the list, so it is safe to get a mutable reference. + let new_entry = unsafe { &mut *links.entry.get() }; + let new_ptr = Some(NonNull::from(new)); + match self.back() { + // SAFETY: `back` is valid as the list cannot change. + Some(back) => self.insert_after_priv(unsafe { back.as_ref() }, new_entry, new_ptr), + None => { + self.head = new_ptr; + new_entry.next = new_ptr; + new_entry.prev = new_ptr; + } + } + true + } + + pub(crate) unsafe fn push_back(&mut self, new: &G::EntryType) -> bool { + self.push_back_internal(new) + } + + fn remove_internal(&mut self, data: &G::EntryType) -> bool { + let links = G::get_links(data); + + // SAFETY: The links are now owned by the list, so it is safe to get a mutable reference. + let entry = unsafe { &mut *links.entry.get() }; + let next = if let Some(next) = entry.next { + next + } else { + // Nothing to do if the entry is not on the list. + return false; + }; + + if ptr::eq(data, next.as_ptr()) { + // We're removing the only element. + self.head = None + } else { + // Update the head if we're removing it. + if let Some(raw_head) = self.head { + if ptr::eq(data, raw_head.as_ptr()) { + self.head = Some(next); + } + } + + // SAFETY: It's safe to get the previous entry because the list cannot change. + unsafe { &mut *G::get_links(entry.prev.unwrap().as_ref()).entry.get() }.next = + entry.next; + + // SAFETY: It's safe to get the next entry because the list cannot change. + unsafe { &mut *G::get_links(next.as_ref()).entry.get() }.prev = entry.prev; + } + + // Reset the links of the element we're removing so that we know it's not on any list. + entry.next = None; + entry.prev = None; + links.release_after_removal(); + true + } + + /// Removes the given entry. + /// + /// # Safety + /// + /// Callers must ensure that `data` is either on this list or in no list. It being on another + /// list leads to memory unsafety. + pub(crate) unsafe fn remove(&mut self, data: &G::EntryType) -> bool { + self.remove_internal(data) + } + + fn pop_front_internal(&mut self) -> Option> { + let head = self.head?; + // SAFETY: The head is on the list as we just got it from there and it cannot change. + unsafe { self.remove(head.as_ref()) }; + Some(head) + } + + pub(crate) fn pop_front(&mut self) -> Option> { + self.pop_front_internal() + } + + pub(crate) fn front(&self) -> Option> { + self.head + } + + pub(crate) fn back(&self) -> Option> { + // SAFETY: The links of head are owned by the list, so it is safe to get a reference. + unsafe { &*G::get_links(self.head?.as_ref()).entry.get() }.prev + } + + pub(crate) fn cursor_front(&self) -> Cursor<'_, G> { + Cursor::new(self, self.front()) + } + + pub(crate) fn cursor_front_mut(&mut self) -> CursorMut<'_, G> { + CursorMut::new(self, self.front()) + } +} + +struct CommonCursor { + cur: Option>, +} + +impl CommonCursor { + fn new(cur: Option>) -> Self { + Self { cur } + } + + fn move_next(&mut self, list: &RawList) { + match self.cur.take() { + None => self.cur = list.head, + Some(cur) => { + if let Some(head) = list.head { + // SAFETY: We have a shared ref to the linked list, so the links can't change. + let links = unsafe { &*G::get_links(cur.as_ref()).entry.get() }; + if links.next.unwrap() != head { + self.cur = links.next; + } + } + } + } + } + + fn move_prev(&mut self, list: &RawList) { + match list.head { + None => self.cur = None, + Some(head) => { + let next = match self.cur.take() { + None => head, + Some(cur) => { + if cur == head { + return; + } + cur + } + }; + // SAFETY: There's a shared ref to the list, so the links can't change. + let links = unsafe { &*G::get_links(next.as_ref()).entry.get() }; + self.cur = links.prev; + } + } + } +} + +/// A list cursor that allows traversing a linked list and inspecting elements. +pub struct Cursor<'a, G: GetLinks> { + cursor: CommonCursor, + list: &'a RawList, +} + +impl<'a, G: GetLinks> Cursor<'a, G> { + fn new(list: &'a RawList, cur: Option>) -> Self { + Self { + list, + cursor: CommonCursor::new(cur), + } + } + + /// Returns the element the cursor is currently positioned on. + pub fn current(&self) -> Option<&'a G::EntryType> { + let cur = self.cursor.cur?; + // SAFETY: Objects must be kept alive while on the list. + Some(unsafe { &*cur.as_ptr() }) + } + + /// Moves the cursor to the next element. + pub fn move_next(&mut self) { + self.cursor.move_next(self.list); + } +} + +pub(crate) struct CursorMut<'a, G: GetLinks> { + cursor: CommonCursor, + list: &'a mut RawList, +} + +impl<'a, G: GetLinks> CursorMut<'a, G> { + fn new(list: &'a mut RawList, cur: Option>) -> Self { + Self { + list, + cursor: CommonCursor::new(cur), + } + } + + pub(crate) fn current(&mut self) -> Option<&mut G::EntryType> { + let cur = self.cursor.cur?; + // SAFETY: Objects must be kept alive while on the list. + Some(unsafe { &mut *cur.as_ptr() }) + } + + /// Removes the entry the cursor is pointing to and advances the cursor to the next entry. It + /// returns a raw pointer to the removed element (if one is removed). + pub(crate) fn remove_current(&mut self) -> Option> { + let entry = self.cursor.cur?; + self.cursor.move_next(self.list); + // SAFETY: The entry is on the list as we just got it from there and it cannot change. + unsafe { self.list.remove(entry.as_ref()) }; + Some(entry) + } + + pub(crate) fn peek_next(&mut self) -> Option<&mut G::EntryType> { + let mut new = CommonCursor::new(self.cursor.cur); + new.move_next(self.list); + // SAFETY: Objects must be kept alive while on the list. + Some(unsafe { &mut *new.cur?.as_ptr() }) + } + + pub(crate) fn peek_prev(&mut self) -> Option<&mut G::EntryType> { + let mut new = CommonCursor::new(self.cursor.cur); + new.move_prev(self.list); + // SAFETY: Objects must be kept alive while on the list. + Some(unsafe { &mut *new.cur?.as_ptr() }) + } + + pub(crate) fn move_next(&mut self) { + self.cursor.move_next(self.list); + } +} diff --git a/rust/kernel/rbtree.rs b/rust/kernel/rbtree.rs new file mode 100644 index 00000000000000..a30739cc683953 --- /dev/null +++ b/rust/kernel/rbtree.rs @@ -0,0 +1,563 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Red-black trees. +//! +//! C header: [`include/linux/rbtree.h`](../../../../include/linux/rbtree.h) +//! +//! Reference: + +use crate::{bindings, Result}; +use alloc::boxed::Box; +use core::{ + cmp::{Ord, Ordering}, + iter::{IntoIterator, Iterator}, + marker::PhantomData, + mem::MaybeUninit, + ptr::{addr_of_mut, NonNull}, +}; + +struct Node { + links: bindings::rb_node, + key: K, + value: V, +} + +/// A red-black tree with owned nodes. +/// +/// It is backed by the kernel C red-black trees. +/// +/// # Invariants +/// +/// Non-null parent/children pointers stored in instances of the `rb_node` C struct are always +/// valid, and pointing to a field of our internal representation of a node. +/// +/// # Examples +/// +/// In the example below we do several operations on a tree. We note that insertions may fail if +/// the system is out of memory. +/// +/// ``` +/// use kernel::rbtree::RBTree; +/// +/// # fn test() -> Result { +/// // Create a new tree. +/// let mut tree = RBTree::new(); +/// +/// // Insert three elements. +/// tree.try_insert(20, 200)?; +/// tree.try_insert(10, 100)?; +/// tree.try_insert(30, 300)?; +/// +/// // Check the nodes we just inserted. +/// { +/// let mut iter = tree.iter(); +/// assert_eq!(iter.next().unwrap(), (&10, &100)); +/// assert_eq!(iter.next().unwrap(), (&20, &200)); +/// assert_eq!(iter.next().unwrap(), (&30, &300)); +/// assert!(iter.next().is_none()); +/// } +/// +/// // Print all elements. +/// for (key, value) in &tree { +/// pr_info!("{} = {}\n", key, value); +/// } +/// +/// // Replace one of the elements. +/// tree.try_insert(10, 1000)?; +/// +/// // Check that the tree reflects the replacement. +/// { +/// let mut iter = tree.iter(); +/// assert_eq!(iter.next().unwrap(), (&10, &1000)); +/// assert_eq!(iter.next().unwrap(), (&20, &200)); +/// assert_eq!(iter.next().unwrap(), (&30, &300)); +/// assert!(iter.next().is_none()); +/// } +/// +/// // Change the value of one of the elements. +/// *tree.get_mut(&30).unwrap() = 3000; +/// +/// // Check that the tree reflects the update. +/// { +/// let mut iter = tree.iter(); +/// assert_eq!(iter.next().unwrap(), (&10, &1000)); +/// assert_eq!(iter.next().unwrap(), (&20, &200)); +/// assert_eq!(iter.next().unwrap(), (&30, &3000)); +/// assert!(iter.next().is_none()); +/// } +/// +/// // Remove an element. +/// tree.remove(&10); +/// +/// // Check that the tree reflects the removal. +/// { +/// let mut iter = tree.iter(); +/// assert_eq!(iter.next().unwrap(), (&20, &200)); +/// assert_eq!(iter.next().unwrap(), (&30, &3000)); +/// assert!(iter.next().is_none()); +/// } +/// +/// // Update all values. +/// for value in tree.values_mut() { +/// *value *= 10; +/// } +/// +/// // Check that the tree reflects the changes to values. +/// { +/// let mut iter = tree.iter(); +/// assert_eq!(iter.next().unwrap(), (&20, &2000)); +/// assert_eq!(iter.next().unwrap(), (&30, &30000)); +/// assert!(iter.next().is_none()); +/// } +/// +/// # Ok(()) +/// # } +/// # +/// # assert_eq!(test(), Ok(())); +/// ``` +/// +/// In the example below, we first allocate a node, acquire a spinlock, then insert the node into +/// the tree. This is useful when the insertion context does not allow sleeping, for example, when +/// holding a spinlock. +/// +/// ``` +/// use kernel::{rbtree::RBTree, sync::SpinLock}; +/// +/// fn insert_test(tree: &SpinLock>) -> Result { +/// // Pre-allocate node. This may fail (as it allocates memory). +/// let node = RBTree::try_allocate_node(10, 100)?; +/// +/// // Insert node while holding the lock. It is guaranteed to succeed with no allocation +/// // attempts. +/// let mut guard = tree.lock(); +/// guard.insert(node); +/// Ok(()) +/// } +/// ``` +/// +/// In the example below, we reuse an existing node allocation from an element we removed. +/// +/// ``` +/// use kernel::rbtree::RBTree; +/// +/// # fn test() -> Result { +/// // Create a new tree. +/// let mut tree = RBTree::new(); +/// +/// // Insert three elements. +/// tree.try_insert(20, 200)?; +/// tree.try_insert(10, 100)?; +/// tree.try_insert(30, 300)?; +/// +/// // Check the nodes we just inserted. +/// { +/// let mut iter = tree.iter(); +/// assert_eq!(iter.next().unwrap(), (&10, &100)); +/// assert_eq!(iter.next().unwrap(), (&20, &200)); +/// assert_eq!(iter.next().unwrap(), (&30, &300)); +/// assert!(iter.next().is_none()); +/// } +/// +/// // Remove a node, getting back ownership of it. +/// let existing = tree.remove_node(&30).unwrap(); +/// +/// // Check that the tree reflects the removal. +/// { +/// let mut iter = tree.iter(); +/// assert_eq!(iter.next().unwrap(), (&10, &100)); +/// assert_eq!(iter.next().unwrap(), (&20, &200)); +/// assert!(iter.next().is_none()); +/// } +/// +/// // Turn the node into a reservation so that we can reuse it with a different key/value. +/// let reservation = existing.into_reservation(); +/// +/// // Insert a new node into the tree, reusing the previous allocation. This is guaranteed to +/// // succeed (no memory allocations). +/// tree.insert(reservation.into_node(15, 150)); +/// +/// // Check that the tree reflect the new insertion. +/// { +/// let mut iter = tree.iter(); +/// assert_eq!(iter.next().unwrap(), (&10, &100)); +/// assert_eq!(iter.next().unwrap(), (&15, &150)); +/// assert_eq!(iter.next().unwrap(), (&20, &200)); +/// assert!(iter.next().is_none()); +/// } +/// +/// # Ok(()) +/// # } +/// # +/// # assert_eq!(test(), Ok(())); +/// ``` +pub struct RBTree { + root: bindings::rb_root, + _p: PhantomData>, +} + +impl RBTree { + /// Creates a new and empty tree. + pub fn new() -> Self { + Self { + // INVARIANT: There are no nodes in the tree, so the invariant holds vacuously. + root: bindings::rb_root::default(), + _p: PhantomData, + } + } + + /// Tries to insert a new value into the tree. + /// + /// It overwrites a node if one already exists with the same key and returns it (containing the + /// key/value pair). Returns [`None`] if a node with the same key didn't already exist. + /// + /// Returns an error if it cannot allocate memory for the new node. + pub fn try_insert(&mut self, key: K, value: V) -> Result>> + where + K: Ord, + { + Ok(self.insert(Self::try_allocate_node(key, value)?)) + } + + /// Allocates memory for a node to be eventually initialised and inserted into the tree via a + /// call to [`RBTree::insert`]. + pub fn try_reserve_node() -> Result> { + Ok(RBTreeNodeReservation { + node: Box::try_new(MaybeUninit::uninit())?, + }) + } + + /// Allocates and initialiases a node that can be inserted into the tree via + /// [`RBTree::insert`]. + pub fn try_allocate_node(key: K, value: V) -> Result> { + Ok(Self::try_reserve_node()?.into_node(key, value)) + } + + /// Inserts a new node into the tree. + /// + /// It overwrites a node if one already exists with the same key and returns it (containing the + /// key/value pair). Returns [`None`] if a node with the same key didn't already exist. + /// + /// This function always succeeds. + pub fn insert(&mut self, node: RBTreeNode) -> Option> + where + K: Ord, + { + let RBTreeNode { node } = node; + let node = Box::into_raw(node); + // SAFETY: `node` is valid at least until we call `Box::from_raw`, which only happens when + // the node is removed or replaced. + let node_links = unsafe { addr_of_mut!((*node).links) }; + let mut new_link: &mut *mut bindings::rb_node = &mut self.root.rb_node; + let mut parent = core::ptr::null_mut(); + while !new_link.is_null() { + let this = crate::container_of!(*new_link, Node, links); + + parent = *new_link; + + // SAFETY: `this` is a non-null node so it is valid by the type invariants. `node` is + // valid until the node is removed. + match unsafe { (*node).key.cmp(&(*this).key) } { + // SAFETY: `parent` is a non-null node so it is valid by the type invariants. + Ordering::Less => new_link = unsafe { &mut (*parent).rb_left }, + // SAFETY: `parent` is a non-null node so it is valid by the type invariants. + Ordering::Greater => new_link = unsafe { &mut (*parent).rb_right }, + Ordering::Equal => { + // INVARIANT: We are replacing an existing node with a new one, which is valid. + // It remains valid because we "forgot" it with `Box::into_raw`. + // SAFETY: All pointers are non-null and valid (parent, despite the name, really + // is the node we're replacing). + unsafe { bindings::rb_replace_node(parent, node_links, &mut self.root) }; + + // INVARIANT: The node is being returned and the caller may free it, however, + // it was removed from the tree. So the invariants still hold. + return Some(RBTreeNode { + // SAFETY: `this` was a node in the tree, so it is valid. + node: unsafe { Box::from_raw(this as _) }, + }); + } + } + } + + // INVARIANT: We are linking in a new node, which is valid. It remains valid because we + // "forgot" it with `Box::into_raw`. + // SAFETY: All pointers are non-null and valid (`*new_link` is null, but `new_link` is a + // mutable reference). + unsafe { bindings::rb_link_node(node_links, parent, new_link) }; + + // SAFETY: All pointers are valid. `node` has just been inserted into the tree. + unsafe { bindings::rb_insert_color(node_links, &mut self.root) }; + None + } + + /// Returns a node with the given key, if one exists. + fn find(&self, key: &K) -> Option>> + where + K: Ord, + { + let mut node = self.root.rb_node; + while !node.is_null() { + let this = crate::container_of!(node, Node, links); + // SAFETY: `this` is a non-null node so it is valid by the type invariants. + node = match key.cmp(unsafe { &(*this).key }) { + // SAFETY: `node` is a non-null node so it is valid by the type invariants. + Ordering::Less => unsafe { (*node).rb_left }, + // SAFETY: `node` is a non-null node so it is valid by the type invariants. + Ordering::Greater => unsafe { (*node).rb_right }, + Ordering::Equal => return NonNull::new(this as _), + } + } + None + } + + /// Returns a reference to the value corresponding to the key. + pub fn get(&self, key: &K) -> Option<&V> + where + K: Ord, + { + // SAFETY: The `find` return value is a node in the tree, so it is valid. + self.find(key).map(|node| unsafe { &node.as_ref().value }) + } + + /// Returns a mutable reference to the value corresponding to the key. + pub fn get_mut(&mut self, key: &K) -> Option<&mut V> + where + K: Ord, + { + // SAFETY: The `find` return value is a node in the tree, so it is valid. + self.find(key) + .map(|mut node| unsafe { &mut node.as_mut().value }) + } + + /// Removes the node with the given key from the tree. + /// + /// It returns the node that was removed if one exists, or [`None`] otherwise. + pub fn remove_node(&mut self, key: &K) -> Option> + where + K: Ord, + { + let mut node = self.find(key)?; + + // SAFETY: The `find` return value is a node in the tree, so it is valid. + unsafe { bindings::rb_erase(&mut node.as_mut().links, &mut self.root) }; + + // INVARIANT: The node is being returned and the caller may free it, however, it was + // removed from the tree. So the invariants still hold. + Some(RBTreeNode { + // SAFETY: The `find` return value was a node in the tree, so it is valid. + node: unsafe { Box::from_raw(node.as_ptr()) }, + }) + } + + /// Removes the node with the given key from the tree. + /// + /// It returns the value that was removed if one exists, or [`None`] otherwise. + pub fn remove(&mut self, key: &K) -> Option + where + K: Ord, + { + let node = self.remove_node(key)?; + let RBTreeNode { node } = node; + let Node { + links: _, + key: _, + value, + } = *node; + Some(value) + } + + /// Returns an iterator over the tree nodes, sorted by key. + pub fn iter(&self) -> RBTreeIterator<'_, K, V> { + RBTreeIterator { + _tree: PhantomData, + // SAFETY: `root` is valid as it's embedded in `self` and we have a valid `self`. + next: unsafe { bindings::rb_first(&self.root) }, + } + } + + /// Returns a mutable iterator over the tree nodes, sorted by key. + pub fn iter_mut(&mut self) -> RBTreeIteratorMut<'_, K, V> { + RBTreeIteratorMut { + _tree: PhantomData, + // SAFETY: `root` is valid as it's embedded in `self` and we have a valid `self`. + next: unsafe { bindings::rb_first(&self.root) }, + } + } + + /// Returns an iterator over the keys of the nodes in the tree, in sorted order. + pub fn keys(&self) -> impl Iterator { + self.iter().map(|(k, _)| k) + } + + /// Returns an iterator over the values of the nodes in the tree, sorted by key. + pub fn values(&self) -> impl Iterator { + self.iter().map(|(_, v)| v) + } + + /// Returns a mutable iterator over the values of the nodes in the tree, sorted by key. + pub fn values_mut(&mut self) -> impl Iterator { + self.iter_mut().map(|(_, v)| v) + } +} + +impl Default for RBTree { + fn default() -> Self { + Self::new() + } +} + +impl Drop for RBTree { + fn drop(&mut self) { + // SAFETY: `root` is valid as it's embedded in `self` and we have a valid `self`. + let mut next = unsafe { bindings::rb_first_postorder(&self.root) }; + + // INVARIANT: The loop invariant is that all tree nodes from `next` in postorder are valid. + while !next.is_null() { + let this = crate::container_of!(next, Node, links); + + // Find out what the next node is before disposing of the current one. + // SAFETY: `next` and all nodes in postorder are still valid. + next = unsafe { bindings::rb_next_postorder(next) }; + + // INVARIANT: This is the destructor, so we break the type invariant during clean-up, + // but it is not observable. The loop invariant is still maintained. + // SAFETY: `this` is valid per the loop invariant. + unsafe { Box::from_raw(this as *mut Node) }; + } + } +} + +impl<'a, K, V> IntoIterator for &'a RBTree { + type Item = (&'a K, &'a V); + type IntoIter = RBTreeIterator<'a, K, V>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// An iterator over the nodes of a [`RBTree`]. +/// +/// Instances are created by calling [`RBTree::iter`]. +pub struct RBTreeIterator<'a, K, V> { + _tree: PhantomData<&'a RBTree>, + next: *mut bindings::rb_node, +} + +impl<'a, K, V> Iterator for RBTreeIterator<'a, K, V> { + type Item = (&'a K, &'a V); + + fn next(&mut self) -> Option { + if self.next.is_null() { + return None; + } + + let cur = crate::container_of!(self.next, Node, links); + + // SAFETY: The reference to the tree used to create the iterator outlives the iterator, so + // the tree cannot change. By the tree invariant, all nodes are valid. + self.next = unsafe { bindings::rb_next(self.next) }; + + // SAFETY: By the same reasoning above, it is safe to dereference the node. Additionally, + // it is ok to return a reference to members because the iterator must outlive it. + Some(unsafe { (&(*cur).key, &(*cur).value) }) + } +} + +impl<'a, K, V> IntoIterator for &'a mut RBTree { + type Item = (&'a K, &'a mut V); + type IntoIter = RBTreeIteratorMut<'a, K, V>; + + fn into_iter(self) -> Self::IntoIter { + self.iter_mut() + } +} + +/// A mutable iterator over the nodes of a [`RBTree`]. +/// +/// Instances are created by calling [`RBTree::iter_mut`]. +pub struct RBTreeIteratorMut<'a, K, V> { + _tree: PhantomData<&'a RBTree>, + next: *mut bindings::rb_node, +} + +impl<'a, K, V> Iterator for RBTreeIteratorMut<'a, K, V> { + type Item = (&'a K, &'a mut V); + + fn next(&mut self) -> Option { + if self.next.is_null() { + return None; + } + + let cur = crate::container_of!(self.next, Node, links) as *mut Node; + + // SAFETY: The reference to the tree used to create the iterator outlives the iterator, so + // the tree cannot change (except for the value of previous nodes, but those don't affect + // the iteration process). By the tree invariant, all nodes are valid. + self.next = unsafe { bindings::rb_next(self.next) }; + + // SAFETY: By the same reasoning above, it is safe to dereference the node. Additionally, + // it is ok to return a reference to members because the iterator must outlive it. + Some(unsafe { (&(*cur).key, &mut (*cur).value) }) + } +} + +/// A memory reservation for a red-black tree node. +/// +/// It contains the memory needed to hold a node that can be inserted into a red-black tree. One +/// can be obtained by directly allocating it ([`RBTree::try_reserve_node`]) or by "uninitialising" +/// ([`RBTreeNode::into_reservation`]) an actual node (usually returned by some operation like +/// removal from a tree). +pub struct RBTreeNodeReservation { + node: Box>>, +} + +impl RBTreeNodeReservation { + /// Initialises a node reservation. + /// + /// It then becomes an [`RBTreeNode`] that can be inserted into a tree. + pub fn into_node(mut self, key: K, value: V) -> RBTreeNode { + let node_ptr = self.node.as_mut_ptr(); + // SAFETY: `node_ptr` is valid, and so are its fields. + unsafe { addr_of_mut!((*node_ptr).links).write(bindings::rb_node::default()) }; + // SAFETY: `node_ptr` is valid, and so are its fields. + unsafe { addr_of_mut!((*node_ptr).key).write(key) }; + // SAFETY: `node_ptr` is valid, and so are its fields. + unsafe { addr_of_mut!((*node_ptr).value).write(value) }; + let raw = Box::into_raw(self.node); + RBTreeNode { + // SAFETY: The pointer came from a `MaybeUninit` whose fields have all been + // initialised. Additionally, it has the same layout as `Node`. + node: unsafe { Box::from_raw(raw as _) }, + } + } +} + +/// A red-black tree node. +/// +/// The node is fully initialised (with key and value) and can be inserted into a tree without any +/// extra allocations or failure paths. +pub struct RBTreeNode { + node: Box>, +} + +impl RBTreeNode { + /// "Uninitialises" a node. + /// + /// It then becomes a reservation that can be re-initialised into a different node (i.e., with + /// a different key and/or value). + /// + /// The existing key and value are dropped in-place as part of this operation, that is, memory + /// may be freed (but only for the key/value; memory for the node itself is kept for reuse). + pub fn into_reservation(self) -> RBTreeNodeReservation { + let raw = Box::into_raw(self.node); + let mut ret = RBTreeNodeReservation { + // SAFETY: The pointer came from a valid `Node`, which has the same layout as + // `MaybeUninit`. + node: unsafe { Box::from_raw(raw as _) }, + }; + // SAFETY: Although the type is `MaybeUninit`, we know it has been initialised + // because it came from a `Node`. So it is safe to drop it. + unsafe { core::ptr::drop_in_place(ret.node.as_mut_ptr()) }; + ret + } +} diff --git a/rust/kernel/revocable.rs b/rust/kernel/revocable.rs new file mode 100644 index 00000000000000..cc49ccaa7a6da1 --- /dev/null +++ b/rust/kernel/revocable.rs @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Revocable objects. +//! +//! The [`Revocable`] type wraps other types and allows access to them to be revoked. The existence +//! of a [`RevocableGuard`] ensures that objects remain valid. + +use crate::bindings; +use core::{ + cell::UnsafeCell, + marker::PhantomData, + mem::ManuallyDrop, + ops::Deref, + ptr::drop_in_place, + sync::atomic::{AtomicBool, Ordering}, +}; + +/// An object that can become inaccessible at runtime. +/// +/// Once access is revoked and all concurrent users complete (i.e., all existing instances of +/// [`RevocableGuard`] are dropped), the wrapped object is also dropped. +/// +/// # Examples +/// +/// ``` +/// # use kernel::revocable::Revocable; +/// +/// struct Example { +/// a: u32, +/// b: u32, +/// } +/// +/// fn add_two(v: &Revocable) -> Option { +/// let guard = v.try_access()?; +/// Some(guard.a + guard.b) +/// } +/// +/// let v = Revocable::new(Example { a: 10, b: 20 }); +/// assert_eq!(add_two(&v), Some(30)); +/// v.revoke(); +/// assert_eq!(add_two(&v), None); +/// ``` +pub struct Revocable { + is_available: AtomicBool, + data: ManuallyDrop>, +} + +// SAFETY: `Revocable` is `Send` if the wrapped object is also `Send`. This is because while the +// functionality exposed by `Revocable` can be accessed from any thread/CPU, it is possible that +// this isn't supported by the wrapped object. +unsafe impl Send for Revocable {} + +// SAFETY: `Revocable` is `Sync` if the wrapped object is both `Send` and `Sync`. We require `Send` +// from the wrapped object as well because of `Revocable::revoke`, which can trigger the `Drop` +// implementation of the wrapped object from an arbitrary thread. +unsafe impl Sync for Revocable {} + +impl Revocable { + /// Creates a new revocable instance of the given data. + pub fn new(data: T) -> Self { + Self { + is_available: AtomicBool::new(true), + data: ManuallyDrop::new(UnsafeCell::new(data)), + } + } +} + +impl Revocable { + /// Tries to access the \[revocable\] wrapped object. + /// + /// Returns `None` if the object has been revoked and is therefore no longer accessible. + /// + /// Returns a guard that gives access to the object otherwise; the object is guaranteed to + /// remain accessible while the guard is alive. In such cases, callers are not allowed to sleep + /// because another CPU may be waiting to complete the revocation of this object. + pub fn try_access(&self) -> Option> { + let guard = RevocableGuard::new(self.data.get()); + if self.is_available.load(Ordering::Relaxed) { + Some(guard) + } else { + None + } + } + + /// Revokes access to and drops the wrapped object. + /// + /// Access to the object is revoked immediately to new callers of [`Revocable::try_access`]. If + /// there are concurrent users of the object (i.e., ones that called [`Revocable::try_access`] + /// beforehand and still haven't dropped the returned guard), this function waits for the + /// concurrent access to complete before dropping the wrapped object. + pub fn revoke(&self) { + if self + .is_available + .compare_exchange(true, false, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + // SAFETY: Just an FFI call, there are no further requirements. + unsafe { bindings::synchronize_rcu() }; + + // SAFETY: We know `self.data` is valid because only one CPU can succeed the + // `compare_exchange` above that takes `is_available` from `true` to `false`. + unsafe { drop_in_place(self.data.get()) }; + } + } +} + +impl Drop for Revocable { + fn drop(&mut self) { + // Drop only if the data hasn't been revoked yet (in which case it has already been + // dropped). + if *self.is_available.get_mut() { + // SAFETY: We know `self.data` is valid because no other CPU has changed + // `is_available` to `false` yet, and no other CPU can do it anymore because this CPU + // holds the only reference (mutable) to `self` now. + unsafe { drop_in_place(self.data.get()) }; + } + } +} + +/// A guard that allows access to a revocable object and keeps it alive. +/// +/// CPUs may not sleep while holding on to [`RevocableGuard`] because it's in atomic context +/// holding the RCU read-side lock. +/// +/// # Invariants +/// +/// The RCU read-side lock is held while the guard is alive. +pub struct RevocableGuard<'a, T: ?Sized> { + data_ref: *const T, + _p: PhantomData<&'a ()>, +} + +impl RevocableGuard<'_, T> { + fn new(data_ref: *const T) -> Self { + // SAFETY: Just an FFI call, there are no further requirements. + unsafe { bindings::rcu_read_lock() }; + + // INVARIANTS: The RCU read-side lock was just acquired. + Self { + data_ref, + _p: PhantomData, + } + } +} + +impl Drop for RevocableGuard<'_, T> { + fn drop(&mut self) { + // SAFETY: By the type invariants, we know that we hold the RCU read-side lock. + unsafe { bindings::rcu_read_unlock() }; + } +} + +impl Deref for RevocableGuard<'_, T> { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: By the type invariants, we hold the rcu read-side lock, so the object is + // guaranteed to remain valid. + unsafe { &*self.data_ref } + } +} diff --git a/rust/kernel/security.rs b/rust/kernel/security.rs new file mode 100644 index 00000000000000..eecf6dbf785116 --- /dev/null +++ b/rust/kernel/security.rs @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Linux Security Modules (LSM). +//! +//! C header: [`include/linux/security.h`](../../../../include/linux/security.h). + +use crate::{bindings, cred::Credential, file::File, to_result, Result}; + +/// Calls the security modules to determine if the given task can become the manager of a binder +/// context. +pub fn binder_set_context_mgr(mgr: &Credential) -> Result { + // SAFETY: `mrg.0` is valid because the shared reference guarantees a nonzero refcount. + to_result(|| unsafe { bindings::security_binder_set_context_mgr(mgr.0.get()) }) +} + +/// Calls the security modules to determine if binder transactions are allowed from task `from` to +/// task `to`. +pub fn binder_transaction(from: &Credential, to: &Credential) -> Result { + // SAFETY: `from` and `to` are valid because the shared references guarantee nonzero refcounts. + to_result(|| unsafe { bindings::security_binder_transaction(from.0.get(), to.0.get()) }) +} + +/// Calls the security modules to determine if task `from` is allowed to send binder objects +/// (owned by itself or other processes) to task `to` through a binder transaction. +pub fn binder_transfer_binder(from: &Credential, to: &Credential) -> Result { + // SAFETY: `from` and `to` are valid because the shared references guarantee nonzero refcounts. + to_result(|| unsafe { bindings::security_binder_transfer_binder(from.0.get(), to.0.get()) }) +} + +/// Calls the security modules to determine if task `from` is allowed to send the given file to +/// task `to` (which would get its own file descriptor) through a binder transaction. +pub fn binder_transfer_file(from: &Credential, to: &Credential, file: &File) -> Result { + // SAFETY: `from`, `to` and `file` are valid because the shared references guarantee nonzero + // refcounts. + to_result(|| unsafe { + bindings::security_binder_transfer_file(from.0.get(), to.0.get(), file.0.get()) + }) +} diff --git a/rust/kernel/static_assert.rs b/rust/kernel/static_assert.rs new file mode 100644 index 00000000000000..c4424218b0ce4a --- /dev/null +++ b/rust/kernel/static_assert.rs @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Static assert. + +/// Static assert (i.e. compile-time assert). +/// +/// Similar to C11 [`_Static_assert`] and C++11 [`static_assert`]. +/// +/// The feature may be added to Rust in the future: see [RFC 2790]. +/// +/// [`_Static_assert`]: https://en.cppreference.com/w/c/language/_Static_assert +/// [`static_assert`]: https://en.cppreference.com/w/cpp/language/static_assert +/// [RFC 2790]: https://github.com/rust-lang/rfcs/issues/2790 +/// +/// # Examples +/// +/// ``` +/// static_assert!(42 > 24); +/// static_assert!(core::mem::size_of::() == 1); +/// +/// const X: &[u8] = b"bar"; +/// static_assert!(X[1] == b'a'); +/// +/// const fn f(x: i32) -> i32 { +/// x + 2 +/// } +/// static_assert!(f(40) == 42); +/// ``` +#[macro_export] +macro_rules! static_assert { + ($condition:expr) => { + // Based on the latest one in `rustc`'s one before it was [removed]. + // + // [removed]: https://github.com/rust-lang/rust/commit/c2dad1c6b9f9636198d7c561b47a2974f5103f6d + #[allow(dead_code)] + const _: () = [()][!($condition) as usize]; + }; +} diff --git a/rust/kernel/std_vendor.rs b/rust/kernel/std_vendor.rs new file mode 100644 index 00000000000000..d64f30ce78dc3b --- /dev/null +++ b/rust/kernel/std_vendor.rs @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! The contents of this file come from the Rust standard library, hosted in the +//! repository. For copyright details, see +//! . + +/// [`std::dbg`], but using [`pr_info`] instead of [`eprintln`]. +/// +/// Prints and returns the value of a given expression for quick and dirty +/// debugging. +/// +/// An example: +/// +/// ```rust +/// let a = 2; +/// # #[allow(clippy::dbg_macro)] +/// let b = dbg!(a * 2) + 1; +/// // ^-- prints: [src/main.rs:2] a * 2 = 4 +/// assert_eq!(b, 5); +/// ``` +/// +/// The macro works by using the `Debug` implementation of the type of +/// the given expression to print the value with [`printk`] along with the +/// source location of the macro invocation as well as the source code +/// of the expression. +/// +/// Invoking the macro on an expression moves and takes ownership of it +/// before returning the evaluated expression unchanged. If the type +/// of the expression does not implement `Copy` and you don't want +/// to give up ownership, you can instead borrow with `dbg!(&expr)` +/// for some expression `expr`. +/// +/// The `dbg!` macro works exactly the same in release builds. +/// This is useful when debugging issues that only occur in release +/// builds or when debugging in release mode is significantly faster. +/// +/// Note that the macro is intended as a debugging tool and therefore you +/// should avoid having uses of it in version control for long periods +/// (other than in tests and similar). +/// +/// # Stability +/// +/// The exact output printed by this macro should not be relied upon +/// and is subject to future changes. +/// +/// # Further examples +/// +/// With a method call: +/// +/// ```rust +/// # #[allow(clippy::dbg_macro)] +/// fn foo(n: usize) { +/// if dbg!(n.checked_sub(4)).is_some() { +/// // ... +/// } +/// } +/// +/// foo(3) +/// ``` +/// +/// This prints to the kernel log: +/// +/// ```text,ignore +/// [src/main.rs:4] n.checked_sub(4) = None +/// ``` +/// +/// Naive factorial implementation: +/// +/// ```rust +/// # #[allow(clippy::dbg_macro)] +/// # { +/// fn factorial(n: u32) -> u32 { +/// if dbg!(n <= 1) { +/// dbg!(1) +/// } else { +/// dbg!(n * factorial(n - 1)) +/// } +/// } +/// +/// dbg!(factorial(4)); +/// # } +/// ``` +/// +/// This prints to the kernel log: +/// +/// ```text,ignore +/// [src/main.rs:3] n <= 1 = false +/// [src/main.rs:3] n <= 1 = false +/// [src/main.rs:3] n <= 1 = false +/// [src/main.rs:3] n <= 1 = true +/// [src/main.rs:4] 1 = 1 +/// [src/main.rs:5] n * factorial(n - 1) = 2 +/// [src/main.rs:5] n * factorial(n - 1) = 6 +/// [src/main.rs:5] n * factorial(n - 1) = 24 +/// [src/main.rs:11] factorial(4) = 24 +/// ``` +/// +/// The `dbg!(..)` macro moves the input: +/// +// TODO: Could be `compile_fail` when supported. +/// ```ignore +/// /// A wrapper around `usize` which importantly is not Copyable. +/// #[derive(Debug)] +/// struct NoCopy(usize); +/// +/// let a = NoCopy(42); +/// let _ = dbg!(a); // <-- `a` is moved here. +/// let _ = dbg!(a); // <-- `a` is moved again; error! +/// ``` +/// +/// You can also use `dbg!()` without a value to just print the +/// file and line whenever it's reached. +/// +/// Finally, if you want to `dbg!(..)` multiple values, it will treat them as +/// a tuple (and return it, too): +/// +/// ``` +/// # #[allow(clippy::dbg_macro)] +/// assert_eq!(dbg!(1usize, 2u32), (1, 2)); +/// ``` +/// +/// However, a single argument with a trailing comma will still not be treated +/// as a tuple, following the convention of ignoring trailing commas in macro +/// invocations. You can use a 1-tuple directly if you need one: +/// +/// ``` +/// # #[allow(clippy::dbg_macro)] +/// # { +/// assert_eq!(1, dbg!(1u32,)); // trailing comma ignored +/// assert_eq!((1,), dbg!((1u32,))); // 1-tuple +/// # } +/// ``` +/// +/// [`std::dbg`]: https://doc.rust-lang.org/std/macro.dbg.html +/// [`eprintln`]: https://doc.rust-lang.org/std/macro.eprintln.html +/// [`printk`]: https://www.kernel.org/doc/html/latest/core-api/printk-basics.html +#[macro_export] +macro_rules! dbg { + // NOTE: We cannot use `concat!` to make a static string as a format argument + // of `pr_info!` because `file!` could contain a `{` or + // `$val` expression could be a block (`{ .. }`), in which case the `pr_info!` + // will be malformed. + () => { + $crate::pr_info!("[{}:{}]\n", ::core::file!(), ::core::line!()) + }; + ($val:expr $(,)?) => { + // Use of `match` here is intentional because it affects the lifetimes + // of temporaries - https://stackoverflow.com/a/48732525/1063961 + match $val { + tmp => { + $crate::pr_info!("[{}:{}] {} = {:#?}\n", + ::core::file!(), ::core::line!(), ::core::stringify!($val), &tmp); + tmp + } + } + }; + ($($val:expr),+ $(,)?) => { + ($($crate::dbg!($val)),+,) + }; +} diff --git a/rust/kernel/str.rs b/rust/kernel/str.rs new file mode 100644 index 00000000000000..1a72e2f0206d35 --- /dev/null +++ b/rust/kernel/str.rs @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! String representations. + +use alloc::vec::Vec; +use core::fmt::{self, Write}; +use core::ops::{self, Deref, Index}; + +use crate::{bindings, c_types, error::code::*, Error}; + +/// Byte string without UTF-8 validity guarantee. +/// +/// `BStr` is simply an alias to `[u8]`, but has a more evident semantical meaning. +pub type BStr = [u8]; + +/// Creates a new [`BStr`] from a string literal. +/// +/// `b_str!` converts the supplied string literal to byte string, so non-ASCII +/// characters can be included. +/// +/// # Examples +/// +/// ``` +/// # use kernel::b_str; +/// # use kernel::str::BStr; +/// const MY_BSTR: &BStr = b_str!("My awesome BStr!"); +/// ``` +#[macro_export] +macro_rules! b_str { + ($str:literal) => {{ + const S: &'static str = $str; + const C: &'static $crate::str::BStr = S.as_bytes(); + C + }}; +} + +/// Possible errors when using conversion functions in [`CStr`]. +#[derive(Debug, Clone, Copy)] +pub enum CStrConvertError { + /// Supplied bytes contain an interior `NUL`. + InteriorNul, + + /// Supplied bytes are not terminated by `NUL`. + NotNulTerminated, +} + +impl From for Error { + #[inline] + fn from(_: CStrConvertError) -> Error { + EINVAL + } +} + +/// A string that is guaranteed to have exactly one `NUL` byte, which is at the +/// end. +/// +/// Used for interoperability with kernel APIs that take C strings. +#[repr(transparent)] +pub struct CStr([u8]); + +impl CStr { + /// Returns the length of this string excluding `NUL`. + #[inline] + pub const fn len(&self) -> usize { + self.len_with_nul() - 1 + } + + /// Returns the length of this string with `NUL`. + #[inline] + pub const fn len_with_nul(&self) -> usize { + // SAFETY: This is one of the invariant of `CStr`. + // We add a `unreachable_unchecked` here to hint the optimizer that + // the value returned from this function is non-zero. + if self.0.is_empty() { + unsafe { core::hint::unreachable_unchecked() }; + } + self.0.len() + } + + /// Returns `true` if the string only includes `NUL`. + #[inline] + pub const fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Wraps a raw C string pointer. + /// + /// # Safety + /// + /// `ptr` must be a valid pointer to a `NUL`-terminated C string, and it must + /// last at least `'a`. When `CStr` is alive, the memory pointed by `ptr` + /// must not be mutated. + #[inline] + pub unsafe fn from_char_ptr<'a>(ptr: *const c_types::c_char) -> &'a Self { + // SAFETY: The safety precondition guarantees `ptr` is a valid pointer + // to a `NUL`-terminated C string. + let len = unsafe { bindings::strlen(ptr) } + 1; + // SAFETY: Lifetime guaranteed by the safety precondition. + let bytes = unsafe { core::slice::from_raw_parts(ptr as _, len as _) }; + // SAFETY: As `len` is returned by `strlen`, `bytes` does not contain interior `NUL`. + // As we have added 1 to `len`, the last byte is known to be `NUL`. + unsafe { Self::from_bytes_with_nul_unchecked(bytes) } + } + + /// Creates a [`CStr`] from a `[u8]`. + /// + /// The provided slice must be `NUL`-terminated, does not contain any + /// interior `NUL` bytes. + pub const fn from_bytes_with_nul(bytes: &[u8]) -> Result<&Self, CStrConvertError> { + if bytes.is_empty() { + return Err(CStrConvertError::NotNulTerminated); + } + if bytes[bytes.len() - 1] != 0 { + return Err(CStrConvertError::NotNulTerminated); + } + let mut i = 0; + // `i + 1 < bytes.len()` allows LLVM to optimize away bounds checking, + // while it couldn't optimize away bounds checks for `i < bytes.len() - 1`. + while i + 1 < bytes.len() { + if bytes[i] == 0 { + return Err(CStrConvertError::InteriorNul); + } + i += 1; + } + // SAFETY: We just checked that all properties hold. + Ok(unsafe { Self::from_bytes_with_nul_unchecked(bytes) }) + } + + /// Creates a [`CStr`] from a `[u8]`, panic if input is not valid. + /// + /// This function is only meant to be used by `c_str!` macro, so + /// crates using `c_str!` macro don't have to enable `const_panic` feature. + #[doc(hidden)] + pub const fn from_bytes_with_nul_unwrap(bytes: &[u8]) -> &Self { + match Self::from_bytes_with_nul(bytes) { + Ok(v) => v, + Err(_) => panic!("string contains interior NUL"), + } + } + + /// Creates a [`CStr`] from a `[u8]` without performing any additional + /// checks. + /// + /// # Safety + /// + /// `bytes` *must* end with a `NUL` byte, and should only have a single + /// `NUL` byte (or the string will be truncated). + #[inline] + pub const unsafe fn from_bytes_with_nul_unchecked(bytes: &[u8]) -> &CStr { + // SAFETY: Properties of `bytes` guaranteed by the safety precondition. + unsafe { core::mem::transmute(bytes) } + } + + /// Returns a C pointer to the string. + #[inline] + pub const fn as_char_ptr(&self) -> *const c_types::c_char { + self.0.as_ptr() as _ + } + + /// Convert the string to a byte slice without the trailing 0 byte. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + &self.0[..self.len()] + } + + /// Convert the string to a byte slice containing the trailing 0 byte. + #[inline] + pub const fn as_bytes_with_nul(&self) -> &[u8] { + &self.0 + } + + /// Yields a [`&str`] slice if the [`CStr`] contains valid UTF-8. + /// + /// If the contents of the [`CStr`] are valid UTF-8 data, this + /// function will return the corresponding [`&str`] slice. Otherwise, + /// it will return an error with details of where UTF-8 validation failed. + /// + /// # Examples + /// + /// ``` + /// # use kernel::str::CStr; + /// let cstr = CStr::from_bytes_with_nul(b"foo\0").unwrap(); + /// assert_eq!(cstr.to_str(), Ok("foo")); + /// ``` + #[inline] + pub fn to_str(&self) -> Result<&str, core::str::Utf8Error> { + core::str::from_utf8(self.as_bytes()) + } + + /// Unsafely convert this [`CStr`] into a [`&str`], without checking for + /// valid UTF-8. + /// + /// # Safety + /// + /// The contents must be valid UTF-8. + /// + /// # Examples + /// + /// ``` + /// # use kernel::c_str; + /// # use kernel::str::CStr; + /// // SAFETY: String literals are guaranteed to be valid UTF-8 + /// // by the Rust compiler. + /// let bar = c_str!("ツ"); + /// assert_eq!(unsafe { bar.as_str_unchecked() }, "ツ"); + /// ``` + #[inline] + pub unsafe fn as_str_unchecked(&self) -> &str { + unsafe { core::str::from_utf8_unchecked(self.as_bytes()) } + } +} + +impl fmt::Display for CStr { + /// Formats printable ASCII characters, escaping the rest. + /// + /// ``` + /// # use kernel::c_str; + /// # use kernel::str::CStr; + /// # use kernel::str::CString; + /// let penguin = c_str!("🐧"); + /// let s = CString::try_from_fmt(fmt!("{}", penguin)).unwrap(); + /// assert_eq!(s.as_bytes_with_nul(), "\\xf0\\x9f\\x90\\xa7\0".as_bytes()); + /// + /// let ascii = c_str!("so \"cool\""); + /// let s = CString::try_from_fmt(fmt!("{}", ascii)).unwrap(); + /// assert_eq!(s.as_bytes_with_nul(), "so \"cool\"\0".as_bytes()); + /// ``` + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + for &c in self.as_bytes() { + if (0x20..0x7f).contains(&c) { + // Printable character + f.write_char(c as char)?; + } else { + write!(f, "\\x{:02x}", c)?; + } + } + Ok(()) + } +} + +impl fmt::Debug for CStr { + /// Formats printable ASCII characters with a double quote on either end, escaping the rest. + /// + /// ``` + /// # use kernel::c_str; + /// # use kernel::str::CStr; + /// # use kernel::str::CString; + /// let penguin = c_str!("🐧"); + /// let s = CString::try_from_fmt(fmt!("{:?}", penguin)).unwrap(); + /// assert_eq!(s.as_bytes_with_nul(), "\"\\xf0\\x9f\\x90\\xa7\"\0".as_bytes()); + /// + /// // Embedded double quotes are escaped. + /// let ascii = c_str!("so \"cool\""); + /// let s = CString::try_from_fmt(fmt!("{:?}", ascii)).unwrap(); + /// assert_eq!(s.as_bytes_with_nul(), "\"so \\\"cool\\\"\"\0".as_bytes()); + /// ``` + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("\"")?; + for &c in self.as_bytes() { + match c { + // Printable characters + b'\"' => f.write_str("\\\"")?, + 0x20..=0x7e => f.write_char(c as char)?, + _ => write!(f, "\\x{:02x}", c)?, + } + } + f.write_str("\"") + } +} + +impl AsRef for CStr { + #[inline] + fn as_ref(&self) -> &BStr { + self.as_bytes() + } +} + +impl Deref for CStr { + type Target = BStr; + + #[inline] + fn deref(&self) -> &Self::Target { + self.as_bytes() + } +} + +impl Index> for CStr { + type Output = CStr; + + #[inline] + fn index(&self, index: ops::RangeFrom) -> &Self::Output { + // Delegate bounds checking to slice. + // Assign to _ to mute clippy's unnecessary operation warning. + let _ = &self.as_bytes()[index.start..]; + // SAFETY: We just checked the bounds. + unsafe { Self::from_bytes_with_nul_unchecked(&self.0[index.start..]) } + } +} + +impl Index for CStr { + type Output = CStr; + + #[inline] + fn index(&self, _index: ops::RangeFull) -> &Self::Output { + self + } +} + +mod private { + use core::ops; + + // Marker trait for index types that can be forward to `BStr`. + pub trait CStrIndex {} + + impl CStrIndex for usize {} + impl CStrIndex for ops::Range {} + impl CStrIndex for ops::RangeInclusive {} + impl CStrIndex for ops::RangeToInclusive {} +} + +impl Index for CStr +where + Idx: private::CStrIndex, + BStr: Index, +{ + type Output = >::Output; + + #[inline] + fn index(&self, index: Idx) -> &Self::Output { + &self.as_bytes()[index] + } +} + +/// Creates a new [`CStr`] from a string literal. +/// +/// The string literal should not contain any `NUL` bytes. +/// +/// # Examples +/// +/// ``` +/// # use kernel::c_str; +/// # use kernel::str::CStr; +/// const MY_CSTR: &CStr = c_str!("My awesome CStr!"); +/// ``` +#[macro_export] +macro_rules! c_str { + ($str:expr) => {{ + const S: &str = concat!($str, "\0"); + const C: &$crate::str::CStr = $crate::str::CStr::from_bytes_with_nul_unwrap(S.as_bytes()); + C + }}; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cstr_to_str() { + let good_bytes = b"\xf0\x9f\xa6\x80\0"; + let checked_cstr = CStr::from_bytes_with_nul(good_bytes).unwrap(); + let checked_str = checked_cstr.to_str().unwrap(); + assert_eq!(checked_str, "🦀"); + } + + #[test] + #[should_panic] + fn test_cstr_to_str_panic() { + let bad_bytes = b"\xc3\x28\0"; + let checked_cstr = CStr::from_bytes_with_nul(bad_bytes).unwrap(); + checked_cstr.to_str().unwrap(); + } + + #[test] + fn test_cstr_as_str_unchecked() { + let good_bytes = b"\xf0\x9f\x90\xA7\0"; + let checked_cstr = CStr::from_bytes_with_nul(good_bytes).unwrap(); + let unchecked_str = unsafe { checked_cstr.as_str_unchecked() }; + assert_eq!(unchecked_str, "🐧"); + } +} + +/// Allows formatting of [`fmt::Arguments`] into a raw buffer. +/// +/// It does not fail if callers write past the end of the buffer so that they can calculate the +/// size required to fit everything. +/// +/// # Invariants +/// +/// The memory region between `pos` (inclusive) and `end` (exclusive) is valid for writes if `pos` +/// is less than `end`. +pub(crate) struct RawFormatter { + // Use `usize` to use `saturating_*` functions. + beg: usize, + pos: usize, + end: usize, +} + +impl RawFormatter { + /// Creates a new instance of [`RawFormatter`] with an empty buffer. + fn new() -> Self { + // INVARIANT: The buffer is empty, so the region that needs to be writable is empty. + Self { + beg: 0, + pos: 0, + end: 0, + } + } + + /// Creates a new instance of [`RawFormatter`] with the given buffer pointers. + /// + /// # Safety + /// + /// If `pos` is less than `end`, then the region between `pos` (inclusive) and `end` + /// (exclusive) must be valid for writes for the lifetime of the returned [`RawFormatter`]. + pub(crate) unsafe fn from_ptrs(pos: *mut u8, end: *mut u8) -> Self { + // INVARIANT: The safety requierments guarantee the type invariants. + Self { + beg: pos as _, + pos: pos as _, + end: end as _, + } + } + + /// Creates a new instance of [`RawFormatter`] with the given buffer. + /// + /// # Safety + /// + /// The memory region starting at `buf` and extending for `len` bytes must be valid for writes + /// for the lifetime of the returned [`RawFormatter`]. + pub(crate) unsafe fn from_buffer(buf: *mut u8, len: usize) -> Self { + let pos = buf as usize; + // INVARIANT: We ensure that `end` is never less then `buf`, and the safety requirements + // guarantees that the memory region is valid for writes. + Self { + pos, + beg: pos, + end: pos.saturating_add(len), + } + } + + /// Returns the current insert position. + /// + /// N.B. It may point to invalid memory. + pub(crate) fn pos(&self) -> *mut u8 { + self.pos as _ + } + + /// Return the number of bytes written to the formatter. + pub(crate) fn bytes_written(&self) -> usize { + self.pos - self.beg + } +} + +impl fmt::Write for RawFormatter { + fn write_str(&mut self, s: &str) -> fmt::Result { + // `pos` value after writing `len` bytes. This does not have to be bounded by `end`, but we + // don't want it to wrap around to 0. + let pos_new = self.pos.saturating_add(s.len()); + + // Amount that we can copy. `saturating_sub` ensures we get 0 if `pos` goes past `end`. + let len_to_copy = core::cmp::min(pos_new, self.end).saturating_sub(self.pos); + + if len_to_copy > 0 { + // SAFETY: If `len_to_copy` is non-zero, then we know `pos` has not gone past `end` + // yet, so it is valid for write per the type invariants. + unsafe { + core::ptr::copy_nonoverlapping( + s.as_bytes().as_ptr(), + self.pos as *mut u8, + len_to_copy, + ) + }; + } + + self.pos = pos_new; + Ok(()) + } +} + +/// Allows formatting of [`fmt::Arguments`] into a raw buffer. +/// +/// Fails if callers attempt to write more than will fit in the buffer. +pub(crate) struct Formatter(RawFormatter); + +impl Formatter { + /// Creates a new instance of [`Formatter`] with the given buffer. + /// + /// # Safety + /// + /// The memory region starting at `buf` and extending for `len` bytes must be valid for writes + /// for the lifetime of the returned [`Formatter`]. + pub(crate) unsafe fn from_buffer(buf: *mut u8, len: usize) -> Self { + // SAFETY: The safety requirements of this function satisfy those of the callee. + Self(unsafe { RawFormatter::from_buffer(buf, len) }) + } +} + +impl Deref for Formatter { + type Target = RawFormatter; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl fmt::Write for Formatter { + fn write_str(&mut self, s: &str) -> fmt::Result { + self.0.write_str(s)?; + + // Fail the request if we go past the end of the buffer. + if self.0.pos > self.0.end { + Err(fmt::Error) + } else { + Ok(()) + } + } +} + +/// An owned string that is guaranteed to have exactly one `NUL` byte, which is at the end. +/// +/// Used for interoperability with kernel APIs that take C strings. +/// +/// # Invariants +/// +/// The string is always `NUL`-terminated and contains no other `NUL` bytes. +/// +/// # Examples +/// +/// ``` +/// use kernel::str::CString; +/// +/// let s = CString::try_from_fmt(fmt!("{}{}{}", "abc", 10, 20)).unwrap(); +/// assert_eq!(s.as_bytes_with_nul(), "abc1020\0".as_bytes()); +/// +/// let tmp = "testing"; +/// let s = CString::try_from_fmt(fmt!("{tmp}{}", 123)).unwrap(); +/// assert_eq!(s.as_bytes_with_nul(), "testing123\0".as_bytes()); +/// +/// // This fails because it has an embedded `NUL` byte. +/// let s = CString::try_from_fmt(fmt!("a\0b{}", 123)); +/// assert_eq!(s.is_ok(), false); +/// ``` +pub struct CString { + buf: Vec, +} + +impl CString { + /// Creates an instance of [`CString`] from the given formatted arguments. + pub fn try_from_fmt(args: fmt::Arguments<'_>) -> Result { + // Calculate the size needed (formatted string plus `NUL` terminator). + let mut f = RawFormatter::new(); + f.write_fmt(args)?; + f.write_str("\0")?; + let size = f.bytes_written(); + + // Allocate a vector with the required number of bytes, and write to it. + let mut buf = Vec::try_with_capacity(size)?; + // SAFETY: The buffer stored in `buf` is at least of size `size` and is valid for writes. + let mut f = unsafe { Formatter::from_buffer(buf.as_mut_ptr(), size) }; + f.write_fmt(args)?; + f.write_str("\0")?; + + // SAFETY: The number of bytes that can be written to `f` is bounded by `size`, which is + // `buf`'s capacity. The contents of the buffer have been initialised by writes to `f`. + unsafe { buf.set_len(f.bytes_written()) }; + + // Check that there are no `NUL` bytes before the end. + // SAFETY: The buffer is valid for read because `f.bytes_written()` is bounded by `size` + // (which the minimum buffer size) and is non-zero (we wrote at least the `NUL` terminator) + // so `f.bytes_written() - 1` doesn't underflow. + let ptr = unsafe { bindings::memchr(buf.as_ptr().cast(), 0, (f.bytes_written() - 1) as _) }; + if !ptr.is_null() { + return Err(EINVAL); + } + + // INVARIANT: We wrote the `NUL` terminator and checked above that no other `NUL` bytes + // exist in the buffer. + Ok(Self { buf }) + } +} + +impl Deref for CString { + type Target = CStr; + + fn deref(&self) -> &Self::Target { + // SAFETY: The type invariants guarantee that the string is `NUL`-terminated and that no + // other `NUL` bytes exist. + unsafe { CStr::from_bytes_with_nul_unchecked(self.buf.as_slice()) } + } +} + +/// A convenience alias for [`core::format_args`]. +#[macro_export] +macro_rules! fmt { + ($($f:tt)*) => ( core::format_args!($($f)*) ) +} diff --git a/rust/kernel/sysctl.rs b/rust/kernel/sysctl.rs new file mode 100644 index 00000000000000..63bf76d03d93b7 --- /dev/null +++ b/rust/kernel/sysctl.rs @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! System control. +//! +//! C header: [`include/linux/sysctl.h`](../../../../include/linux/sysctl.h) +//! +//! Reference: + +use alloc::boxed::Box; +use alloc::vec::Vec; +use core::mem; +use core::ptr; +use core::sync::atomic; + +use crate::{ + bindings, c_types, + error::code::*, + io_buffer::IoBufferWriter, + str::CStr, + types, + user_ptr::{UserSlicePtr, UserSlicePtrWriter}, + Result, +}; + +/// Sysctl storage. +pub trait SysctlStorage: Sync { + /// Writes a byte slice. + fn store_value(&self, data: &[u8]) -> (usize, Result); + + /// Reads via a [`UserSlicePtrWriter`]. + fn read_value(&self, data: &mut UserSlicePtrWriter) -> (usize, Result); +} + +fn trim_whitespace(mut data: &[u8]) -> &[u8] { + while !data.is_empty() && (data[0] == b' ' || data[0] == b'\t' || data[0] == b'\n') { + data = &data[1..]; + } + while !data.is_empty() + && (data[data.len() - 1] == b' ' + || data[data.len() - 1] == b'\t' + || data[data.len() - 1] == b'\n') + { + data = &data[..data.len() - 1]; + } + data +} + +impl SysctlStorage for &T +where + T: SysctlStorage, +{ + fn store_value(&self, data: &[u8]) -> (usize, Result) { + (*self).store_value(data) + } + + fn read_value(&self, data: &mut UserSlicePtrWriter) -> (usize, Result) { + (*self).read_value(data) + } +} + +impl SysctlStorage for atomic::AtomicBool { + fn store_value(&self, data: &[u8]) -> (usize, Result) { + let result = match trim_whitespace(data) { + b"0" => { + self.store(false, atomic::Ordering::Relaxed); + Ok(()) + } + b"1" => { + self.store(true, atomic::Ordering::Relaxed); + Ok(()) + } + _ => Err(EINVAL), + }; + (data.len(), result) + } + + fn read_value(&self, data: &mut UserSlicePtrWriter) -> (usize, Result) { + let value = if self.load(atomic::Ordering::Relaxed) { + b"1\n" + } else { + b"0\n" + }; + (value.len(), data.write_slice(value)) + } +} + +/// Holds a single `sysctl` entry (and its table). +pub struct Sysctl { + inner: Box, + // Responsible for keeping the `ctl_table` alive. + _table: Box<[bindings::ctl_table]>, + header: *mut bindings::ctl_table_header, +} + +// SAFETY: The only public method we have is `get()`, which returns `&T`, and +// `T: Sync`. Any new methods must adhere to this requirement. +unsafe impl Sync for Sysctl {} + +unsafe extern "C" fn proc_handler( + ctl: *mut bindings::ctl_table, + write: c_types::c_int, + buffer: *mut c_types::c_void, + len: *mut usize, + ppos: *mut bindings::loff_t, +) -> c_types::c_int { + // If we are reading from some offset other than the beginning of the file, + // return an empty read to signal EOF. + if unsafe { *ppos } != 0 && write == 0 { + unsafe { *len = 0 }; + return 0; + } + + let data = unsafe { UserSlicePtr::new(buffer, *len) }; + let storage = unsafe { &*((*ctl).data as *const T) }; + let (bytes_processed, result) = if write != 0 { + let data = match data.read_all() { + Ok(r) => r, + Err(e) => return e.to_kernel_errno(), + }; + storage.store_value(&data) + } else { + let mut writer = data.writer(); + storage.read_value(&mut writer) + }; + unsafe { *len = bytes_processed }; + unsafe { *ppos += *len as bindings::loff_t }; + match result { + Ok(()) => 0, + Err(e) => e.to_kernel_errno(), + } +} + +impl Sysctl { + /// Registers a single entry in `sysctl`. + pub fn register( + path: &'static CStr, + name: &'static CStr, + storage: T, + mode: types::Mode, + ) -> Result> { + if name.contains(&b'/') { + return Err(EINVAL); + } + + let storage = Box::try_new(storage)?; + let mut table = Vec::try_with_capacity(2)?; + table.try_push(bindings::ctl_table { + procname: name.as_char_ptr(), + mode: mode.as_int(), + data: &*storage as *const T as *mut c_types::c_void, + proc_handler: Some(proc_handler::), + + maxlen: 0, + child: ptr::null_mut(), + poll: ptr::null_mut(), + extra1: ptr::null_mut(), + extra2: ptr::null_mut(), + })?; + table.try_push(unsafe { mem::zeroed() })?; + let mut table = table.try_into_boxed_slice()?; + + let result = unsafe { bindings::register_sysctl(path.as_char_ptr(), table.as_mut_ptr()) }; + if result.is_null() { + return Err(ENOMEM); + } + + Ok(Sysctl { + inner: storage, + _table: table, + header: result, + }) + } + + /// Gets the storage. + pub fn get(&self) -> &T { + &self.inner + } +} + +impl Drop for Sysctl { + fn drop(&mut self) { + unsafe { + bindings::unregister_sysctl_table(self.header); + } + self.header = ptr::null_mut(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_trim_whitespace() { + assert_eq!(trim_whitespace(b"foo "), b"foo"); + assert_eq!(trim_whitespace(b" foo"), b"foo"); + assert_eq!(trim_whitespace(b" foo "), b"foo"); + } +} diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs new file mode 100644 index 00000000000000..52dfc8db3d35e3 --- /dev/null +++ b/rust/kernel/task.rs @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Tasks (threads and processes). +//! +//! C header: [`include/linux/sched.h`](../../../../include/linux/sched.h). + +use crate::bindings; +use core::{marker::PhantomData, mem::ManuallyDrop, ops::Deref}; + +/// Wraps the kernel's `struct task_struct`. +/// +/// # Invariants +/// +/// The pointer `Task::ptr` is non-null and valid. Its reference count is also non-zero. +/// +/// # Examples +/// +/// The following is an example of getting the PID of the current thread with zero additional cost +/// when compared to the C version: +/// +/// ``` +/// use kernel::task::Task; +/// +/// let pid = Task::current().pid(); +/// ``` +/// +/// Getting the PID of the current process, also zero additional cost: +/// +/// ``` +/// use kernel::task::Task; +/// +/// let pid = Task::current().group_leader().pid(); +/// ``` +/// +/// Getting the current task and storing it in some struct. The reference count is automatically +/// incremented when creating `State` and decremented when it is dropped: +/// +/// ``` +/// use kernel::task::Task; +/// +/// struct State { +/// creator: Task, +/// index: u32, +/// } +/// +/// impl State { +/// fn new() -> Self { +/// Self { +/// creator: Task::current().clone(), +/// index: 0, +/// } +/// } +/// } +/// ``` +pub struct Task { + pub(crate) ptr: *mut bindings::task_struct, +} + +// SAFETY: Given that the task is referenced, it is OK to send it to another thread. +unsafe impl Send for Task {} + +// SAFETY: It's OK to access `Task` through references from other threads because we're either +// accessing properties that don't change (e.g., `pid`, `group_leader`) or that are properly +// synchronised by C code (e.g., `signal_pending`). +unsafe impl Sync for Task {} + +/// The type of process identifiers (PIDs). +type Pid = bindings::pid_t; + +impl Task { + /// Returns a task reference for the currently executing task/thread. + pub fn current<'a>() -> TaskRef<'a> { + // SAFETY: Just an FFI call. + let ptr = unsafe { bindings::get_current() }; + + // SAFETY: If the current thread is still running, the current task is valid. Given + // that `TaskRef` is not `Send`, we know it cannot be transferred to another thread (where + // it could potentially outlive the caller). + unsafe { TaskRef::from_ptr(ptr) } + } + + /// Returns the group leader of the given task. + pub fn group_leader(&self) -> TaskRef<'_> { + // SAFETY: By the type invariant, we know that `self.ptr` is non-null and valid. + let ptr = unsafe { (*self.ptr).group_leader }; + + // SAFETY: The lifetime of the returned task reference is tied to the lifetime of `self`, + // and given that a task has a reference to its group leader, we know it must be valid for + // the lifetime of the returned task reference. + unsafe { TaskRef::from_ptr(ptr) } + } + + /// Returns the PID of the given task. + pub fn pid(&self) -> Pid { + // SAFETY: By the type invariant, we know that `self.ptr` is non-null and valid. + unsafe { (*self.ptr).pid } + } + + /// Determines whether the given task has pending signals. + pub fn signal_pending(&self) -> bool { + // SAFETY: By the type invariant, we know that `self.ptr` is non-null and valid. + unsafe { bindings::signal_pending(self.ptr) != 0 } + } +} + +impl PartialEq for Task { + fn eq(&self, other: &Self) -> bool { + self.ptr == other.ptr + } +} + +impl Eq for Task {} + +impl Clone for Task { + fn clone(&self) -> Self { + // SAFETY: The type invariants guarantee that `self.ptr` has a non-zero reference count. + unsafe { bindings::get_task_struct(self.ptr) }; + + // INVARIANT: We incremented the reference count to account for the new `Task` being + // created. + Self { ptr: self.ptr } + } +} + +impl Drop for Task { + fn drop(&mut self) { + // INVARIANT: We may decrement the refcount to zero, but the `Task` is being dropped, so + // this is not observable. + // SAFETY: The type invariants guarantee that `Task::ptr` has a non-zero reference count. + unsafe { bindings::put_task_struct(self.ptr) }; + } +} + +/// A wrapper for [`Task`] that doesn't automatically decrement the refcount when dropped. +/// +/// We need the wrapper because [`ManuallyDrop`] alone would allow callers to call +/// [`ManuallyDrop::into_inner`]. This would allow an unsafe sequence to be triggered without +/// `unsafe` blocks because it would trigger an unbalanced call to `put_task_struct`. +/// +/// We make this explicitly not [`Send`] so that we can use it to represent the current thread +/// without having to increment/decrement its reference count. +/// +/// # Invariants +/// +/// The wrapped [`Task`] remains valid for the lifetime of the object. +pub struct TaskRef<'a> { + task: ManuallyDrop, + _not_send: PhantomData<(&'a (), *mut ())>, +} + +impl TaskRef<'_> { + /// Constructs a new `struct task_struct` wrapper that doesn't change its reference count. + /// + /// # Safety + /// + /// The pointer `ptr` must be non-null and valid for the lifetime of the object. + pub(crate) unsafe fn from_ptr(ptr: *mut bindings::task_struct) -> Self { + Self { + task: ManuallyDrop::new(Task { ptr }), + _not_send: PhantomData, + } + } +} + +// SAFETY: It is OK to share a reference to the current thread with another thread because we know +// the owner cannot go away while the shared reference exists (and `Task` itself is `Sync`). +unsafe impl Sync for TaskRef<'_> {} + +impl Deref for TaskRef<'_> { + type Target = Task; + + fn deref(&self) -> &Self::Target { + self.task.deref() + } +} diff --git a/rust/kernel/types.rs b/rust/kernel/types.rs new file mode 100644 index 00000000000000..42a83f4390d30c --- /dev/null +++ b/rust/kernel/types.rs @@ -0,0 +1,679 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Kernel types. +//! +//! C header: [`include/linux/types.h`](../../../../include/linux/types.h) + +use crate::{ + bindings, c_types, + sync::{Ref, RefBorrow}, +}; +use alloc::boxed::Box; +use core::{ + cell::UnsafeCell, + marker::PhantomData, + mem::MaybeUninit, + ops::{self, Deref, DerefMut}, + pin::Pin, + ptr::NonNull, +}; + +/// Permissions. +/// +/// C header: [`include/uapi/linux/stat.h`](../../../../include/uapi/linux/stat.h) +/// +/// C header: [`include/linux/stat.h`](../../../../include/linux/stat.h) +pub struct Mode(bindings::umode_t); + +impl Mode { + /// Creates a [`Mode`] from an integer. + pub fn from_int(m: u16) -> Mode { + Mode(m) + } + + /// Returns the mode as an integer. + pub fn as_int(&self) -> u16 { + self.0 + } +} + +/// Used to convert an object into a raw pointer that represents it. +/// +/// It can eventually be converted back into the object. This is used to store objects as pointers +/// in kernel data structures, for example, an implementation of [`FileOperations`] in `struct +/// file::private_data`. +pub trait PointerWrapper { + /// Type of values borrowed between calls to [`PointerWrapper::into_pointer`] and + /// [`PointerWrapper::from_pointer`]. + type Borrowed<'a>; + + /// Returns the raw pointer. + fn into_pointer(self) -> *const c_types::c_void; + + /// Returns a borrowed value. + /// + /// # Safety + /// + /// `ptr` must have been returned by a previous call to [`PointerWrapper::into_pointer`]. + /// Additionally, [`PointerWrapper::from_pointer`] can only be called after *all* values + /// returned by [`PointerWrapper::borrow`] have been dropped. + unsafe fn borrow<'a>(ptr: *const c_types::c_void) -> Self::Borrowed<'a>; + + /// Returns the instance back from the raw pointer. + /// + /// # Safety + /// + /// The passed pointer must come from a previous call to [`PointerWrapper::into_pointer()`]. + unsafe fn from_pointer(ptr: *const c_types::c_void) -> Self; +} + +impl PointerWrapper for Box { + type Borrowed<'a> = &'a T; + + fn into_pointer(self) -> *const c_types::c_void { + Box::into_raw(self) as _ + } + + unsafe fn borrow<'a>(ptr: *const c_types::c_void) -> &'a T { + // SAFETY: The safety requirements for this function ensure that the object is still alive, + // so it is safe to dereference the raw pointer. + // The safety requirements also ensure that the object remains alive for the lifetime of + // the returned value. + unsafe { &*ptr.cast() } + } + + unsafe fn from_pointer(ptr: *const c_types::c_void) -> Self { + // SAFETY: The passed pointer comes from a previous call to [`Self::into_pointer()`]. + unsafe { Box::from_raw(ptr as _) } + } +} + +impl PointerWrapper for Ref { + type Borrowed<'a> = RefBorrow<'a, T>; + + fn into_pointer(self) -> *const c_types::c_void { + Ref::into_usize(self) as _ + } + + unsafe fn borrow<'a>(ptr: *const c_types::c_void) -> RefBorrow<'a, T> { + // SAFETY: The safety requirements for this function ensure that the underlying object + // remains valid for the lifetime of the returned value. + unsafe { Ref::borrow_usize(ptr as _) } + } + + unsafe fn from_pointer(ptr: *const c_types::c_void) -> Self { + // SAFETY: The passed pointer comes from a previous call to [`Self::into_pointer()`]. + unsafe { Ref::from_usize(ptr as _) } + } +} + +impl PointerWrapper for Pin { + type Borrowed<'a> = T::Borrowed<'a>; + + fn into_pointer(self) -> *const c_types::c_void { + // SAFETY: We continue to treat the pointer as pinned by returning just a pointer to it to + // the caller. + let inner = unsafe { Pin::into_inner_unchecked(self) }; + inner.into_pointer() + } + + unsafe fn borrow<'a>(ptr: *const c_types::c_void) -> Self::Borrowed<'a> { + // SAFETY: The safety requirements for this function are the same as the ones for + // `T::borrow`. + unsafe { T::borrow(ptr) } + } + + unsafe fn from_pointer(p: *const c_types::c_void) -> Self { + // SAFETY: The object was originally pinned. + // The passed pointer comes from a previous call to `inner::into_pointer()`. + unsafe { Pin::new_unchecked(T::from_pointer(p)) } + } +} + +impl PointerWrapper for *mut T { + type Borrowed<'a> = *mut T; + + fn into_pointer(self) -> *const c_types::c_void { + self as _ + } + + unsafe fn borrow<'a>(ptr: *const c_types::c_void) -> Self::Borrowed<'a> { + ptr as _ + } + + unsafe fn from_pointer(ptr: *const c_types::c_void) -> Self { + ptr as _ + } +} + +impl PointerWrapper for () { + type Borrowed<'a> = (); + + fn into_pointer(self) -> *const c_types::c_void { + // We use 1 to be different from a null pointer. + 1usize as _ + } + + unsafe fn borrow<'a>(_: *const c_types::c_void) -> Self::Borrowed<'a> {} + + unsafe fn from_pointer(_: *const c_types::c_void) -> Self {} +} + +/// Runs a cleanup function/closure when dropped. +/// +/// The [`ScopeGuard::dismiss`] function prevents the cleanup function from running. +/// +/// # Examples +/// +/// In the example below, we have multiple exit paths and we want to log regardless of which one is +/// taken: +/// ``` +/// # use kernel::ScopeGuard; +/// fn example1(arg: bool) { +/// let _log = ScopeGuard::new(|| pr_info!("example1 completed\n")); +/// +/// if arg { +/// return; +/// } +/// +/// pr_info!("Do something...\n"); +/// } +/// +/// # example1(false); +/// # example1(true); +/// ``` +/// +/// In the example below, we want to log the same message on all early exits but a different one on +/// the main exit path: +/// ``` +/// # use kernel::ScopeGuard; +/// fn example2(arg: bool) { +/// let log = ScopeGuard::new(|| pr_info!("example2 returned early\n")); +/// +/// if arg { +/// return; +/// } +/// +/// // (Other early returns...) +/// +/// log.dismiss(); +/// pr_info!("example2 no early return\n"); +/// } +/// +/// # example2(false); +/// # example2(true); +/// ``` +/// +/// In the example below, we need a mutable object (the vector) to be accessible within the log +/// function, so we wrap it in the [`ScopeGuard`]: +/// ``` +/// # use kernel::ScopeGuard; +/// fn example3(arg: bool) -> Result { +/// let mut vec = +/// ScopeGuard::new_with_data(Vec::new(), |v| pr_info!("vec had {} elements\n", v.len())); +/// +/// vec.try_push(10u8)?; +/// if arg { +/// return Ok(()); +/// } +/// vec.try_push(20u8)?; +/// Ok(()) +/// } +/// +/// # assert_eq!(example3(false), Ok(())); +/// # assert_eq!(example3(true), Ok(())); +/// ``` +/// +/// # Invariants +/// +/// The value stored in the struct is nearly always `Some(_)`, except between +/// [`ScopeGuard::dismiss`] and [`ScopeGuard::drop`]: in this case, it will be `None` as the value +/// will have been returned to the caller. Since [`ScopeGuard::dismiss`] consumes the guard, +/// callers won't be able to use it anymore. +pub struct ScopeGuard(Option<(T, F)>); + +impl ScopeGuard { + /// Creates a new guarded object wrapping the given data and with the given cleanup function. + pub fn new_with_data(data: T, cleanup_func: F) -> Self { + // INVARIANT: The struct is being initialised with `Some(_)`. + Self(Some((data, cleanup_func))) + } + + /// Prevents the cleanup function from running and returns the guarded data. + pub fn dismiss(mut self) -> T { + // INVARIANT: This is the exception case in the invariant; it is not visible to callers + // because this function consumes `self`. + self.0.take().unwrap().0 + } +} + +impl ScopeGuard<(), Box> { + /// Creates a new guarded object with the given cleanup function. + pub fn new(cleanup: impl FnOnce()) -> ScopeGuard<(), impl FnOnce(())> { + ScopeGuard::new_with_data((), move |_| cleanup()) + } +} + +impl Deref for ScopeGuard { + type Target = T; + + fn deref(&self) -> &T { + // The type invariants guarantee that `unwrap` will succeed. + &self.0.as_ref().unwrap().0 + } +} + +impl DerefMut for ScopeGuard { + fn deref_mut(&mut self) -> &mut T { + // The type invariants guarantee that `unwrap` will succeed. + &mut self.0.as_mut().unwrap().0 + } +} + +impl Drop for ScopeGuard { + fn drop(&mut self) { + // Run the cleanup function if one is still present. + if let Some((data, cleanup)) = self.0.take() { + cleanup(data) + } + } +} + +/// Stores an opaque value. +/// +/// This is meant to be used with FFI objects that are never interpreted by Rust code. +pub struct Opaque(MaybeUninit>); + +impl Opaque { + /// Creates a new opaque value. + pub fn new(value: T) -> Self { + Self(MaybeUninit::new(UnsafeCell::new(value))) + } + + /// Creates an uninitialised value. + pub const fn uninit() -> Self { + Self(MaybeUninit::uninit()) + } + + /// Returns a raw pointer to the opaque data. + pub fn get(&self) -> *mut T { + UnsafeCell::raw_get(self.0.as_ptr()) + } +} + +/// A bitmask. +/// +/// It has a restriction that all bits must be the same, except one. For example, `0b1110111` and +/// `0b1000` are acceptable masks. +#[derive(Clone, Copy)] +pub struct Bit { + index: T, + inverted: bool, +} + +/// Creates a bit mask with a single bit set. +/// +/// # Examples +/// +/// ``` +/// # use kernel::bit; +/// let mut x = 0xfeu32; +/// +/// assert_eq!(x & bit(0), 0); +/// assert_eq!(x & bit(1), 2); +/// assert_eq!(x & bit(2), 4); +/// assert_eq!(x & bit(3), 8); +/// +/// x |= bit(0); +/// assert_eq!(x, 0xff); +/// +/// x &= !bit(1); +/// assert_eq!(x, 0xfd); +/// +/// x &= !bit(7); +/// assert_eq!(x, 0x7d); +/// +/// let y: u64 = bit(34).into(); +/// assert_eq!(y, 0x400000000); +/// +/// assert_eq!(y | bit(35), 0xc00000000); +/// ``` +pub fn bit(index: T) -> Bit { + Bit { + index, + inverted: false, + } +} + +impl ops::Not for Bit { + type Output = Self; + fn not(self) -> Self { + Self { + index: self.index, + inverted: !self.inverted, + } + } +} + +/// Implemented by integer types that allow counting the number of trailing zeroes. +pub trait TrailingZeros { + /// Returns the number of trailing zeroes in the binary representation of `self`. + fn trailing_zeros(&self) -> u32; +} + +macro_rules! define_unsigned_number_traits { + ($type_name:ty) => { + impl TrailingZeros for $type_name { + fn trailing_zeros(&self) -> u32 { + <$type_name>::trailing_zeros(*self) + } + } + + impl core::convert::From> for $type_name + where + Self: ops::Shl + core::convert::From + ops::Not, + { + fn from(v: Bit) -> Self { + let c = Self::from(1u8) << v.index; + if v.inverted { + !c + } else { + c + } + } + } + + impl ops::BitAnd> for $type_name + where + Self: ops::Shl + core::convert::From, + { + type Output = Self; + fn bitand(self, rhs: Bit) -> Self::Output { + self & Self::from(rhs) + } + } + + impl ops::BitOr> for $type_name + where + Self: ops::Shl + core::convert::From, + { + type Output = Self; + fn bitor(self, rhs: Bit) -> Self::Output { + self | Self::from(rhs) + } + } + + impl ops::BitAndAssign> for $type_name + where + Self: ops::Shl + core::convert::From, + { + fn bitand_assign(&mut self, rhs: Bit) { + *self &= Self::from(rhs) + } + } + + impl ops::BitOrAssign> for $type_name + where + Self: ops::Shl + core::convert::From, + { + fn bitor_assign(&mut self, rhs: Bit) { + *self |= Self::from(rhs) + } + } + }; +} + +define_unsigned_number_traits!(u8); +define_unsigned_number_traits!(u16); +define_unsigned_number_traits!(u32); +define_unsigned_number_traits!(u64); +define_unsigned_number_traits!(usize); + +/// Returns an iterator over the set bits of `value`. +/// +/// # Examples +/// +/// ``` +/// use kernel::bits_iter; +/// +/// let mut iter = bits_iter(5usize); +/// assert_eq!(iter.next().unwrap(), 0); +/// assert_eq!(iter.next().unwrap(), 2); +/// assert!(iter.next().is_none()); +/// ``` +/// +/// ``` +/// use kernel::bits_iter; +/// +/// fn print_bits(x: usize) { +/// for bit in bits_iter(x) { +/// pr_info!("{}\n", bit); +/// } +/// } +/// +/// # print_bits(42); +/// ``` +#[inline] +pub fn bits_iter(value: T) -> impl Iterator +where + T: core::cmp::PartialEq + + From + + ops::Shl + + ops::Not + + ops::BitAndAssign + + TrailingZeros, +{ + struct BitIterator { + value: U, + } + + impl Iterator for BitIterator + where + U: core::cmp::PartialEq + + From + + ops::Shl + + ops::Not + + ops::BitAndAssign + + TrailingZeros, + { + type Item = u32; + + #[inline] + fn next(&mut self) -> Option { + if self.value == U::from(0u8) { + return None; + } + let ret = self.value.trailing_zeros(); + self.value &= !(U::from(1u8) << ret); + Some(ret) + } + } + + BitIterator { value } +} + +/// A trait for boolean types. +/// +/// This is meant to be used in type states to allow boolean constraints in implementation blocks. +/// In the example below, the implementation containing `MyType::set_value` could _not_ be +/// constrained to type states containing `Writable = true` if `Writable` were a constant instead +/// of a type. +/// +/// # Safety +/// +/// No additional implementations of [`Bool`] should be provided, as [`True`] and [`False`] are +/// already provided. +/// +/// # Examples +/// +/// ``` +/// # use kernel::{Bool, False, True}; +/// use core::marker::PhantomData; +/// +/// // Type state specifies whether the type is writable. +/// trait MyTypeState { +/// type Writable: Bool; +/// } +/// +/// // In state S1, the type is writable. +/// struct S1; +/// impl MyTypeState for S1 { +/// type Writable = True; +/// } +/// +/// // In state S2, the type is not writable. +/// struct S2; +/// impl MyTypeState for S2 { +/// type Writable = False; +/// } +/// +/// struct MyType { +/// value: u32, +/// _p: PhantomData, +/// } +/// +/// impl MyType { +/// fn new(value: u32) -> Self { +/// Self { +/// value, +/// _p: PhantomData, +/// } +/// } +/// } +/// +/// // This implementation block only applies if the type state is writable. +/// impl MyType +/// where +/// T: MyTypeState, +/// { +/// fn set_value(&mut self, v: u32) { +/// self.value = v; +/// } +/// } +/// +/// let mut x = MyType::::new(10); +/// let mut y = MyType::::new(20); +/// +/// x.set_value(30); +/// +/// // The code below fails to compile because `S2` is not writable. +/// // y.set_value(40); +/// ``` +pub unsafe trait Bool {} + +/// Represents the `true` value for types with [`Bool`] bound. +pub struct True; + +// SAFETY: This is one of the only two implementations of `Bool`. +unsafe impl Bool for True {} + +/// Represents the `false` value for types wth [`Bool`] bound. +pub struct False; + +// SAFETY: This is one of the only two implementations of `Bool`. +unsafe impl Bool for False {} + +/// Types that are _always_ reference counted. +/// +/// It allows such types to define their own custom ref increment and decrement functions. +/// Additionally, it allows users to convert from a shared reference `&T` to an owned reference +/// [`ARef`]. +/// +/// This is usually implemented by wrappers to existing structures on the C side of the code. For +/// Rust code, the recommendation is to use [`Ref`] to create reference-counted instances of a +/// type. +/// +/// # Safety +/// +/// Implementers must ensure that increments to the reference count keeps the object alive in +/// memory at least until a matching decrement performed. +/// +/// Implementers must also ensure that all instances are reference-counted. (Otherwise they +/// won't be able to honour the requirement that [`AlwaysRefCounted::inc_ref`] keep the object +/// alive.) +pub unsafe trait AlwaysRefCounted { + /// Increments the reference count on the object. + fn inc_ref(&self); + + /// Decrements the reference count on the object. + /// + /// Frees the object when the count reaches zero. + /// + /// # Safety + /// + /// Callers must ensure that there was a previous matching increment to the reference count, + /// and that the object is no longer used after its reference count is decremented (as it may + /// result in the object being freed), unless the caller owns another increment on the refcount + /// (e.g., it calls [`AlwaysRefCounted::inc_ref`] twice, then calls + /// [`AlwaysRefCounted::dec_ref`] once). + unsafe fn dec_ref(obj: NonNull); +} + +/// An owned reference to an always-reference-counted object. +/// +/// The object's reference count is automatically decremented when an instance of [`ARef`] is +/// dropped. It is also automatically incremented when a new instance is created via +/// [`ARef::clone`]. +/// +/// # Invariants +/// +/// The pointer stored in `ptr` is non-null and valid for the lifetime of the [`ARef`] instance. In +/// particular, the [`ARef`] instance owns an increment on underlying object's reference count. +pub struct ARef { + ptr: NonNull, + _p: PhantomData, +} + +impl ARef { + /// Creates a new instance of [`ARef`]. + /// + /// It takes over an increment of the reference count on the underlying object. + /// + /// # Safety + /// + /// Callers must ensure that the reference count was incremented at least once, and that they + /// are properly relinquishing one increment. That is, if there is only one increment, callers + /// must not use the underlying object anymore -- it is only safe to do so via the newly + /// created [`ARef`]. + pub unsafe fn from_raw(ptr: NonNull) -> Self { + // INVARIANT: The safety requirements guarantee that the new instance now owns the + // increment on the refcount. + Self { + ptr, + _p: PhantomData, + } + } +} + +impl Clone for ARef { + fn clone(&self) -> Self { + self.inc_ref(); + // SAFETY: We just incremented the refcount above. + unsafe { Self::from_raw(self.ptr) } + } +} + +impl Deref for ARef { + type Target = T; + + fn deref(&self) -> &Self::Target { + // SAFETY: The type invariants guarantee that the object is valid. + unsafe { self.ptr.as_ref() } + } +} + +impl From<&T> for ARef { + fn from(b: &T) -> Self { + b.inc_ref(); + // SAFETY: We just incremented the refcount above. + unsafe { Self::from_raw(NonNull::from(b)) } + } +} + +impl Drop for ARef { + fn drop(&mut self) { + // SAFETY: The type invariants guarantee that the `ARef` owns the reference we're about to + // decrement. + unsafe { T::dec_ref(self.ptr) }; + } +} diff --git a/rust/kernel/user_ptr.rs b/rust/kernel/user_ptr.rs new file mode 100644 index 00000000000000..8489e80923c779 --- /dev/null +++ b/rust/kernel/user_ptr.rs @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! User pointers. +//! +//! C header: [`include/linux/uaccess.h`](../../../../include/linux/uaccess.h) + +use crate::{ + bindings, c_types, + error::code::*, + io_buffer::{IoBufferReader, IoBufferWriter}, + Result, +}; +use alloc::vec::Vec; + +/// A reference to an area in userspace memory, which can be either +/// read-only or read-write. +/// +/// All methods on this struct are safe: invalid pointers return +/// `EFAULT`. Concurrent access, *including data races to/from userspace +/// memory*, is permitted, because fundamentally another userspace +/// thread/process could always be modifying memory at the same time +/// (in the same way that userspace Rust's [`std::io`] permits data races +/// with the contents of files on disk). In the presence of a race, the +/// exact byte values read/written are unspecified but the operation is +/// well-defined. Kernelspace code should validate its copy of data +/// after completing a read, and not expect that multiple reads of the +/// same address will return the same value. +/// +/// All APIs enforce the invariant that a given byte of memory from userspace +/// may only be read once. By preventing double-fetches we avoid TOCTOU +/// vulnerabilities. This is accomplished by taking `self` by value to prevent +/// obtaining multiple readers on a given [`UserSlicePtr`], and the readers +/// only permitting forward reads. +/// +/// Constructing a [`UserSlicePtr`] performs no checks on the provided +/// address and length, it can safely be constructed inside a kernel thread +/// with no current userspace process. Reads and writes wrap the kernel APIs +/// `copy_from_user` and `copy_to_user`, which check the memory map of the +/// current process and enforce that the address range is within the user +/// range (no additional calls to `access_ok` are needed). +/// +/// [`std::io`]: https://doc.rust-lang.org/std/io/index.html +pub struct UserSlicePtr(*mut c_types::c_void, usize); + +impl UserSlicePtr { + /// Constructs a user slice from a raw pointer and a length in bytes. + /// + /// # Safety + /// + /// Callers must be careful to avoid time-of-check-time-of-use + /// (TOCTOU) issues. The simplest way is to create a single instance of + /// [`UserSlicePtr`] per user memory block as it reads each byte at + /// most once. + pub unsafe fn new(ptr: *mut c_types::c_void, length: usize) -> Self { + UserSlicePtr(ptr, length) + } + + /// Reads the entirety of the user slice. + /// + /// Returns `EFAULT` if the address does not currently point to + /// mapped, readable memory. + pub fn read_all(self) -> Result> { + self.reader().read_all() + } + + /// Constructs a [`UserSlicePtrReader`]. + pub fn reader(self) -> UserSlicePtrReader { + UserSlicePtrReader(self.0, self.1) + } + + /// Writes the provided slice into the user slice. + /// + /// Returns `EFAULT` if the address does not currently point to + /// mapped, writable memory (in which case some data from before the + /// fault may be written), or `data` is larger than the user slice + /// (in which case no data is written). + pub fn write_all(self, data: &[u8]) -> Result { + self.writer().write_slice(data) + } + + /// Constructs a [`UserSlicePtrWriter`]. + pub fn writer(self) -> UserSlicePtrWriter { + UserSlicePtrWriter(self.0, self.1) + } + + /// Constructs both a [`UserSlicePtrReader`] and a [`UserSlicePtrWriter`]. + pub fn reader_writer(self) -> (UserSlicePtrReader, UserSlicePtrWriter) { + ( + UserSlicePtrReader(self.0, self.1), + UserSlicePtrWriter(self.0, self.1), + ) + } +} + +/// A reader for [`UserSlicePtr`]. +/// +/// Used to incrementally read from the user slice. +pub struct UserSlicePtrReader(*mut c_types::c_void, usize); + +impl IoBufferReader for UserSlicePtrReader { + /// Returns the number of bytes left to be read from this. + /// + /// Note that even reading less than this number of bytes may fail. + fn len(&self) -> usize { + self.1 + } + + /// Reads raw data from the user slice into a raw kernel buffer. + /// + /// # Safety + /// + /// The output buffer must be valid. + unsafe fn read_raw(&mut self, out: *mut u8, len: usize) -> Result { + if len > self.1 || len > u32::MAX as usize { + return Err(EFAULT); + } + let res = unsafe { bindings::copy_from_user(out as _, self.0, len as _) }; + if res != 0 { + return Err(EFAULT); + } + // Since this is not a pointer to a valid object in our program, + // we cannot use `add`, which has C-style rules for defined + // behavior. + self.0 = self.0.wrapping_add(len); + self.1 -= len; + Ok(()) + } +} + +/// A writer for [`UserSlicePtr`]. +/// +/// Used to incrementally write into the user slice. +pub struct UserSlicePtrWriter(*mut c_types::c_void, usize); + +impl IoBufferWriter for UserSlicePtrWriter { + fn len(&self) -> usize { + self.1 + } + + fn clear(&mut self, mut len: usize) -> Result { + let mut ret = Ok(()); + if len > self.1 { + ret = Err(EFAULT); + len = self.1; + } + + // SAFETY: The buffer will be validated by `clear_user`. We ensure that `len` is within + // bounds in the check above. + let left = unsafe { bindings::clear_user(self.0, len as _) } as usize; + if left != 0 { + ret = Err(EFAULT); + len -= left; + } + + self.0 = self.0.wrapping_add(len); + self.1 -= len; + ret + } + + unsafe fn write_raw(&mut self, data: *const u8, len: usize) -> Result { + if len > self.1 || len > u32::MAX as usize { + return Err(EFAULT); + } + let res = unsafe { bindings::copy_to_user(self.0, data as _, len as _) }; + if res != 0 { + return Err(EFAULT); + } + // Since this is not a pointer to a valid object in our program, + // we cannot use `add`, which has C-style rules for defined + // behavior. + self.0 = self.0.wrapping_add(len); + self.1 -= len; + Ok(()) + } +} From b11cd2a2f93b0c23be4b846a56c7642a3a8252b0 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 17:18:23 +0200 Subject: [PATCH 0027/1250] rust: export generated symbols All symbols are reexported reusing the `EXPORT_SYMBOL_GPL` macro from C. The lists of symbols are generated on the fly. There are three main sets of symbols to distinguish: - The ones from the `core` and `alloc` crates (from the Rust standard library). The code is licensed as Apache/MIT. - The ones from our abstractions in the `kernel` crate. - The helpers (already exported since they are not generated). We export everything as GPL. This ensures we do not mistakenly expose GPL kernel symbols/features as non-GPL, even indirectly. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Signed-off-by: Miguel Ojeda --- rust/exports.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 rust/exports.c diff --git a/rust/exports.c b/rust/exports.c new file mode 100644 index 00000000000000..fe3dcfdd6fbf6e --- /dev/null +++ b/rust/exports.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A hack to export Rust symbols for loadable modules without having to redo + * the entire `include/linux/export.h` logic in Rust. + * + * This requires the Rust's new/future `v0` mangling scheme because the default + * one ("legacy") uses invalid characters for C identifiers (thus we cannot use + * the `EXPORT_SYMBOL_*` macros). + * + * All symbols are exported as GPL-only to guarantee no GPL-only feature is + * accidentally exposed. + */ + +#include + +#define EXPORT_SYMBOL_RUST_GPL(sym) extern int sym; EXPORT_SYMBOL_GPL(sym); + +#include "exports_core_generated.h" +#include "exports_alloc_generated.h" +#include "exports_kernel_generated.h" From 2fbf5241a56115202991cd1fb8b30a4d5f524c56 Mon Sep 17 00:00:00 2001 From: Gary Guo Date: Sat, 3 Jul 2021 17:38:57 +0200 Subject: [PATCH 0028/1250] vsprintf: add new `%pA` format specifier This patch adds a format specifier `%pA` to `vsprintf` which formats a pointer as `core::fmt::Arguments`. Doing so allows us to directly format to the internal buffer of `printf`, so we do not have to use a temporary buffer on the stack to pre-assemble the message on the Rust side. This specifier is intended only to be used from Rust and not for C, so `checkpatch.pl` is intentionally unchanged to catch any misuse. Reviewed-by: Kees Cook Acked-by: Petr Mladek Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Signed-off-by: Gary Guo Co-developed-by: Miguel Ojeda Signed-off-by: Miguel Ojeda --- Documentation/core-api/printk-formats.rst | 10 ++++++++++ lib/vsprintf.c | 13 +++++++++++++ 2 files changed, 23 insertions(+) diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 5e89497ba314e7..dbe1aacc79d0fc 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -625,6 +625,16 @@ Examples:: %p4cc Y10 little-endian (0x20303159) %p4cc NV12 big-endian (0xb231564e) +Rust +---- + +:: + + %pA + +Only intended to be used from Rust code to format ``core::fmt::Arguments``. +Do *not* use it from C. + Thanks ====== diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 40d26a07a13319..00f71f91d991d0 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2270,6 +2270,9 @@ int __init no_hash_pointers_enable(char *str) } early_param("no_hash_pointers", no_hash_pointers_enable); +/* Used for Rust formatting ('%pA'). */ +char *rust_fmt_argument(char *buf, char *end, void *ptr); + /* * Show a '%p' thing. A kernel extension is that the '%p' is followed * by an extra set of alphanumeric characters that are extended format @@ -2396,6 +2399,10 @@ early_param("no_hash_pointers", no_hash_pointers_enable); * * Note: The default behaviour (unadorned %p) is to hash the address, * rendering it useful as a unique identifier. + * + * There is also a '%pA' format specifier, but it is only intended to be used + * from Rust code to format core::fmt::Arguments. Do *not* use it from C. + * See rust/kernel/print.rs for details. */ static noinline_for_stack char *pointer(const char *fmt, char *buf, char *end, void *ptr, @@ -2468,6 +2475,12 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return device_node_string(buf, end, ptr, spec, fmt + 1); case 'f': return fwnode_string(buf, end, ptr, spec, fmt + 1); + case 'A': + if (!IS_ENABLED(CONFIG_RUST)) { + WARN_ONCE(1, "Please remove %%pA from non-Rust code\n"); + return error_string(buf, end, "(%pA?)", spec); + } + return rust_fmt_argument(buf, end, ptr); case 'x': return pointer_string(buf, end, ptr, spec); case 'e': From 59cda43d682e9903b2949164c966b0dbfa3f9345 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 22 May 2022 03:11:08 +0200 Subject: [PATCH 0029/1250] scripts: checkpatch: diagnose uses of `%pA` in the C side The `%pA` format specifier is only intended to be used from Rust. `checkpatch.pl` already gives a warning for invalid specificers: WARNING: Invalid vsprintf pointer extension '%pA' With this change, we introduce an error message with further explanation: ERROR: '%pA' is only intended to be used from Rust code Suggested-by: Kees Cook Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Signed-off-by: Miguel Ojeda --- scripts/checkpatch.pl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 577e029987011e..457d544b0b9dab 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -6784,6 +6784,10 @@ sub process { my $stat_real = get_stat_real($linenr, $lc); my $ext_type = "Invalid"; my $use = ""; + if ($bad_specifier =~ /pA/) { + ERROR("VSPRINTF_RUST", + "'\%pA' is only intended to be used from Rust code\n" . "$here\n$stat_real\n"); + } if ($bad_specifier =~ /p[Ff]/) { $use = " - use %pS instead"; $use =~ s/pS/ps/ if ($bad_specifier =~ /pf/); From b04a582e3cc0ae29dcf8182f3c3006af4169d9ea Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 22 May 2022 17:22:58 +0200 Subject: [PATCH 0030/1250] scripts: checkpatch: enable language-independent checks for Rust Include Rust in the "source code files" category, so that the language-independent tests are checked for Rust too, and teach `checkpatch` about the comment style for Rust files. This enables the malformed SPDX check, the misplaced SPDX license tag check, the long line checks, the lines without a newline check and the embedded filename check. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Signed-off-by: Miguel Ojeda --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 457d544b0b9dab..923e82467fe5a6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3615,7 +3615,7 @@ sub process { my $comment = ""; if ($realfile =~ /\.(h|s|S)$/) { $comment = '/*'; - } elsif ($realfile =~ /\.(c|dts|dtsi)$/) { + } elsif ($realfile =~ /\.(c|rs|dts|dtsi)$/) { $comment = '//'; } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc|yaml)$/) { $comment = '#'; @@ -3663,7 +3663,7 @@ sub process { } # check we are in a valid source file if not then ignore this hunk - next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/); + next if ($realfile !~ /\.(h|c|rs|s|S|sh|dtsi|dts)$/); # check for using SPDX-License-Identifier on the wrong line number if ($realline != $checklicenseline && From 56bf83de2cd1350017a6570fed570e2321c1b541 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Fri, 6 May 2022 16:37:18 +0200 Subject: [PATCH 0031/1250] scripts: add `rustdoc_test_{builder,gen}.py` scripts Rust documentation tests are typically examples of usage of any item (e.g. function, struct, module...). They are very convenient because they are just written alongside the documentation, e.g.: /// Sums two numbers. /// /// # Examples /// /// ``` /// assert_eq!(mymod::f(10, 20), 30); /// ``` pub fn f(a: i32, b: i32) -> i32 { a + b } These scripts are used to transform Rust documentation tests into KUnit tests, so that they can be run in-kernel. In turn, this allows us to run tests that use kernel APIs. In particular, the test builder receives `rustdoc`-generated tests, parses them and stores the result. Then, the test generator takes the saved results and generates a KUnit suite where each original documentation test is a test case. For the moment, this is only done for the `kernel` crate, but the plan is to generalize it for other crates and modules. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Signed-off-by: Miguel Ojeda --- scripts/rustdoc_test_builder.py | 59 ++++++++++++ scripts/rustdoc_test_gen.py | 164 ++++++++++++++++++++++++++++++++ 2 files changed, 223 insertions(+) create mode 100755 scripts/rustdoc_test_builder.py create mode 100755 scripts/rustdoc_test_gen.py diff --git a/scripts/rustdoc_test_builder.py b/scripts/rustdoc_test_builder.py new file mode 100755 index 00000000000000..d9b47a5c54fc98 --- /dev/null +++ b/scripts/rustdoc_test_builder.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +"""rustdoc_test_builder - Test builder for `rustdoc`-generated tests. +""" + +import json +import pathlib +import re +import sys + +RUST_DIR = pathlib.Path("rust") +TESTS_DIR = RUST_DIR / "test" / "doctests" / "kernel" + +# `[^\s]*` removes the prefix (e.g. `_doctest_main_`) plus any +# leading path (for `O=` builds). +MAIN_RE = re.compile( + r"^" + r"fn main\(\) { " + r"#\[allow\(non_snake_case\)\] " + r"fn ([^\s]*rust_kernel_([a-zA-Z0-9_]+))\(\) {" + r"$" +) + +def main(): + found_main = False + test_header = "" + test_body = "" + for line in sys.stdin.readlines(): + main_match = MAIN_RE.match(line) + if main_match: + if found_main: + raise Exception("More than one `main` line found.") + found_main = True + function_name = main_match.group(1) + test_name = f"rust_kernel_doctest_{main_match.group(2)}" + continue + + if found_main: + test_body += line + else: + test_header += line + + if not found_main: + raise Exception("No `main` line found.") + + call_line = f"}} {function_name}() }}" + if not test_body.endswith(call_line): + raise Exception("Unexpected end of test body.") + test_body = test_body[:-len(call_line)] + + with open(TESTS_DIR / f"{test_name}.json", "w") as fd: + json.dump({ + "name": test_name, + "header": test_header, + "body": test_body, + }, fd, sort_keys=True, indent=4) + +if __name__ == "__main__": + main() diff --git a/scripts/rustdoc_test_gen.py b/scripts/rustdoc_test_gen.py new file mode 100755 index 00000000000000..ad9a94293ab595 --- /dev/null +++ b/scripts/rustdoc_test_gen.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +"""rustdoc_test_gen - Generates KUnit tests from saved `rustdoc`-generated tests. +""" + +import json +import os +import pathlib + +RUST_DIR = pathlib.Path("rust") +TESTS_DIR = RUST_DIR / "test" / "doctests" / "kernel" + +RUST_FILE = RUST_DIR / "doctests_kernel_generated.rs" +C_FILE = RUST_DIR / "doctests_kernel_generated_kunit.c" + +RUST_TEMPLATE_TEST = """ +/// Generated `{test_name}` KUnit test case from a Rust documentation test. +#[no_mangle] +pub fn {test_name}(__kunit_test: *mut kernel::bindings::kunit) {{ + /// Provides mutual exclusion (see `# Implementation` notes). + static __KUNIT_TEST_MUTEX: kernel::sync::smutex::Mutex<()> = + kernel::sync::smutex::Mutex::new(()); + + /// Saved argument (see `# Implementation` notes). + static __KUNIT_TEST: core::sync::atomic::AtomicPtr = + core::sync::atomic::AtomicPtr::new(core::ptr::null_mut()); + + let __kunit_test_mutex_guard = __KUNIT_TEST_MUTEX.lock(); + __KUNIT_TEST.store(__kunit_test, core::sync::atomic::Ordering::SeqCst); + + /// Overrides the usual [`assert!`] macro with one that calls KUnit instead. + macro_rules! assert {{ + ($cond:expr $(,)?) => {{{{ + kernel::kunit_assert!( + __KUNIT_TEST.load(core::sync::atomic::Ordering::SeqCst), + $cond + ); + }}}} + }} + + /// Overrides the usual [`assert_eq!`] macro with one that calls KUnit instead. + macro_rules! assert_eq {{ + ($left:expr, $right:expr $(,)?) => {{{{ + kernel::kunit_assert_eq!( + __KUNIT_TEST.load(core::sync::atomic::Ordering::SeqCst), + $left, + $right + ); + }}}} + }} + + // Many tests need the prelude, so provide it by default. + use kernel::prelude::*; + + {test_body} +}} +""" +RUST_TEMPLATE = """// SPDX-License-Identifier: GPL-2.0 + +//! `kernel` crate documentation tests. + +// # Implementation +// +// KUnit gives us a context in the form of the `kunit_test` parameter that one +// needs to pass back to other KUnit functions and macros. +// +// However, we want to keep this as an implementation detail because: +// +// - Test code should not care about the implementation. +// +// - Documentation looks worse if it needs to carry extra details unrelated +// to the piece being described. +// +// - Test code should be able to define functions and call them, without +// having to carry the context (since functions cannot capture dynamic +// environment). +// +// - Later on, we may want to be able to test non-kernel code (e.g. `core`, +// `alloc` or external crates) which likely use the standard library +// `assert*!` macros. +// +// For this reason, `static`s are used in the generated code to save the +// argument which then gets read by the asserting macros. These macros then +// call back into KUnit, instead of panicking. +// +// To avoid depending on whether KUnit allows to run tests concurrently and/or +// reentrantly, we ensure mutual exclusion on our end. To ensure a single test +// being killed does not trigger failure of every other test (timing out), +// we provide different `static`s per test (which also allow for concurrent +// execution, though KUnit runs them sequentially). +// +// Furthermore, since test code may create threads and assert from them, we use +// an `AtomicPtr` to hold the context (though each test only writes once before +// threads may be created). + +{rust_header} + +const __LOG_PREFIX: &[u8] = b"rust_kernel_doctests\\0"; + +{rust_tests} +""" + +C_TEMPLATE_TEST_DECLARATION = "void {test_name}(struct kunit *);\n" +C_TEMPLATE_TEST_CASE = " KUNIT_CASE({test_name}),\n" +C_TEMPLATE = """// SPDX-License-Identifier: GPL-2.0 +/* + * `kernel` crate documentation tests. + */ + +#include + +{c_test_declarations} + +static struct kunit_case test_cases[] = {{ + {c_test_cases} + {{ }} +}}; + +static struct kunit_suite test_suite = {{ + .name = "rust_kernel_doctests", + .test_cases = test_cases, +}}; + +kunit_test_suite(test_suite); + +MODULE_LICENSE("GPL"); +""" + +def main(): + rust_header = set() + rust_tests = "" + c_test_declarations = "" + c_test_cases = "" + for filename in sorted(os.listdir(TESTS_DIR)): + with open(TESTS_DIR / filename, "r") as fd: + test = json.load(fd) + for line in test["header"].strip().split("\n"): + rust_header.add(line) + rust_tests += RUST_TEMPLATE_TEST.format( + test_name = test["name"], + test_body = test["body"] + ) + c_test_declarations += C_TEMPLATE_TEST_DECLARATION.format( + test_name = test["name"] + ) + c_test_cases += C_TEMPLATE_TEST_CASE.format( + test_name = test["name"] + ) + rust_header = sorted(rust_header) + + with open(RUST_FILE, "w") as fd: + fd.write(RUST_TEMPLATE.format( + rust_header = "\n".join(rust_header).strip(), + rust_tests = rust_tests.strip(), + )) + + with open(C_FILE, "w") as fd: + fd.write(C_TEMPLATE.format( + c_test_declarations=c_test_declarations.strip(), + c_test_cases=c_test_cases.strip(), + )) + +if __name__ == "__main__": + main() From 16ea7610e30b7a8d1ac9d9e21b640362ad8ac0a4 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 17:26:15 +0200 Subject: [PATCH 0032/1250] scripts: add `generate_rust_analyzer.py` scripts The `generate_rust_analyzer.py` script generates the configuration file (`rust-project.json`) for rust-analyzer. rust-analyzer is a modular compiler frontend for the Rust language. It provides an LSP server which can be used in editors such as VS Code, Emacs or Vim. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Finn Behrens Signed-off-by: Finn Behrens Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boris-Chengbiao Zhou Signed-off-by: Boris-Chengbiao Zhou Signed-off-by: Miguel Ojeda --- scripts/generate_rust_analyzer.py | 134 ++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100755 scripts/generate_rust_analyzer.py diff --git a/scripts/generate_rust_analyzer.py b/scripts/generate_rust_analyzer.py new file mode 100755 index 00000000000000..37c049fb18f257 --- /dev/null +++ b/scripts/generate_rust_analyzer.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 +"""generate_rust_analyzer - Generates the `rust-project.json` file for `rust-analyzer`. +""" + +import argparse +import json +import logging +import pathlib +import sys + +def generate_crates(srctree, objtree, sysroot_src): + # Generate the configuration list. + cfg = [] + with open(objtree / "include" / "generated" / "rustc_cfg") as fd: + for line in fd: + line = line.replace("--cfg=", "") + line = line.replace("\n", "") + cfg.append(line) + + # Now fill the crates list -- dependencies need to come first. + # + # Avoid O(n^2) iterations by keeping a map of indexes. + crates = [] + crates_indexes = {} + + def append_crate(display_name, root_module, deps, cfg=[], is_workspace_member=True, is_proc_macro=False): + crates_indexes[display_name] = len(crates) + crates.append({ + "display_name": display_name, + "root_module": str(root_module), + "is_workspace_member": is_workspace_member, + "is_proc_macro": is_proc_macro, + "deps": [{"crate": crates_indexes[dep], "name": dep} for dep in deps], + "cfg": cfg, + "edition": "2021", + "env": { + "RUST_MODFILE": "This is only for rust-analyzer" + } + }) + + # First, the ones in `rust/` since they are a bit special. + append_crate( + "core", + sysroot_src / "core" / "src" / "lib.rs", + [], + is_workspace_member=False, + ) + + append_crate( + "compiler_builtins", + srctree / "rust" / "compiler_builtins.rs", + [], + ) + + append_crate( + "alloc", + srctree / "rust" / "alloc" / "lib.rs", + ["core", "compiler_builtins"], + ) + + append_crate( + "macros", + srctree / "rust" / "macros" / "lib.rs", + [], + is_proc_macro=True, + ) + crates[-1]["proc_macro_dylib_path"] = "rust/libmacros.so" + + append_crate( + "build_error", + srctree / "rust" / "build_error.rs", + ["core", "compiler_builtins"], + ) + + append_crate( + "kernel", + srctree / "rust" / "kernel" / "lib.rs", + ["core", "alloc", "macros", "build_error"], + cfg=cfg, + ) + crates[-1]["env"]["OBJTREE"] = str(objtree.resolve(True)) + crates[-1]["source"] = { + "include_dirs": [ + str(srctree / "rust" / "kernel"), + str(objtree / "rust") + ], + "exclude_dirs": [], + } + + # Then, the rest outside of `rust/`. + # + # We explicitly mention the top-level folders we want to cover. + for folder in ("samples", "drivers"): + for path in (srctree / folder).rglob("*.rs"): + logging.info("Checking %s", path) + name = path.name.replace(".rs", "") + + # Skip those that are not crate roots. + if f"{name}.o" not in open(path.parent / "Makefile").read(): + continue + + logging.info("Adding %s", name) + append_crate( + name, + path, + ["core", "alloc", "kernel"], + cfg=cfg, + ) + + return crates + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--verbose', '-v', action='store_true') + parser.add_argument("srctree", type=pathlib.Path) + parser.add_argument("objtree", type=pathlib.Path) + parser.add_argument("sysroot_src", type=pathlib.Path) + args = parser.parse_args() + + logging.basicConfig( + format="[%(asctime)s] [%(levelname)s] %(message)s", + level=logging.INFO if args.verbose else logging.WARNING + ) + + rust_project = { + "crates": generate_crates(args.srctree, args.objtree, args.sysroot_src), + "sysroot_src": str(args.sysroot_src), + } + + json.dump(rust_project, sys.stdout, sort_keys=True, indent=4) + +if __name__ == "__main__": + main() From 54203052bf8ef43c5e4952707a08ed27628fcfca Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 5 Dec 2021 19:00:43 +0100 Subject: [PATCH 0033/1250] scripts: decode_stacktrace: demangle Rust symbols Recent versions of both Binutils (`c++filt`) and LLVM (`llvm-cxxfilt`) provide Rust v0 mangling support. Reviewed-by: Kees Cook Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Signed-off-by: Miguel Ojeda --- scripts/decode_stacktrace.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 5fbad61fe490e0..f3c7b506d44071 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh @@ -8,6 +8,14 @@ usage() { echo " $0 -r | [|auto] []" } +# Try to find a Rust demangler +if type llvm-cxxfilt >/dev/null 2>&1 ; then + cppfilt=llvm-cxxfilt +elif type c++filt >/dev/null 2>&1 ; then + cppfilt=c++filt + cppfilt_opts=-i +fi + if [[ $1 == "-r" ]] ; then vmlinux="" basepath="auto" @@ -169,6 +177,12 @@ parse_symbol() { # In the case of inlines, move everything to same line code=${code//$'\n'/' '} + # Demangle if the name looks like a Rust symbol and if + # we got a Rust demangler + if [[ $name =~ ^_R && $cppfilt != "" ]] ; then + name=$("$cppfilt" "$cppfilt_opts" "$name") + fi + # Replace old address with pretty line numbers symbol="$segment$name ($code)" } From 6fcfd3e119fc06fc0d15e80aa3c40c93684ea01f Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 17:23:16 +0200 Subject: [PATCH 0034/1250] docs: add Rust documentation Most of the documentation for Rust is written within the source code itself, as it is idiomatic for Rust projects. This applies to both the shared infrastructure at `rust/` as well as any other Rust module (e.g. drivers) written across the kernel. However, these documents contain general information that does not fit particularly well in the source code, like the Quick Start guide. It also contains an asset (SVG logo) used for the `rustdoc` target and a few other small changes elsewhere in the documentation folder. Reviewed-by: Kees Cook Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Finn Behrens Signed-off-by: Finn Behrens Co-developed-by: Adam Bratschi-Kaye Signed-off-by: Adam Bratschi-Kaye Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Michael Ellerman Signed-off-by: Michael Ellerman Co-developed-by: Sven Van Asbroeck Signed-off-by: Sven Van Asbroeck Co-developed-by: Wu XiangCheng Signed-off-by: Wu XiangCheng Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boris-Chengbiao Zhou Signed-off-by: Boris-Chengbiao Zhou Co-developed-by: Yuki Okushi Signed-off-by: Yuki Okushi Co-developed-by: Wei Liu Signed-off-by: Wei Liu Co-developed-by: Daniel Xu Signed-off-by: Daniel Xu Co-developed-by: Julian Merkle Signed-off-by: Julian Merkle Signed-off-by: Miguel Ojeda --- Documentation/doc-guide/kernel-doc.rst | 3 + Documentation/index.rst | 1 + Documentation/kbuild/kbuild.rst | 17 ++ Documentation/kbuild/makefiles.rst | 50 ++++- Documentation/process/changes.rst | 41 ++++ Documentation/rust/arch-support.rst | 25 +++ Documentation/rust/coding-guidelines.rst | 216 +++++++++++++++++++ Documentation/rust/general-information.rst | 79 +++++++ Documentation/rust/index.rst | 22 ++ Documentation/rust/quick-start.rst | 232 +++++++++++++++++++++ 10 files changed, 682 insertions(+), 4 deletions(-) create mode 100644 Documentation/rust/arch-support.rst create mode 100644 Documentation/rust/coding-guidelines.rst create mode 100644 Documentation/rust/general-information.rst create mode 100644 Documentation/rust/index.rst create mode 100644 Documentation/rust/quick-start.rst diff --git a/Documentation/doc-guide/kernel-doc.rst b/Documentation/doc-guide/kernel-doc.rst index 79aaa55d6bcf2b..cde7aa67e76fa0 100644 --- a/Documentation/doc-guide/kernel-doc.rst +++ b/Documentation/doc-guide/kernel-doc.rst @@ -11,6 +11,9 @@ when it is embedded in source files. reasons. The kernel source contains tens of thousands of kernel-doc comments. Please stick to the style described here. +.. note:: kernel-doc does not cover Rust code: please see + Documentation/rust/general-information.rst instead. + The kernel-doc structure is extracted from the comments, and proper `Sphinx C Domain`_ function and type descriptions with anchors are generated from them. The descriptions are filtered for special kernel-doc diff --git a/Documentation/index.rst b/Documentation/index.rst index 1988c19d9daf48..ee639a500278da 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -82,6 +82,7 @@ merged much easier. maintainer/index fault-injection/index livepatch/index + rust/index Kernel API documentation diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst index ef19b9c1352362..08f575e6236c71 100644 --- a/Documentation/kbuild/kbuild.rst +++ b/Documentation/kbuild/kbuild.rst @@ -48,6 +48,10 @@ KCFLAGS ------- Additional options to the C compiler (for built-in and modules). +KRUSTFLAGS +---------- +Additional options to the Rust compiler (for built-in and modules). + CFLAGS_KERNEL ------------- Additional options for $(CC) when used to compile @@ -57,6 +61,15 @@ CFLAGS_MODULE ------------- Additional module specific options to use for $(CC). +RUSTFLAGS_KERNEL +---------------- +Additional options for $(RUSTC) when used to compile +code that is compiled as built-in. + +RUSTFLAGS_MODULE +---------------- +Additional module specific options to use for $(RUSTC). + LDFLAGS_MODULE -------------- Additional options used for $(LD) when linking modules. @@ -69,6 +82,10 @@ HOSTCXXFLAGS ------------ Additional flags to be passed to $(HOSTCXX) when building host programs. +HOSTRUSTFLAGS +------------- +Additional flags to be passed to $(HOSTRUSTC) when building host programs. + HOSTLDFLAGS ----------- Additional flags to be passed when linking host programs. diff --git a/Documentation/kbuild/makefiles.rst b/Documentation/kbuild/makefiles.rst index 11a296e52d6800..5ea1e72d89c8b6 100644 --- a/Documentation/kbuild/makefiles.rst +++ b/Documentation/kbuild/makefiles.rst @@ -29,8 +29,9 @@ This document describes the Linux kernel Makefiles. --- 4.1 Simple Host Program --- 4.2 Composite Host Programs --- 4.3 Using C++ for host programs - --- 4.4 Controlling compiler options for host programs - --- 4.5 When host programs are actually built + --- 4.4 Using Rust for host programs + --- 4.5 Controlling compiler options for host programs + --- 4.6 When host programs are actually built === 5 Userspace Program support --- 5.1 Simple Userspace Program @@ -835,7 +836,24 @@ Both possibilities are described in the following. qconf-cxxobjs := qconf.o qconf-objs := check.o -4.4 Controlling compiler options for host programs +4.4 Using Rust for host programs +-------------------------------- + + Kbuild offers support for host programs written in Rust. However, + since a Rust toolchain is not mandatory for kernel compilation, + it may only be used in scenarios where Rust is required to be + available (e.g. when ``CONFIG_RUST`` is enabled). + + Example:: + + hostprogs := target + target-rust := y + + Kbuild will compile ``target`` using ``target.rs`` as the crate root, + located in the same directory as the ``Makefile``. The crate may + consist of several source files (see ``samples/rust/hostprogs``). + +4.5 Controlling compiler options for host programs -------------------------------------------------- When compiling host programs, it is possible to set specific flags. @@ -867,7 +885,7 @@ Both possibilities are described in the following. When linking qconf, it will be passed the extra option "-L$(QTDIR)/lib". -4.5 When host programs are actually built +4.6 When host programs are actually built ----------------------------------------- Kbuild will only build host-programs when they are referenced @@ -1181,6 +1199,17 @@ When kbuild executes, the following steps are followed (roughly): The first example utilises the trick that a config option expands to 'y' when selected. + KBUILD_RUSTFLAGS + $(RUSTC) compiler flags + + Default value - see top level Makefile + Append or modify as required per architecture. + + Often, the KBUILD_RUSTFLAGS variable depends on the configuration. + + Note that target specification file generation (for ``--target``) + is handled in ``scripts/generate_rust_target.rs``. + KBUILD_AFLAGS_KERNEL Assembler options specific for built-in @@ -1208,6 +1237,19 @@ When kbuild executes, the following steps are followed (roughly): are used for $(CC). From commandline CFLAGS_MODULE shall be used (see kbuild.rst). + KBUILD_RUSTFLAGS_KERNEL + $(RUSTC) options specific for built-in + + $(KBUILD_RUSTFLAGS_KERNEL) contains extra Rust compiler flags used to + compile resident kernel code. + + KBUILD_RUSTFLAGS_MODULE + Options for $(RUSTC) when building modules + + $(KBUILD_RUSTFLAGS_MODULE) is used to add arch-specific options that + are used for $(RUSTC). + From commandline RUSTFLAGS_MODULE shall be used (see kbuild.rst). + KBUILD_LDFLAGS_MODULE Options for $(LD) when linking modules diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index a337e8eabfe11f..a886ac497266b5 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -31,6 +31,8 @@ you probably needn't concern yourself with pcmciautils. ====================== =============== ======================================== GNU C 5.1 gcc --version Clang/LLVM (optional) 11.0.0 clang --version +Rust (optional) 1.60.0 rustc --version +bindgen (optional) 0.56.0 bindgen --version GNU make 3.81 make --version binutils 2.23 ld -v flex 2.5.35 flex --version @@ -78,6 +80,29 @@ kernels. Older releases aren't guaranteed to work, and we may drop workarounds from the kernel that were used to support older versions. Please see additional docs on :ref:`Building Linux with Clang/LLVM `. +Rust (optional) +--------------- + +A particular version of the Rust toolchain is required. Newer versions may or +may not work because the kernel depends on some unstable Rust features, for +the moment. + +Each Rust toolchain comes with several "components", some of which are required +(like ``rustc``) and some that are optional. The ``rust-src`` component (which +is optional) needs to be installed to build the kernel. Other components are +useful for developing. + +Please see Documentation/rust/quick-start.rst for instructions on how to +satisfy the build requirements of Rust support. In particular, the ``Makefile`` +target ``rustavailable`` is useful to check why the Rust toolchain may not +be detected. + +bindgen (optional) +------------------ + +``bindgen`` is used to generate the Rust bindings to the C side of the kernel. +It depends on ``libclang``. + Make ---- @@ -340,6 +365,12 @@ Sphinx Please see :ref:`sphinx_install` in :ref:`Documentation/doc-guide/sphinx.rst ` for details about Sphinx requirements. +rustdoc +------- + +``rustdoc`` is used to generate the documentation for Rust code. Please see +Documentation/rust/general-information.rst for more information. + Getting updated software ======================== @@ -356,6 +387,16 @@ Clang/LLVM - :ref:`Getting LLVM `. +Rust +---- + +- Documentation/rust/quick-start.rst. + +bindgen +------- + +- Documentation/rust/quick-start.rst. + Make ---- diff --git a/Documentation/rust/arch-support.rst b/Documentation/rust/arch-support.rst new file mode 100644 index 00000000000000..79ebad889720ab --- /dev/null +++ b/Documentation/rust/arch-support.rst @@ -0,0 +1,25 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Arch Support +============ + +Currently, the Rust compiler (``rustc``) uses LLVM for code generation, +which limits the supported architectures that can be targeted. In addition, +support for building the kernel with LLVM/Clang varies (please see +Documentation/kbuild/llvm.rst). This support is needed for ``bindgen`` +which uses ``libclang``. + +Below is a general summary of architectures that currently work. Level of +support corresponds to ``S`` values in the ``MAINTAINERS`` file. + +============ ================ ============================================== +Architecture Level of support Constraints +============ ================ ============================================== +``arm`` Maintained ``armv6`` and compatible only, + ``RUST_OPT_LEVEL >= 2``. +``arm64`` Maintained None. +``powerpc`` Maintained ``ppc64le`` only, ``RUST_OPT_LEVEL < 2`` + requires ``CONFIG_THREAD_SHIFT=15``. +``riscv`` Maintained ``riscv64`` only. +``x86`` Maintained ``x86_64`` only. +============ ================ ============================================== diff --git a/Documentation/rust/coding-guidelines.rst b/Documentation/rust/coding-guidelines.rst new file mode 100644 index 00000000000000..aa8ed082613e12 --- /dev/null +++ b/Documentation/rust/coding-guidelines.rst @@ -0,0 +1,216 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Coding Guidelines +================= + +This document describes how to write Rust code in the kernel. + + +Style & formatting +------------------ + +The code should be formatted using ``rustfmt``. In this way, a person +contributing from time to time to the kernel does not need to learn and +remember one more style guide. More importantly, reviewers and maintainers +do not need to spend time pointing out style issues anymore, and thus +less patch roundtrips may be needed to land a change. + +.. note:: Conventions on comments and documentation are not checked by + ``rustfmt``. Thus those are still needed to be taken care of. + +The default settings of ``rustfmt`` are used. This means the idiomatic Rust +style is followed. For instance, 4 spaces are used for indentation rather +than tabs. + +It is convenient to instruct editors/IDEs to format while typing, +when saving or at commit time. However, if for some reason reformatting +the entire kernel Rust sources is needed at some point, the following can be +run:: + + make LLVM=1 rustfmt + +It is also possible to check if everything is formatted (printing a diff +otherwise), for instance for a CI, with:: + + make LLVM=1 rustfmtcheck + +Like ``clang-format`` for the rest of the kernel, ``rustfmt`` works on +individual files, and does not require a kernel configuration. Sometimes it may +even work with broken code. + + +Comments +-------- + +"Normal" comments (i.e. ``//``, rather than code documentation which starts +with ``///`` or ``//!``) are written in Markdown the same way as documentation +comments are, even though they will not be rendered. This improves consistency, +simplifies the rules and allows to move content between the two kinds of +comments more easily. For instance: + +.. code-block:: rust + + // `object` is ready to be handled now. + f(object); + +Furthermore, just like documentation, comments are capitalized at the beginning +of a sentence and ended with a period (even if it is a single sentence). This +includes ``// SAFETY:``, ``// TODO:`` and other "tagged" comments, e.g.: + +.. code-block:: rust + + // FIXME: The error should be handled properly. + +Comments should not be used for documentation purposes: comments are intended +for implementation details, not users. This distinction is useful even if the +reader of the source file is both an implementor and a user of an API. In fact, +sometimes it is useful to use both comments and documentation at the same time. +For instance, for a ``TODO`` list or to comment on the documentation itself. +For the latter case, comments can be inserted in the middle; that is, closer to +the line of documentation to be commented. For any other case, comments are +written after the documentation, e.g.: + +.. code-block:: rust + + /// Returns a new [`Foo`]. + /// + /// # Examples + /// + // TODO: Find a better example. + /// ``` + /// let foo = f(42); + /// ``` + // FIXME: Use fallible approach. + pub fn f(x: i32) -> Foo { + // ... + } + +One special kind of comments are the ``// SAFETY:`` comments. These must appear +before every ``unsafe`` block, and they explain why the code inside the block is +correct/sound, i.e. why it cannot trigger undefined behavior in any case, e.g.: + +.. code-block:: rust + + // SAFETY: `p` is valid by the safety requirements. + unsafe { *p = 0; } + +``// SAFETY:`` comments are not to be confused with the ``# Safety`` sections +in code documentation. ``# Safety`` sections specify the contract that callers +(for functions) or implementors (for traits) need to abide by. ``// SAFETY:`` +comments show why a call (for functions) or implementation (for traits) actually +respects the preconditions stated in a ``# Safety`` section or the language +reference. + + +Code documentation +------------------ + +Rust kernel code is not documented like C kernel code (i.e. via kernel-doc). +Instead, the usual system for documenting Rust code is used: the ``rustdoc`` +tool, which uses Markdown (a lightweight markup language). + +To learn Markdown, there are many guides available out there. For instance, +the one at: + + https://commonmark.org/help/ + +This is how a well-documented Rust function may look like: + +.. code-block:: rust + + /// Returns the contained [`Some`] value, consuming the `self` value, + /// without checking that the value is not [`None`]. + /// + /// # Safety + /// + /// Calling this method on [`None`] is *[undefined behavior]*. + /// + /// [undefined behavior]: https://doc.rust-lang.org/reference/behavior-considered-undefined.html + /// + /// # Examples + /// + /// ``` + /// let x = Some("air"); + /// assert_eq!(unsafe { x.unwrap_unchecked() }, "air"); + /// ``` + pub unsafe fn unwrap_unchecked(self) -> T { + match self { + Some(val) => val, + + // SAFETY: The safety contract must be upheld by the caller. + None => unsafe { hint::unreachable_unchecked() }, + } + } + +This example showcases a few ``rustdoc`` features and some conventions followed +in the kernel: + + - The first paragraph must be a single sentence briefly describing what + the documented item does. Further explanations must go in extra paragraphs. + + - Unsafe functions must document their safety preconditions under + a ``# Safety`` section. + + - While not shown here, if a function may panic, the conditions under which + that happens must be described under a ``# Panics`` section. + + Please note that panicking should be very rare and used only with a good + reason. In almost all cases, a fallible approach should be used, typically + returning a ``Result``. + + - If providing examples of usage would help readers, they must be written in + a section called ``# Examples``. + + - Rust items (functions, types, constants...) must be linked appropriately + (``rustdoc`` will create a link automatically). + + - Any ``unsafe`` block must be preceded by a ``// SAFETY:`` comment + describing why the code inside is sound. + + While sometimes the reason might look trivial and therefore unneeded, + writing these comments is not just a good way of documenting what has been + taken into account, but most importantly, it provides a way to know that + there are no *extra* implicit constraints. + +To learn more about how to write documentation for Rust and extra features, +please take a look at the ``rustdoc`` book at: + + https://doc.rust-lang.org/rustdoc/how-to-write-documentation.html + + +Naming +------ + +Rust kernel code follows the usual Rust naming conventions: + + https://rust-lang.github.io/api-guidelines/naming.html + +When existing C concepts (e.g. macros, functions, objects...) are wrapped into +a Rust abstraction, a name as close as reasonably possible to the C side should +be used in order to avoid confusion and to improve readability when switching +back and forth between the C and Rust sides. For instance, macros such as +``pr_info`` from C are named the same in the Rust side. + +Having said that, casing should be adjusted to follow the Rust naming +conventions, and namespacing introduced by modules and types should not be +repeated in the item names. For instance, when wrapping constants like: + +.. code-block:: c + + #define GPIO_LINE_DIRECTION_IN 0 + #define GPIO_LINE_DIRECTION_OUT 1 + +The equivalent in Rust may look like (ignoring documentation): + +.. code-block:: rust + + pub mod gpio { + pub enum LineDirection { + In = bindings::GPIO_LINE_DIRECTION_IN as _, + Out = bindings::GPIO_LINE_DIRECTION_OUT as _, + } + } + +That is, the equivalent of ``GPIO_LINE_DIRECTION_IN`` would be referred to as +``gpio::LineDirection::In``. In particular, it should not be named +``gpio::gpio_line_direction::GPIO_LINE_DIRECTION_IN``. diff --git a/Documentation/rust/general-information.rst b/Documentation/rust/general-information.rst new file mode 100644 index 00000000000000..49029ee82e559e --- /dev/null +++ b/Documentation/rust/general-information.rst @@ -0,0 +1,79 @@ +.. SPDX-License-Identifier: GPL-2.0 + +General Information +=================== + +This document contains useful information to know when working with +the Rust support in the kernel. + + +Code documentation +------------------ + +Rust kernel code is documented using ``rustdoc``, its built-in documentation +generator. + +The generated HTML docs include integrated search, linked items (e.g. types, +functions, constants), source code, etc. They may be read at (TODO: link when +in mainline and generated alongside the rest of the documentation): + + http://kernel.org/ + +The docs can also be easily generated and read locally. This is quite fast +(same order as compiling the code itself) and no special tools or environment +are needed. This has the added advantage that they will be tailored to +the particular kernel configuration used. To generate them, use the ``rustdoc`` +target with the same invocation used for compilation, e.g.:: + + make LLVM=1 rustdoc + +To read the docs locally in your web browser, run e.g.:: + + xdg-open rust/doc/kernel/index.html + +To learn about how to write the documentation, please see coding-guidelines.rst. + + +Extra lints +----------- + +While ``rustc`` is a very helpful compiler, some extra lints and analyses are +available via ``clippy``, a Rust linter. To enable it, pass ``CLIPPY=1`` to +the same invocation used for compilation, e.g.:: + + make LLVM=1 CLIPPY=1 + +Please note that Clippy may change code generation, thus it should not be +enabled while building a production kernel. + + +Abstractions vs. bindings +------------------------- + +Abstractions are Rust code wrapping kernel functionality from the C side. + +In order to use functions and types from the C side, bindings are created. +Bindings are the declarations for Rust of those functions and types from +the C side. + +For instance, one may write a ``Mutex`` abstraction in Rust which wraps +a ``struct mutex`` from the C side and calls its functions through the bindings. + +Abstractions are not available for all the kernel internal APIs and concepts, +but it is intended that coverage is expanded as time goes on. "Leaf" modules +(e.g. drivers) should not use the C bindings directly. Instead, subsystems +should provide as-safe-as-possible abstractions as needed. + + +Conditional compilation +----------------------- + +Rust code has access to conditional compilation based on the kernel +configuration: + +.. code-block:: rust + + #[cfg(CONFIG_X)] // Enabled (`y` or `m`) + #[cfg(CONFIG_X="y")] // Enabled as a built-in (`y`) + #[cfg(CONFIG_X="m")] // Enabled as a module (`m`) + #[cfg(not(CONFIG_X))] // Disabled diff --git a/Documentation/rust/index.rst b/Documentation/rust/index.rst new file mode 100644 index 00000000000000..4ae8c66b94faf9 --- /dev/null +++ b/Documentation/rust/index.rst @@ -0,0 +1,22 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Rust +==== + +Documentation related to Rust within the kernel. To start using Rust +in the kernel, please read the quick-start.rst guide. + +.. toctree:: + :maxdepth: 1 + + quick-start + general-information + coding-guidelines + arch-support + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/rust/quick-start.rst b/Documentation/rust/quick-start.rst new file mode 100644 index 00000000000000..13b7744b1e2753 --- /dev/null +++ b/Documentation/rust/quick-start.rst @@ -0,0 +1,232 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Quick Start +=========== + +This document describes how to get started with kernel development in Rust. + + +Requirements: Building +---------------------- + +This section explains how to fetch the tools needed for building. + +Some of these requirements might be available from Linux distributions +under names like ``rustc``, ``rust-src``, ``rust-bindgen``, etc. However, +at the time of writing, they are likely not to be recent enough unless +the distribution tracks the latest releases. + +To easily check whether the requirements are met, the following target +can be used:: + + make LLVM=1 rustavailable + +This triggers the same logic used by Kconfig to determine whether +``RUST_IS_AVAILABLE`` should be enabled; but it also explains why not +if that is the case. + + +rustc +***** + +A particular version of the Rust compiler is required. Newer versions may or +may not work because, for the moment, the kernel depends on some unstable +Rust features. + +If ``rustup`` is being used, enter the checked out source code directory +and run:: + + rustup override set $(scripts/min-tool-version.sh rustc) + +Otherwise, fetch a standalone installer or install ``rustup`` from: + + https://www.rust-lang.org + + +Rust standard library source +**************************** + +The Rust standard library source is required because the build system will +cross-compile ``core`` and ``alloc``. + +If ``rustup`` is being used, run:: + + rustup component add rust-src + +The components are installed per toolchain, thus upgrading the Rust compiler +version later on requires re-adding the component. + +Otherwise, if a standalone installer is used, the Rust repository may be cloned +into the installation folder of the toolchain:: + + git clone --recurse-submodules \ + --branch $(scripts/min-tool-version.sh rustc) \ + https://github.com/rust-lang/rust \ + $(rustc --print sysroot)/lib/rustlib/src/rust + +In this case, upgrading the Rust compiler version later on requires manually +updating this clone. + + +libclang +******** + +``libclang`` (part of LLVM) is used by ``bindgen`` to understand the C code +in the kernel, which means LLVM needs to be installed; like when the kernel +is compiled with ``CC=clang`` or ``LLVM=1``. + +Linux distributions are likely to have a suitable one available, so it is +best to check that first. + +There are also some binaries for several systems and architectures uploaded at: + + https://releases.llvm.org/download.html + +Otherwise, building LLVM takes quite a while, but it is not a complex process: + + https://llvm.org/docs/GettingStarted.html#getting-the-source-code-and-building-llvm + +Please see Documentation/kbuild/llvm.rst for more information and further ways +to fetch pre-built releases and distribution packages. + + +bindgen +******* + +The bindings to the C side of the kernel are generated at build time using +the ``bindgen`` tool. A particular version is required. + +Install it via (note that this will download and build the tool from source):: + + cargo install --locked --version $(scripts/min-tool-version.sh bindgen) bindgen + + +Requirements: Developing +------------------------ + +This section explains how to fetch the tools needed for developing. That is, +they are not needed when just building the kernel. + + +rustfmt +******* + +The ``rustfmt`` tool is used to automatically format all the Rust kernel code, +including the generated C bindings (for details, please see +coding-guidelines.rst). + +If ``rustup`` is being used, its ``default`` profile already installs the tool, +thus nothing needs to be done. If another profile is being used, the component +can be installed manually:: + + rustup component add rustfmt + +The standalone installers also come with ``rustfmt``. + + +clippy +****** + +``clippy`` is a Rust linter. Running it provides extra warnings for Rust code. +It can be run by passing ``CLIPPY=1`` to ``make`` (for details, please see +general-information.rst). + +If ``rustup`` is being used, its ``default`` profile already installs the tool, +thus nothing needs to be done. If another profile is being used, the component +can be installed manually:: + + rustup component add clippy + +The standalone installers also come with ``clippy``. + + +cargo +***** + +``cargo`` is the Rust native build system. It is currently required to run +the tests since it is used to build a custom standard library that contains +the facilities provided by the custom ``alloc`` in the kernel. The tests can +be run using the ``rusttest`` Make target. + +If ``rustup`` is being used, all the profiles already install the tool, +thus nothing needs to be done. + +The standalone installers also come with ``cargo``. + + +rustdoc +******* + +``rustdoc`` is the documentation tool for Rust. It generates pretty HTML +documentation for Rust code (for details, please see +general-information.rst). + +``rustdoc`` is also used to test the examples provided in documented Rust code +(called doctests or documentation tests). The ``rusttest`` Make target uses +this feature. + +If ``rustup`` is being used, all the profiles already install the tool, +thus nothing needs to be done. + +The standalone installers also come with ``rustdoc``. + + +rust-analyzer +************* + +The `rust-analyzer `_ language server can +be used with many editors to enable syntax highlighting, completion, go to +definition, and other features. + +``rust-analyzer`` needs a configuration file, ``rust-project.json``, which +can be generated by the ``rust-analyzer`` Make target. + + +Configuration +------------- + +``Rust support`` (``CONFIG_RUST``) needs to be enabled in the ``General setup`` +menu. The option is only shown if a suitable Rust toolchain is found (see +above), as long as the other requirements are met. In turn, this will make +visible the rest of options that depend on Rust. + +Afterwards, go to:: + + Kernel hacking + -> Sample kernel code + -> Rust samples + +And enable some sample modules either as built-in or as loadable. + + +Building +-------- + +Building a kernel with a complete LLVM toolchain is the best supported setup +at the moment. That is:: + + make LLVM=1 + +For architectures that do not support a full LLVM toolchain, use:: + + make CC=clang + +Using GCC also works for some configurations, but it is very experimental at +the moment. + + +Hacking +------- + +To dive deeper, take a look at the source code of the samples +at ``samples/rust/``, the Rust support code under ``rust/`` and +the ``Rust hacking`` menu under ``Kernel hacking``. + +If GDB/Binutils is used and Rust symbols are not getting demangled, the reason +is the toolchain does not support Rust's new v0 mangling scheme yet. +There are a few ways out: + + - Install a newer release (GDB >= 10.2, Binutils >= 2.36). + + - Some versions of GDB (e.g. vanilla GDB 10.1) are able to use + the pre-demangled names embedded in the debug info (``CONFIG_DEBUG_INFO``). From 0ea4b9a1bece486345562862676c8f88638cc2bd Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 16:42:57 +0200 Subject: [PATCH 0035/1250] Kbuild: add Rust support Having all the new files in place, we now enable Rust support in the build system, including `Kconfig` entries related to Rust, the Rust configuration printer, the target specification generation script, the version detection script and a few other bits. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Finn Behrens Signed-off-by: Finn Behrens Co-developed-by: Adam Bratschi-Kaye Signed-off-by: Adam Bratschi-Kaye Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Michael Ellerman Signed-off-by: Michael Ellerman Co-developed-by: Sven Van Asbroeck Signed-off-by: Sven Van Asbroeck Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boris-Chengbiao Zhou Signed-off-by: Boris-Chengbiao Zhou Co-developed-by: Boqun Feng Signed-off-by: Boqun Feng Co-developed-by: Douglas Su Signed-off-by: Douglas Su Co-developed-by: Dariusz Sosnowski Signed-off-by: Dariusz Sosnowski Co-developed-by: Antonio Terceiro Signed-off-by: Antonio Terceiro Co-developed-by: Daniel Xu Signed-off-by: Daniel Xu Co-developed-by: Miguel Cano Signed-off-by: Miguel Cano Co-developed-by: David Gow Signed-off-by: David Gow Signed-off-by: Miguel Ojeda --- .gitignore | 5 + .rustfmt.toml | 12 + Makefile | 177 ++++++++- arch/Kconfig | 6 + arch/arm/Kconfig | 1 + arch/arm64/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/riscv/Kconfig | 1 + arch/riscv/Makefile | 5 + arch/um/Kconfig | 1 + arch/x86/Kconfig | 1 + arch/x86/Makefile | 14 + init/Kconfig | 45 ++- lib/Kconfig.debug | 155 ++++++++ rust/.gitignore | 10 + rust/Makefile | 398 +++++++++++++++++++ rust/bindgen_parameters | 17 + scripts/.gitignore | 1 + scripts/Kconfig.include | 6 +- scripts/Makefile | 3 + scripts/Makefile.build | 60 +++ scripts/Makefile.debug | 10 + scripts/Makefile.host | 34 +- scripts/Makefile.lib | 12 + scripts/Makefile.modfinal | 8 +- scripts/cc-version.sh | 12 +- scripts/generate_rust_target.rs | 227 +++++++++++ scripts/is_rust_module.sh | 13 + scripts/kconfig/confdata.c | 75 ++++ scripts/min-tool-version.sh | 6 + scripts/rust-is-available-bindgen-libclang.h | 2 + scripts/rust-is-available.sh | 158 ++++++++ 32 files changed, 1453 insertions(+), 24 deletions(-) create mode 100644 .rustfmt.toml create mode 100644 rust/.gitignore create mode 100644 rust/Makefile create mode 100644 rust/bindgen_parameters create mode 100644 scripts/generate_rust_target.rs create mode 100755 scripts/is_rust_module.sh create mode 100644 scripts/rust-is-available-bindgen-libclang.h create mode 100755 scripts/rust-is-available.sh diff --git a/.gitignore b/.gitignore index 7afd412dadd2c1..48c68948f476d3 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,7 @@ *.o *.o.* *.patch +*.rmeta *.s *.so *.so.dbg @@ -96,6 +97,7 @@ modules.order !.gitattributes !.gitignore !.mailmap +!.rustfmt.toml # # Generated include files @@ -161,3 +163,6 @@ x509.genkey # Documentation toolchain sphinx_*/ + +# Rust analyzer configuration +/rust-project.json diff --git a/.rustfmt.toml b/.rustfmt.toml new file mode 100644 index 00000000000000..3de5cc497465c2 --- /dev/null +++ b/.rustfmt.toml @@ -0,0 +1,12 @@ +edition = "2021" +newline_style = "Unix" + +# Unstable options that help catching some mistakes in formatting and that we may want to enable +# when they become stable. +# +# They are kept here since they are useful to run from time to time. +#format_code_in_doc_comments = true +#reorder_impl_items = true +#comment_width = 100 +#wrap_comments = true +#normalize_comments = true diff --git a/Makefile b/Makefile index 7d5b0bfe79602d..ce17ec71f89b17 100644 --- a/Makefile +++ b/Makefile @@ -120,6 +120,15 @@ endif export KBUILD_CHECKSRC +# Enable "clippy" (a linter) as part of the Rust compilation. +# +# Use 'make CLIPPY=1' to enable it. +ifeq ("$(origin CLIPPY)", "command line") + KBUILD_CLIPPY := $(CLIPPY) +endif + +export KBUILD_CLIPPY + # Use make M=dir or set the environment variable KBUILD_EXTMOD to specify the # directory of external module to build. Setting M= takes precedence. ifeq ("$(origin M)", "command line") @@ -267,7 +276,7 @@ no-dot-config-targets := $(clean-targets) \ cscope gtags TAGS tags help% %docs check% coccicheck \ $(version_h) headers headers_% archheaders archscripts \ %asm-generic kernelversion %src-pkg dt_binding_check \ - outputmakefile + outputmakefile rustavailable rustfmt rustfmtcheck # Installation targets should not require compiler. Unfortunately, vdso_install # is an exception where build artifacts may be updated. This must be fixed. no-compiler-targets := $(no-dot-config-targets) install dtbs_install \ @@ -436,6 +445,7 @@ else HOSTCC = gcc HOSTCXX = g++ endif +HOSTRUSTC = rustc KBUILD_USERHOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \ -O2 -fomit-frame-pointer -std=gnu11 \ @@ -443,8 +453,26 @@ KBUILD_USERHOSTCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \ KBUILD_USERCFLAGS := $(KBUILD_USERHOSTCFLAGS) $(USERCFLAGS) KBUILD_USERLDFLAGS := $(USERLDFLAGS) +# These flags apply to all Rust code in the tree, including the kernel and +# host programs. +export rust_common_flags := --edition=2021 \ + -Zbinary_dep_depinfo=y \ + -Dunsafe_op_in_unsafe_fn -Drust_2018_idioms \ + -Dunreachable_pub -Dnon_ascii_idents \ + -Wmissing_docs \ + -Drustdoc::missing_crate_level_docs \ + -Dclippy::correctness -Dclippy::style \ + -Dclippy::suspicious -Dclippy::complexity \ + -Dclippy::perf \ + -Dclippy::let_unit_value -Dclippy::mut_mut \ + -Dclippy::needless_bitwise_bool \ + -Dclippy::needless_continue \ + -Wclippy::dbg_macro + KBUILD_HOSTCFLAGS := $(KBUILD_USERHOSTCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS) KBUILD_HOSTCXXFLAGS := -Wall -O2 $(HOST_LFS_CFLAGS) $(HOSTCXXFLAGS) +KBUILD_HOSTRUSTFLAGS := $(rust_common_flags) -O -Cstrip=debuginfo \ + -Zallow-features= $(HOSTRUSTFLAGS) KBUILD_HOSTLDFLAGS := $(HOST_LFS_LDFLAGS) $(HOSTLDFLAGS) KBUILD_HOSTLDLIBS := $(HOST_LFS_LIBS) $(HOSTLDLIBS) @@ -469,6 +497,12 @@ OBJDUMP = $(CROSS_COMPILE)objdump READELF = $(CROSS_COMPILE)readelf STRIP = $(CROSS_COMPILE)strip endif +RUSTC = rustc +RUSTDOC = rustdoc +RUSTFMT = rustfmt +CLIPPY_DRIVER = clippy-driver +BINDGEN = bindgen +CARGO = cargo PAHOLE = pahole RESOLVE_BTFIDS = $(objtree)/tools/bpf/resolve_btfids/resolve_btfids LEX = flex @@ -494,9 +528,11 @@ CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \ -Wbitwise -Wno-return-void -Wno-unknown-attribute $(CF) NOSTDINC_FLAGS := CFLAGS_MODULE = +RUSTFLAGS_MODULE = AFLAGS_MODULE = LDFLAGS_MODULE = CFLAGS_KERNEL = +RUSTFLAGS_KERNEL = AFLAGS_KERNEL = LDFLAGS_vmlinux = @@ -525,15 +561,42 @@ KBUILD_CFLAGS := -Wall -Wundef -Werror=strict-prototypes -Wno-trigraphs \ -Werror=return-type -Wno-format-security \ -std=gnu11 KBUILD_CPPFLAGS := -D__KERNEL__ +KBUILD_RUSTFLAGS := $(rust_common_flags) \ + --target=$(objtree)/rust/target.json \ + -Cpanic=abort -Cembed-bitcode=n -Clto=n \ + -Cforce-unwind-tables=n -Ccodegen-units=1 \ + -Csymbol-mangling-version=v0 \ + -Crelocation-model=static \ + -Zfunction-sections=n \ + -Dclippy::float_arithmetic + KBUILD_AFLAGS_KERNEL := KBUILD_CFLAGS_KERNEL := +KBUILD_RUSTFLAGS_KERNEL := KBUILD_AFLAGS_MODULE := -DMODULE KBUILD_CFLAGS_MODULE := -DMODULE +KBUILD_RUSTFLAGS_MODULE := --cfg MODULE KBUILD_LDFLAGS_MODULE := KBUILD_LDFLAGS := CLANG_FLAGS := +ifeq ($(KBUILD_CLIPPY),1) + RUSTC_OR_CLIPPY_QUIET := CLIPPY + RUSTC_OR_CLIPPY = $(CLIPPY_DRIVER) +else + RUSTC_OR_CLIPPY_QUIET := RUSTC + RUSTC_OR_CLIPPY = $(RUSTC) +endif + +ifdef RUST_LIB_SRC + export RUST_LIB_SRC +endif + +export RUSTC_BOOTSTRAP := 1 + export ARCH SRCARCH CONFIG_SHELL BASH HOSTCC KBUILD_HOSTCFLAGS CROSS_COMPILE LD CC +export RUSTC RUSTDOC RUSTFMT RUSTC_OR_CLIPPY_QUIET RUSTC_OR_CLIPPY BINDGEN CARGO +export HOSTRUSTC KBUILD_HOSTRUSTFLAGS export CPP AR NM STRIP OBJCOPY OBJDUMP READELF PAHOLE RESOLVE_BTFIDS LEX YACC AWK INSTALLKERNEL export PERL PYTHON3 CHECK CHECKFLAGS MAKE UTS_MACHINE HOSTCXX export KGZIP KBZIP2 KLZOP LZMA LZ4 XZ ZSTD @@ -542,9 +605,10 @@ export KBUILD_USERCFLAGS KBUILD_USERLDFLAGS export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS KBUILD_LDFLAGS export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE +export KBUILD_RUSTFLAGS RUSTFLAGS_KERNEL RUSTFLAGS_MODULE export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE -export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE -export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL +export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_RUSTFLAGS_MODULE KBUILD_LDFLAGS_MODULE +export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL KBUILD_RUSTFLAGS_KERNEL export PAHOLE_FLAGS # Files to ignore in find ... statements @@ -725,7 +789,7 @@ $(KCONFIG_CONFIG): # # Do not use $(call cmd,...) here. That would suppress prompts from syncconfig, # so you cannot notice that Kconfig is waiting for the user input. -%/config/auto.conf %/config/auto.conf.cmd %/generated/autoconf.h: $(KCONFIG_CONFIG) +%/config/auto.conf %/config/auto.conf.cmd %/generated/autoconf.h %/generated/rustc_cfg: $(KCONFIG_CONFIG) $(Q)$(kecho) " SYNC $@" $(Q)$(MAKE) -f $(srctree)/Makefile syncconfig else # !may-sync-config @@ -754,11 +818,27 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 +KBUILD_RUSTFLAGS_OPT_LEVEL_MAP := 2 else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 KBUILD_CFLAGS += -O3 +KBUILD_RUSTFLAGS_OPT_LEVEL_MAP := 3 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os -endif +KBUILD_RUSTFLAGS_OPT_LEVEL_MAP := s +endif + +# Always set `debug-assertions` and `overflow-checks` because their default +# depends on `opt-level` and `debug-assertions`, respectively. +KBUILD_RUSTFLAGS += -Cdebug-assertions=$(if $(CONFIG_RUST_DEBUG_ASSERTIONS),y,n) +KBUILD_RUSTFLAGS += -Coverflow-checks=$(if $(CONFIG_RUST_OVERFLOW_CHECKS),y,n) +KBUILD_RUSTFLAGS += -Copt-level=$\ + $(if $(CONFIG_RUST_OPT_LEVEL_SIMILAR_AS_CHOSEN_FOR_C),$(KBUILD_RUSTFLAGS_OPT_LEVEL_MAP))$\ + $(if $(CONFIG_RUST_OPT_LEVEL_0),0)$\ + $(if $(CONFIG_RUST_OPT_LEVEL_1),1)$\ + $(if $(CONFIG_RUST_OPT_LEVEL_2),2)$\ + $(if $(CONFIG_RUST_OPT_LEVEL_3),3)$\ + $(if $(CONFIG_RUST_OPT_LEVEL_S),s)$\ + $(if $(CONFIG_RUST_OPT_LEVEL_Z),z) # Tell gcc to never replace conditional load with a non-conditional one ifdef CONFIG_CC_IS_GCC @@ -789,6 +869,9 @@ KBUILD_CFLAGS += $(stackp-flags-y) KBUILD_CFLAGS-$(CONFIG_WERROR) += -Werror KBUILD_CFLAGS += $(KBUILD_CFLAGS-y) $(CONFIG_CC_IMPLICIT_FALLTHROUGH) +KBUILD_RUSTFLAGS-$(CONFIG_WERROR) += -Dwarnings +KBUILD_RUSTFLAGS += $(KBUILD_RUSTFLAGS-y) + ifdef CONFIG_CC_IS_CLANG KBUILD_CPPFLAGS += -Qunused-arguments # The kernel builds with '-std=gnu11' so use of GNU extensions is acceptable. @@ -806,12 +889,15 @@ KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable) ifdef CONFIG_FRAME_POINTER KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls +KBUILD_RUSTFLAGS += -Cforce-frame-pointers=y else # Some targets (ARM with Thumb2, for example), can't be built with frame # pointers. For those, we don't have FUNCTION_TRACER automatically # select FRAME_POINTER. However, FUNCTION_TRACER adds -pg, and this is # incompatible with -fomit-frame-pointer with current GCC, so we don't use # -fomit-frame-pointer with FUNCTION_TRACER. +# In the Rust target specification, "frame-pointer" is set explicitly +# to "may-omit". ifndef CONFIG_FUNCTION_TRACER KBUILD_CFLAGS += -fomit-frame-pointer endif @@ -876,8 +962,10 @@ ifdef CONFIG_DEBUG_SECTION_MISMATCH KBUILD_CFLAGS += -fno-inline-functions-called-once endif +# `rustc`'s `-Zfunction-sections` applies to data too (as of 1.59.0). ifdef CONFIG_LD_DEAD_CODE_DATA_ELIMINATION KBUILD_CFLAGS_KERNEL += -ffunction-sections -fdata-sections +KBUILD_RUSTFLAGS_KERNEL += -Zfunction-sections=y LDFLAGS_vmlinux += --gc-sections endif @@ -1019,10 +1107,11 @@ include $(addprefix $(srctree)/, $(include-y)) # Do not add $(call cc-option,...) below this line. When you build the kernel # from the clean source tree, the GCC plugins do not exist at this point. -# Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments +# Add user supplied CPPFLAGS, AFLAGS, CFLAGS and RUSTFLAGS as the last assignments KBUILD_CPPFLAGS += $(KCPPFLAGS) KBUILD_AFLAGS += $(KAFLAGS) KBUILD_CFLAGS += $(KCFLAGS) +KBUILD_RUSTFLAGS += $(KRUSTFLAGS) KBUILD_LDFLAGS_MODULE += --build-id=sha1 LDFLAGS_vmlinux += --build-id=sha1 @@ -1091,6 +1180,7 @@ export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps ifeq ($(KBUILD_EXTMOD),) core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ core-$(CONFIG_BLOCK) += block/ +core-$(CONFIG_RUST) += rust/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ @@ -1195,6 +1285,10 @@ prepare0: archprepare # All the preparing.. prepare: prepare0 +ifdef CONFIG_RUST + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust-is-available.sh -v + $(Q)$(MAKE) $(build)=rust +endif PHONY += remove-stale-files remove-stale-files: @@ -1483,7 +1577,7 @@ endif # CONFIG_MODULES # Directories & files removed with 'make clean' CLEAN_FILES += include/ksym vmlinux.symvers modules-only.symvers \ modules.builtin modules.builtin.modinfo modules.nsdeps \ - compile_commands.json .thinlto-cache + compile_commands.json .thinlto-cache rust/test rust/doc # Directories & files removed with 'make mrproper' MRPROPER_FILES += include/config include/generated \ @@ -1494,7 +1588,8 @@ MRPROPER_FILES += include/config include/generated \ certs/signing_key.pem \ certs/x509.genkey \ vmlinux-gdb.py \ - *.spec + *.spec \ + rust/target.json rust/libmacros.so # clean - Delete most, but leave enough to build external modules # @@ -1519,6 +1614,9 @@ $(mrproper-dirs): mrproper: clean $(mrproper-dirs) $(call cmd,rmfiles) + @find . $(RCS_FIND_IGNORE) \ + \( -name '*.rmeta' \) \ + -type f -print | xargs rm -f # distclean # @@ -1606,6 +1704,23 @@ help: @echo ' kselftest-merge - Merge all the config dependencies of' @echo ' kselftest to existing .config.' @echo '' + @echo 'Rust targets:' + @echo ' rustavailable - Checks whether the Rust toolchain is' + @echo ' available and, if not, explains why.' + @echo ' rustfmt - Reformat all the Rust code in the kernel' + @echo ' rustfmtcheck - Checks if all the Rust code in the kernel' + @echo ' is formatted, printing a diff otherwise.' + @echo ' rustdoc - Generate Rust documentation' + @echo ' (requires kernel .config)' + @echo ' rusttest - Runs the Rust tests' + @echo ' (requires kernel .config; downloads external repos)' + @echo ' rust-analyzer - Generate rust-project.json rust-analyzer support file' + @echo ' (requires kernel .config)' + @echo ' dir/file.[os] - Build specified target only' + @echo ' dir/file.i - Build macro expanded source, similar to C preprocessing' + @echo ' (run with RUSTFMT=n to skip reformatting if needed)' + @echo ' dir/file.ll - Build the LLVM assembly file' + @echo '' @$(if $(dtstree), \ echo 'Devicetree:'; \ echo '* dtbs - Build device tree blobs for enabled boards'; \ @@ -1677,6 +1792,52 @@ PHONY += $(DOC_TARGETS) $(DOC_TARGETS): $(Q)$(MAKE) $(build)=Documentation $@ + +# Rust targets +# --------------------------------------------------------------------------- + +# "Is Rust available?" target +PHONY += rustavailable +rustavailable: + $(Q)$(CONFIG_SHELL) $(srctree)/scripts/rust-is-available.sh -v && echo >&2 "Rust is available!" + +# Documentation target +# +# Using the singular to avoid running afoul of `no-dot-config-targets`. +PHONY += rustdoc +rustdoc: prepare + $(Q)$(MAKE) $(build)=rust $@ + +# Testing target +PHONY += rusttest +rusttest: prepare + $(Q)$(MAKE) $(build)=rust $@ + +# Formatting targets +PHONY += rustfmt rustfmtcheck + +# We skip `rust/alloc` since we want to minimize the diff w.r.t. upstream. +# +# We match using absolute paths since `find` does not resolve them +# when matching, which is a problem when e.g. `srctree` is `..`. +# We `grep` afterwards in order to remove the directory entry itself. +rustfmt: + $(Q)find $(abs_srctree) -type f -name '*.rs' \ + -o -path $(abs_srctree)/rust/alloc -prune \ + -o -path $(abs_objtree)/rust/test -prune \ + | grep -Fv $(abs_srctree)/rust/alloc \ + | grep -Fv $(abs_objtree)/rust/test \ + | grep -Fv generated \ + | xargs $(RUSTFMT) $(rustfmt_flags) + +rustfmtcheck: rustfmt_flags = --check +rustfmtcheck: rustfmt + +# IDE support targets +PHONY += rust-analyzer +rust-analyzer: + $(Q)$(MAKE) $(build)=rust $@ + # Misc # --------------------------------------------------------------------------- diff --git a/arch/Kconfig b/arch/Kconfig index 31c4fdc4a4baaa..89d27b2a86ddc0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -344,6 +344,12 @@ config HAVE_RSEQ This symbol should be selected by an architecture if it supports an implementation of restartable sequences. +config HAVE_RUST + bool + help + This symbol should be selected by an architecture if it + supports Rust. + config HAVE_FUNCTION_ARG_ACCESS_API bool help diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2e8091e2d8a86d..1d0005080aebd6 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -116,6 +116,7 @@ config ARM select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ + select HAVE_RUST if CPU_32v6 || CPU_32v6K select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HAVE_UID16 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 20ea89d9ac2fa7..308cff85f5cb89 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -198,6 +198,7 @@ config ARM64 select HAVE_FUNCTION_ARG_ACCESS_API select MMU_GATHER_RCU_TABLE_FREE select HAVE_RSEQ + select HAVE_RUST select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS select HAVE_KPROBES diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 174edabb74fa11..ffbad38204b98b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -239,6 +239,7 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ + select HAVE_RUST if PPC64 && CPU_LITTLE_ENDIAN select HAVE_SETUP_PER_CPU_AREA if PPC64 select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 00fd9c548f2631..63f7258984f340 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -101,6 +101,7 @@ config RISCV select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RUST if 64BIT select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 7d81102cffd48e..663ae53b55973a 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -26,6 +26,8 @@ ifeq ($(CONFIG_ARCH_RV64I),y) KBUILD_CFLAGS += -mabi=lp64 KBUILD_AFLAGS += -mabi=lp64 + KBUILD_RUSTFLAGS += -Ctarget-cpu=generic-rv64 + KBUILD_LDFLAGS += -melf64lriscv else BITS := 32 @@ -33,6 +35,9 @@ else KBUILD_CFLAGS += -mabi=ilp32 KBUILD_AFLAGS += -mabi=ilp32 + + KBUILD_RUSTFLAGS += -Ctarget-cpu=generic-rv32 + KBUILD_LDFLAGS += -melf32lriscv endif diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 4d398b80aea8c2..c9e76243c0b943 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -24,6 +24,7 @@ config UML select TRACE_IRQFLAGS_SUPPORT select TTY # Needed for line.c select HAVE_ARCH_VMAP_STACK + select HAVE_RUST if X86_64 config MMU bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4bed3abf444d13..8d4e30f07a7dff 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -252,6 +252,7 @@ config X86 select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION select HAVE_PREEMPT_DYNAMIC_CALL select HAVE_RSEQ + select HAVE_RUST if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 63d50f65b82834..5ac9b324751d1a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -21,6 +21,8 @@ ifdef CONFIG_CC_IS_CLANG RETPOLINE_CFLAGS := -mretpoline-external-thunk RETPOLINE_VDSO_CFLAGS := -mretpoline endif +RETPOLINE_RUSTFLAGS := -Ctarget-feature=+retpoline-external-thunk + export RETPOLINE_CFLAGS export RETPOLINE_VDSO_CFLAGS @@ -61,6 +63,8 @@ export BITS # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 # KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +KBUILD_RUSTFLAGS += -Ctarget-feature=-mmx,-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2 +KBUILD_RUSTFLAGS += -Ctarget-feature=-3dnow,-3dnowa,-avx,-avx2,+soft-float ifeq ($(CONFIG_X86_KERNEL_IBT),y) # @@ -148,8 +152,17 @@ else cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic KBUILD_CFLAGS += $(cflags-y) + rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 + rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona + rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 + rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom + rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic + KBUILD_RUSTFLAGS += $(rustflags-y) + KBUILD_CFLAGS += -mno-red-zone KBUILD_CFLAGS += -mcmodel=kernel + KBUILD_RUSTFLAGS += -Cno-redzone=y + KBUILD_RUSTFLAGS += -Ccode-model=kernel endif # @@ -185,6 +198,7 @@ ifdef CONFIG_RETPOLINE ifndef CONFIG_CC_IS_CLANG KBUILD_CFLAGS += -fno-jump-tables endif + KBUILD_RUSTFLAGS += $(RETPOLINE_RUSTFLAGS) endif ifdef CONFIG_SLS diff --git a/init/Kconfig b/init/Kconfig index ddcbefe535e9e7..3457cf596588f5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -60,6 +60,17 @@ config LLD_VERSION default $(ld-version) if LD_IS_LLD default 0 +config RUST_IS_AVAILABLE + def_bool $(success,$(srctree)/scripts/rust-is-available.sh) + help + This shows whether a suitable Rust toolchain is available (found). + + Please see Documentation/rust/quick-start.rst for instructions on how + to satify the build requirements of Rust support. + + In particular, the Makefile target 'rustavailable' is useful to check + why the Rust toolchain is not being detected. + config CC_CAN_LINK bool default $(success,$(srctree)/scripts/cc-can-link.sh $(CC) $(CLANG_FLAGS) $(USERCFLAGS) $(USERLDFLAGS) $(m64-flag)) if 64BIT @@ -146,7 +157,8 @@ config WERROR default COMPILE_TEST help A kernel build should not cause any compiler warnings, and this - enables the '-Werror' flag to enforce that rule by default. + enables the '-Werror' (for C) and '-Dwarnings' (for Rust) flags + to enforce that rule by default. However, if you have a new (or very old) compiler with odd and unusual warnings, or you have some architecture with problems, @@ -2045,6 +2057,37 @@ config PROFILING Say Y here to enable the extended profiling support mechanisms used by profilers. +config RUST + bool "Rust support" + depends on HAVE_RUST + depends on RUST_IS_AVAILABLE + depends on !MODVERSIONS + depends on !GCC_PLUGINS + depends on !DEBUG_INFO_BTF + select CONSTRUCTORS + help + Enables Rust support in the kernel. + + This allows other Rust-related options, like drivers written in Rust, + to be selected. + + It is also required to be able to load external kernel modules + written in Rust. + + See Documentation/rust/ for more information. + + If unsure, say N. + +config RUSTC_VERSION_TEXT + string + depends on RUST + default $(shell,command -v $(RUSTC) >/dev/null 2>&1 && $(RUSTC) --version || echo n) + +config BINDGEN_VERSION_TEXT + string + depends on RUST + default $(shell,command -v $(BINDGEN) >/dev/null 2>&1 && $(BINDGEN) --version || echo n) + # # Place an empty function call at each tracepoint site. Can be # dynamically changed for a probe function. diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 075cd25363ac38..bfc28f52b6037d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2760,6 +2760,161 @@ config HYPERV_TESTING endmenu # "Kernel Testing and Coverage" +menu "Rust hacking" + +config RUST_DEBUG_ASSERTIONS + bool "Debug assertions" + depends on RUST + help + Enables rustc's `-Cdebug-assertions` codegen option. + + This flag lets you turn `cfg(debug_assertions)` conditional + compilation on or off. This can be used to enable extra debugging + code in development but not in production. For example, it controls + the behavior of the standard library's `debug_assert!` macro. + + Note that this will apply to all Rust code, including `core`. + + If unsure, say N. + +config RUST_OVERFLOW_CHECKS + bool "Overflow checks" + default y + depends on RUST + help + Enables rustc's `-Coverflow-checks` codegen option. + + This flag allows you to control the behavior of runtime integer + overflow. When overflow-checks are enabled, a Rust panic will occur + on overflow. + + Note that this will apply to all Rust code, including `core`. + + If unsure, say Y. + +choice + prompt "Optimization level" + default RUST_OPT_LEVEL_SIMILAR_AS_CHOSEN_FOR_C + depends on RUST + help + Controls rustc's `-Copt-level` codegen option. + + This flag controls the optimization level. + + If unsure, say "Similar as chosen for C". + +config RUST_OPT_LEVEL_SIMILAR_AS_CHOSEN_FOR_C + bool "Similar as chosen for C" + help + This choice will pick a similar optimization level as chosen in + the "Compiler optimization level" for C: + + -O2 is currently mapped to -Copt-level=2 + -O3 is currently mapped to -Copt-level=3 + -Os is currently mapped to -Copt-level=s + + The mapping may change over time to follow the intended semantics + of the choice for C as sensibly as possible. + + This is the default. + +config RUST_OPT_LEVEL_0 + bool "No optimizations (-Copt-level=0)" + help + Not recommended for most purposes. It may come in handy for debugging + suspected optimizer bugs, unexpected undefined behavior, etc. + + Note that this level will *not* enable debug assertions nor overflow + checks on its own (like it happens when interacting with rustc + directly). Use the corresponding configuration options to control + that instead, orthogonally. + + Note this level may cause excessive stack usage, which can lead to stack + overflow and subsequent crashes. + +config RUST_OPT_LEVEL_1 + bool "Basic optimizations (-Copt-level=1)" + help + Useful for debugging without getting too lost, but without + the overhead and boilerplate of no optimizations at all. + + Note this level may cause excessive stack usage, which can lead to stack + overflow and subsequent crashes. + +config RUST_OPT_LEVEL_2 + bool "Some optimizations (-Copt-level=2)" + help + The sensible choice in most cases. + +config RUST_OPT_LEVEL_3 + bool "All optimizations (-Copt-level=3)" + help + Yet more performance (hopefully). + +config RUST_OPT_LEVEL_S + bool "Optimize for size (-Copt-level=s)" + help + Smaller kernel, ideally without too much performance loss. + +config RUST_OPT_LEVEL_Z + bool "Optimize for size, no loop vectorization (-Copt-level=z)" + help + Like the previous level, but also turn off loop vectorization. + +endchoice + +choice + prompt "Build-time assertions" + default RUST_BUILD_ASSERT_ALLOW if RUST_OPT_LEVEL_0 + default RUST_BUILD_ASSERT_DENY if !RUST_OPT_LEVEL_0 + depends on RUST + help + Controls how are `build_error!` and `build_assert!` handled during build. + + If calls to them exist in the binary, it may indicate a violated invariant + or that the optimizer failed to verify the invariant during compilation. + You can choose to abort compilation or ignore them during build and let the + check be carried to runtime. + + If optimizations are turned off, you cannot select "Deny". + + If unsure, say "Deny". + +config RUST_BUILD_ASSERT_ALLOW + bool "Allow" + help + Unoptimized calls to `build_error!` will be converted to `panic!` + and checked at runtime. + +config RUST_BUILD_ASSERT_WARN + bool "Warn" + help + Unoptimized calls to `build_error!` will be converted to `panic!` + and checked at runtime, but warnings will be generated when building. + +config RUST_BUILD_ASSERT_DENY + bool "Deny" + depends on !RUST_OPT_LEVEL_0 + help + Unoptimized calls to `build_error!` will abort compilation. + +endchoice + +config RUST_KERNEL_KUNIT_TEST + bool "KUnit test for the `kernel` crate" if !KUNIT_ALL_TESTS + depends on RUST && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the documentation tests of the `kernel` crate + as KUnit tests. + + For more information on KUnit and unit tests in general, + please refer to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + +endmenu # "Rust" + source "Documentation/Kconfig" endmenu # Kernel hacking diff --git a/rust/.gitignore b/rust/.gitignore new file mode 100644 index 00000000000000..89b602d9110907 --- /dev/null +++ b/rust/.gitignore @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 + +target.json +bindings_generated.rs +bindings_helpers_generated.rs +exports_*_generated.h +doctests_kernel_generated.rs +doctests_kernel_generated_kunit.c +doc/ +test/ diff --git a/rust/Makefile b/rust/Makefile new file mode 100644 index 00000000000000..a7dc37027fe22e --- /dev/null +++ b/rust/Makefile @@ -0,0 +1,398 @@ +# SPDX-License-Identifier: GPL-2.0 + +always-$(CONFIG_RUST) += target.json +no-clean-files += target.json + +obj-$(CONFIG_RUST) += core.o compiler_builtins.o +always-$(CONFIG_RUST) += exports_core_generated.h + +# Missing prototypes are expected in the helpers since these are exported +# for Rust only, thus there is no header nor prototypes. +obj-$(CONFIG_RUST) += helpers.o +CFLAGS_REMOVE_helpers.o = -Wmissing-prototypes -Wmissing-declarations + +always-$(CONFIG_RUST) += libmacros.so +no-clean-files += libmacros.so + +always-$(CONFIG_RUST) += bindings_generated.rs bindings_helpers_generated.rs +obj-$(CONFIG_RUST) += alloc.o kernel.o +always-$(CONFIG_RUST) += exports_alloc_generated.h exports_kernel_generated.h + +ifdef CONFIG_RUST_BUILD_ASSERT_DENY +always-$(CONFIG_RUST) += build_error.o +else +obj-$(CONFIG_RUST) += build_error.o +endif + +obj-$(CONFIG_RUST) += exports.o + +obj-$(CONFIG_RUST_KERNEL_KUNIT_TEST) += doctests_kernel_generated.o +obj-$(CONFIG_RUST_KERNEL_KUNIT_TEST) += doctests_kernel_generated_kunit.o + +# Avoids running `$(RUSTC)` for the sysroot when it may not be available. +ifdef CONFIG_RUST + +# `$(rust_flags)` is passed in case the user added `--sysroot`. +rustc_sysroot := $(shell $(RUSTC) $(rust_flags) --print sysroot) +rustc_host_target := $(shell $(RUSTC) --version --verbose | grep -F 'host: ' | cut -d' ' -f2) +RUST_LIB_SRC ?= $(rustc_sysroot)/lib/rustlib/src/rust/library + +ifeq ($(quiet),silent_) +cargo_quiet=-q +rust_test_quiet=-q +rustdoc_test_quiet=--test-args -q +rustdoc_test_kernel_quiet=>/dev/null +else ifeq ($(quiet),quiet_) +rust_test_quiet=-q +rustdoc_test_quiet=--test-args -q +rustdoc_test_kernel_quiet=>/dev/null +else +cargo_quiet=--verbose +endif + +core-cfgs = \ + --cfg no_fp_fmt_parse + +alloc-cfgs = \ + --cfg no_global_oom_handling \ + --cfg no_rc \ + --cfg no_sync + +quiet_cmd_rustdoc = RUSTDOC $(if $(rustdoc_host),H, ) $< + cmd_rustdoc = \ + OBJTREE=$(abspath $(objtree)) \ + $(RUSTDOC) $(if $(rustdoc_host),$(rust_common_flags),$(rust_flags)) \ + $(rustc_target_flags) -L$(objtree)/$(obj) \ + --output $(objtree)/$(obj)/doc \ + --crate-name $(subst rustdoc-,,$@) \ + @$(objtree)/include/generated/rustc_cfg $< + +# The `html_logo_url` and `html_favicon_url` forms of the `doc` attribute +# can be used to specify a custom logo. However: +# - The given value is used as-is, thus it cannot be relative or a local file +# (unlike the non-custom case) since the generated docs have subfolders. +# - It requires adding it to every crate. +# - It requires changing `core` which comes from the sysroot. +# +# Using `-Zcrate-attr` would solve the last two points, but not the first. +# The https://github.com/rust-lang/rfcs/pull/3226 RFC suggests two new +# command-like flags to solve the issue. Meanwhile, we use the non-custom case +# and then retouch the generated files. +rustdoc: rustdoc-core rustdoc-macros rustdoc-compiler_builtins \ + rustdoc-alloc rustdoc-kernel + $(Q)cp $(srctree)/Documentation/logo.gif $(objtree)/$(obj)/doc + $(Q)find $(objtree)/$(obj)/doc -name '*.html' -type f -print0 | xargs -0 sed -Ei \ + -e 's:rust-logo\.svg:logo.gif:g' \ + -e 's:rust-logo\.png:logo.gif:g' \ + -e 's:favicon\.svg:logo.gif:g' \ + -e 's:::g' + +rustdoc-macros: private rustdoc_host = yes +rustdoc-macros: private rustc_target_flags = --crate-type proc-macro \ + --extern proc_macro +rustdoc-macros: $(src)/macros/lib.rs FORCE + $(call if_changed,rustdoc) + +rustdoc-core: private rustc_target_flags = $(core-cfgs) +rustdoc-core: $(RUST_LIB_SRC)/core/src/lib.rs FORCE + $(call if_changed,rustdoc) + +rustdoc-compiler_builtins: $(src)/compiler_builtins.rs rustdoc-core FORCE + $(call if_changed,rustdoc) + +# We need to allow `rustdoc::broken_intra_doc_links` because some +# `no_global_oom_handling` functions refer to non-`no_global_oom_handling` +# functions. Ideally `rustdoc` would have a way to distinguish broken links +# due to things that are "configured out" vs. entirely non-existing ones. +rustdoc-alloc: private rustc_target_flags = $(alloc-cfgs) \ + -Arustdoc::broken_intra_doc_links +rustdoc-alloc: $(src)/alloc/lib.rs rustdoc-core rustdoc-compiler_builtins FORCE + $(call if_changed,rustdoc) + +rustdoc-kernel: private rustc_target_flags = --extern alloc \ + --extern build_error --extern macros=$(objtree)/$(obj)/libmacros.so +rustdoc-kernel: $(src)/kernel/lib.rs rustdoc-core rustdoc-macros \ + rustdoc-compiler_builtins rustdoc-alloc $(obj)/libmacros.so \ + $(obj)/bindings_generated.rs $(obj)/bindings_helpers_generated.rs FORCE + $(call if_changed,rustdoc) + +quiet_cmd_rustc_test_library = RUSTC TL $< + cmd_rustc_test_library = \ + OBJTREE=$(abspath $(objtree)) \ + $(RUSTC) $(rust_common_flags) \ + @$(objtree)/include/generated/rustc_cfg $(rustc_target_flags) \ + --crate-type $(if $(rustc_test_library_proc),proc-macro,rlib) \ + --out-dir $(objtree)/$(obj)/test --cfg testlib \ + --sysroot $(objtree)/$(obj)/test/sysroot \ + -L$(objtree)/$(obj)/test \ + --crate-name $(subst rusttest-,,$(subst rusttestlib-,,$@)) $< + +rusttestlib-build_error: $(src)/build_error.rs rusttest-prepare FORCE + $(call if_changed,rustc_test_library) + +rusttestlib-macros: private rustc_target_flags = --extern proc_macro +rusttestlib-macros: private rustc_test_library_proc = yes +rusttestlib-macros: $(src)/macros/lib.rs rusttest-prepare FORCE + $(call if_changed,rustc_test_library) + +quiet_cmd_rustdoc_test = RUSTDOC T $< + cmd_rustdoc_test = \ + OBJTREE=$(abspath $(objtree)) \ + $(RUSTDOC) --test $(rust_common_flags) \ + @$(objtree)/include/generated/rustc_cfg \ + $(rustc_target_flags) $(rustdoc_test_target_flags) \ + --sysroot $(objtree)/$(obj)/test/sysroot $(rustdoc_test_quiet) \ + -L$(objtree)/$(obj)/test --output $(objtree)/$(obj)/doc \ + --crate-name $(subst rusttest-,,$@) $< + +quiet_cmd_rustdoc_test_kernel = RUSTDOC TK $< + cmd_rustdoc_test_kernel = \ + rm -rf $(objtree)/$(obj)/test/doctests/kernel; \ + mkdir -p $(objtree)/$(obj)/test/doctests/kernel; \ + OBJTREE=$(abspath $(objtree)) \ + $(RUSTDOC) --test $(rust_flags) \ + @$(objtree)/include/generated/rustc_cfg \ + -L$(objtree)/$(obj) --extern alloc --extern kernel \ + --extern build_error --extern macros \ + --no-run --crate-name kernel -Zunstable-options \ + --test-builder $(srctree)/scripts/rustdoc_test_builder.py \ + $< $(rustdoc_test_kernel_quiet); \ + $(srctree)/scripts/rustdoc_test_gen.py + +%/doctests_kernel_generated.rs %/doctests_kernel_generated_kunit.c: $(src)/kernel/lib.rs $(obj)/kernel.o FORCE + $(call if_changed,rustdoc_test_kernel) + +# We cannot use `-Zpanic-abort-tests` because some tests are dynamic, +# so for the moment we skip `-Cpanic=abort`. +quiet_cmd_rustc_test = RUSTC T $< + cmd_rustc_test = \ + OBJTREE=$(abspath $(objtree)) \ + $(RUSTC) --test $(rust_common_flags) \ + @$(objtree)/include/generated/rustc_cfg \ + $(rustc_target_flags) --out-dir $(objtree)/$(obj)/test \ + --sysroot $(objtree)/$(obj)/test/sysroot \ + -L$(objtree)/$(obj)/test \ + --crate-name $(subst rusttest-,,$@) $<; \ + $(objtree)/$(obj)/test/$(subst rusttest-,,$@) $(rust_test_quiet) \ + $(rustc_test_run_flags) + +rusttest: rusttest-macros rusttest-kernel + +# This prepares a custom sysroot with our custom `alloc` instead of +# the standard one. +# +# This requires several hacks: +# - Unlike `core` and `alloc`, `std` depends on more than a dozen crates, +# including third-party crates that need to be downloaded, plus custom +# `build.rs` steps. Thus hardcoding things here is not maintainable. +# - `cargo` knows how to build the standard library, but it is an unstable +# feature so far (`-Zbuild-std`). +# - `cargo` only considers the use case of building the standard library +# to use it in a given package. Thus we need to create a dummy package +# and pick the generated libraries from there. +# - Since we only keep a subset of upstream `alloc` in-tree, we need +# to recreate it on the fly by putting our sources on top. +# - The usual ways of modifying the dependency graph in `cargo` do not seem +# to apply for the `-Zbuild-std` steps, thus we have to mislead it +# by modifying the sources in the sysroot. +# - To avoid messing with the user's Rust installation, we create a clone +# of the sysroot. However, `cargo` ignores `RUSTFLAGS` in the `-Zbuild-std` +# steps, thus we use a wrapper binary passed via `RUSTC` to pass the flag. +# +# In the future, we hope to avoid the whole ordeal by either: +# - Making the `test` crate not depend on `std` (either improving upstream +# or having our own custom crate). +# - Making the tests run in kernel space (requires the previous point). +# - Making `std` and friends be more like a "normal" crate, so that +# `-Zbuild-std` and related hacks are not needed. +quiet_cmd_rustsysroot = RUSTSYSROOT + cmd_rustsysroot = \ + rm -rf $(objtree)/$(obj)/test; \ + mkdir -p $(objtree)/$(obj)/test; \ + cp -a $(rustc_sysroot) $(objtree)/$(obj)/test/sysroot; \ + cp -r $(srctree)/$(src)/alloc/* \ + $(objtree)/$(obj)/test/sysroot/lib/rustlib/src/rust/library/alloc/src; \ + echo '\#!/bin/sh' > $(objtree)/$(obj)/test/rustc_sysroot; \ + echo "$(RUSTC) --sysroot=$(abspath $(objtree)/$(obj)/test/sysroot) \"\$$@\"" \ + >> $(objtree)/$(obj)/test/rustc_sysroot; \ + chmod u+x $(objtree)/$(obj)/test/rustc_sysroot; \ + $(CARGO) -q new $(objtree)/$(obj)/test/dummy; \ + RUSTC=$(objtree)/$(obj)/test/rustc_sysroot $(CARGO) $(cargo_quiet) \ + test -Zbuild-std --target $(rustc_host_target) \ + --manifest-path $(objtree)/$(obj)/test/dummy/Cargo.toml; \ + rm $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib/*; \ + cp $(objtree)/$(obj)/test/dummy/target/$(rustc_host_target)/debug/deps/* \ + $(objtree)/$(obj)/test/sysroot/lib/rustlib/$(rustc_host_target)/lib + +rusttest-prepare: FORCE + $(call if_changed,rustsysroot) + +rusttest-macros: private rustc_target_flags = --extern proc_macro +rusttest-macros: private rustdoc_test_target_flags = --crate-type proc-macro +rusttest-macros: $(src)/macros/lib.rs rusttest-prepare FORCE + $(call if_changed,rustc_test) + $(call if_changed,rustdoc_test) + +rusttest-kernel: private rustc_target_flags = --extern alloc \ + --extern build_error --extern macros +rusttest-kernel: private rustc_test_run_flags = --skip bindgen_test_layout_ +rusttest-kernel: $(src)/kernel/lib.rs rusttest-prepare \ + rusttestlib-build_error rusttestlib-macros FORCE + $(call if_changed,rustc_test) + $(call if_changed,rustc_test_library) + +filechk_rust_target = $(objtree)/scripts/generate_rust_target < $< + +$(obj)/target.json: $(objtree)/include/config/auto.conf FORCE + $(call filechk,rust_target) + +ifdef CONFIG_CC_IS_CLANG +bindgen_c_flags = $(c_flags) +else +# bindgen relies on libclang to parse C. Ideally, bindgen would support a GCC +# plugin backend and/or the Clang driver would be perfectly compatible with GCC. +# +# For the moment, here we are tweaking the flags on the fly. This is a hack, +# and some kernel configurations may not work (e.g. `GCC_PLUGIN_RANDSTRUCT` +# if we end up using one of those structs). +bindgen_skip_c_flags := -mno-fp-ret-in-387 -mpreferred-stack-boundary=% \ + -mskip-rax-setup -mgeneral-regs-only -msign-return-address=% \ + -mindirect-branch=thunk-extern -mindirect-branch-register \ + -mrecord-mcount -mabi=lp64 -mstack-protector-guard% -mtraceback=no \ + -mno-pointers-to-nested-functions -mno-string -mno-strict-align \ + -mstrict-align \ + -fconserve-stack -falign-jumps=% -falign-loops=% \ + -femit-struct-debug-baseonly -fno-ipa-cp-clone -fno-ipa-sra \ + -fno-partial-inlining -fplugin-arg-arm_ssp_per_task_plugin-% \ + -fno-reorder-blocks -fno-allow-store-data-races -fasan-shadow-offset=% \ + -fzero-call-used-regs=% -fno-stack-clash-protection \ + -fno-inline-functions-called-once \ + --param=% --param asan-% + +# Derived from `scripts/Makefile.clang`. +BINDGEN_TARGET_arm := arm-linux-gnueabi +BINDGEN_TARGET_arm64 := aarch64-linux-gnu +BINDGEN_TARGET_powerpc := powerpc64le-linux-gnu +BINDGEN_TARGET_riscv := riscv64-linux-gnu +BINDGEN_TARGET_x86 := x86_64-linux-gnu +BINDGEN_TARGET := $(BINDGEN_TARGET_$(SRCARCH)) + +# All warnings are inhibited since GCC builds are very experimental, +# many GCC warnings are not supported by Clang, they may only appear in +# some configurations, with new GCC versions, etc. +bindgen_extra_c_flags = -w --target=$(BINDGEN_TARGET) + +bindgen_c_flags = $(filter-out $(bindgen_skip_c_flags), $(c_flags)) \ + $(bindgen_extra_c_flags) +endif + +ifdef CONFIG_LTO +bindgen_c_flags_lto = $(filter-out $(CC_FLAGS_LTO), $(bindgen_c_flags)) +else +bindgen_c_flags_lto = $(bindgen_c_flags) +endif + +bindgen_c_flags_final = $(bindgen_c_flags_lto) + +quiet_cmd_bindgen = BINDGEN $@ + cmd_bindgen = \ + $(BINDGEN) $< $(bindgen_target_flags) \ + --use-core --with-derive-default --ctypes-prefix c_types \ + --no-debug '.*' \ + --size_t-is-usize -o $@ -- $(bindgen_c_flags_final) -DMODULE \ + $(bindgen_target_cflags) $(bindgen_target_extra) + +$(obj)/bindings_generated.rs: private bindgen_target_flags = \ + $(shell grep -v '^\#\|^$$' $(srctree)/$(src)/bindgen_parameters) +$(obj)/bindings_generated.rs: $(src)/kernel/bindings_helper.h \ + $(src)/bindgen_parameters FORCE + $(call if_changed_dep,bindgen) + +# See `CFLAGS_REMOVE_helpers.o` above. In addition, Clang on C does not warn +# with `-Wmissing-declarations` (unlike GCC), so it is not strictly needed here +# given it is `libclang`; but for consistency, future Clang changes and/or +# a potential future GCC backend for `bindgen`, we disable it too. +$(obj)/bindings_helpers_generated.rs: private bindgen_target_flags = \ + --blacklist-type '.*' --whitelist-var '' \ + --whitelist-function 'rust_helper_.*' +$(obj)/bindings_helpers_generated.rs: private bindgen_target_cflags = \ + -I$(objtree)/$(obj) -Wno-missing-prototypes -Wno-missing-declarations +$(obj)/bindings_helpers_generated.rs: private bindgen_target_extra = ; \ + sed -Ei 's/pub fn rust_helper_([a-zA-Z0-9_]*)/#[link_name="rust_helper_\1"]\n pub fn \1/g' $@ +$(obj)/bindings_helpers_generated.rs: $(src)/helpers.c FORCE + $(call if_changed_dep,bindgen) + +quiet_cmd_exports = EXPORTS $@ + cmd_exports = \ + $(NM) -p --defined-only $< \ + | grep -E ' (T|R|D) ' | cut -d ' ' -f 3 \ + | xargs -Isymbol \ + echo 'EXPORT_SYMBOL_RUST_GPL(symbol);' > $@ + +$(obj)/exports_core_generated.h: $(obj)/core.o FORCE + $(call if_changed,exports) + +$(obj)/exports_alloc_generated.h: $(obj)/alloc.o FORCE + $(call if_changed,exports) + +$(obj)/exports_kernel_generated.h: $(obj)/kernel.o FORCE + $(call if_changed,exports) + +quiet_cmd_rustc_procmacro = $(RUSTC_OR_CLIPPY_QUIET) P $@ + cmd_rustc_procmacro = \ + $(RUSTC_OR_CLIPPY) $(rust_common_flags) \ + --emit=dep-info,link --extern proc_macro \ + --crate-type proc-macro --out-dir $(objtree)/$(obj) \ + --crate-name $(patsubst lib%.so,%,$(notdir $@)) $<; \ + mv $(objtree)/$(obj)/$(patsubst lib%.so,%,$(notdir $@)).d $(depfile); \ + sed -i '/^\#/d' $(depfile) + +# Procedural macros can only be used with the `rustc` that compiled it. +# Therefore, to get `libmacros.so` automatically recompiled when the compiler +# version changes, we add `core.o` as a dependency (even if it is not needed). +$(obj)/libmacros.so: $(src)/macros/lib.rs $(obj)/core.o FORCE + $(call if_changed_dep,rustc_procmacro) + +quiet_cmd_rustc_library = $(if $(skip_clippy),RUSTC,$(RUSTC_OR_CLIPPY_QUIET)) L $@ + cmd_rustc_library = \ + OBJTREE=$(abspath $(objtree)) \ + $(if $(skip_clippy),$(RUSTC),$(RUSTC_OR_CLIPPY)) \ + $(filter-out $(skip_flags),$(rust_flags) $(rustc_target_flags)) \ + --emit=dep-info,obj,metadata --crate-type rlib \ + --out-dir $(objtree)/$(obj) -L$(objtree)/$(obj) \ + --crate-name $(patsubst %.o,%,$(notdir $@)) $<; \ + mv $(objtree)/$(obj)/$(patsubst %.o,%,$(notdir $@)).d $(depfile); \ + sed -i '/^\#/d' $(depfile) \ + $(if $(rustc_objcopy),;$(OBJCOPY) $(rustc_objcopy) $@) + +rust-analyzer: + $(Q)$(srctree)/scripts/generate_rust_analyzer.py $(srctree) $(objtree) \ + $(RUST_LIB_SRC) > $(objtree)/rust-project.json + +$(obj)/core.o: private skip_clippy = 1 +$(obj)/core.o: private skip_flags = -Dunreachable_pub +$(obj)/core.o: private rustc_target_flags = $(core-cfgs) +$(obj)/core.o: $(RUST_LIB_SRC)/core/src/lib.rs $(obj)/target.json FORCE + $(call if_changed_dep,rustc_library) + +$(obj)/compiler_builtins.o: private rustc_objcopy = -w -W '__*' +$(obj)/compiler_builtins.o: $(src)/compiler_builtins.rs $(obj)/core.o FORCE + $(call if_changed_dep,rustc_library) + +$(obj)/alloc.o: private skip_clippy = 1 +$(obj)/alloc.o: private skip_flags = -Dunreachable_pub +$(obj)/alloc.o: private rustc_target_flags = $(alloc-cfgs) +$(obj)/alloc.o: $(src)/alloc/lib.rs $(obj)/compiler_builtins.o FORCE + $(call if_changed_dep,rustc_library) + +$(obj)/build_error.o: $(src)/build_error.rs $(obj)/compiler_builtins.o FORCE + $(call if_changed_dep,rustc_library) + +$(obj)/kernel.o: private rustc_target_flags = --extern alloc \ + --extern build_error --extern macros +$(obj)/kernel.o: $(src)/kernel/lib.rs $(obj)/alloc.o $(obj)/build_error.o \ + $(obj)/libmacros.so $(obj)/bindings_generated.rs \ + $(obj)/bindings_helpers_generated.rs FORCE + $(call if_changed_dep,rustc_library) + +endif # CONFIG_RUST diff --git a/rust/bindgen_parameters b/rust/bindgen_parameters new file mode 100644 index 00000000000000..6c77865e834545 --- /dev/null +++ b/rust/bindgen_parameters @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 + +--opaque-type xregs_state +--opaque-type desc_struct +--opaque-type arch_lbr_state +--opaque-type local_apic + +# `try` is a reserved keyword since Rust 2018; solved in `bindgen` v0.59.2, +# commit 2aed6b021680 ("context: Escape the try keyword properly"). +--opaque-type kunit_try_catch + +# If SMP is disabled, `arch_spinlock_t` is defined as a ZST which triggers a Rust +# warning. We don't need to peek into it anyway. +--opaque-type spinlock + +# `seccomp`'s comment gets understood as a doctest +--no-doc-comments diff --git a/scripts/.gitignore b/scripts/.gitignore index eed308bef604a1..b7aec8eb1bd443 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only /asn1_compiler /bin2c +/generate_rust_target /insert-sys-cert /kallsyms /module.lds diff --git a/scripts/Kconfig.include b/scripts/Kconfig.include index 0496efd6e11794..83e850321eb6a7 100644 --- a/scripts/Kconfig.include +++ b/scripts/Kconfig.include @@ -36,12 +36,12 @@ ld-option = $(success,$(LD) -v $(1)) as-instr = $(success,printf "%b\n" "$(1)" | $(CC) $(CLANG_FLAGS) -c -x assembler -o /dev/null -) # check if $(CC) and $(LD) exist -$(error-if,$(failure,command -v $(CC)),compiler '$(CC)' not found) +$(error-if,$(failure,command -v $(CC)),C compiler '$(CC)' not found) $(error-if,$(failure,command -v $(LD)),linker '$(LD)' not found) -# Get the compiler name, version, and error out if it is not supported. +# Get the C compiler name, version, and error out if it is not supported. cc-info := $(shell,$(srctree)/scripts/cc-version.sh $(CC)) -$(error-if,$(success,test -z "$(cc-info)"),Sorry$(comma) this compiler is not supported.) +$(error-if,$(success,test -z "$(cc-info)"),Sorry$(comma) this C compiler is not supported.) cc-name := $(shell,set -- $(cc-info) && echo $1) cc-version := $(shell,set -- $(cc-info) && echo $2) diff --git a/scripts/Makefile b/scripts/Makefile index ce5aa9030b740a..a278345e7820af 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -10,6 +10,9 @@ hostprogs-always-$(CONFIG_BUILDTIME_TABLE_SORT) += sorttable hostprogs-always-$(CONFIG_ASN1) += asn1_compiler hostprogs-always-$(CONFIG_MODULE_SIG_FORMAT) += sign-file hostprogs-always-$(CONFIG_SYSTEM_EXTRA_CERTIFICATE) += insert-sys-cert +hostprogs-always-$(CONFIG_RUST) += generate_rust_target + +generate_rust_target-rust := y HOSTCFLAGS_sorttable.o = -I$(srctree)/tools/include HOSTLDLIBS_sorttable = -lpthread diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 33c1ed58152294..533631753b1613 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -26,6 +26,7 @@ EXTRA_CPPFLAGS := EXTRA_LDFLAGS := asflags-y := ccflags-y := +rustflags-y := cppflags-y := ldflags-y := @@ -324,6 +325,65 @@ quiet_cmd_cc_lst_c = MKLST $@ $(obj)/%.lst: $(src)/%.c FORCE $(call if_changed_dep,cc_lst_c) +# Compile Rust sources (.rs) +# --------------------------------------------------------------------------- + +rust_allowed_features := allocator_api,bench_black_box,concat_idents,generic_associated_types + +rust_common_cmd = \ + RUST_MODFILE=$(modfile) $(RUSTC_OR_CLIPPY) $(rust_flags) \ + -Zallow-features=$(rust_allowed_features) \ + -Zcrate-attr=no_std \ + -Zcrate-attr='feature($(rust_allowed_features))' \ + --extern alloc --extern kernel \ + --crate-type rlib --out-dir $(obj) -L $(objtree)/rust/ \ + --crate-name $(basename $(notdir $@)) + +rust_handle_depfile = \ + mv $(obj)/$(basename $(notdir $@)).d $(depfile); \ + sed -i '/^\#/d' $(depfile) + +# `--emit=obj`, `--emit=asm` and `--emit=llvm-ir` imply a single codegen unit +# will be used. We explicitly request `-Ccodegen-units=1` in any case, and +# the compiler shows a warning if it is not 1. However, if we ever stop +# requesting it explicitly and we start using some other `--emit` that does not +# imply it (and for which codegen is performed), then we would be out of sync, +# i.e. the outputs we would get for the different single targets (e.g. `.ll`) +# would not match each other. + +quiet_cmd_rustc_o_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@ + cmd_rustc_o_rs = \ + $(rust_common_cmd) --emit=dep-info,obj $<; \ + $(rust_handle_depfile) + +$(obj)/%.o: $(src)/%.rs FORCE + $(call if_changed_dep,rustc_o_rs) + +quiet_cmd_rustc_i_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@ + cmd_rustc_i_rs = \ + $(rust_common_cmd) --emit=dep-info -Zunpretty=expanded $< >$@; \ + command -v $(RUSTFMT) >/dev/null && $(RUSTFMT) $@; \ + $(rust_handle_depfile) + +$(obj)/%.i: $(src)/%.rs FORCE + $(call if_changed_dep,rustc_i_rs) + +quiet_cmd_rustc_s_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@ + cmd_rustc_s_rs = \ + $(rust_common_cmd) --emit=dep-info,asm $<; \ + $(rust_handle_depfile) + +$(obj)/%.s: $(src)/%.rs FORCE + $(call if_changed_dep,rustc_s_rs) + +quiet_cmd_rustc_ll_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@ + cmd_rustc_ll_rs = \ + $(rust_common_cmd) --emit=dep-info,llvm-ir $<; \ + $(rust_handle_depfile) + +$(obj)/%.ll: $(src)/%.rs FORCE + $(call if_changed_dep,rustc_ll_rs) + # Compile assembler sources (.S) # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.debug b/scripts/Makefile.debug index 9f39b0130551f3..fe87389d52c071 100644 --- a/scripts/Makefile.debug +++ b/scripts/Makefile.debug @@ -1,4 +1,5 @@ DEBUG_CFLAGS := +DEBUG_RUSTFLAGS := ifdef CONFIG_DEBUG_INFO_SPLIT DEBUG_CFLAGS += -gsplit-dwarf @@ -10,6 +11,12 @@ ifndef CONFIG_AS_IS_LLVM KBUILD_AFLAGS += -Wa,-gdwarf-2 endif +ifdef CONFIG_DEBUG_INFO_REDUCED +DEBUG_RUSTFLAGS += -Cdebuginfo=1 +else +DEBUG_RUSTFLAGS += -Cdebuginfo=2 +endif + ifndef CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT dwarf-version-$(CONFIG_DEBUG_INFO_DWARF4) := 4 dwarf-version-$(CONFIG_DEBUG_INFO_DWARF5) := 5 @@ -31,3 +38,6 @@ endif KBUILD_CFLAGS += $(DEBUG_CFLAGS) export DEBUG_CFLAGS + +KBUILD_RUSTFLAGS += $(DEBUG_RUSTFLAGS) +export DEBUG_RUSTFLAGS diff --git a/scripts/Makefile.host b/scripts/Makefile.host index 278b4d6ac9454d..da133780b7518d 100644 --- a/scripts/Makefile.host +++ b/scripts/Makefile.host @@ -22,6 +22,8 @@ $(obj)/%.tab.c $(obj)/%.tab.h: $(src)/%.y FORCE # to preprocess a data file. # # Both C and C++ are supported, but preferred language is C for such utilities. +# Rust is also supported, but it may only be used in scenarios where a Rust +# toolchain is required to be available (e.g. when `CONFIG_RUST` is enabled). # # Sample syntax (see Documentation/kbuild/makefiles.rst for reference) # hostprogs := bin2hex @@ -37,15 +39,20 @@ $(obj)/%.tab.c $(obj)/%.tab.h: $(src)/%.y FORCE # qconf-objs := menu.o # Will compile qconf as a C++ program, and menu as a C program. # They are linked as C++ code to the executable qconf +# +# hostprogs := target +# target-rust := y +# Will compile `target` as a Rust program, using `target.rs` as the crate root. +# The crate may consist of several source files. # C code # Executables compiled from a single .c file host-csingle := $(foreach m,$(hostprogs), \ - $(if $($(m)-objs)$($(m)-cxxobjs),,$(m))) + $(if $($(m)-objs)$($(m)-cxxobjs)$($(m)-rust),,$(m))) # C executables linked based on several .o files host-cmulti := $(foreach m,$(hostprogs),\ - $(if $($(m)-cxxobjs),,$(if $($(m)-objs),$(m)))) + $(if $($(m)-cxxobjs)$($(m)-rust),,$(if $($(m)-objs),$(m)))) # Object (.o) files compiled from .c files host-cobjs := $(sort $(foreach m,$(hostprogs),$($(m)-objs))) @@ -58,11 +65,17 @@ host-cxxmulti := $(foreach m,$(hostprogs),$(if $($(m)-cxxobjs),$(m))) # C++ Object (.o) files compiled from .cc files host-cxxobjs := $(sort $(foreach m,$(host-cxxmulti),$($(m)-cxxobjs))) +# Rust code +# Executables compiled from a single Rust crate (which may consist of +# one or more .rs files) +host-rust := $(foreach m,$(hostprogs),$(if $($(m)-rust),$(m))) + host-csingle := $(addprefix $(obj)/,$(host-csingle)) host-cmulti := $(addprefix $(obj)/,$(host-cmulti)) host-cobjs := $(addprefix $(obj)/,$(host-cobjs)) host-cxxmulti := $(addprefix $(obj)/,$(host-cxxmulti)) host-cxxobjs := $(addprefix $(obj)/,$(host-cxxobjs)) +host-rust := $(addprefix $(obj)/,$(host-rust)) ##### # Handle options to gcc. Support building with separate output directory @@ -71,6 +84,8 @@ _hostc_flags = $(KBUILD_HOSTCFLAGS) $(HOST_EXTRACFLAGS) \ $(HOSTCFLAGS_$(target-stem).o) _hostcxx_flags = $(KBUILD_HOSTCXXFLAGS) $(HOST_EXTRACXXFLAGS) \ $(HOSTCXXFLAGS_$(target-stem).o) +_hostrust_flags = $(KBUILD_HOSTRUSTFLAGS) $(HOST_EXTRARUSTFLAGS) \ + $(HOSTRUSTFLAGS_$(target-stem)) # $(objtree)/$(obj) for including generated headers from checkin source files ifeq ($(KBUILD_EXTMOD),) @@ -82,6 +97,7 @@ endif hostc_flags = -Wp,-MMD,$(depfile) $(_hostc_flags) hostcxx_flags = -Wp,-MMD,$(depfile) $(_hostcxx_flags) +hostrust_flags = $(_hostrust_flags) ##### # Compile programs on the host @@ -128,5 +144,17 @@ quiet_cmd_host-cxxobjs = HOSTCXX $@ $(host-cxxobjs): $(obj)/%.o: $(src)/%.cc FORCE $(call if_changed_dep,host-cxxobjs) +# Create executable from a single Rust crate (which may consist of +# one or more `.rs` files) +# host-rust -> Executable +quiet_cmd_host-rust = HOSTRUSTC $@ + cmd_host-rust = \ + $(HOSTRUSTC) $(hostrust_flags) --emit=dep-info,link \ + --out-dir=$(obj)/ $<; \ + mv $(obj)/$(target-stem).d $(depfile); \ + sed -i '/^\#/d' $(depfile) +$(host-rust): $(obj)/%: $(src)/%.rs FORCE + $(call if_changed_dep,host-rust) + targets += $(host-csingle) $(host-cmulti) $(host-cobjs) \ - $(host-cxxmulti) $(host-cxxobjs) + $(host-cxxmulti) $(host-cxxobjs) $(host-rust) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 9f69ecdd7977a3..f3e623f242dfde 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -8,6 +8,7 @@ ldflags-y += $(EXTRA_LDFLAGS) # flags that take effect in current and sub directories KBUILD_AFLAGS += $(subdir-asflags-y) KBUILD_CFLAGS += $(subdir-ccflags-y) +KBUILD_RUSTFLAGS += $(subdir-rustflags-y) # Figure out what we need to build from the various variables # =========================================================================== @@ -128,6 +129,10 @@ _c_flags = $(filter-out $(CFLAGS_REMOVE_$(target-stem).o), \ $(filter-out $(ccflags-remove-y), \ $(KBUILD_CPPFLAGS) $(KBUILD_CFLAGS) $(ccflags-y)) \ $(CFLAGS_$(target-stem).o)) +_rust_flags = $(filter-out $(RUSTFLAGS_REMOVE_$(target-stem).o), \ + $(filter-out $(rustflags-remove-y), \ + $(KBUILD_RUSTFLAGS) $(rustflags-y)) \ + $(RUSTFLAGS_$(target-stem).o)) _a_flags = $(filter-out $(AFLAGS_REMOVE_$(target-stem).o), \ $(filter-out $(asflags-remove-y), \ $(KBUILD_CPPFLAGS) $(KBUILD_AFLAGS) $(asflags-y)) \ @@ -202,6 +207,11 @@ modkern_cflags = \ $(KBUILD_CFLAGS_MODULE) $(CFLAGS_MODULE), \ $(KBUILD_CFLAGS_KERNEL) $(CFLAGS_KERNEL) $(modfile_flags)) +modkern_rustflags = \ + $(if $(part-of-module), \ + $(KBUILD_RUSTFLAGS_MODULE) $(RUSTFLAGS_MODULE), \ + $(KBUILD_RUSTFLAGS_KERNEL) $(RUSTFLAGS_KERNEL)) + modkern_aflags = $(if $(part-of-module), \ $(KBUILD_AFLAGS_MODULE) $(AFLAGS_MODULE), \ $(KBUILD_AFLAGS_KERNEL) $(AFLAGS_KERNEL)) @@ -211,6 +221,8 @@ c_flags = -Wp,-MMD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ $(_c_flags) $(modkern_cflags) \ $(basename_flags) $(modname_flags) +rust_flags = $(_rust_flags) $(modkern_rustflags) @$(objtree)/include/generated/rustc_cfg + a_flags = -Wp,-MMD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \ $(_a_flags) $(modkern_aflags) diff --git a/scripts/Makefile.modfinal b/scripts/Makefile.modfinal index 7f39599e9faedd..670d7997a38b6f 100644 --- a/scripts/Makefile.modfinal +++ b/scripts/Makefile.modfinal @@ -39,11 +39,13 @@ quiet_cmd_ld_ko_o = LD [M] $@ quiet_cmd_btf_ko = BTF [M] $@ cmd_btf_ko = \ - if [ -f vmlinux ]; then \ + if [ ! -f vmlinux ]; then \ + printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \ + elif [ -n "$(CONFIG_RUST)" ] && $(srctree)/scripts/is_rust_module.sh $@; then \ + printf "Skipping BTF generation for %s because it's a Rust module\n" $@ 1>&2; \ + else \ LLVM_OBJCOPY="$(OBJCOPY)" $(PAHOLE) -J $(PAHOLE_FLAGS) --btf_base vmlinux $@; \ $(RESOLVE_BTFIDS) -b vmlinux $@; \ - else \ - printf "Skipping BTF generation for %s due to unavailability of vmlinux\n" $@ 1>&2; \ fi; # Same as newer-prereqs, but allows to exclude specified extra dependencies diff --git a/scripts/cc-version.sh b/scripts/cc-version.sh index f1952c52246624..2401c86fcf5331 100755 --- a/scripts/cc-version.sh +++ b/scripts/cc-version.sh @@ -1,13 +1,13 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # -# Print the compiler name and its version in a 5 or 6-digit form. +# Print the C compiler name and its version in a 5 or 6-digit form. # Also, perform the minimum version check. set -e -# Print the compiler name and some version components. -get_compiler_info() +# Print the C compiler name and some version components. +get_c_compiler_info() { cat <<- EOF | "$@" -E -P -x c - 2>/dev/null #if defined(__clang__) @@ -32,7 +32,7 @@ get_canonical_version() # $@ instead of $1 because multiple words might be given, e.g. CC="ccache gcc". orig_args="$@" -set -- $(get_compiler_info "$@") +set -- $(get_c_compiler_info "$@") name=$1 @@ -52,7 +52,7 @@ ICC) min_version=$($min_tool_version icc) ;; *) - echo "$orig_args: unknown compiler" >&2 + echo "$orig_args: unknown C compiler" >&2 exit 1 ;; esac @@ -62,7 +62,7 @@ min_cversion=$(get_canonical_version $min_version) if [ "$cversion" -lt "$min_cversion" ]; then echo >&2 "***" - echo >&2 "*** Compiler is too old." + echo >&2 "*** C compiler is too old." echo >&2 "*** Your $name version: $version" echo >&2 "*** Minimum $name version: $min_version" echo >&2 "***" diff --git a/scripts/generate_rust_target.rs b/scripts/generate_rust_target.rs new file mode 100644 index 00000000000000..c146a3407183b6 --- /dev/null +++ b/scripts/generate_rust_target.rs @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! The custom target specification file generator for `rustc`. +//! +//! To configure a target from scratch, a JSON-encoded file has to be passed +//! to `rustc` (introduced in [RFC 131]). These options and the file itself are +//! unstable. Eventually, `rustc` should provide a way to do this in a stable +//! manner. For instance, via command-line arguments. Therefore, this file +//! should avoid using keys which can be set via `-C` or `-Z` options. +//! +//! [RFC 131]: https://rust-lang.github.io/rfcs/0131-target-specification.html + +use std::{ + collections::HashMap, + fmt::{Display, Formatter, Result}, + io::BufRead, +}; + +enum Value { + Boolean(bool), + Number(i32), + String(String), + Object(Object), +} + +type Object = Vec<(String, Value)>; + +/// Minimal "almost JSON" generator (e.g. no `null`s, no arrays, no escaping), +/// enough for this purpose. +impl Display for Value { + fn fmt(&self, formatter: &mut Formatter<'_>) -> Result { + match self { + Value::Boolean(boolean) => write!(formatter, "{}", boolean), + Value::Number(number) => write!(formatter, "{}", number), + Value::String(string) => write!(formatter, "\"{}\"", string), + Value::Object(object) => { + formatter.write_str("{")?; + if let [ref rest @ .., ref last] = object[..] { + for (key, value) in rest { + write!(formatter, "\"{}\": {},", key, value)?; + } + write!(formatter, "\"{}\": {}", last.0, last.1)?; + } + formatter.write_str("}") + } + } + } +} + +struct TargetSpec(Object); + +impl TargetSpec { + fn new() -> TargetSpec { + TargetSpec(Vec::new()) + } +} + +trait Push { + fn push(&mut self, key: &str, value: T); +} + +impl Push for TargetSpec { + fn push(&mut self, key: &str, value: bool) { + self.0.push((key.to_string(), Value::Boolean(value))); + } +} + +impl Push for TargetSpec { + fn push(&mut self, key: &str, value: i32) { + self.0.push((key.to_string(), Value::Number(value))); + } +} + +impl Push for TargetSpec { + fn push(&mut self, key: &str, value: String) { + self.0.push((key.to_string(), Value::String(value))); + } +} + +impl Push<&str> for TargetSpec { + fn push(&mut self, key: &str, value: &str) { + self.push(key, value.to_string()); + } +} + +impl Push for TargetSpec { + fn push(&mut self, key: &str, value: Object) { + self.0.push((key.to_string(), Value::Object(value))); + } +} + +impl Display for TargetSpec { + fn fmt(&self, formatter: &mut Formatter<'_>) -> Result { + // We add some newlines for clarity. + formatter.write_str("{\n")?; + if let [ref rest @ .., ref last] = self.0[..] { + for (key, value) in rest { + write!(formatter, " \"{}\": {},\n", key, value)?; + } + write!(formatter, " \"{}\": {}\n", last.0, last.1)?; + } + formatter.write_str("}") + } +} + +struct KernelConfig(HashMap); + +impl KernelConfig { + /// Parses `include/config/auto.conf` from `stdin`. + fn from_stdin() -> KernelConfig { + let mut result = HashMap::new(); + + let stdin = std::io::stdin(); + let mut handle = stdin.lock(); + let mut line = String::new(); + + loop { + line.clear(); + + if handle.read_line(&mut line).unwrap() == 0 { + break; + } + + if line.starts_with('#') { + continue; + } + + let (key, value) = line.split_once('=').expect("Missing `=` in line."); + result.insert(key.to_string(), value.trim_end_matches('\n').to_string()); + } + + KernelConfig(result) + } + + /// Does the option exist in the configuration (any value)? + /// + /// The argument must be passed without the `CONFIG_` prefix. + /// This avoids repetition and it also avoids `fixdep` making us + /// depend on it. + fn has(&self, option: &str) -> bool { + let option = "CONFIG_".to_owned() + option; + self.0.contains_key(&option) + } +} + +fn main() { + let cfg = KernelConfig::from_stdin(); + let mut ts = TargetSpec::new(); + + // `llvm-target`s are taken from `scripts/Makefile.clang`. + if cfg.has("ARM") { + ts.push("arch", "arm"); + ts.push( + "data-layout", + "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64", + ); + ts.push("features", "+strict-align,+v6"); + ts.push("llvm-target", "arm-linux-gnueabi"); + ts.push("max-atomic-width", 64); + ts.push("target-mcount", "\\u0001__gnu_mcount_nc"); + ts.push("target-pointer-width", "32"); + } else if cfg.has("ARM64") { + ts.push("arch", "aarch64"); + ts.push( + "data-layout", + "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", + ); + ts.push("disable-redzone", true); + ts.push("features", "+strict-align,-neon,-fp-armv8"); + ts.push("llvm-target", "aarch64-linux-gnu"); + ts.push("max-atomic-width", 128); + ts.push("target-pointer-width", "64"); + } else if cfg.has("PPC") { + ts.push("arch", "powerpc64"); + ts.push("code-model", "large"); + ts.push("data-layout", "e-m:e-i64:64-n32:64"); + ts.push("features", "-altivec,-vsx,-hard-float"); + ts.push("llvm-target", "powerpc64le-linux-gnu"); + ts.push("max-atomic-width", 64); + ts.push("target-mcount", "_mcount"); + ts.push("target-pointer-width", "64"); + } else if cfg.has("RISCV") { + if cfg.has("64BIT") { + ts.push("arch", "riscv64"); + ts.push("data-layout", "e-m:e-p:64:64-i64:64-i128:128-n64-S128"); + ts.push("llvm-target", "riscv64-linux-gnu"); + ts.push("target-pointer-width", "64"); + } else { + ts.push("arch", "riscv32"); + ts.push("data-layout", "e-m:e-p:32:32-i64:64-n32-S128"); + ts.push("llvm-target", "riscv32-linux-gnu"); + ts.push("target-pointer-width", "32"); + } + ts.push("code-model", "medium"); + ts.push("disable-redzone", true); + let mut features = "+m,+a".to_string(); + if cfg.has("RISCV_ISA_C") { + features += ",+c"; + } + ts.push("features", features); + } else if cfg.has("X86_64") { + ts.push("arch", "x86_64"); + ts.push( + "data-layout", + "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128", + ); + ts.push("llvm-target", "x86_64-linux-gnu"); + ts.push("target-pointer-width", "64"); + } else { + panic!("Unsupported architecture"); + } + + ts.push("emit-debug-gdb-scripts", false); + ts.push("frame-pointer", "may-omit"); + ts.push( + "stack-probes", + vec![("kind".to_string(), Value::String("none".to_string()))], + ); + + // Everything else is LE, whether `CPU_LITTLE_ENDIAN` is declared or not + // (e.g. x86). It is also `rustc`'s default. + if cfg.has("CPU_BIG_ENDIAN") { + ts.push("target-endian", "big"); + } + + println!("{}", ts); +} diff --git a/scripts/is_rust_module.sh b/scripts/is_rust_module.sh new file mode 100755 index 00000000000000..277a64d07f22c1 --- /dev/null +++ b/scripts/is_rust_module.sh @@ -0,0 +1,13 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# is_rust_module.sh module.ko +# +# Returns `0` if `module.ko` is a Rust module, `1` otherwise. + +set -e + +# Using the `16_` prefix ensures other symbols with the same substring +# are not picked up (even if it would be unlikely). The last part is +# used just in case LLVM decides to use the `.` suffix. +${NM} "$*" | grep -qE '^[0-9a-fA-F]+ r _R[^[:space:]]+16___IS_RUST_MODULE[^[:space:]]*$' diff --git a/scripts/kconfig/confdata.c b/scripts/kconfig/confdata.c index c4340c90e172f8..b7c9f1dd5e4229 100644 --- a/scripts/kconfig/confdata.c +++ b/scripts/kconfig/confdata.c @@ -216,6 +216,13 @@ static const char *conf_get_autoheader_name(void) return name ? name : "include/generated/autoconf.h"; } +static const char *conf_get_rustccfg_name(void) +{ + char *name = getenv("KCONFIG_RUSTCCFG"); + + return name ? name : "include/generated/rustc_cfg"; +} + static int conf_set_sym_val(struct symbol *sym, int def, int def_flags, char *p) { char *p2; @@ -605,6 +612,9 @@ static const struct comment_style comment_style_c = { static void conf_write_heading(FILE *fp, const struct comment_style *cs) { + if (!cs) + return; + fprintf(fp, "%s\n", cs->prefix); fprintf(fp, "%s Automatically generated file; DO NOT EDIT.\n", @@ -745,6 +755,65 @@ static void print_symbol_for_c(FILE *fp, struct symbol *sym) free(escaped); } +static void print_symbol_for_rustccfg(FILE *fp, struct symbol *sym) +{ + const char *val; + const char *val_prefix = ""; + char *val_prefixed = NULL; + size_t val_prefixed_len; + char *escaped = NULL; + + if (sym->type == S_UNKNOWN) + return; + + val = sym_get_string_value(sym); + + switch (sym->type) { + case S_BOOLEAN: + case S_TRISTATE: + /* + * We do not care about disabled ones, i.e. no need for + * what otherwise are "comments" in other printers. + */ + if (*val == 'n') + return; + + /* + * To have similar functionality to the C macro `IS_ENABLED()` + * we provide an empty `--cfg CONFIG_X` here in both `y` + * and `m` cases. + * + * Then, the common `fprintf()` below will also give us + * a `--cfg CONFIG_X="y"` or `--cfg CONFIG_X="m"`, which can + * be used as the equivalent of `IS_BUILTIN()`/`IS_MODULE()`. + */ + fprintf(fp, "--cfg=%s%s\n", CONFIG_, sym->name); + break; + case S_HEX: + if (val[0] != '0' || (val[1] != 'x' && val[1] != 'X')) + val_prefix = "0x"; + break; + default: + break; + } + + if (strlen(val_prefix) > 0) { + val_prefixed_len = strlen(val) + strlen(val_prefix) + 1; + val_prefixed = xmalloc(val_prefixed_len); + snprintf(val_prefixed, val_prefixed_len, "%s%s", val_prefix, val); + val = val_prefixed; + } + + /* All values get escaped: the `--cfg` option only takes strings */ + escaped = escape_string_value(val); + val = escaped; + + fprintf(fp, "--cfg=%s%s=%s\n", CONFIG_, sym->name, val); + + free(escaped); + free(val_prefixed); +} + /* * Write out a minimal config. * All values that has default values are skipped as this is redundant. @@ -1132,6 +1201,12 @@ int conf_write_autoconf(int overwrite) if (ret) return ret; + ret = __conf_write_autoconf(conf_get_rustccfg_name(), + print_symbol_for_rustccfg, + NULL); + if (ret) + return ret; + /* * Create include/config/auto.conf. This must be the last step because * Kbuild has a dependency on auto.conf and this marks the successful diff --git a/scripts/min-tool-version.sh b/scripts/min-tool-version.sh index 7c20252a90c68d..53fe64856015ba 100755 --- a/scripts/min-tool-version.sh +++ b/scripts/min-tool-version.sh @@ -31,6 +31,12 @@ llvm) echo 11.0.0 fi ;; +rustc) + echo 1.60.0 + ;; +bindgen) + echo 0.56.0 + ;; *) echo "$1: unknown tool" >&2 exit 1 diff --git a/scripts/rust-is-available-bindgen-libclang.h b/scripts/rust-is-available-bindgen-libclang.h new file mode 100644 index 00000000000000..0ef6db10d67413 --- /dev/null +++ b/scripts/rust-is-available-bindgen-libclang.h @@ -0,0 +1,2 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#pragma message("clang version " __clang_version__) diff --git a/scripts/rust-is-available.sh b/scripts/rust-is-available.sh new file mode 100755 index 00000000000000..6bd395167d0f10 --- /dev/null +++ b/scripts/rust-is-available.sh @@ -0,0 +1,158 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Tests whether a suitable Rust toolchain is available. +# +# Pass `-v` for human output and more checks (as warnings). + +set -e + +min_tool_version=$(dirname $0)/min-tool-version.sh + +# Convert the version string x.y.z to a canonical up-to-7-digits form. +# +# Note that this function uses one more digit (compared to other +# instances in other version scripts) to give a bit more space to +# `rustc` since it will reach 1.100.0 in late 2026. +get_canonical_version() +{ + IFS=. + set -- $1 + echo $((100000 * $1 + 100 * $2 + $3)) +} + +# Check that the Rust compiler exists. +if ! command -v "$RUSTC" >/dev/null; then + if [ "$1" = -v ]; then + echo >&2 "***" + echo >&2 "*** Rust compiler '$RUSTC' could not be found." + echo >&2 "***" + fi + exit 1 +fi + +# Check that the Rust bindings generator exists. +if ! command -v "$BINDGEN" >/dev/null; then + if [ "$1" = -v ]; then + echo >&2 "***" + echo >&2 "*** Rust bindings generator '$BINDGEN' could not be found." + echo >&2 "***" + fi + exit 1 +fi + +# Check that the Rust compiler version is suitable. +# +# Non-stable and distributions' versions may have a version suffix, e.g. `-dev`. +rust_compiler_version=$( \ + LC_ALL=C "$RUSTC" --version 2>/dev/null \ + | head -n 1 \ + | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' \ +) +rust_compiler_min_version=$($min_tool_version rustc) +rust_compiler_cversion=$(get_canonical_version $rust_compiler_version) +rust_compiler_min_cversion=$(get_canonical_version $rust_compiler_min_version) +if [ "$rust_compiler_cversion" -lt "$rust_compiler_min_cversion" ]; then + if [ "$1" = -v ]; then + echo >&2 "***" + echo >&2 "*** Rust compiler '$RUSTC' is too old." + echo >&2 "*** Your version: $rust_compiler_version" + echo >&2 "*** Minimum version: $rust_compiler_min_version" + echo >&2 "***" + fi + exit 1 +fi +if [ "$1" = -v ] && [ "$rust_compiler_cversion" -gt "$rust_compiler_min_cversion" ]; then + echo >&2 "***" + echo >&2 "*** Rust compiler '$RUSTC' is too new. This may or may not work." + echo >&2 "*** Your version: $rust_compiler_version" + echo >&2 "*** Expected version: $rust_compiler_min_version" + echo >&2 "***" +fi + +# Check that the Rust bindings generator is suitable. +# +# Non-stable and distributions' versions may have a version suffix, e.g. `-dev`. +rust_bindings_generator_version=$( \ + LC_ALL=C "$BINDGEN" --version 2>/dev/null \ + | head -n 1 \ + | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' \ +) +rust_bindings_generator_min_version=$($min_tool_version bindgen) +rust_bindings_generator_cversion=$(get_canonical_version $rust_bindings_generator_version) +rust_bindings_generator_min_cversion=$(get_canonical_version $rust_bindings_generator_min_version) +if [ "$rust_bindings_generator_cversion" -lt "$rust_bindings_generator_min_cversion" ]; then + if [ "$1" = -v ]; then + echo >&2 "***" + echo >&2 "*** Rust bindings generator '$BINDGEN' is too old." + echo >&2 "*** Your version: $rust_bindings_generator_version" + echo >&2 "*** Minimum version: $rust_bindings_generator_min_version" + echo >&2 "***" + fi + exit 1 +fi +if [ "$1" = -v ] && [ "$rust_bindings_generator_cversion" -gt "$rust_bindings_generator_min_cversion" ]; then + echo >&2 "***" + echo >&2 "*** Rust bindings generator '$BINDGEN' is too new. This may or may not work." + echo >&2 "*** Your version: $rust_bindings_generator_version" + echo >&2 "*** Expected version: $rust_bindings_generator_min_version" + echo >&2 "***" +fi + +# Check that the `libclang` used by the Rust bindings generator is suitable. +bindgen_libclang_version=$( \ + LC_ALL=C "$BINDGEN" $(dirname $0)/rust-is-available-bindgen-libclang.h 2>&1 >/dev/null \ + | grep -F 'clang version ' \ + | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' \ +) +bindgen_libclang_min_version=$($min_tool_version llvm) +bindgen_libclang_cversion=$(get_canonical_version $bindgen_libclang_version) +bindgen_libclang_min_cversion=$(get_canonical_version $bindgen_libclang_min_version) +if [ "$bindgen_libclang_cversion" -lt "$bindgen_libclang_min_cversion" ]; then + if [ "$1" = -v ]; then + echo >&2 "***" + echo >&2 "*** libclang (used by the Rust bindings generator '$BINDGEN') is too old." + echo >&2 "*** Your version: $bindgen_libclang_version" + echo >&2 "*** Minimum version: $bindgen_libclang_min_version" + echo >&2 "***" + fi + exit 1 +fi + +# If the C compiler is Clang, then we can also check whether its version +# matches the `libclang` version used by the Rust bindings generator. +# +# In the future, we might be able to perform a full version check, see +# https://github.com/rust-lang/rust-bindgen/issues/2138. +if [ "$1" = -v ]; then + cc_name=$($(dirname $0)/cc-version.sh "$CC" | cut -f1 -d' ') + if [ "$cc_name" = Clang ]; then + clang_version=$( \ + LC_ALL=C "$CC" --version 2>/dev/null \ + | sed -nE '1s:.*version ([0-9]+\.[0-9]+\.[0-9]+).*:\1:p' + ) + if [ "$clang_version" != "$bindgen_libclang_version" ]; then + echo >&2 "***" + echo >&2 "*** libclang (used by the Rust bindings generator '$BINDGEN') version does not match Clang's. This may be a problem." + echo >&2 "*** libclang version: $bindgen_libclang_version" + echo >&2 "*** Clang version: $clang_version" + echo >&2 "***" + fi + fi +fi + +# Check that the source code for the `core` standard library exists. +# +# `$KRUSTFLAGS` is passed in case the user added `--sysroot`. +rustc_sysroot=$("$RUSTC" $KRUSTFLAGS --print sysroot) +rustc_src=${RUST_LIB_SRC:-"$rustc_sysroot/lib/rustlib/src/rust/library"} +rustc_src_core="$rustc_src/core/src/lib.rs" +if [ ! -e "$rustc_src_core" ]; then + if [ "$1" = -v ]; then + echo >&2 "***" + echo >&2 "*** Source code for the 'core' standard library could not be found" + echo >&2 "*** at '$rustc_src_core'." + echo >&2 "***" + fi + exit 1 +fi From 91fb0182d4db1e0b0b2a1438f6bf699975fca96c Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 17:21:12 +0200 Subject: [PATCH 0036/1250] samples: add Rust examples A set of Rust modules that showcase how Rust modules look like and how to use the abstracted kernel features, as well as an example of a Rust host program with several modules. These samples also double as tests in the CI. The semaphore sample comes with a C version for comparison. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Finn Behrens Signed-off-by: Finn Behrens Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Co-developed-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman Co-developed-by: Sven Van Asbroeck Signed-off-by: Sven Van Asbroeck Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Boris-Chengbiao Zhou Signed-off-by: Boris-Chengbiao Zhou Co-developed-by: Ayaan Zaidi Signed-off-by: Ayaan Zaidi Co-developed-by: Milan Landaverde Signed-off-by: Milan Landaverde Signed-off-by: Miguel Ojeda --- samples/Kconfig | 2 + samples/Makefile | 1 + samples/rust/Kconfig | 140 ++++++++++++++++ samples/rust/Makefile | 16 ++ samples/rust/hostprogs/.gitignore | 3 + samples/rust/hostprogs/Makefile | 5 + samples/rust/hostprogs/a.rs | 7 + samples/rust/hostprogs/b.rs | 5 + samples/rust/hostprogs/single.rs | 12 ++ samples/rust/rust_chrdev.rs | 50 ++++++ samples/rust/rust_minimal.rs | 35 ++++ samples/rust/rust_miscdev.rs | 143 +++++++++++++++++ samples/rust/rust_module_parameters.rs | 69 ++++++++ samples/rust/rust_netfilter.rs | 54 +++++++ samples/rust/rust_platform.rs | 22 +++ samples/rust/rust_print.rs | 54 +++++++ samples/rust/rust_random.rs | 60 +++++++ samples/rust/rust_semaphore.rs | 171 ++++++++++++++++++++ samples/rust/rust_semaphore_c.c | 212 +++++++++++++++++++++++++ samples/rust/rust_stack_probing.rs | 36 +++++ samples/rust/rust_sync.rs | 93 +++++++++++ 21 files changed, 1190 insertions(+) create mode 100644 samples/rust/Kconfig create mode 100644 samples/rust/Makefile create mode 100644 samples/rust/hostprogs/.gitignore create mode 100644 samples/rust/hostprogs/Makefile create mode 100644 samples/rust/hostprogs/a.rs create mode 100644 samples/rust/hostprogs/b.rs create mode 100644 samples/rust/hostprogs/single.rs create mode 100644 samples/rust/rust_chrdev.rs create mode 100644 samples/rust/rust_minimal.rs create mode 100644 samples/rust/rust_miscdev.rs create mode 100644 samples/rust/rust_module_parameters.rs create mode 100644 samples/rust/rust_netfilter.rs create mode 100644 samples/rust/rust_platform.rs create mode 100644 samples/rust/rust_print.rs create mode 100644 samples/rust/rust_random.rs create mode 100644 samples/rust/rust_semaphore.rs create mode 100644 samples/rust/rust_semaphore_c.c create mode 100644 samples/rust/rust_stack_probing.rs create mode 100644 samples/rust/rust_sync.rs diff --git a/samples/Kconfig b/samples/Kconfig index 470ee3baf2e16d..0d81c00289ee36 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -263,6 +263,8 @@ config SAMPLE_CORESIGHT_SYSCFG This demonstrates how a user may create their own CoreSight configurations and easily load them into the system at runtime. +source "samples/rust/Kconfig" + endif # SAMPLES config HAVE_SAMPLE_FTRACE_DIRECT diff --git a/samples/Makefile b/samples/Makefile index 701e912ab5afee..9832ef3f8fcbaf 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -35,3 +35,4 @@ subdir-$(CONFIG_SAMPLE_WATCH_QUEUE) += watch_queue obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak/ obj-$(CONFIG_SAMPLE_CORESIGHT_SYSCFG) += coresight/ obj-$(CONFIG_SAMPLE_FPROBE) += fprobe/ +obj-$(CONFIG_SAMPLES_RUST) += rust/ diff --git a/samples/rust/Kconfig b/samples/rust/Kconfig new file mode 100644 index 00000000000000..4f90f8d6935188 --- /dev/null +++ b/samples/rust/Kconfig @@ -0,0 +1,140 @@ +# SPDX-License-Identifier: GPL-2.0 + +menuconfig SAMPLES_RUST + bool "Rust samples" + depends on RUST + help + You can build sample Rust kernel code here. + + If unsure, say N. + +if SAMPLES_RUST + +config SAMPLE_RUST_MINIMAL + tristate "Minimal" + help + This option builds the Rust minimal module sample. + + To compile this as a module, choose M here: + the module will be called rust_minimal. + + If unsure, say N. + +config SAMPLE_RUST_PRINT + tristate "Printing macros" + help + This option builds the Rust printing macros sample. + + To compile this as a module, choose M here: + the module will be called rust_print. + + If unsure, say N. + +config SAMPLE_RUST_MODULE_PARAMETERS + tristate "Module parameters" + help + This option builds the Rust module parameters sample. + + To compile this as a module, choose M here: + the module will be called rust_module_parameters. + + If unsure, say N. + +config SAMPLE_RUST_SYNC + tristate "Synchronisation primitives" + help + This option builds the Rust synchronisation primitives sample. + + To compile this as a module, choose M here: + the module will be called rust_sync. + + If unsure, say N. + +config SAMPLE_RUST_CHRDEV + tristate "Character device" + help + This option builds the Rust character device sample. + + To compile this as a module, choose M here: + the module will be called rust_chrdev. + + If unsure, say N. + +config SAMPLE_RUST_MISCDEV + tristate "Miscellaneous device" + help + This option builds the Rust miscellaneous device sample. + + To compile this as a module, choose M here: + the module will be called rust_miscdev. + + If unsure, say N. + +config SAMPLE_RUST_STACK_PROBING + tristate "Stack probing" + help + This option builds the Rust stack probing sample. + + To compile this as a module, choose M here: + the module will be called rust_stack_probing. + + If unsure, say N. + +config SAMPLE_RUST_SEMAPHORE + tristate "Semaphore" + help + This option builds the Rust semaphore sample. + + To compile this as a module, choose M here: + the module will be called rust_semaphore. + + If unsure, say N. + +config SAMPLE_RUST_SEMAPHORE_C + tristate "Semaphore (in C, for comparison)" + help + This option builds the Rust semaphore sample (in C, for comparison). + + To compile this as a module, choose M here: + the module will be called rust_semaphore_c. + + If unsure, say N. + +config SAMPLE_RUST_RANDOM + tristate "Random" + help + This option builds the Rust random sample. + + To compile this as a module, choose M here: + the module will be called rust_random. + + If unsure, say N. + +config SAMPLE_RUST_PLATFORM + tristate "Platform device driver" + help + This option builds the Rust platform device driver sample. + + To compile this as a module, choose M here: + the module will be called rust_platform. + + If unsure, say N. + +config SAMPLE_RUST_NETFILTER + tristate "Network filter module" + help + This option builds the Rust netfilter module sample. + + To compile this as a module, choose M here: + the module will be called rust_netfilter. + + If unsure, say N. + +config SAMPLE_RUST_HOSTPROGS + bool "Host programs" + help + This option builds the Rust host program samples. + + If unsure, say N. + +endif # SAMPLES_RUST diff --git a/samples/rust/Makefile b/samples/rust/Makefile new file mode 100644 index 00000000000000..fb5a205ebb8cf8 --- /dev/null +++ b/samples/rust/Makefile @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0 + +obj-$(CONFIG_SAMPLE_RUST_MINIMAL) += rust_minimal.o +obj-$(CONFIG_SAMPLE_RUST_PRINT) += rust_print.o +obj-$(CONFIG_SAMPLE_RUST_MODULE_PARAMETERS) += rust_module_parameters.o +obj-$(CONFIG_SAMPLE_RUST_SYNC) += rust_sync.o +obj-$(CONFIG_SAMPLE_RUST_CHRDEV) += rust_chrdev.o +obj-$(CONFIG_SAMPLE_RUST_MISCDEV) += rust_miscdev.o +obj-$(CONFIG_SAMPLE_RUST_STACK_PROBING) += rust_stack_probing.o +obj-$(CONFIG_SAMPLE_RUST_SEMAPHORE) += rust_semaphore.o +obj-$(CONFIG_SAMPLE_RUST_SEMAPHORE_C) += rust_semaphore_c.o +obj-$(CONFIG_SAMPLE_RUST_RANDOM) += rust_random.o +obj-$(CONFIG_SAMPLE_RUST_PLATFORM) += rust_platform.o +obj-$(CONFIG_SAMPLE_RUST_NETFILTER) += rust_netfilter.o + +subdir-$(CONFIG_SAMPLE_RUST_HOSTPROGS) += hostprogs diff --git a/samples/rust/hostprogs/.gitignore b/samples/rust/hostprogs/.gitignore new file mode 100644 index 00000000000000..a6c173da5048dc --- /dev/null +++ b/samples/rust/hostprogs/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 + +single diff --git a/samples/rust/hostprogs/Makefile b/samples/rust/hostprogs/Makefile new file mode 100644 index 00000000000000..8ddcbd7416db5d --- /dev/null +++ b/samples/rust/hostprogs/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +hostprogs-always-y := single + +single-rust := y diff --git a/samples/rust/hostprogs/a.rs b/samples/rust/hostprogs/a.rs new file mode 100644 index 00000000000000..f7a4a3d0f4e0b5 --- /dev/null +++ b/samples/rust/hostprogs/a.rs @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust single host program sample: module `a`. + +pub(crate) fn f(x: i32) { + println!("The number is {}.", x); +} diff --git a/samples/rust/hostprogs/b.rs b/samples/rust/hostprogs/b.rs new file mode 100644 index 00000000000000..c1675890648fd9 --- /dev/null +++ b/samples/rust/hostprogs/b.rs @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust single host program sample: module `b`. + +pub(crate) const CONSTANT: i32 = 42; diff --git a/samples/rust/hostprogs/single.rs b/samples/rust/hostprogs/single.rs new file mode 100644 index 00000000000000..8c48a119339a88 --- /dev/null +++ b/samples/rust/hostprogs/single.rs @@ -0,0 +1,12 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust single host program sample. + +mod a; +mod b; + +fn main() { + println!("Hello world!"); + + a::f(b::CONSTANT); +} diff --git a/samples/rust/rust_chrdev.rs b/samples/rust/rust_chrdev.rs new file mode 100644 index 00000000000000..9f5d564671eaca --- /dev/null +++ b/samples/rust/rust_chrdev.rs @@ -0,0 +1,50 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust character device sample. + +use kernel::prelude::*; +use kernel::{chrdev, file}; + +module! { + type: RustChrdev, + name: b"rust_chrdev", + author: b"Rust for Linux Contributors", + description: b"Rust character device sample", + license: b"GPL", +} + +struct RustFile; + +impl file::Operations for RustFile { + kernel::declare_file_operations!(); + + fn open(_shared: &(), _file: &file::File) -> Result { + Ok(()) + } +} + +struct RustChrdev { + _dev: Pin>>, +} + +impl kernel::Module for RustChrdev { + fn init(name: &'static CStr, module: &'static ThisModule) -> Result { + pr_info!("Rust character device sample (init)\n"); + + let mut chrdev_reg = chrdev::Registration::new_pinned(name, 0, module)?; + + // Register the same kind of device twice, we're just demonstrating + // that you can use multiple minors. There are two minors in this case + // because its type is `chrdev::Registration<2>` + chrdev_reg.as_mut().register::()?; + chrdev_reg.as_mut().register::()?; + + Ok(RustChrdev { _dev: chrdev_reg }) + } +} + +impl Drop for RustChrdev { + fn drop(&mut self) { + pr_info!("Rust character device sample (exit)\n"); + } +} diff --git a/samples/rust/rust_minimal.rs b/samples/rust/rust_minimal.rs new file mode 100644 index 00000000000000..6e1a926c6f62e0 --- /dev/null +++ b/samples/rust/rust_minimal.rs @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust minimal sample. + +use kernel::prelude::*; + +module! { + type: RustMinimal, + name: b"rust_minimal", + author: b"Rust for Linux Contributors", + description: b"Rust minimal sample", + license: b"GPL", +} + +struct RustMinimal { + message: String, +} + +impl kernel::Module for RustMinimal { + fn init(_name: &'static CStr, _module: &'static ThisModule) -> Result { + pr_info!("Rust minimal sample (init)\n"); + pr_info!("Am I built-in? {}\n", !cfg!(MODULE)); + + Ok(RustMinimal { + message: "on the heap!".try_to_owned()?, + }) + } +} + +impl Drop for RustMinimal { + fn drop(&mut self) { + pr_info!("My message is {}\n", self.message); + pr_info!("Rust minimal sample (exit)\n"); + } +} diff --git a/samples/rust/rust_miscdev.rs b/samples/rust/rust_miscdev.rs new file mode 100644 index 00000000000000..d1bf3c61f5cee8 --- /dev/null +++ b/samples/rust/rust_miscdev.rs @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust miscellaneous device sample. + +use kernel::prelude::*; +use kernel::{ + file::{self, File}, + io_buffer::{IoBufferReader, IoBufferWriter}, + miscdev, + sync::{CondVar, Mutex, Ref, RefBorrow, UniqueRef}, +}; + +module! { + type: RustMiscdev, + name: b"rust_miscdev", + author: b"Rust for Linux Contributors", + description: b"Rust miscellaneous device sample", + license: b"GPL", +} + +const MAX_TOKENS: usize = 3; + +struct SharedStateInner { + token_count: usize, +} + +struct SharedState { + state_changed: CondVar, + inner: Mutex, +} + +impl SharedState { + fn try_new() -> Result> { + let mut state = Pin::from(UniqueRef::try_new(Self { + // SAFETY: `condvar_init!` is called below. + state_changed: unsafe { CondVar::new() }, + // SAFETY: `mutex_init!` is called below. + inner: unsafe { Mutex::new(SharedStateInner { token_count: 0 }) }, + })?); + + // SAFETY: `state_changed` is pinned when `state` is. + let pinned = unsafe { state.as_mut().map_unchecked_mut(|s| &mut s.state_changed) }; + kernel::condvar_init!(pinned, "SharedState::state_changed"); + + // SAFETY: `inner` is pinned when `state` is. + let pinned = unsafe { state.as_mut().map_unchecked_mut(|s| &mut s.inner) }; + kernel::mutex_init!(pinned, "SharedState::inner"); + + Ok(state.into()) + } +} + +struct Token; +impl file::Operations for Token { + type Data = Ref; + type OpenData = Ref; + + kernel::declare_file_operations!(read, write); + + fn open(shared: &Ref, _file: &File) -> Result { + Ok(shared.clone()) + } + + fn read( + shared: RefBorrow<'_, SharedState>, + _: &File, + data: &mut impl IoBufferWriter, + offset: u64, + ) -> Result { + // Succeed if the caller doesn't provide a buffer or if not at the start. + if data.is_empty() || offset != 0 { + return Ok(0); + } + + { + let mut inner = shared.inner.lock(); + + // Wait until we are allowed to decrement the token count or a signal arrives. + while inner.token_count == 0 { + if shared.state_changed.wait(&mut inner) { + return Err(EINTR); + } + } + + // Consume a token. + inner.token_count -= 1; + } + + // Notify a possible writer waiting. + shared.state_changed.notify_all(); + + // Write a one-byte 1 to the reader. + data.write_slice(&[1u8; 1])?; + Ok(1) + } + + fn write( + shared: RefBorrow<'_, SharedState>, + _: &File, + data: &mut impl IoBufferReader, + _offset: u64, + ) -> Result { + { + let mut inner = shared.inner.lock(); + + // Wait until we are allowed to increment the token count or a signal arrives. + while inner.token_count == MAX_TOKENS { + if shared.state_changed.wait(&mut inner) { + return Err(EINTR); + } + } + + // Increment the number of token so that a reader can be released. + inner.token_count += 1; + } + + // Notify a possible reader waiting. + shared.state_changed.notify_all(); + Ok(data.len()) + } +} + +struct RustMiscdev { + _dev: Pin>>, +} + +impl kernel::Module for RustMiscdev { + fn init(name: &'static CStr, _module: &'static ThisModule) -> Result { + pr_info!("Rust miscellaneous device sample (init)\n"); + + let state = SharedState::try_new()?; + + Ok(RustMiscdev { + _dev: miscdev::Registration::new_pinned(fmt!("{name}"), state)?, + }) + } +} + +impl Drop for RustMiscdev { + fn drop(&mut self) { + pr_info!("Rust miscellaneous device sample (exit)\n"); + } +} diff --git a/samples/rust/rust_module_parameters.rs b/samples/rust/rust_module_parameters.rs new file mode 100644 index 00000000000000..12fe5e738e8315 --- /dev/null +++ b/samples/rust/rust_module_parameters.rs @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust module parameters sample. + +use kernel::prelude::*; + +module! { + type: RustModuleParameters, + name: b"rust_module_parameters", + author: b"Rust for Linux Contributors", + description: b"Rust module parameters sample", + license: b"GPL", + params: { + my_bool: bool { + default: true, + permissions: 0, + description: b"Example of bool", + }, + my_i32: i32 { + default: 42, + permissions: 0o644, + description: b"Example of i32", + }, + my_str: str { + default: b"default str val", + permissions: 0o644, + description: b"Example of a string param", + }, + my_usize: usize { + default: 42, + permissions: 0o644, + description: b"Example of usize", + }, + my_array: ArrayParam { + default: [0, 1], + permissions: 0, + description: b"Example of array", + }, + }, +} + +struct RustModuleParameters; + +impl kernel::Module for RustModuleParameters { + fn init(_name: &'static CStr, module: &'static ThisModule) -> Result { + pr_info!("Rust module parameters sample (init)\n"); + + { + let lock = module.kernel_param_lock(); + pr_info!("Parameters:\n"); + pr_info!(" my_bool: {}\n", my_bool.read()); + pr_info!(" my_i32: {}\n", my_i32.read(&lock)); + pr_info!( + " my_str: {}\n", + core::str::from_utf8(my_str.read(&lock))? + ); + pr_info!(" my_usize: {}\n", my_usize.read(&lock)); + pr_info!(" my_array: {:?}\n", my_array.read()); + } + + Ok(RustModuleParameters) + } +} + +impl Drop for RustModuleParameters { + fn drop(&mut self) { + pr_info!("Rust module parameters sample (exit)\n"); + } +} diff --git a/samples/rust/rust_netfilter.rs b/samples/rust/rust_netfilter.rs new file mode 100644 index 00000000000000..4bd5c07fee8c72 --- /dev/null +++ b/samples/rust/rust_netfilter.rs @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust netfilter sample. + +use kernel::net; +use kernel::net::filter::{self as netfilter, inet, Disposition, Family}; +use kernel::prelude::*; + +module! { + type: RustNetfilter, + name: b"rust_netfilter", + author: b"Rust for Linux Contributors", + description: b"Rust netfilter sample", + license: b"GPL", +} + +struct RustNetfilter { + _in: Pin>>, + _out: Pin>>, +} + +impl netfilter::Filter for RustNetfilter { + fn filter(_: (), skb: &net::SkBuff) -> Disposition { + let data = skb.head_data(); + pr_info!( + "packet headlen={}, len={}, first bytes={:02x?}\n", + data.len(), + skb.len(), + &data[..core::cmp::min(10, data.len())] + ); + Disposition::Accept + } +} + +impl kernel::Module for RustNetfilter { + fn init(_name: &'static CStr, _module: &'static ThisModule) -> Result { + Ok(Self { + _in: netfilter::Registration::new_pinned( + Family::INet(inet::Hook::PreRouting), + 0, + net::init_ns().into(), + None, + (), + )?, + _out: netfilter::Registration::new_pinned( + Family::INet(inet::Hook::PostRouting), + 0, + net::init_ns().into(), + None, + (), + )?, + }) + } +} diff --git a/samples/rust/rust_platform.rs b/samples/rust/rust_platform.rs new file mode 100644 index 00000000000000..f62784676919da --- /dev/null +++ b/samples/rust/rust_platform.rs @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust platform device driver sample. + +use kernel::{module_platform_driver, of, platform, prelude::*}; + +module_platform_driver! { + type: Driver, + name: b"rust_platform", + license: b"GPL", +} + +struct Driver; +impl platform::Driver for Driver { + kernel::define_of_id_table! {(), [ + (of::DeviceId::Compatible(b"rust,sample"), None), + ]} + + fn probe(_dev: &mut platform::Device, _id_info: Option<&Self::IdInfo>) -> Result { + Ok(()) + } +} diff --git a/samples/rust/rust_print.rs b/samples/rust/rust_print.rs new file mode 100644 index 00000000000000..30d96e025d898f --- /dev/null +++ b/samples/rust/rust_print.rs @@ -0,0 +1,54 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust printing macros sample. + +use kernel::prelude::*; +use kernel::{pr_cont, str::CStr, ThisModule}; + +module! { + type: RustPrint, + name: b"rust_print", + author: b"Rust for Linux Contributors", + description: b"Rust printing macros sample", + license: b"GPL", +} + +struct RustPrint; + +impl kernel::Module for RustPrint { + fn init(_name: &'static CStr, _module: &'static ThisModule) -> Result { + pr_info!("Rust printing macros sample (init)\n"); + + pr_emerg!("Emergency message (level 0) without args\n"); + pr_alert!("Alert message (level 1) without args\n"); + pr_crit!("Critical message (level 2) without args\n"); + pr_err!("Error message (level 3) without args\n"); + pr_warn!("Warning message (level 4) without args\n"); + pr_notice!("Notice message (level 5) without args\n"); + pr_info!("Info message (level 6) without args\n"); + + pr_info!("A line that"); + pr_cont!(" is continued"); + pr_cont!(" without args\n"); + + pr_emerg!("{} message (level {}) with args\n", "Emergency", 0); + pr_alert!("{} message (level {}) with args\n", "Alert", 1); + pr_crit!("{} message (level {}) with args\n", "Critical", 2); + pr_err!("{} message (level {}) with args\n", "Error", 3); + pr_warn!("{} message (level {}) with args\n", "Warning", 4); + pr_notice!("{} message (level {}) with args\n", "Notice", 5); + pr_info!("{} message (level {}) with args\n", "Info", 6); + + pr_info!("A {} that", "line"); + pr_cont!(" is {}", "continued"); + pr_cont!(" with {}\n", "args"); + + Ok(RustPrint) + } +} + +impl Drop for RustPrint { + fn drop(&mut self) { + pr_info!("Rust printing macros sample (exit)\n"); + } +} diff --git a/samples/rust/rust_random.rs b/samples/rust/rust_random.rs new file mode 100644 index 00000000000000..8ec87119aa9bb8 --- /dev/null +++ b/samples/rust/rust_random.rs @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust random device. +//! +//! Adapted from Alex Gaynor's original available at +//! . + +use kernel::{ + file::{self, File}, + io_buffer::{IoBufferReader, IoBufferWriter}, + prelude::*, +}; + +module_misc_device! { + type: RandomFile, + name: b"rust_random", + author: b"Rust for Linux Contributors", + description: b"Just use /dev/urandom: Now with early-boot safety", + license: b"GPL", +} + +struct RandomFile; + +impl file::Operations for RandomFile { + kernel::declare_file_operations!(read, write, read_iter, write_iter); + + fn open(_data: &(), _file: &File) -> Result { + Ok(()) + } + + fn read(_this: (), file: &File, buf: &mut impl IoBufferWriter, _: u64) -> Result { + let total_len = buf.len(); + let mut chunkbuf = [0; 256]; + + while !buf.is_empty() { + let len = chunkbuf.len().min(buf.len()); + let chunk = &mut chunkbuf[0..len]; + + if file.is_blocking() { + kernel::random::getrandom(chunk)?; + } else { + kernel::random::getrandom_nonblock(chunk)?; + } + buf.write_slice(chunk)?; + } + Ok(total_len) + } + + fn write(_this: (), _file: &File, buf: &mut impl IoBufferReader, _: u64) -> Result { + let total_len = buf.len(); + let mut chunkbuf = [0; 256]; + while !buf.is_empty() { + let len = chunkbuf.len().min(buf.len()); + let chunk = &mut chunkbuf[0..len]; + buf.read_slice(chunk)?; + kernel::random::add_randomness(chunk); + } + Ok(total_len) + } +} diff --git a/samples/rust/rust_semaphore.rs b/samples/rust/rust_semaphore.rs new file mode 100644 index 00000000000000..702ac1fcb48a8c --- /dev/null +++ b/samples/rust/rust_semaphore.rs @@ -0,0 +1,171 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust semaphore sample. +//! +//! A counting semaphore that can be used by userspace. +//! +//! The count is incremented by writes to the device. A write of `n` bytes results in an increment +//! of `n`. It is decremented by reads; each read results in the count being decremented by 1. If +//! the count is already zero, a read will block until another write increments it. +//! +//! This can be used in user space from the shell for example as follows (assuming a node called +//! `semaphore`): `cat semaphore` decrements the count by 1 (waiting for it to become non-zero +//! before decrementing); `echo -n 123 > semaphore` increments the semaphore by 3, potentially +//! unblocking up to 3 blocked readers. + +use core::sync::atomic::{AtomicU64, Ordering}; +use kernel::{ + condvar_init, declare_file_operations, + file::{self, File, IoctlCommand, IoctlHandler}, + io_buffer::{IoBufferReader, IoBufferWriter}, + miscdev::Registration, + mutex_init, + prelude::*, + sync::{CondVar, Mutex, Ref, UniqueRef}, + user_ptr::{UserSlicePtrReader, UserSlicePtrWriter}, +}; + +module! { + type: RustSemaphore, + name: b"rust_semaphore", + author: b"Rust for Linux Contributors", + description: b"Rust semaphore sample", + license: b"GPL", +} + +struct SemaphoreInner { + count: usize, + max_seen: usize, +} + +struct Semaphore { + changed: CondVar, + inner: Mutex, +} + +struct FileState { + read_count: AtomicU64, + shared: Ref, +} + +impl FileState { + fn consume(&self) -> Result { + let mut inner = self.shared.inner.lock(); + while inner.count == 0 { + if self.shared.changed.wait(&mut inner) { + return Err(EINTR); + } + } + inner.count -= 1; + Ok(()) + } +} + +impl file::Operations for FileState { + type Data = Box; + type OpenData = Ref; + + declare_file_operations!(read, write, ioctl); + + fn open(shared: &Ref, _file: &File) -> Result> { + Ok(Box::try_new(Self { + read_count: AtomicU64::new(0), + shared: shared.clone(), + })?) + } + + fn read(this: &Self, _: &File, data: &mut impl IoBufferWriter, offset: u64) -> Result { + if data.is_empty() || offset > 0 { + return Ok(0); + } + this.consume()?; + data.write_slice(&[0u8; 1])?; + this.read_count.fetch_add(1, Ordering::Relaxed); + Ok(1) + } + + fn write(this: &Self, _: &File, data: &mut impl IoBufferReader, _offs: u64) -> Result { + { + let mut inner = this.shared.inner.lock(); + inner.count = inner.count.saturating_add(data.len()); + if inner.count > inner.max_seen { + inner.max_seen = inner.count; + } + } + + this.shared.changed.notify_all(); + Ok(data.len()) + } + + fn ioctl(this: &Self, file: &File, cmd: &mut IoctlCommand) -> Result { + cmd.dispatch::(this, file) + } +} + +struct RustSemaphore { + _dev: Pin>>, +} + +impl kernel::Module for RustSemaphore { + fn init(name: &'static CStr, _module: &'static ThisModule) -> Result { + pr_info!("Rust semaphore sample (init)\n"); + + let mut sema = Pin::from(UniqueRef::try_new(Semaphore { + // SAFETY: `condvar_init!` is called below. + changed: unsafe { CondVar::new() }, + + // SAFETY: `mutex_init!` is called below. + inner: unsafe { + Mutex::new(SemaphoreInner { + count: 0, + max_seen: 0, + }) + }, + })?); + + // SAFETY: `changed` is pinned when `sema` is. + let pinned = unsafe { sema.as_mut().map_unchecked_mut(|s| &mut s.changed) }; + condvar_init!(pinned, "Semaphore::changed"); + + // SAFETY: `inner` is pinned when `sema` is. + let pinned = unsafe { sema.as_mut().map_unchecked_mut(|s| &mut s.inner) }; + mutex_init!(pinned, "Semaphore::inner"); + + Ok(Self { + _dev: Registration::new_pinned(fmt!("{name}"), sema.into())?, + }) + } +} + +impl Drop for RustSemaphore { + fn drop(&mut self) { + pr_info!("Rust semaphore sample (exit)\n"); + } +} + +const IOCTL_GET_READ_COUNT: u32 = 0x80086301; +const IOCTL_SET_READ_COUNT: u32 = 0x40086301; + +impl IoctlHandler for FileState { + type Target<'a> = &'a Self; + + fn read(this: &Self, _: &File, cmd: u32, writer: &mut UserSlicePtrWriter) -> Result { + match cmd { + IOCTL_GET_READ_COUNT => { + writer.write(&this.read_count.load(Ordering::Relaxed))?; + Ok(0) + } + _ => Err(EINVAL), + } + } + + fn write(this: &Self, _: &File, cmd: u32, reader: &mut UserSlicePtrReader) -> Result { + match cmd { + IOCTL_SET_READ_COUNT => { + this.read_count.store(reader.read()?, Ordering::Relaxed); + Ok(0) + } + _ => Err(EINVAL), + } + } +} diff --git a/samples/rust/rust_semaphore_c.c b/samples/rust/rust_semaphore_c.c new file mode 100644 index 00000000000000..7672b0b4c105be --- /dev/null +++ b/samples/rust/rust_semaphore_c.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Rust semaphore sample (in C, for comparison) + * + * This is a C implementation of `rust_semaphore.rs`. Refer to the description + * in that file for details on the device. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include + +#define IOCTL_GET_READ_COUNT _IOR('c', 1, u64) +#define IOCTL_SET_READ_COUNT _IOW('c', 1, u64) + +struct semaphore_state { + struct kref ref; + struct miscdevice miscdev; + wait_queue_head_t changed; + struct mutex mutex; + size_t count; + size_t max_seen; +}; + +struct file_state { + atomic64_t read_count; + struct semaphore_state *shared; +}; + +static int semaphore_consume(struct semaphore_state *state) +{ + DEFINE_WAIT(wait); + + mutex_lock(&state->mutex); + while (state->count == 0) { + prepare_to_wait(&state->changed, &wait, TASK_INTERRUPTIBLE); + mutex_unlock(&state->mutex); + schedule(); + finish_wait(&state->changed, &wait); + if (signal_pending(current)) + return -EINTR; + mutex_lock(&state->mutex); + } + + state->count--; + mutex_unlock(&state->mutex); + + return 0; +} + +static int semaphore_open(struct inode *nodp, struct file *filp) +{ + struct semaphore_state *shared = + container_of(filp->private_data, struct semaphore_state, miscdev); + struct file_state *state; + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return -ENOMEM; + + kref_get(&shared->ref); + state->shared = shared; + atomic64_set(&state->read_count, 0); + + filp->private_data = state; + + return 0; +} + +static ssize_t semaphore_write(struct file *filp, const char __user *buffer, size_t count, + loff_t *ppos) +{ + struct file_state *state = filp->private_data; + struct semaphore_state *shared = state->shared; + + mutex_lock(&shared->mutex); + + shared->count += count; + if (shared->count < count) + shared->count = SIZE_MAX; + + if (shared->count > shared->max_seen) + shared->max_seen = shared->count; + + mutex_unlock(&shared->mutex); + + wake_up_all(&shared->changed); + + return count; +} + +static ssize_t semaphore_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct file_state *state = filp->private_data; + char c = 0; + int ret; + + if (count == 0 || *ppos > 0) + return 0; + + ret = semaphore_consume(state->shared); + if (ret) + return ret; + + if (copy_to_user(buffer, &c, sizeof(c))) + return -EFAULT; + + atomic64_add(1, &state->read_count); + *ppos += 1; + return 1; +} + +static long semaphore_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct file_state *state = filp->private_data; + void __user *buffer = (void __user *)arg; + u64 value; + + switch (cmd) { + case IOCTL_GET_READ_COUNT: + value = atomic64_read(&state->read_count); + if (copy_to_user(buffer, &value, sizeof(value))) + return -EFAULT; + return 0; + case IOCTL_SET_READ_COUNT: + if (copy_from_user(&value, buffer, sizeof(value))) + return -EFAULT; + atomic64_set(&state->read_count, value); + return 0; + default: + return -EINVAL; + } +} + +static void semaphore_free(struct kref *kref) +{ + struct semaphore_state *device; + + device = container_of(kref, struct semaphore_state, ref); + kfree(device); +} + +static int semaphore_release(struct inode *nodp, struct file *filp) +{ + struct file_state *state = filp->private_data; + + kref_put(&state->shared->ref, semaphore_free); + kfree(state); + return 0; +} + +static const struct file_operations semaphore_fops = { + .owner = THIS_MODULE, + .open = semaphore_open, + .read = semaphore_read, + .write = semaphore_write, + .compat_ioctl = semaphore_ioctl, + .release = semaphore_release, +}; + +static struct semaphore_state *device; + +static int __init semaphore_init(void) +{ + int ret; + struct semaphore_state *state; + + pr_info("Rust semaphore sample (in C, for comparison) (init)\n"); + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return -ENOMEM; + + mutex_init(&state->mutex); + kref_init(&state->ref); + init_waitqueue_head(&state->changed); + + state->miscdev.fops = &semaphore_fops; + state->miscdev.minor = MISC_DYNAMIC_MINOR; + state->miscdev.name = "semaphore"; + + ret = misc_register(&state->miscdev); + if (ret < 0) { + kfree(state); + return ret; + } + + device = state; + + return 0; +} + +static void __exit semaphore_exit(void) +{ + pr_info("Rust semaphore sample (in C, for comparison) (exit)\n"); + + misc_deregister(&device->miscdev); + kref_put(&device->ref, semaphore_free); +} + +module_init(semaphore_init); +module_exit(semaphore_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rust for Linux Contributors"); +MODULE_DESCRIPTION("Rust semaphore sample (in C, for comparison)"); diff --git a/samples/rust/rust_stack_probing.rs b/samples/rust/rust_stack_probing.rs new file mode 100644 index 00000000000000..1448fe8e1b5640 --- /dev/null +++ b/samples/rust/rust_stack_probing.rs @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust stack probing sample. + +use kernel::prelude::*; + +module! { + type: RustStackProbing, + name: b"rust_stack_probing", + author: b"Rust for Linux Contributors", + description: b"Rust stack probing sample", + license: b"GPL", +} + +struct RustStackProbing; + +impl kernel::Module for RustStackProbing { + fn init(_name: &'static CStr, _module: &'static ThisModule) -> Result { + pr_info!("Rust stack probing sample (init)\n"); + + // Including this large variable on the stack will trigger + // stack probing on the supported archs. + // This will verify that stack probing does not lead to + // any errors if we need to link `__rust_probestack`. + let x: [u64; 514] = core::hint::black_box([5; 514]); + pr_info!("Large array has length: {}\n", x.len()); + + Ok(RustStackProbing) + } +} + +impl Drop for RustStackProbing { + fn drop(&mut self) { + pr_info!("Rust stack probing sample (exit)\n"); + } +} diff --git a/samples/rust/rust_sync.rs b/samples/rust/rust_sync.rs new file mode 100644 index 00000000000000..46637ace2f7ffc --- /dev/null +++ b/samples/rust/rust_sync.rs @@ -0,0 +1,93 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Rust synchronisation primitives sample. + +use kernel::prelude::*; +use kernel::{ + condvar_init, mutex_init, spinlock_init, + sync::{CondVar, Mutex, SpinLock}, +}; + +module! { + type: RustSync, + name: b"rust_sync", + author: b"Rust for Linux Contributors", + description: b"Rust synchronisation primitives sample", + license: b"GPL", +} + +kernel::init_static_sync! { + static SAMPLE_MUTEX: Mutex = 10; + static SAMPLE_CONDVAR: CondVar; +} + +struct RustSync; + +impl kernel::Module for RustSync { + fn init(_name: &'static CStr, _module: &'static ThisModule) -> Result { + pr_info!("Rust synchronisation primitives sample (init)\n"); + + // Test mutexes. + { + // SAFETY: `init` is called below. + let mut data = Pin::from(Box::try_new(unsafe { Mutex::new(0) })?); + mutex_init!(data.as_mut(), "RustSync::init::data1"); + *data.lock() = 10; + pr_info!("Value: {}\n", *data.lock()); + + // SAFETY: `init` is called below. + let mut cv = Pin::from(Box::try_new(unsafe { CondVar::new() })?); + condvar_init!(cv.as_mut(), "RustSync::init::cv1"); + + { + let mut guard = data.lock(); + while *guard != 10 { + let _ = cv.wait(&mut guard); + } + } + cv.notify_one(); + cv.notify_all(); + cv.free_waiters(); + } + + // Test static mutex + condvar. + *SAMPLE_MUTEX.lock() = 20; + + { + let mut guard = SAMPLE_MUTEX.lock(); + while *guard != 20 { + let _ = SAMPLE_CONDVAR.wait(&mut guard); + } + } + + // Test spinlocks. + { + // SAFETY: `init` is called below. + let mut data = Pin::from(Box::try_new(unsafe { SpinLock::new(0) })?); + spinlock_init!(data.as_mut(), "RustSync::init::data2"); + *data.lock() = 10; + pr_info!("Value: {}\n", *data.lock()); + + // SAFETY: `init` is called below. + let mut cv = Pin::from(Box::try_new(unsafe { CondVar::new() })?); + condvar_init!(cv.as_mut(), "RustSync::init::cv2"); + { + let mut guard = data.lock(); + while *guard != 10 { + let _ = cv.wait(&mut guard); + } + } + cv.notify_one(); + cv.notify_all(); + cv.free_waiters(); + } + + Ok(RustSync) + } +} + +impl Drop for RustSync { + fn drop(&mut self) { + pr_info!("Rust synchronisation primitives sample (exit)\n"); + } +} From 19ae2b0ecba9fc75d0f9016e56f40fac7f81bdb5 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sat, 3 Jul 2021 17:24:21 +0200 Subject: [PATCH 0037/1250] MAINTAINERS: Rust Miguel, Alex and Wedson will be maintaining the Rust support. Reviewed-by: Kees Cook Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Wedson Almeida Filho Signed-off-by: Wedson Almeida Filho Signed-off-by: Miguel Ojeda --- MAINTAINERS | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index f468864fd268cf..b2c41b2ed14e28 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17149,6 +17149,21 @@ L: linux-rdma@vger.kernel.org S: Maintained F: drivers/infiniband/ulp/rtrs/ +RUST +M: Miguel Ojeda +M: Alex Gaynor +M: Wedson Almeida Filho +L: rust-for-linux@vger.kernel.org +S: Supported +W: https://github.com/Rust-for-Linux/linux +B: https://github.com/Rust-for-Linux/linux/issues +T: git https://github.com/Rust-for-Linux/linux.git rust-next +F: Documentation/rust/ +F: rust/ +F: samples/rust/ +F: scripts/*rust* +K: \b(?i:rust)\b + RXRPC SOCKETS (AF_RXRPC) M: David Howells M: Marc Dionne From 9b6df53414e1dbcb95461e5bf33464b178fd84e0 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Tue, 30 Nov 2021 21:03:44 +0100 Subject: [PATCH 0038/1250] [RFC] drivers: gpio: PrimeCell PL061 in Rust A port to Rust of the PrimeCell PL061 GPIO driver. This module is a work in progress and will be sent for review later on, as well as separately from the Rust support. However, it is included to show how an actual working module written in Rust may look like. Signed-off-by: Wedson Almeida Filho Signed-off-by: Miguel Ojeda --- drivers/gpio/Kconfig | 8 + drivers/gpio/Makefile | 1 + drivers/gpio/gpio_pl061_rust.rs | 370 ++++++++++++++++++++++++++++++++ 3 files changed, 379 insertions(+) create mode 100644 drivers/gpio/gpio_pl061_rust.rs diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 45764ec3b2ebe4..ad99b96f6d798e 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -481,6 +481,14 @@ config GPIO_PL061 help Say yes here to support the PrimeCell PL061 GPIO device. +config GPIO_PL061_RUST + tristate "PrimeCell PL061 GPIO support written in Rust" + depends on ARM_AMBA && RUST + select IRQ_DOMAIN + select GPIOLIB_IRQCHIP + help + Say yes here to support the PrimeCell PL061 GPIO device + config GPIO_PMIC_EIC_SPRD tristate "Spreadtrum PMIC EIC support" depends on MFD_SC27XX_PMIC || COMPILE_TEST diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile index 14352f6dfe8e53..30141fec12be6e 100644 --- a/drivers/gpio/Makefile +++ b/drivers/gpio/Makefile @@ -118,6 +118,7 @@ obj-$(CONFIG_GPIO_PCIE_IDIO_24) += gpio-pcie-idio-24.o obj-$(CONFIG_GPIO_PCI_IDIO_16) += gpio-pci-idio-16.o obj-$(CONFIG_GPIO_PISOSR) += gpio-pisosr.o obj-$(CONFIG_GPIO_PL061) += gpio-pl061.o +obj-$(CONFIG_GPIO_PL061_RUST) += gpio_pl061_rust.o obj-$(CONFIG_GPIO_PMIC_EIC_SPRD) += gpio-pmic-eic-sprd.o obj-$(CONFIG_GPIO_PXA) += gpio-pxa.o obj-$(CONFIG_GPIO_RASPBERRYPI_EXP) += gpio-raspberrypi-exp.o diff --git a/drivers/gpio/gpio_pl061_rust.rs b/drivers/gpio/gpio_pl061_rust.rs new file mode 100644 index 00000000000000..90891408044283 --- /dev/null +++ b/drivers/gpio/gpio_pl061_rust.rs @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Driver for the ARM PrimeCell(tm) General Purpose Input/Output (PL061). +//! +//! Based on the C driver written by Baruch Siach . + +use kernel::{ + amba, bit, bits_iter, define_amba_id_table, device, gpio, + io_mem::IoMem, + irq::{self, ExtraResult, IrqData, LockedIrqData}, + power, + prelude::*, + sync::{RawSpinLock, Ref, RefBorrow}, +}; + +const GPIODIR: usize = 0x400; +const GPIOIS: usize = 0x404; +const GPIOIBE: usize = 0x408; +const GPIOIEV: usize = 0x40C; +const GPIOIE: usize = 0x410; +const GPIOMIS: usize = 0x418; +const GPIOIC: usize = 0x41C; +const GPIO_SIZE: usize = 0x1000; + +const PL061_GPIO_NR: u16 = 8; + +#[derive(Default)] +struct ContextSaveRegs { + gpio_data: u8, + gpio_dir: u8, + gpio_is: u8, + gpio_ibe: u8, + gpio_iev: u8, + gpio_ie: u8, +} + +#[derive(Default)] +struct PL061DataInner { + csave_regs: ContextSaveRegs, +} + +struct PL061Data { + dev: device::Device, + inner: RawSpinLock, +} + +struct PL061Resources { + base: IoMem, + parent_irq: u32, +} + +type PL061Registrations = gpio::RegistrationWithIrqChip; + +type DeviceData = device::Data; + +struct PL061Device; + +impl gpio::Chip for PL061Device { + type Data = Ref; + + kernel::declare_gpio_chip_operations!( + get_direction, + direction_input, + direction_output, + get, + set + ); + + fn get_direction(data: RefBorrow<'_, DeviceData>, offset: u32) -> Result { + let pl061 = data.resources().ok_or(ENXIO)?; + Ok(if pl061.base.readb(GPIODIR) & bit(offset) != 0 { + gpio::LineDirection::Out + } else { + gpio::LineDirection::In + }) + } + + fn direction_input(data: RefBorrow<'_, DeviceData>, offset: u32) -> Result { + let _guard = data.inner.lock_irqdisable(); + let pl061 = data.resources().ok_or(ENXIO)?; + let mut gpiodir = pl061.base.readb(GPIODIR); + gpiodir &= !bit(offset); + pl061.base.writeb(gpiodir, GPIODIR); + Ok(()) + } + + fn direction_output(data: RefBorrow<'_, DeviceData>, offset: u32, value: bool) -> Result { + let woffset = bit(offset + 2).into(); + let _guard = data.inner.lock_irqdisable(); + let pl061 = data.resources().ok_or(ENXIO)?; + pl061.base.try_writeb((value as u8) << offset, woffset)?; + let mut gpiodir = pl061.base.readb(GPIODIR); + gpiodir |= bit(offset); + pl061.base.writeb(gpiodir, GPIODIR); + + // gpio value is set again, because pl061 doesn't allow to set value of a gpio pin before + // configuring it in OUT mode. + pl061.base.try_writeb((value as u8) << offset, woffset)?; + Ok(()) + } + + fn get(data: RefBorrow<'_, DeviceData>, offset: u32) -> Result { + let pl061 = data.resources().ok_or(ENXIO)?; + Ok(pl061.base.try_readb(bit(offset + 2).into())? != 0) + } + + fn set(data: RefBorrow<'_, DeviceData>, offset: u32, value: bool) { + if let Some(pl061) = data.resources() { + let woffset = bit(offset + 2).into(); + let _ = pl061.base.try_writeb((value as u8) << offset, woffset); + } + } +} + +impl gpio::ChipWithIrqChip for PL061Device { + fn handle_irq_flow( + data: RefBorrow<'_, DeviceData>, + desc: &irq::Descriptor, + domain: &irq::Domain, + ) { + let chained = desc.enter_chained(); + + if let Some(pl061) = data.resources() { + let pending = pl061.base.readb(GPIOMIS); + for offset in bits_iter(pending) { + domain.generic_handle_chained(offset, &chained); + } + } + } +} + +impl irq::Chip for PL061Device { + type Data = Ref; + + kernel::declare_irq_chip_operations!(set_type, set_wake); + + fn set_type( + data: RefBorrow<'_, DeviceData>, + irq_data: &mut LockedIrqData, + trigger: u32, + ) -> Result { + let offset = irq_data.hwirq(); + let bit = bit(offset); + + if offset >= PL061_GPIO_NR.into() { + return Err(EINVAL); + } + + if trigger & (irq::Type::LEVEL_HIGH | irq::Type::LEVEL_LOW) != 0 + && trigger & (irq::Type::EDGE_RISING | irq::Type::EDGE_FALLING) != 0 + { + dev_err!( + data.dev, + "trying to configure line {} for both level and edge detection, choose one!\n", + offset + ); + return Err(EINVAL); + } + + let _guard = data.inner.lock_irqdisable(); + let pl061 = data.resources().ok_or(ENXIO)?; + + let mut gpioiev = pl061.base.readb(GPIOIEV); + let mut gpiois = pl061.base.readb(GPIOIS); + let mut gpioibe = pl061.base.readb(GPIOIBE); + + if trigger & (irq::Type::LEVEL_HIGH | irq::Type::LEVEL_LOW) != 0 { + let polarity = trigger & irq::Type::LEVEL_HIGH != 0; + + // Disable edge detection. + gpioibe &= !bit; + // Enable level detection. + gpiois |= bit; + // Select polarity. + if polarity { + gpioiev |= bit; + } else { + gpioiev &= !bit; + } + irq_data.set_level_handler(); + dev_dbg!( + data.dev, + "line {}: IRQ on {} level\n", + offset, + if polarity { "HIGH" } else { "LOW" } + ); + } else if (trigger & irq::Type::EDGE_BOTH) == irq::Type::EDGE_BOTH { + // Disable level detection. + gpiois &= !bit; + // Select both edges, settings this makes GPIOEV be ignored. + gpioibe |= bit; + irq_data.set_edge_handler(); + dev_dbg!(data.dev, "line {}: IRQ on both edges\n", offset); + } else if trigger & (irq::Type::EDGE_RISING | irq::Type::EDGE_FALLING) != 0 { + let rising = trigger & irq::Type::EDGE_RISING != 0; + + // Disable level detection. + gpiois &= !bit; + // Clear detection on both edges. + gpioibe &= !bit; + // Select edge. + if rising { + gpioiev |= bit; + } else { + gpioiev &= !bit; + } + irq_data.set_edge_handler(); + dev_dbg!( + data.dev, + "line {}: IRQ on {} edge\n", + offset, + if rising { "RISING" } else { "FALLING}" } + ); + } else { + // No trigger: disable everything. + gpiois &= !bit; + gpioibe &= !bit; + gpioiev &= !bit; + irq_data.set_bad_handler(); + dev_warn!(data.dev, "no trigger selected for line {}\n", offset); + } + + pl061.base.writeb(gpiois, GPIOIS); + pl061.base.writeb(gpioibe, GPIOIBE); + pl061.base.writeb(gpioiev, GPIOIEV); + + Ok(ExtraResult::None) + } + + fn mask(data: RefBorrow<'_, DeviceData>, irq_data: &IrqData) { + let mask = bit(irq_data.hwirq() % irq::HwNumber::from(PL061_GPIO_NR)); + let _guard = data.inner.lock(); + if let Some(pl061) = data.resources() { + let gpioie = pl061.base.readb(GPIOIE) & !mask; + pl061.base.writeb(gpioie, GPIOIE); + } + } + + fn unmask(data: RefBorrow<'_, DeviceData>, irq_data: &IrqData) { + let mask = bit(irq_data.hwirq() % irq::HwNumber::from(PL061_GPIO_NR)); + let _guard = data.inner.lock(); + if let Some(pl061) = data.resources() { + let gpioie = pl061.base.readb(GPIOIE) | mask; + pl061.base.writeb(gpioie, GPIOIE); + } + } + + // This gets called from the edge IRQ handler to ACK the edge IRQ in the GPIOIC + // (interrupt-clear) register. For level IRQs this is not needed: these go away when the level + // signal goes away. + fn ack(data: RefBorrow<'_, DeviceData>, irq_data: &IrqData) { + let mask = bit(irq_data.hwirq() % irq::HwNumber::from(PL061_GPIO_NR)); + let _guard = data.inner.lock(); + if let Some(pl061) = data.resources() { + pl061.base.writeb(mask.into(), GPIOIC); + } + } + + fn set_wake(data: RefBorrow<'_, DeviceData>, _irq_data: &IrqData, on: bool) -> Result { + let pl061 = data.resources().ok_or(ENXIO)?; + irq::set_wake(pl061.parent_irq, on) + } +} + +impl amba::Driver for PL061Device { + type Data = Ref; + type PowerOps = Self; + + define_amba_id_table! {(), [ + ({id: 0x00041061, mask: 0x000fffff}, None), + ]} + + fn probe(dev: &mut amba::Device, _data: Option<&Self::IdInfo>) -> Result> { + let res = dev.take_resource().ok_or(ENXIO)?; + let irq = dev.irq(0).ok_or(ENXIO)?; + + let mut data = kernel::new_device_data!( + gpio::RegistrationWithIrqChip::new(), + PL061Resources { + // SAFETY: This device doesn't support DMA. + base: unsafe { IoMem::try_new(res)? }, + parent_irq: irq, + }, + PL061Data { + dev: device::Device::from_dev(dev), + // SAFETY: We call `rawspinlock_init` below. + inner: unsafe { RawSpinLock::new(PL061DataInner::default()) }, + }, + "PL061::Registrations" + )?; + + // SAFETY: General part of the data is pinned when `data` is. + let gen_inner = unsafe { data.as_mut().map_unchecked_mut(|d| &mut (**d).inner) }; + kernel::rawspinlock_init!(gen_inner, "PL061Data::inner"); + + let data = Ref::::from(data); + + data.resources().ok_or(ENXIO)?.base.writeb(0, GPIOIE); // disable irqs + + data.registrations() + .ok_or(ENXIO)? + .as_pinned_mut() + .register::(PL061_GPIO_NR, None, dev, data.clone(), irq)?; + + dev_info!(data.dev, "PL061 GPIO chip registered\n"); + + Ok(data) + } +} + +impl power::Operations for PL061Device { + type Data = Ref; + + fn suspend(data: RefBorrow<'_, DeviceData>) -> Result { + let mut inner = data.inner.lock(); + let pl061 = data.resources().ok_or(ENXIO)?; + inner.csave_regs.gpio_data = 0; + inner.csave_regs.gpio_dir = pl061.base.readb(GPIODIR); + inner.csave_regs.gpio_is = pl061.base.readb(GPIOIS); + inner.csave_regs.gpio_ibe = pl061.base.readb(GPIOIBE); + inner.csave_regs.gpio_iev = pl061.base.readb(GPIOIEV); + inner.csave_regs.gpio_ie = pl061.base.readb(GPIOIE); + + for offset in 0..PL061_GPIO_NR { + if inner.csave_regs.gpio_dir & bit(offset) != 0 { + if let Ok(v) = ::get(data, offset.into()) { + inner.csave_regs.gpio_data |= (v as u8) << offset; + } + } + } + + Ok(()) + } + + fn resume(data: RefBorrow<'_, DeviceData>) -> Result { + let inner = data.inner.lock(); + let pl061 = data.resources().ok_or(ENXIO)?; + + for offset in 0..PL061_GPIO_NR { + if inner.csave_regs.gpio_dir & bit(offset) != 0 { + let value = inner.csave_regs.gpio_data & bit(offset) != 0; + let _ = ::direction_output(data, offset.into(), value); + } else { + let _ = ::direction_input(data, offset.into()); + } + } + + pl061.base.writeb(inner.csave_regs.gpio_is, GPIOIS); + pl061.base.writeb(inner.csave_regs.gpio_ibe, GPIOIBE); + pl061.base.writeb(inner.csave_regs.gpio_iev, GPIOIEV); + pl061.base.writeb(inner.csave_regs.gpio_ie, GPIOIE); + + Ok(()) + } + + fn freeze(data: RefBorrow<'_, DeviceData>) -> Result { + Self::suspend(data) + } + + fn restore(data: RefBorrow<'_, DeviceData>) -> Result { + Self::resume(data) + } +} + +module_amba_driver! { + type: PL061Device, + name: b"pl061_gpio", + author: b"Wedson Almeida Filho", + license: b"GPL", +} From f19584603a37c11d6cf6d5f46747fa63929d6246 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Tue, 30 Nov 2021 21:02:23 +0100 Subject: [PATCH 0039/1250] [RFC] drivers: android: Binder IPC in Rust A port to Rust of the Android Binder IPC mechanism. This module is a work in progress and will be sent for review later on, as well as separately from the Rust support. However, it is included to show how an actual working module written in Rust may look like. Co-developed-by: Alex Gaynor Signed-off-by: Alex Gaynor Co-developed-by: Finn Behrens Signed-off-by: Finn Behrens Co-developed-by: Sven Van Asbroeck Signed-off-by: Sven Van Asbroeck Co-developed-by: Gary Guo Signed-off-by: Gary Guo Co-developed-by: Wayne Campbell Signed-off-by: Wayne Campbell Signed-off-by: Wedson Almeida Filho Co-developed-by: Miguel Ojeda Signed-off-by: Miguel Ojeda --- drivers/android/Kconfig | 6 + drivers/android/Makefile | 2 + drivers/android/allocation.rs | 266 ++++++++ drivers/android/context.rs | 80 +++ drivers/android/defs.rs | 99 +++ drivers/android/node.rs | 476 ++++++++++++++ drivers/android/process.rs | 960 ++++++++++++++++++++++++++++ drivers/android/range_alloc.rs | 189 ++++++ drivers/android/rust_binder.rs | 111 ++++ drivers/android/thread.rs | 870 +++++++++++++++++++++++++ drivers/android/transaction.rs | 326 ++++++++++ include/uapi/linux/android/binder.h | 28 +- 12 files changed, 3400 insertions(+), 13 deletions(-) create mode 100644 drivers/android/allocation.rs create mode 100644 drivers/android/context.rs create mode 100644 drivers/android/defs.rs create mode 100644 drivers/android/node.rs create mode 100644 drivers/android/process.rs create mode 100644 drivers/android/range_alloc.rs create mode 100644 drivers/android/rust_binder.rs create mode 100644 drivers/android/thread.rs create mode 100644 drivers/android/transaction.rs diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 53b22e26266c3e..bc10eebd3ad382 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -20,6 +20,12 @@ config ANDROID_BINDER_IPC Android process, using Binder to identify, invoke and pass arguments between said processes. +config ANDROID_BINDER_IPC_RUST + bool "Android Binder IPC Driver in Rust" + depends on MMU && RUST + help + Implementation of the Binder IPC in Rust. + config ANDROID_BINDERFS bool "Android Binderfs filesystem" depends on ANDROID_BINDER_IPC diff --git a/drivers/android/Makefile b/drivers/android/Makefile index c9d3d0c99c2571..c428f2ce2f05ec 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -4,3 +4,5 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o + +obj-$(CONFIG_ANDROID_BINDER_IPC_RUST) += rust_binder.o diff --git a/drivers/android/allocation.rs b/drivers/android/allocation.rs new file mode 100644 index 00000000000000..3ed7b649eeb7e2 --- /dev/null +++ b/drivers/android/allocation.rs @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: GPL-2.0 + +use core::mem::{replace, size_of, MaybeUninit}; +use kernel::{ + bindings, linked_list::List, pages::Pages, prelude::*, sync::Ref, user_ptr::UserSlicePtrReader, +}; + +use crate::{ + defs::*, + node::NodeRef, + process::{AllocationInfo, Process}, + thread::{BinderError, BinderResult}, + transaction::FileInfo, +}; + +pub(crate) struct Allocation<'a> { + pub(crate) offset: usize, + size: usize, + pub(crate) ptr: usize, + pages: Ref<[Pages<0>]>, + pub(crate) process: &'a Process, + allocation_info: Option, + free_on_drop: bool, + file_list: List>, +} + +impl<'a> Allocation<'a> { + pub(crate) fn new( + process: &'a Process, + offset: usize, + size: usize, + ptr: usize, + pages: Ref<[Pages<0>]>, + ) -> Self { + Self { + process, + offset, + size, + ptr, + pages, + allocation_info: None, + free_on_drop: true, + file_list: List::new(), + } + } + + pub(crate) fn take_file_list(&mut self) -> List> { + replace(&mut self.file_list, List::new()) + } + + pub(crate) fn add_file_info(&mut self, file: Box) { + self.file_list.push_back(file); + } + + fn iterate(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result + where + T: FnMut(&Pages<0>, usize, usize) -> Result, + { + // Check that the request is within the buffer. + if offset.checked_add(size).ok_or(EINVAL)? > self.size { + return Err(EINVAL); + } + offset += self.offset; + let mut page_index = offset >> bindings::PAGE_SHIFT; + offset &= (1 << bindings::PAGE_SHIFT) - 1; + while size > 0 { + let available = core::cmp::min(size, (1 << bindings::PAGE_SHIFT) as usize - offset); + cb(&self.pages[page_index], offset, available)?; + size -= available; + page_index += 1; + offset = 0; + } + Ok(()) + } + + pub(crate) fn copy_into( + &self, + reader: &mut UserSlicePtrReader, + offset: usize, + size: usize, + ) -> Result { + self.iterate(offset, size, |page, offset, to_copy| { + page.copy_into_page(reader, offset, to_copy) + }) + } + + pub(crate) fn read(&self, offset: usize) -> Result { + let mut out = MaybeUninit::::uninit(); + let mut out_offset = 0; + self.iterate(offset, size_of::(), |page, offset, to_copy| { + // SAFETY: Data buffer is allocated on the stack. + unsafe { + page.read( + (out.as_mut_ptr() as *mut u8).add(out_offset), + offset, + to_copy, + ) + }?; + out_offset += to_copy; + Ok(()) + })?; + // SAFETY: We just initialised the data. + Ok(unsafe { out.assume_init() }) + } + + pub(crate) fn write(&self, offset: usize, obj: &T) -> Result { + let mut obj_offset = 0; + self.iterate(offset, size_of::(), |page, offset, to_copy| { + // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T. + let obj_ptr = unsafe { (obj as *const T as *const u8).add(obj_offset) }; + // SAFETY: We have a reference to the object, so the pointer is valid. + unsafe { page.write(obj_ptr, offset, to_copy) }?; + obj_offset += to_copy; + Ok(()) + }) + } + + pub(crate) fn keep_alive(mut self) { + self.process + .buffer_make_freeable(self.offset, self.allocation_info.take()); + self.free_on_drop = false; + } + + pub(crate) fn set_info(&mut self, info: AllocationInfo) { + self.allocation_info = Some(info); + } +} + +impl Drop for Allocation<'_> { + fn drop(&mut self) { + if !self.free_on_drop { + return; + } + + if let Some(info) = &self.allocation_info { + let offsets = info.offsets.clone(); + let view = AllocationView::new(self, offsets.start); + for i in offsets.step_by(size_of::()) { + if view.cleanup_object(i).is_err() { + pr_warn!("Error cleaning up object at offset {}\n", i) + } + } + } + + self.process.buffer_raw_free(self.ptr); + } +} + +pub(crate) struct AllocationView<'a, 'b> { + pub(crate) alloc: &'a mut Allocation<'b>, + limit: usize, +} + +impl<'a, 'b> AllocationView<'a, 'b> { + pub(crate) fn new(alloc: &'a mut Allocation<'b>, limit: usize) -> Self { + AllocationView { alloc, limit } + } + + pub(crate) fn read(&self, offset: usize) -> Result { + if offset.checked_add(size_of::()).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.read(offset) + } + + pub(crate) fn write(&self, offset: usize, obj: &T) -> Result { + if offset.checked_add(size_of::()).ok_or(EINVAL)? > self.limit { + return Err(EINVAL); + } + self.alloc.write(offset, obj) + } + + pub(crate) fn transfer_binder_object( + &self, + offset: usize, + strong: bool, + get_node: T, + ) -> BinderResult + where + T: FnOnce(&bindings::flat_binder_object) -> BinderResult, + { + // TODO: Do we want this function to take a &mut self? + let obj = self.read::(offset)?; + let node_ref = get_node(&obj)?; + + if core::ptr::eq(&*node_ref.node.owner, self.alloc.process) { + // The receiving process is the owner of the node, so send it a binder object (instead + // of a handle). + let (ptr, cookie) = node_ref.node.get_id(); + let newobj = bindings::flat_binder_object { + hdr: bindings::binder_object_header { + type_: if strong { + BINDER_TYPE_BINDER + } else { + BINDER_TYPE_WEAK_BINDER + }, + }, + flags: obj.flags, + __bindgen_anon_1: bindings::flat_binder_object__bindgen_ty_1 { binder: ptr as _ }, + cookie: cookie as _, + }; + self.write(offset, &newobj)?; + + // Increment the user ref count on the node. It will be decremented as part of the + // destruction of the buffer, when we see a binder or weak-binder object. + node_ref.node.update_refcount(true, strong); + } else { + // The receiving process is different from the owner, so we need to insert a handle to + // the binder object. + let handle = self + .alloc + .process + .insert_or_update_handle(node_ref, false)?; + + let newobj = bindings::flat_binder_object { + hdr: bindings::binder_object_header { + type_: if strong { + BINDER_TYPE_HANDLE + } else { + BINDER_TYPE_WEAK_HANDLE + }, + }, + flags: obj.flags, + // TODO: To avoid padding, we write to `binder` instead of `handle` here. We need a + // better solution though. + __bindgen_anon_1: bindings::flat_binder_object__bindgen_ty_1 { + binder: handle as _, + }, + ..bindings::flat_binder_object::default() + }; + if self.write(offset, &newobj).is_err() { + // Decrement ref count on the handle we just created. + let _ = self.alloc.process.update_ref(handle, false, strong); + return Err(BinderError::new_failed()); + } + } + Ok(()) + } + + fn cleanup_object(&self, index_offset: usize) -> Result { + let offset = self.alloc.read(index_offset)?; + let header = self.read::(offset)?; + // TODO: Handle other types. + match header.type_ { + BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => { + let obj = self.read::(offset)?; + let strong = header.type_ == BINDER_TYPE_BINDER; + // SAFETY: The type is `BINDER_TYPE_{WEAK_}BINDER`, so the `binder` field is + // populated. + let ptr = unsafe { obj.__bindgen_anon_1.binder } as usize; + let cookie = obj.cookie as usize; + self.alloc.process.update_node(ptr, cookie, strong, false); + Ok(()) + } + BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => { + let obj = self.read::(offset)?; + let strong = header.type_ == BINDER_TYPE_HANDLE; + // SAFETY: The type is `BINDER_TYPE_{WEAK_}HANDLE`, so the `handle` field is + // populated. + let handle = unsafe { obj.__bindgen_anon_1.handle } as _; + self.alloc.process.update_ref(handle, false, strong) + } + _ => Ok(()), + } + } +} diff --git a/drivers/android/context.rs b/drivers/android/context.rs new file mode 100644 index 00000000000000..2bb448df664152 --- /dev/null +++ b/drivers/android/context.rs @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 + +use kernel::{ + bindings, + prelude::*, + security, + sync::{Mutex, Ref, UniqueRef}, +}; + +use crate::{ + node::NodeRef, + thread::{BinderError, BinderResult}, +}; + +struct Manager { + node: Option, + uid: Option, +} + +pub(crate) struct Context { + manager: Mutex, +} + +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for Context {} +unsafe impl Sync for Context {} + +impl Context { + pub(crate) fn new() -> Result> { + let mut ctx = Pin::from(UniqueRef::try_new(Self { + // SAFETY: Init is called below. + manager: unsafe { + Mutex::new(Manager { + node: None, + uid: None, + }) + }, + })?); + + // SAFETY: `manager` is also pinned when `ctx` is. + let manager = unsafe { ctx.as_mut().map_unchecked_mut(|c| &mut c.manager) }; + kernel::mutex_init!(manager, "Context::manager"); + + Ok(ctx.into()) + } + + pub(crate) fn set_manager_node(&self, node_ref: NodeRef) -> Result { + let mut manager = self.manager.lock(); + if manager.node.is_some() { + return Err(EBUSY); + } + security::binder_set_context_mgr(&node_ref.node.owner.cred)?; + + // TODO: Get the actual caller id. + let caller_uid = bindings::kuid_t::default(); + if let Some(ref uid) = manager.uid { + if uid.val != caller_uid.val { + return Err(EPERM); + } + } + + manager.node = Some(node_ref); + manager.uid = Some(caller_uid); + Ok(()) + } + + pub(crate) fn unset_manager_node(&self) { + let node_ref = self.manager.lock().node.take(); + drop(node_ref); + } + + pub(crate) fn get_manager_node(&self, strong: bool) -> BinderResult { + self.manager + .lock() + .node + .as_ref() + .ok_or_else(BinderError::new_dead)? + .clone(strong) + } +} diff --git a/drivers/android/defs.rs b/drivers/android/defs.rs new file mode 100644 index 00000000000000..ec2dde9b3dd84c --- /dev/null +++ b/drivers/android/defs.rs @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 + +use core::ops::{Deref, DerefMut}; +use kernel::{ + bindings, + bindings::*, + io_buffer::{ReadableFromBytes, WritableToBytes}, +}; + +macro_rules! pub_no_prefix { + ($prefix:ident, $($newname:ident),+) => { + $(pub(crate) const $newname: u32 = concat_idents!($prefix, $newname);)+ + }; +} + +pub_no_prefix!( + binder_driver_return_protocol_, + BR_OK, + BR_ERROR, + BR_TRANSACTION, + BR_REPLY, + BR_DEAD_REPLY, + BR_TRANSACTION_COMPLETE, + BR_INCREFS, + BR_ACQUIRE, + BR_RELEASE, + BR_DECREFS, + BR_NOOP, + BR_SPAWN_LOOPER, + BR_DEAD_BINDER, + BR_CLEAR_DEATH_NOTIFICATION_DONE, + BR_FAILED_REPLY +); + +pub_no_prefix!( + binder_driver_command_protocol_, + BC_TRANSACTION, + BC_REPLY, + BC_FREE_BUFFER, + BC_INCREFS, + BC_ACQUIRE, + BC_RELEASE, + BC_DECREFS, + BC_INCREFS_DONE, + BC_ACQUIRE_DONE, + BC_REGISTER_LOOPER, + BC_ENTER_LOOPER, + BC_EXIT_LOOPER, + BC_REQUEST_DEATH_NOTIFICATION, + BC_CLEAR_DEATH_NOTIFICATION, + BC_DEAD_BINDER_DONE +); + +pub_no_prefix!(transaction_flags_, TF_ONE_WAY, TF_ACCEPT_FDS); + +pub(crate) use bindings::{ + BINDER_TYPE_BINDER, BINDER_TYPE_FD, BINDER_TYPE_HANDLE, BINDER_TYPE_WEAK_BINDER, + BINDER_TYPE_WEAK_HANDLE, FLAT_BINDER_FLAG_ACCEPTS_FDS, +}; + +macro_rules! decl_wrapper { + ($newname:ident, $wrapped:ty) => { + #[derive(Copy, Clone, Default)] + pub(crate) struct $newname($wrapped); + + // TODO: This must be justified by inspecting the type, so should live outside the macro or + // the macro should be somehow marked unsafe. + unsafe impl ReadableFromBytes for $newname {} + unsafe impl WritableToBytes for $newname {} + + impl Deref for $newname { + type Target = $wrapped; + fn deref(&self) -> &Self::Target { + &self.0 + } + } + + impl DerefMut for $newname { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } + } + }; +} + +decl_wrapper!(BinderNodeDebugInfo, bindings::binder_node_debug_info); +decl_wrapper!(BinderNodeInfoForRef, bindings::binder_node_info_for_ref); +decl_wrapper!(FlatBinderObject, bindings::flat_binder_object); +decl_wrapper!(BinderTransactionData, bindings::binder_transaction_data); +decl_wrapper!(BinderWriteRead, bindings::binder_write_read); +decl_wrapper!(BinderVersion, bindings::binder_version); + +impl BinderVersion { + pub(crate) fn current() -> Self { + Self(bindings::binder_version { + protocol_version: bindings::BINDER_CURRENT_PROTOCOL_VERSION as _, + }) + } +} diff --git a/drivers/android/node.rs b/drivers/android/node.rs new file mode 100644 index 00000000000000..1a46de1e736c85 --- /dev/null +++ b/drivers/android/node.rs @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: GPL-2.0 + +use core::sync::atomic::{AtomicU64, Ordering}; +use kernel::{ + io_buffer::IoBufferWriter, + linked_list::{GetLinks, Links, List}, + prelude::*, + sync::{Guard, LockedBy, Mutex, Ref, SpinLock}, + user_ptr::UserSlicePtrWriter, +}; + +use crate::{ + defs::*, + process::{Process, ProcessInner}, + thread::{BinderError, BinderResult, Thread}, + DeliverToRead, +}; + +struct CountState { + count: usize, + has_count: bool, + is_biased: bool, +} + +impl CountState { + fn new() -> Self { + Self { + count: 0, + has_count: false, + is_biased: false, + } + } + + fn add_bias(&mut self) { + self.count += 1; + self.is_biased = true; + } +} + +struct NodeInner { + strong: CountState, + weak: CountState, + death_list: List>, +} + +struct NodeDeathInner { + dead: bool, + cleared: bool, + notification_done: bool, + + /// Indicates whether the normal flow was interrupted by removing the handle. In this case, we + /// need behave as if the death notification didn't exist (i.e., we don't deliver anything to + /// the user. + aborted: bool, +} + +pub(crate) struct NodeDeath { + node: Ref, + process: Ref, + // TODO: Make this private. + pub(crate) cookie: usize, + work_links: Links, + // TODO: Add the moment we're using this for two lists, which isn't safe because we want to + // remove from the list without knowing the list it's in. We need to separate this out. + death_links: Links, + inner: SpinLock, +} + +impl NodeDeath { + /// Constructs a new node death notification object. + /// + /// # Safety + /// + /// The caller must call `NodeDeath::init` before using the notification object. + pub(crate) unsafe fn new(node: Ref, process: Ref, cookie: usize) -> Self { + Self { + node, + process, + cookie, + work_links: Links::new(), + death_links: Links::new(), + inner: unsafe { + SpinLock::new(NodeDeathInner { + dead: false, + cleared: false, + notification_done: false, + aborted: false, + }) + }, + } + } + + pub(crate) fn init(self: Pin<&mut Self>) { + // SAFETY: `inner` is pinned when `self` is. + let inner = unsafe { self.map_unchecked_mut(|n| &mut n.inner) }; + kernel::spinlock_init!(inner, "NodeDeath::inner"); + } + + /// Sets the cleared flag to `true`. + /// + /// It removes `self` from the node's death notification list if needed. It must only be called + /// once. + /// + /// Returns whether it needs to be queued. + pub(crate) fn set_cleared(self: &Ref, abort: bool) -> bool { + let (needs_removal, needs_queueing) = { + // Update state and determine if we need to queue a work item. We only need to do it + // when the node is not dead or if the user already completed the death notification. + let mut inner = self.inner.lock(); + inner.cleared = true; + if abort { + inner.aborted = true; + } + (!inner.dead, !inner.dead || inner.notification_done) + }; + + // Remove death notification from node. + if needs_removal { + let mut owner_inner = self.node.owner.inner.lock(); + let node_inner = self.node.inner.access_mut(&mut owner_inner); + unsafe { node_inner.death_list.remove(self) }; + } + + needs_queueing + } + + /// Sets the 'notification done' flag to `true`. + /// + /// Returns whether it needs to be queued. + pub(crate) fn set_notification_done(self: Ref, thread: &Thread) { + let needs_queueing = { + let mut inner = self.inner.lock(); + inner.notification_done = true; + inner.cleared + }; + + if needs_queueing { + let _ = thread.push_work_if_looper(self); + } + } + + /// Sets the 'dead' flag to `true` and queues work item if needed. + pub(crate) fn set_dead(self: Ref) { + let needs_queueing = { + let mut inner = self.inner.lock(); + if inner.cleared { + false + } else { + inner.dead = true; + true + } + }; + + if needs_queueing { + // Push the death notification to the target process. There is nothing else to do if + // it's already dead. + let process = self.process.clone(); + let _ = process.push_work(self); + } + } +} + +impl GetLinks for NodeDeath { + type EntryType = NodeDeath; + fn get_links(data: &NodeDeath) -> &Links { + &data.death_links + } +} + +impl DeliverToRead for NodeDeath { + fn do_work(self: Ref, _thread: &Thread, writer: &mut UserSlicePtrWriter) -> Result { + let done = { + let inner = self.inner.lock(); + if inner.aborted { + return Ok(true); + } + inner.cleared && (!inner.dead || inner.notification_done) + }; + + let cookie = self.cookie; + let cmd = if done { + BR_CLEAR_DEATH_NOTIFICATION_DONE + } else { + let process = self.process.clone(); + let mut process_inner = process.inner.lock(); + let inner = self.inner.lock(); + if inner.aborted { + return Ok(true); + } + // We're still holding the inner lock, so it cannot be aborted while we insert it into + // the delivered list. + process_inner.death_delivered(self.clone()); + BR_DEAD_BINDER + }; + + writer.write(&cmd)?; + writer.write(&cookie)?; + + // Mimic the original code: we stop processing work items when we get to a death + // notification. + Ok(cmd != BR_DEAD_BINDER) + } + + fn get_links(&self) -> &Links { + &self.work_links + } +} + +pub(crate) struct Node { + pub(crate) global_id: u64, + ptr: usize, + cookie: usize, + pub(crate) flags: u32, + pub(crate) owner: Ref, + inner: LockedBy>, + links: Links, +} + +impl Node { + pub(crate) fn new(ptr: usize, cookie: usize, flags: u32, owner: Ref) -> Self { + static NEXT_ID: AtomicU64 = AtomicU64::new(1); + let inner = LockedBy::new( + &owner.inner, + NodeInner { + strong: CountState::new(), + weak: CountState::new(), + death_list: List::new(), + }, + ); + Self { + global_id: NEXT_ID.fetch_add(1, Ordering::Relaxed), + ptr, + cookie, + flags, + owner, + inner, + links: Links::new(), + } + } + + pub(crate) fn get_id(&self) -> (usize, usize) { + (self.ptr, self.cookie) + } + + pub(crate) fn next_death( + &self, + guard: &mut Guard<'_, Mutex>, + ) -> Option> { + self.inner.access_mut(guard).death_list.pop_front() + } + + pub(crate) fn add_death( + &self, + death: Ref, + guard: &mut Guard<'_, Mutex>, + ) { + self.inner.access_mut(guard).death_list.push_back(death); + } + + pub(crate) fn update_refcount_locked( + &self, + inc: bool, + strong: bool, + biased: bool, + owner_inner: &mut ProcessInner, + ) -> bool { + let inner = self.inner.access_from_mut(owner_inner); + + // Get a reference to the state we'll update. + let state = if strong { + &mut inner.strong + } else { + &mut inner.weak + }; + + // Update biased state: if the count is not biased, there is nothing to do; otherwise, + // we're removing the bias, so mark the state as such. + if biased { + if !state.is_biased { + return false; + } + + state.is_biased = false; + } + + // Update the count and determine whether we need to push work. + // TODO: Here we may want to check the weak count being zero but the strong count being 1, + // because in such cases, we won't deliver anything to userspace, so we shouldn't queue + // either. + if inc { + state.count += 1; + !state.has_count + } else { + state.count -= 1; + state.count == 0 && state.has_count + } + } + + pub(crate) fn update_refcount(self: &Ref, inc: bool, strong: bool) { + self.owner + .inner + .lock() + .update_node_refcount(self, inc, strong, false, None); + } + + pub(crate) fn populate_counts( + &self, + out: &mut BinderNodeInfoForRef, + guard: &Guard<'_, Mutex>, + ) { + let inner = self.inner.access(guard); + out.strong_count = inner.strong.count as _; + out.weak_count = inner.weak.count as _; + } + + pub(crate) fn populate_debug_info( + &self, + out: &mut BinderNodeDebugInfo, + guard: &Guard<'_, Mutex>, + ) { + out.ptr = self.ptr as _; + out.cookie = self.cookie as _; + let inner = self.inner.access(guard); + if inner.strong.has_count { + out.has_strong_ref = 1; + } + if inner.weak.has_count { + out.has_weak_ref = 1; + } + } + + pub(crate) fn force_has_count(&self, guard: &mut Guard<'_, Mutex>) { + let inner = self.inner.access_mut(guard); + inner.strong.has_count = true; + inner.weak.has_count = true; + } + + fn write(&self, writer: &mut UserSlicePtrWriter, code: u32) -> Result { + writer.write(&code)?; + writer.write(&self.ptr)?; + writer.write(&self.cookie)?; + Ok(()) + } +} + +impl DeliverToRead for Node { + fn do_work(self: Ref, _thread: &Thread, writer: &mut UserSlicePtrWriter) -> Result { + let mut owner_inner = self.owner.inner.lock(); + let inner = self.inner.access_mut(&mut owner_inner); + let strong = inner.strong.count > 0; + let has_strong = inner.strong.has_count; + let weak = strong || inner.weak.count > 0; + let has_weak = inner.weak.has_count; + inner.weak.has_count = weak; + inner.strong.has_count = strong; + + if !weak { + // Remove the node if there are no references to it. + owner_inner.remove_node(self.ptr); + } else { + if !has_weak { + inner.weak.add_bias(); + } + + if !has_strong && strong { + inner.strong.add_bias(); + } + } + + drop(owner_inner); + + // This could be done more compactly but we write out all the posibilities for + // compatibility with the original implementation wrt the order of events. + if weak && !has_weak { + self.write(writer, BR_INCREFS)?; + } + + if strong && !has_strong { + self.write(writer, BR_ACQUIRE)?; + } + + if !strong && has_strong { + self.write(writer, BR_RELEASE)?; + } + + if !weak && has_weak { + self.write(writer, BR_DECREFS)?; + } + + Ok(true) + } + + fn get_links(&self) -> &Links { + &self.links + } +} + +pub(crate) struct NodeRef { + pub(crate) node: Ref, + strong_count: usize, + weak_count: usize, +} + +impl NodeRef { + pub(crate) fn new(node: Ref, strong_count: usize, weak_count: usize) -> Self { + Self { + node, + strong_count, + weak_count, + } + } + + pub(crate) fn absorb(&mut self, mut other: Self) { + self.strong_count += other.strong_count; + self.weak_count += other.weak_count; + other.strong_count = 0; + other.weak_count = 0; + } + + pub(crate) fn clone(&self, strong: bool) -> BinderResult { + if strong && self.strong_count == 0 { + return Err(BinderError::new_failed()); + } + + Ok(self + .node + .owner + .inner + .lock() + .new_node_ref(self.node.clone(), strong, None)) + } + + /// Updates (increments or decrements) the number of references held against the node. If the + /// count being updated transitions from 0 to 1 or from 1 to 0, the node is notified by having + /// its `update_refcount` function called. + /// + /// Returns whether `self` should be removed (when both counts are zero). + pub(crate) fn update(&mut self, inc: bool, strong: bool) -> bool { + if strong && self.strong_count == 0 { + return false; + } + + let (count, other_count) = if strong { + (&mut self.strong_count, self.weak_count) + } else { + (&mut self.weak_count, self.strong_count) + }; + + if inc { + if *count == 0 { + self.node.update_refcount(true, strong); + } + *count += 1; + } else { + *count -= 1; + if *count == 0 { + self.node.update_refcount(false, strong); + return other_count == 0; + } + } + + false + } +} + +impl Drop for NodeRef { + fn drop(&mut self) { + if self.strong_count > 0 { + self.node.update_refcount(false, true); + } + + if self.weak_count > 0 { + self.node.update_refcount(false, false); + } + } +} diff --git a/drivers/android/process.rs b/drivers/android/process.rs new file mode 100644 index 00000000000000..e77cd09df50f69 --- /dev/null +++ b/drivers/android/process.rs @@ -0,0 +1,960 @@ +// SPDX-License-Identifier: GPL-2.0 + +use core::{convert::TryFrom, mem::take, ops::Range}; +use kernel::{ + bindings, + cred::Credential, + file::{self, File, IoctlCommand, IoctlHandler, PollTable}, + io_buffer::{IoBufferReader, IoBufferWriter}, + linked_list::List, + mm, + pages::Pages, + prelude::*, + rbtree::RBTree, + sync::{Guard, Mutex, Ref, RefBorrow, UniqueRef}, + task::Task, + user_ptr::{UserSlicePtr, UserSlicePtrReader}, +}; + +use crate::{ + allocation::Allocation, + context::Context, + defs::*, + node::{Node, NodeDeath, NodeRef}, + range_alloc::RangeAllocator, + thread::{BinderError, BinderResult, Thread}, + DeliverToRead, DeliverToReadListAdapter, Either, +}; + +// TODO: Review this: +// Lock order: Process::node_refs -> Process::inner -> Thread::inner + +pub(crate) struct AllocationInfo { + /// Range within the allocation where we can find the offsets to the object descriptors. + pub(crate) offsets: Range, +} + +struct Mapping { + address: usize, + alloc: RangeAllocator, + pages: Ref<[Pages<0>]>, +} + +impl Mapping { + fn new(address: usize, size: usize, pages: Ref<[Pages<0>]>) -> Result { + let alloc = RangeAllocator::new(size)?; + Ok(Self { + address, + alloc, + pages, + }) + } +} + +// TODO: Make this private. +pub(crate) struct ProcessInner { + is_manager: bool, + is_dead: bool, + threads: RBTree>, + ready_threads: List>, + work: List, + mapping: Option, + nodes: RBTree>, + + delivered_deaths: List>, + + /// The number of requested threads that haven't registered yet. + requested_thread_count: u32, + + /// The maximum number of threads used by the process thread pool. + max_threads: u32, + + /// The number of threads the started and registered with the thread pool. + started_thread_count: u32, +} + +impl ProcessInner { + fn new() -> Self { + Self { + is_manager: false, + is_dead: false, + threads: RBTree::new(), + ready_threads: List::new(), + work: List::new(), + mapping: None, + nodes: RBTree::new(), + requested_thread_count: 0, + max_threads: 0, + started_thread_count: 0, + delivered_deaths: List::new(), + } + } + + fn push_work(&mut self, work: Ref) -> BinderResult { + // Try to find a ready thread to which to push the work. + if let Some(thread) = self.ready_threads.pop_front() { + // Push to thread while holding state lock. This prevents the thread from giving up + // (for example, because of a signal) when we're about to deliver work. + thread.push_work(work) + } else if self.is_dead { + Err(BinderError::new_dead()) + } else { + // There are no ready threads. Push work to process queue. + self.work.push_back(work); + + // Wake up polling threads, if any. + for thread in self.threads.values() { + thread.notify_if_poll_ready(); + } + Ok(()) + } + } + + // TODO: Should this be private? + pub(crate) fn remove_node(&mut self, ptr: usize) { + self.nodes.remove(&ptr); + } + + /// Updates the reference count on the given node. + // TODO: Decide if this should be private. + pub(crate) fn update_node_refcount( + &mut self, + node: &Ref, + inc: bool, + strong: bool, + biased: bool, + othread: Option<&Thread>, + ) { + let push = node.update_refcount_locked(inc, strong, biased, self); + + // If we decided that we need to push work, push either to the process or to a thread if + // one is specified. + if push { + if let Some(thread) = othread { + thread.push_work_deferred(node.clone()); + } else { + let _ = self.push_work(node.clone()); + // Nothing to do: `push_work` may fail if the process is dead, but that's ok as in + // that case, it doesn't care about the notification. + } + } + } + + // TODO: Make this private. + pub(crate) fn new_node_ref( + &mut self, + node: Ref, + strong: bool, + thread: Option<&Thread>, + ) -> NodeRef { + self.update_node_refcount(&node, true, strong, false, thread); + let strong_count = if strong { 1 } else { 0 }; + NodeRef::new(node, strong_count, 1 - strong_count) + } + + /// Returns an existing node with the given pointer and cookie, if one exists. + /// + /// Returns an error if a node with the given pointer but a different cookie exists. + fn get_existing_node(&self, ptr: usize, cookie: usize) -> Result>> { + match self.nodes.get(&ptr) { + None => Ok(None), + Some(node) => { + let (_, node_cookie) = node.get_id(); + if node_cookie == cookie { + Ok(Some(node.clone())) + } else { + Err(EINVAL) + } + } + } + } + + /// Returns a reference to an existing node with the given pointer and cookie. It requires a + /// mutable reference because it needs to increment the ref count on the node, which may + /// require pushing work to the work queue (to notify userspace of 0 to 1 transitions). + fn get_existing_node_ref( + &mut self, + ptr: usize, + cookie: usize, + strong: bool, + thread: Option<&Thread>, + ) -> Result> { + Ok(self + .get_existing_node(ptr, cookie)? + .map(|node| self.new_node_ref(node, strong, thread))) + } + + fn register_thread(&mut self) -> bool { + if self.requested_thread_count == 0 { + return false; + } + + self.requested_thread_count -= 1; + self.started_thread_count += 1; + true + } + + /// Finds a delivered death notification with the given cookie, removes it from the thread's + /// delivered list, and returns it. + fn pull_delivered_death(&mut self, cookie: usize) -> Option> { + let mut cursor = self.delivered_deaths.cursor_front_mut(); + while let Some(death) = cursor.current() { + if death.cookie == cookie { + return cursor.remove_current(); + } + cursor.move_next(); + } + None + } + + pub(crate) fn death_delivered(&mut self, death: Ref) { + self.delivered_deaths.push_back(death); + } +} + +struct NodeRefInfo { + node_ref: NodeRef, + death: Option>, +} + +impl NodeRefInfo { + fn new(node_ref: NodeRef) -> Self { + Self { + node_ref, + death: None, + } + } +} + +struct ProcessNodeRefs { + by_handle: RBTree, + by_global_id: RBTree, +} + +impl ProcessNodeRefs { + fn new() -> Self { + Self { + by_handle: RBTree::new(), + by_global_id: RBTree::new(), + } + } +} + +pub(crate) struct Process { + ctx: Ref, + + // The task leader (process). + pub(crate) task: Task, + + // Credential associated with file when `Process` is created. + pub(crate) cred: ARef, + + // TODO: For now this a mutex because we have allocations in RangeAllocator while holding the + // lock. We may want to split up the process state at some point to use a spin lock for the + // other fields. + // TODO: Make this private again. + pub(crate) inner: Mutex, + + // References are in a different mutex to avoid recursive acquisition when + // incrementing/decrementing a node in another process. + node_refs: Mutex, +} + +#[allow(clippy::non_send_fields_in_send_ty)] +unsafe impl Send for Process {} +unsafe impl Sync for Process {} + +impl Process { + fn new(ctx: Ref, cred: ARef) -> Result> { + let mut process = Pin::from(UniqueRef::try_new(Self { + ctx, + cred, + task: Task::current().group_leader().clone(), + // SAFETY: `inner` is initialised in the call to `mutex_init` below. + inner: unsafe { Mutex::new(ProcessInner::new()) }, + // SAFETY: `node_refs` is initialised in the call to `mutex_init` below. + node_refs: unsafe { Mutex::new(ProcessNodeRefs::new()) }, + })?); + + // SAFETY: `inner` is pinned when `Process` is. + let pinned = unsafe { process.as_mut().map_unchecked_mut(|p| &mut p.inner) }; + kernel::mutex_init!(pinned, "Process::inner"); + + // SAFETY: `node_refs` is pinned when `Process` is. + let pinned = unsafe { process.as_mut().map_unchecked_mut(|p| &mut p.node_refs) }; + kernel::mutex_init!(pinned, "Process::node_refs"); + + Ok(process.into()) + } + + /// Attempts to fetch a work item from the process queue. + pub(crate) fn get_work(&self) -> Option> { + self.inner.lock().work.pop_front() + } + + /// Attempts to fetch a work item from the process queue. If none is available, it registers the + /// given thread as ready to receive work directly. + /// + /// This must only be called when the thread is not participating in a transaction chain; when + /// it is, work will always be delivered directly to the thread (and not through the process + /// queue). + pub(crate) fn get_work_or_register<'a>( + &'a self, + thread: &'a Ref, + ) -> Either, Registration<'a>> { + let mut inner = self.inner.lock(); + + // Try to get work from the process queue. + if let Some(work) = inner.work.pop_front() { + return Either::Left(work); + } + + // Register the thread as ready. + Either::Right(Registration::new(self, thread, &mut inner)) + } + + fn get_thread(self: RefBorrow<'_, Self>, id: i32) -> Result> { + // TODO: Consider using read/write locks here instead. + { + let inner = self.inner.lock(); + if let Some(thread) = inner.threads.get(&id) { + return Ok(thread.clone()); + } + } + + // Allocate a new `Thread` without holding any locks. + let ta = Thread::new(id, self.into())?; + let node = RBTree::try_allocate_node(id, ta.clone())?; + + let mut inner = self.inner.lock(); + + // Recheck. It's possible the thread was create while we were not holding the lock. + if let Some(thread) = inner.threads.get(&id) { + return Ok(thread.clone()); + } + + inner.threads.insert(node); + Ok(ta) + } + + pub(crate) fn push_work(&self, work: Ref) -> BinderResult { + self.inner.lock().push_work(work) + } + + fn set_as_manager( + self: RefBorrow<'_, Self>, + info: Option, + thread: &Thread, + ) -> Result { + let (ptr, cookie, flags) = if let Some(obj) = info { + ( + // SAFETY: The object type for this ioctl is implicitly `BINDER_TYPE_BINDER`, so it + // is safe to access the `binder` field. + unsafe { obj.__bindgen_anon_1.binder }, + obj.cookie, + obj.flags, + ) + } else { + (0, 0, 0) + }; + let node_ref = self.get_node(ptr as _, cookie as _, flags as _, true, Some(thread))?; + let node = node_ref.node.clone(); + self.ctx.set_manager_node(node_ref)?; + self.inner.lock().is_manager = true; + + // Force the state of the node to prevent the delivery of acquire/increfs. + let mut owner_inner = node.owner.inner.lock(); + node.force_has_count(&mut owner_inner); + Ok(()) + } + + pub(crate) fn get_node( + self: RefBorrow<'_, Self>, + ptr: usize, + cookie: usize, + flags: u32, + strong: bool, + thread: Option<&Thread>, + ) -> Result { + // Try to find an existing node. + { + let mut inner = self.inner.lock(); + if let Some(node) = inner.get_existing_node_ref(ptr, cookie, strong, thread)? { + return Ok(node); + } + } + + // Allocate the node before reacquiring the lock. + let node = Ref::try_new(Node::new(ptr, cookie, flags, self.into()))?; + let rbnode = RBTree::try_allocate_node(ptr, node.clone())?; + + let mut inner = self.inner.lock(); + if let Some(node) = inner.get_existing_node_ref(ptr, cookie, strong, thread)? { + return Ok(node); + } + + inner.nodes.insert(rbnode); + Ok(inner.new_node_ref(node, strong, thread)) + } + + pub(crate) fn insert_or_update_handle( + &self, + node_ref: NodeRef, + is_mananger: bool, + ) -> Result { + { + let mut refs = self.node_refs.lock(); + + // Do a lookup before inserting. + if let Some(handle_ref) = refs.by_global_id.get(&node_ref.node.global_id) { + let handle = *handle_ref; + let info = refs.by_handle.get_mut(&handle).unwrap(); + info.node_ref.absorb(node_ref); + return Ok(handle); + } + } + + // Reserve memory for tree nodes. + let reserve1 = RBTree::try_reserve_node()?; + let reserve2 = RBTree::try_reserve_node()?; + + let mut refs = self.node_refs.lock(); + + // Do a lookup again as node may have been inserted before the lock was reacquired. + if let Some(handle_ref) = refs.by_global_id.get(&node_ref.node.global_id) { + let handle = *handle_ref; + let info = refs.by_handle.get_mut(&handle).unwrap(); + info.node_ref.absorb(node_ref); + return Ok(handle); + } + + // Find id. + let mut target = if is_mananger { 0 } else { 1 }; + for handle in refs.by_handle.keys() { + if *handle > target { + break; + } + if *handle == target { + target = target.checked_add(1).ok_or(ENOMEM)?; + } + } + + // Ensure the process is still alive while we insert a new reference. + let inner = self.inner.lock(); + if inner.is_dead { + return Err(ESRCH); + } + refs.by_global_id + .insert(reserve1.into_node(node_ref.node.global_id, target)); + refs.by_handle + .insert(reserve2.into_node(target, NodeRefInfo::new(node_ref))); + Ok(target) + } + + pub(crate) fn get_transaction_node(&self, handle: u32) -> BinderResult { + // When handle is zero, try to get the context manager. + if handle == 0 { + self.ctx.get_manager_node(true) + } else { + self.get_node_from_handle(handle, true) + } + } + + pub(crate) fn get_node_from_handle(&self, handle: u32, strong: bool) -> BinderResult { + self.node_refs + .lock() + .by_handle + .get(&handle) + .ok_or(ENOENT)? + .node_ref + .clone(strong) + } + + pub(crate) fn remove_from_delivered_deaths(&self, death: &Ref) { + let mut inner = self.inner.lock(); + let removed = unsafe { inner.delivered_deaths.remove(death) }; + drop(inner); + drop(removed); + } + + pub(crate) fn update_ref(&self, handle: u32, inc: bool, strong: bool) -> Result { + if inc && handle == 0 { + if let Ok(node_ref) = self.ctx.get_manager_node(strong) { + if core::ptr::eq(self, &*node_ref.node.owner) { + return Err(EINVAL); + } + let _ = self.insert_or_update_handle(node_ref, true); + return Ok(()); + } + } + + // To preserve original binder behaviour, we only fail requests where the manager tries to + // increment references on itself. + let mut refs = self.node_refs.lock(); + if let Some(info) = refs.by_handle.get_mut(&handle) { + if info.node_ref.update(inc, strong) { + // Clean up death if there is one attached to this node reference. + if let Some(death) = info.death.take() { + death.set_cleared(true); + self.remove_from_delivered_deaths(&death); + } + + // Remove reference from process tables. + let id = info.node_ref.node.global_id; + refs.by_handle.remove(&handle); + refs.by_global_id.remove(&id); + } + } + Ok(()) + } + + /// Decrements the refcount of the given node, if one exists. + pub(crate) fn update_node(&self, ptr: usize, cookie: usize, strong: bool, biased: bool) { + let mut inner = self.inner.lock(); + if let Ok(Some(node)) = inner.get_existing_node(ptr, cookie) { + inner.update_node_refcount(&node, false, strong, biased, None); + } + } + + pub(crate) fn inc_ref_done(&self, reader: &mut UserSlicePtrReader, strong: bool) -> Result { + let ptr = reader.read::()?; + let cookie = reader.read::()?; + self.update_node(ptr, cookie, strong, true); + Ok(()) + } + + pub(crate) fn buffer_alloc(&self, size: usize) -> BinderResult> { + let mut inner = self.inner.lock(); + let mapping = inner.mapping.as_mut().ok_or_else(BinderError::new_dead)?; + + let offset = mapping.alloc.reserve_new(size)?; + Ok(Allocation::new( + self, + offset, + size, + mapping.address + offset, + mapping.pages.clone(), + )) + } + + // TODO: Review if we want an Option or a Result. + pub(crate) fn buffer_get(&self, ptr: usize) -> Option> { + let mut inner = self.inner.lock(); + let mapping = inner.mapping.as_mut()?; + let offset = ptr.checked_sub(mapping.address)?; + let (size, odata) = mapping.alloc.reserve_existing(offset).ok()?; + let mut alloc = Allocation::new(self, offset, size, ptr, mapping.pages.clone()); + if let Some(data) = odata { + alloc.set_info(data); + } + Some(alloc) + } + + pub(crate) fn buffer_raw_free(&self, ptr: usize) { + let mut inner = self.inner.lock(); + if let Some(ref mut mapping) = &mut inner.mapping { + if ptr < mapping.address + || mapping + .alloc + .reservation_abort(ptr - mapping.address) + .is_err() + { + pr_warn!( + "Pointer {:x} failed to free, base = {:x}\n", + ptr, + mapping.address + ); + } + } + } + + pub(crate) fn buffer_make_freeable(&self, offset: usize, data: Option) { + let mut inner = self.inner.lock(); + if let Some(ref mut mapping) = &mut inner.mapping { + if mapping.alloc.reservation_commit(offset, data).is_err() { + pr_warn!("Offset {} failed to be marked freeable\n", offset); + } + } + } + + fn create_mapping(&self, vma: &mut mm::virt::Area) -> Result { + let size = core::cmp::min(vma.end() - vma.start(), bindings::SZ_4M as usize); + let page_count = size / kernel::PAGE_SIZE; + + // Allocate and map all pages. + // + // N.B. If we fail halfway through mapping these pages, the kernel will unmap them. + let mut pages = Vec::new(); + pages.try_reserve_exact(page_count)?; + let mut address = vma.start(); + for _ in 0..page_count { + let page = Pages::<0>::new()?; + vma.insert_page(address, &page)?; + pages.try_push(page)?; + address += kernel::PAGE_SIZE; + } + + let ref_pages = Ref::try_from(pages)?; + + // Save pages for later. + let mut inner = self.inner.lock(); + match &inner.mapping { + None => inner.mapping = Some(Mapping::new(vma.start(), size, ref_pages)?), + Some(_) => return Err(EBUSY), + } + Ok(()) + } + + fn version(&self, data: UserSlicePtr) -> Result { + data.writer().write(&BinderVersion::current()) + } + + pub(crate) fn register_thread(&self) -> bool { + self.inner.lock().register_thread() + } + + fn remove_thread(&self, thread: Ref) { + self.inner.lock().threads.remove(&thread.id); + thread.release(); + } + + fn set_max_threads(&self, max: u32) { + self.inner.lock().max_threads = max; + } + + fn get_node_debug_info(&self, data: UserSlicePtr) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + + // Read the starting point. + let ptr = reader.read::()?.ptr as usize; + let mut out = BinderNodeDebugInfo::default(); + + { + let inner = self.inner.lock(); + for (node_ptr, node) in &inner.nodes { + if *node_ptr > ptr { + node.populate_debug_info(&mut out, &inner); + break; + } + } + } + + writer.write(&out) + } + + fn get_node_info_from_ref(&self, data: UserSlicePtr) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + let mut out = reader.read::()?; + + if out.strong_count != 0 + || out.weak_count != 0 + || out.reserved1 != 0 + || out.reserved2 != 0 + || out.reserved3 != 0 + { + return Err(EINVAL); + } + + // Only the context manager is allowed to use this ioctl. + if !self.inner.lock().is_manager { + return Err(EPERM); + } + + let node_ref = self + .get_node_from_handle(out.handle, true) + .or(Err(EINVAL))?; + + // Get the counts from the node. + { + let owner_inner = node_ref.node.owner.inner.lock(); + node_ref.node.populate_counts(&mut out, &owner_inner); + } + + // Write the result back. + writer.write(&out) + } + + pub(crate) fn needs_thread(&self) -> bool { + let mut inner = self.inner.lock(); + let ret = inner.requested_thread_count == 0 + && inner.ready_threads.is_empty() + && inner.started_thread_count < inner.max_threads; + if ret { + inner.requested_thread_count += 1 + }; + ret + } + + pub(crate) fn request_death( + self: &Ref, + reader: &mut UserSlicePtrReader, + thread: &Thread, + ) -> Result { + let handle: u32 = reader.read()?; + let cookie: usize = reader.read()?; + + // TODO: First two should result in error, but not the others. + + // TODO: Do we care about the context manager dying? + + // Queue BR_ERROR if we can't allocate memory for the death notification. + let death = UniqueRef::try_new_uninit().map_err(|err| { + thread.push_return_work(BR_ERROR); + err + })?; + + let mut refs = self.node_refs.lock(); + let info = refs.by_handle.get_mut(&handle).ok_or(EINVAL)?; + + // Nothing to do if there is already a death notification request for this handle. + if info.death.is_some() { + return Ok(()); + } + + let death = { + let mut pinned = Pin::from(death.write( + // SAFETY: `init` is called below. + unsafe { NodeDeath::new(info.node_ref.node.clone(), self.clone(), cookie) }, + )); + pinned.as_mut().init(); + Ref::::from(pinned) + }; + + info.death = Some(death.clone()); + + // Register the death notification. + { + let mut owner_inner = info.node_ref.node.owner.inner.lock(); + if owner_inner.is_dead { + drop(owner_inner); + let _ = self.push_work(death); + } else { + info.node_ref.node.add_death(death, &mut owner_inner); + } + } + Ok(()) + } + + pub(crate) fn clear_death(&self, reader: &mut UserSlicePtrReader, thread: &Thread) -> Result { + let handle: u32 = reader.read()?; + let cookie: usize = reader.read()?; + + let mut refs = self.node_refs.lock(); + let info = refs.by_handle.get_mut(&handle).ok_or(EINVAL)?; + + let death = info.death.take().ok_or(EINVAL)?; + if death.cookie != cookie { + info.death = Some(death); + return Err(EINVAL); + } + + // Update state and determine if we need to queue a work item. We only need to do it when + // the node is not dead or if the user already completed the death notification. + if death.set_cleared(false) { + let _ = thread.push_work_if_looper(death); + } + + Ok(()) + } + + pub(crate) fn dead_binder_done(&self, cookie: usize, thread: &Thread) { + if let Some(death) = self.inner.lock().pull_delivered_death(cookie) { + death.set_notification_done(thread); + } + } +} + +impl IoctlHandler for Process { + type Target<'a> = RefBorrow<'a, Process>; + + fn write( + this: RefBorrow<'_, Process>, + _file: &File, + cmd: u32, + reader: &mut UserSlicePtrReader, + ) -> Result { + let thread = this.get_thread(Task::current().pid())?; + match cmd { + bindings::BINDER_SET_MAX_THREADS => this.set_max_threads(reader.read()?), + bindings::BINDER_SET_CONTEXT_MGR => this.set_as_manager(None, &thread)?, + bindings::BINDER_THREAD_EXIT => this.remove_thread(thread), + bindings::BINDER_SET_CONTEXT_MGR_EXT => { + this.set_as_manager(Some(reader.read()?), &thread)? + } + _ => return Err(EINVAL), + } + Ok(0) + } + + fn read_write( + this: RefBorrow<'_, Process>, + file: &File, + cmd: u32, + data: UserSlicePtr, + ) -> Result { + let thread = this.get_thread(Task::current().pid())?; + match cmd { + bindings::BINDER_WRITE_READ => thread.write_read(data, file.is_blocking())?, + bindings::BINDER_GET_NODE_DEBUG_INFO => this.get_node_debug_info(data)?, + bindings::BINDER_GET_NODE_INFO_FOR_REF => this.get_node_info_from_ref(data)?, + bindings::BINDER_VERSION => this.version(data)?, + _ => return Err(EINVAL), + } + Ok(0) + } +} + +impl file::Operations for Process { + type Data = Ref; + type OpenData = Ref; + + kernel::declare_file_operations!(ioctl, compat_ioctl, mmap, poll); + + fn open(ctx: &Ref, file: &File) -> Result { + Self::new(ctx.clone(), file.cred().into()) + } + + fn release(obj: Self::Data, _file: &File) { + // Mark this process as dead. We'll do the same for the threads later. + obj.inner.lock().is_dead = true; + + // If this process is the manager, unset it. + if obj.inner.lock().is_manager { + obj.ctx.unset_manager_node(); + } + + // TODO: Do this in a worker? + + // Cancel all pending work items. + while let Some(work) = obj.get_work() { + work.cancel(); + } + + // Free any resources kept alive by allocated buffers. + let omapping = obj.inner.lock().mapping.take(); + if let Some(mut mapping) = omapping { + let address = mapping.address; + let pages = mapping.pages.clone(); + mapping.alloc.for_each(|offset, size, odata| { + let ptr = offset + address; + let mut alloc = Allocation::new(&obj, offset, size, ptr, pages.clone()); + if let Some(data) = odata { + alloc.set_info(data); + } + drop(alloc) + }); + } + + // Drop all references. We do this dance with `swap` to avoid destroying the references + // while holding the lock. + let mut refs = obj.node_refs.lock(); + let mut node_refs = take(&mut refs.by_handle); + drop(refs); + + // Remove all death notifications from the nodes (that belong to a different process). + for info in node_refs.values_mut() { + let death = if let Some(existing) = info.death.take() { + existing + } else { + continue; + }; + + death.set_cleared(false); + } + + // Do similar dance for the state lock. + let mut inner = obj.inner.lock(); + let threads = take(&mut inner.threads); + let nodes = take(&mut inner.nodes); + drop(inner); + + // Release all threads. + for thread in threads.values() { + thread.release(); + } + + // Deliver death notifications. + for node in nodes.values() { + loop { + let death = { + let mut inner = obj.inner.lock(); + if let Some(death) = node.next_death(&mut inner) { + death + } else { + break; + } + }; + + death.set_dead(); + } + } + } + + fn ioctl(this: RefBorrow<'_, Process>, file: &File, cmd: &mut IoctlCommand) -> Result { + cmd.dispatch::(this, file) + } + + fn compat_ioctl( + this: RefBorrow<'_, Process>, + file: &File, + cmd: &mut IoctlCommand, + ) -> Result { + cmd.dispatch::(this, file) + } + + fn mmap(this: RefBorrow<'_, Process>, _file: &File, vma: &mut mm::virt::Area) -> Result { + // We don't allow mmap to be used in a different process. + if !Task::current().group_leader().eq(&this.task) { + return Err(EINVAL); + } + + if vma.start() == 0 { + return Err(EINVAL); + } + + let mut flags = vma.flags(); + use mm::virt::flags::*; + if flags & WRITE != 0 { + return Err(EPERM); + } + + flags |= DONTCOPY | MIXEDMAP; + flags &= !MAYWRITE; + vma.set_flags(flags); + + // TODO: Set ops. We need to learn when the user unmaps so that we can stop using it. + this.create_mapping(vma) + } + + fn poll(this: RefBorrow<'_, Process>, file: &File, table: &PollTable) -> Result { + let thread = this.get_thread(Task::current().pid())?; + let (from_proc, mut mask) = thread.poll(file, table); + if mask == 0 && from_proc && !this.inner.lock().work.is_empty() { + mask |= bindings::POLLIN; + } + Ok(mask) + } +} + +pub(crate) struct Registration<'a> { + process: &'a Process, + thread: &'a Ref, +} + +impl<'a> Registration<'a> { + fn new( + process: &'a Process, + thread: &'a Ref, + guard: &mut Guard<'_, Mutex>, + ) -> Self { + guard.ready_threads.push_back(thread.clone()); + Self { process, thread } + } +} + +impl Drop for Registration<'_> { + fn drop(&mut self) { + let mut inner = self.process.inner.lock(); + unsafe { inner.ready_threads.remove(self.thread) }; + } +} diff --git a/drivers/android/range_alloc.rs b/drivers/android/range_alloc.rs new file mode 100644 index 00000000000000..7b149048879b74 --- /dev/null +++ b/drivers/android/range_alloc.rs @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0 + +use core::ptr::NonNull; +use kernel::{ + linked_list::{CursorMut, GetLinks, Links, List}, + prelude::*, +}; + +pub(crate) struct RangeAllocator { + list: List>>, +} + +#[derive(Debug, PartialEq, Eq)] +enum DescriptorState { + Free, + Reserved, + Allocated, +} + +impl RangeAllocator { + pub(crate) fn new(size: usize) -> Result { + let desc = Box::try_new(Descriptor::new(0, size))?; + let mut list = List::new(); + list.push_back(desc); + Ok(Self { list }) + } + + fn find_best_match(&self, size: usize) -> Option>> { + // TODO: Use a binary tree instead of list for this lookup. + let mut best = None; + let mut best_size = usize::MAX; + let mut cursor = self.list.cursor_front(); + while let Some(desc) = cursor.current() { + if desc.state == DescriptorState::Free { + if size == desc.size { + return Some(NonNull::from(desc)); + } + + if size < desc.size && desc.size < best_size { + best = Some(NonNull::from(desc)); + best_size = desc.size; + } + } + + cursor.move_next(); + } + best + } + + pub(crate) fn reserve_new(&mut self, size: usize) -> Result { + let desc_ptr = match self.find_best_match(size) { + None => return Err(ENOMEM), + Some(found) => found, + }; + + // SAFETY: We hold the only mutable reference to list, so it cannot have changed. + let desc = unsafe { &mut *desc_ptr.as_ptr() }; + if desc.size == size { + desc.state = DescriptorState::Reserved; + return Ok(desc.offset); + } + + // We need to break up the descriptor. + let new = Box::try_new(Descriptor::new(desc.offset + size, desc.size - size))?; + unsafe { self.list.insert_after(desc_ptr, new) }; + desc.state = DescriptorState::Reserved; + desc.size = size; + Ok(desc.offset) + } + + fn free_with_cursor(cursor: &mut CursorMut<'_, Box>>) -> Result { + let mut size = match cursor.current() { + None => return Err(EINVAL), + Some(ref mut entry) => { + match entry.state { + DescriptorState::Free => return Err(EINVAL), + DescriptorState::Allocated => return Err(EPERM), + DescriptorState::Reserved => {} + } + entry.state = DescriptorState::Free; + entry.size + } + }; + + // Try to merge with the next entry. + if let Some(next) = cursor.peek_next() { + if next.state == DescriptorState::Free { + next.offset -= size; + next.size += size; + size = next.size; + cursor.remove_current(); + } + } + + // Try to merge with the previous entry. + if let Some(prev) = cursor.peek_prev() { + if prev.state == DescriptorState::Free { + prev.size += size; + cursor.remove_current(); + } + } + + Ok(()) + } + + fn find_at_offset(&mut self, offset: usize) -> Option>>> { + let mut cursor = self.list.cursor_front_mut(); + while let Some(desc) = cursor.current() { + if desc.offset == offset { + return Some(cursor); + } + + if desc.offset > offset { + return None; + } + + cursor.move_next(); + } + None + } + + pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { + // TODO: The force case is currently O(n), but could be made O(1) with unsafe. + let mut cursor = self.find_at_offset(offset).ok_or(EINVAL)?; + Self::free_with_cursor(&mut cursor) + } + + pub(crate) fn reservation_commit(&mut self, offset: usize, data: Option) -> Result { + // TODO: This is currently O(n), make it O(1). + let mut cursor = self.find_at_offset(offset).ok_or(ENOENT)?; + let desc = cursor.current().unwrap(); + desc.state = DescriptorState::Allocated; + desc.data = data; + Ok(()) + } + + /// Takes an entry at the given offset from [`DescriptorState::Allocated`] to + /// [`DescriptorState::Reserved`]. + /// + /// Returns the size of the existing entry and the data associated with it. + pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, Option)> { + // TODO: This is currently O(n), make it O(log n). + let mut cursor = self.find_at_offset(offset).ok_or(ENOENT)?; + let desc = cursor.current().unwrap(); + if desc.state != DescriptorState::Allocated { + return Err(ENOENT); + } + desc.state = DescriptorState::Reserved; + Ok((desc.size, desc.data.take())) + } + + pub(crate) fn for_each)>(&mut self, callback: F) { + let mut cursor = self.list.cursor_front_mut(); + while let Some(desc) = cursor.current() { + if desc.state == DescriptorState::Allocated { + callback(desc.offset, desc.size, desc.data.take()); + } + + cursor.move_next(); + } + } +} + +struct Descriptor { + state: DescriptorState, + size: usize, + offset: usize, + links: Links>, + data: Option, +} + +impl Descriptor { + fn new(offset: usize, size: usize) -> Self { + Self { + size, + offset, + state: DescriptorState::Free, + links: Links::new(), + data: None, + } + } +} + +impl GetLinks for Descriptor { + type EntryType = Self; + fn get_links(desc: &Self) -> &Links { + &desc.links + } +} diff --git a/drivers/android/rust_binder.rs b/drivers/android/rust_binder.rs new file mode 100644 index 00000000000000..d059077238ce54 --- /dev/null +++ b/drivers/android/rust_binder.rs @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 + +//! Binder -- the Android IPC mechanism. +//! +//! TODO: This module is a work in progress. + +use kernel::{ + io_buffer::IoBufferWriter, + linked_list::{GetLinks, GetLinksWrapped, Links}, + miscdev::Registration, + prelude::*, + str::CStr, + sync::Ref, + user_ptr::UserSlicePtrWriter, +}; + +mod allocation; +mod context; +mod defs; +mod node; +mod process; +mod range_alloc; +mod thread; +mod transaction; + +use {context::Context, thread::Thread}; + +module! { + type: BinderModule, + name: b"rust_binder", + author: b"Wedson Almeida Filho", + description: b"Android Binder", + license: b"GPL", +} + +enum Either { + Left(L), + Right(R), +} + +trait DeliverToRead { + /// Performs work. Returns true if remaining work items in the queue should be processed + /// immediately, or false if it should return to caller before processing additional work + /// items. + fn do_work(self: Ref, thread: &Thread, writer: &mut UserSlicePtrWriter) -> Result; + + /// Cancels the given work item. This is called instead of [`DeliverToRead::do_work`] when work + /// won't be delivered. + fn cancel(self: Ref) {} + + /// Returns the linked list links for the work item. + fn get_links(&self) -> &Links; +} + +struct DeliverToReadListAdapter {} + +impl GetLinks for DeliverToReadListAdapter { + type EntryType = dyn DeliverToRead; + + fn get_links(data: &Self::EntryType) -> &Links { + data.get_links() + } +} + +impl GetLinksWrapped for DeliverToReadListAdapter { + type Wrapped = Ref; +} + +struct DeliverCode { + code: u32, + links: Links, +} + +impl DeliverCode { + fn new(code: u32) -> Self { + Self { + code, + links: Links::new(), + } + } +} + +impl DeliverToRead for DeliverCode { + fn do_work(self: Ref, _thread: &Thread, writer: &mut UserSlicePtrWriter) -> Result { + writer.write(&self.code)?; + Ok(true) + } + + fn get_links(&self) -> &Links { + &self.links + } +} + +const fn ptr_align(value: usize) -> usize { + let size = core::mem::size_of::() - 1; + (value + size) & !size +} + +unsafe impl Sync for BinderModule {} + +struct BinderModule { + _reg: Pin>>, +} + +impl kernel::Module for BinderModule { + fn init(name: &'static CStr, _module: &'static kernel::ThisModule) -> Result { + let ctx = Context::new()?; + let reg = Registration::new_pinned(fmt!("{name}"), ctx)?; + Ok(Self { _reg: reg }) + } +} diff --git a/drivers/android/thread.rs b/drivers/android/thread.rs new file mode 100644 index 00000000000000..817fca0890a725 --- /dev/null +++ b/drivers/android/thread.rs @@ -0,0 +1,870 @@ +// SPDX-License-Identifier: GPL-2.0 + +use core::{ + alloc::AllocError, + mem::size_of, + sync::atomic::{AtomicU32, Ordering}, +}; +use kernel::{ + bindings, + file::{File, PollTable}, + io_buffer::{IoBufferReader, IoBufferWriter}, + linked_list::{GetLinks, Links, List}, + prelude::*, + security, + sync::{CondVar, Ref, SpinLock, UniqueRef}, + user_ptr::{UserSlicePtr, UserSlicePtrWriter}, +}; + +use crate::{ + allocation::{Allocation, AllocationView}, + defs::*, + process::{AllocationInfo, Process}, + ptr_align, + transaction::{FileInfo, Transaction}, + DeliverCode, DeliverToRead, DeliverToReadListAdapter, Either, +}; + +pub(crate) type BinderResult = core::result::Result; + +pub(crate) struct BinderError { + pub(crate) reply: u32, +} + +impl BinderError { + pub(crate) fn new_failed() -> Self { + Self { + reply: BR_FAILED_REPLY, + } + } + + pub(crate) fn new_dead() -> Self { + Self { + reply: BR_DEAD_REPLY, + } + } +} + +impl From for BinderError { + fn from(_: Error) -> Self { + Self::new_failed() + } +} + +impl From for BinderError { + fn from(_: AllocError) -> Self { + Self::new_failed() + } +} + +const LOOPER_REGISTERED: u32 = 0x01; +const LOOPER_ENTERED: u32 = 0x02; +const LOOPER_EXITED: u32 = 0x04; +const LOOPER_INVALID: u32 = 0x08; +const LOOPER_WAITING: u32 = 0x10; +const LOOPER_POLL: u32 = 0x20; + +struct InnerThread { + /// Determines the looper state of the thread. It is a bit-wise combination of the constants + /// prefixed with `LOOPER_`. + looper_flags: u32, + + /// Determines if thread is dead. + is_dead: bool, + + /// Work item used to deliver error codes to the thread that started a transaction. When set to + /// `Some(x)`, it will hold the only reference to the object so that it can update the error + /// code to be delivered before queuing it. + reply_work: Option>, + + /// Work item used to deliver error codes to the current thread. When set to `Some(x)`, it will + /// hold the only reference to the object so that it can update the error code to be delivered + /// before queuing. + return_work: Option>, + + /// Determines whether the work list below should be processed. When set to false, `work_list` + /// is treated as if it were empty. + process_work_list: bool, + work_list: List, + current_transaction: Option>, +} + +impl InnerThread { + fn new() -> Self { + Self { + looper_flags: 0, + is_dead: false, + process_work_list: false, + work_list: List::new(), + current_transaction: None, + return_work: None, + reply_work: None, + } + } + + fn set_reply_work(&mut self, reply_work: Ref) { + self.reply_work = Some(reply_work); + } + + fn push_reply_work(&mut self, code: u32) { + let work = self.reply_work.take(); + self.push_existing_work(work, code); + } + + fn set_return_work(&mut self, return_work: Ref) { + self.return_work = Some(return_work); + } + + fn push_return_work(&mut self, code: u32) { + let work = self.return_work.take(); + self.push_existing_work(work, code); + } + + fn push_existing_work(&mut self, owork: Option>, code: u32) { + // TODO: Write some warning when the following fails. It should not happen, and + // if it does, there is likely something wrong. + if let Some(work) = owork { + // `error_code` is written to with relaxed semantics because the queue onto which it is + // being inserted is protected by a lock. The release barrier when the lock is released + // by the caller matches with the acquire barrier of the future reader to guarantee + // that `error_code` is visible. + work.error_code.store(code, Ordering::Relaxed); + self.push_work(work); + } + } + + fn pop_work(&mut self) -> Option> { + if !self.process_work_list { + return None; + } + + let ret = self.work_list.pop_front(); + // Once the queue is drained, we stop processing it until a non-deferred item is pushed + // again onto it. + self.process_work_list = !self.work_list.is_empty(); + ret + } + + fn push_work_deferred(&mut self, work: Ref) { + self.work_list.push_back(work); + } + + fn push_work(&mut self, work: Ref) { + self.push_work_deferred(work); + self.process_work_list = true; + } + + fn has_work(&self) -> bool { + self.process_work_list && !self.work_list.is_empty() + } + + /// Fetches the transaction the thread can reply to. If the thread has a pending transaction + /// (that it could respond to) but it has also issued a transaction, it must first wait for the + /// previously-issued transaction to complete. + fn pop_transaction_to_reply(&mut self, thread: &Thread) -> Result> { + let transaction = self.current_transaction.take().ok_or(EINVAL)?; + + if core::ptr::eq(thread, transaction.from.as_ref()) { + self.current_transaction = Some(transaction); + return Err(EINVAL); + } + + // Find a new current transaction for this thread. + self.current_transaction = transaction.find_from(thread); + Ok(transaction) + } + + fn pop_transaction_replied(&mut self, transaction: &Ref) -> bool { + match self.current_transaction.take() { + None => false, + Some(old) => { + if !Ref::ptr_eq(transaction, &old) { + self.current_transaction = Some(old); + return false; + } + self.current_transaction = old.clone_next(); + true + } + } + } + + fn looper_enter(&mut self) { + self.looper_flags |= LOOPER_ENTERED; + if self.looper_flags & LOOPER_REGISTERED != 0 { + self.looper_flags |= LOOPER_INVALID; + } + } + + fn looper_register(&mut self, valid: bool) { + self.looper_flags |= LOOPER_REGISTERED; + if !valid || self.looper_flags & LOOPER_ENTERED != 0 { + self.looper_flags |= LOOPER_INVALID; + } + } + + fn looper_exit(&mut self) { + self.looper_flags |= LOOPER_EXITED; + } + + /// Determines whether the thread is part of a pool, i.e., if it is a looper. + fn is_looper(&self) -> bool { + self.looper_flags & (LOOPER_ENTERED | LOOPER_REGISTERED) != 0 + } + + /// Determines whether the thread should attempt to fetch work items from the process queue + /// (when its own queue is empty). This is case when the thread is not part of a transaction + /// stack and it is registered as a looper. + fn should_use_process_work_queue(&self) -> bool { + self.current_transaction.is_none() && self.is_looper() + } + + fn poll(&mut self) -> u32 { + self.looper_flags |= LOOPER_POLL; + if self.has_work() { + bindings::POLLIN + } else { + 0 + } + } +} + +pub(crate) struct Thread { + pub(crate) id: i32, + pub(crate) process: Ref, + inner: SpinLock, + work_condvar: CondVar, + links: Links, +} + +impl Thread { + pub(crate) fn new(id: i32, process: Ref) -> Result> { + let return_work = Ref::try_new(ThreadError::new(InnerThread::set_return_work))?; + let reply_work = Ref::try_new(ThreadError::new(InnerThread::set_reply_work))?; + let mut thread = Pin::from(UniqueRef::try_new(Self { + id, + process, + // SAFETY: `inner` is initialised in the call to `spinlock_init` below. + inner: unsafe { SpinLock::new(InnerThread::new()) }, + // SAFETY: `work_condvar` is initialised in the call to `condvar_init` below. + work_condvar: unsafe { CondVar::new() }, + links: Links::new(), + })?); + + // SAFETY: `inner` is pinned when `thread` is. + let inner = unsafe { thread.as_mut().map_unchecked_mut(|t| &mut t.inner) }; + kernel::spinlock_init!(inner, "Thread::inner"); + + // SAFETY: `work_condvar` is pinned when `thread` is. + let condvar = unsafe { thread.as_mut().map_unchecked_mut(|t| &mut t.work_condvar) }; + kernel::condvar_init!(condvar, "Thread::work_condvar"); + + { + let mut inner = thread.inner.lock(); + inner.set_reply_work(reply_work); + inner.set_return_work(return_work); + } + + Ok(thread.into()) + } + + pub(crate) fn set_current_transaction(&self, transaction: Ref) { + self.inner.lock().current_transaction = Some(transaction); + } + + /// Attempts to fetch a work item from the thread-local queue. The behaviour if the queue is + /// empty depends on `wait`: if it is true, the function waits for some work to be queued (or a + /// signal); otherwise it returns indicating that none is available. + fn get_work_local(self: &Ref, wait: bool) -> Result> { + // Try once if the caller does not want to wait. + if !wait { + return self.inner.lock().pop_work().ok_or(EAGAIN); + } + + // Loop waiting only on the local queue (i.e., not registering with the process queue). + let mut inner = self.inner.lock(); + loop { + if let Some(work) = inner.pop_work() { + return Ok(work); + } + + inner.looper_flags |= LOOPER_WAITING; + let signal_pending = self.work_condvar.wait(&mut inner); + inner.looper_flags &= !LOOPER_WAITING; + + if signal_pending { + return Err(ERESTARTSYS); + } + } + } + + /// Attempts to fetch a work item from the thread-local queue, falling back to the process-wide + /// queue if none is available locally. + /// + /// This must only be called when the thread is not participating in a transaction chain. If it + /// is, the local version (`get_work_local`) should be used instead. + fn get_work(self: &Ref, wait: bool) -> Result> { + // Try to get work from the thread's work queue, using only a local lock. + { + let mut inner = self.inner.lock(); + if let Some(work) = inner.pop_work() { + return Ok(work); + } + } + + // If the caller doesn't want to wait, try to grab work from the process queue. + // + // We know nothing will have been queued directly to the thread queue because it is not in + // a transaction and it is not in the process' ready list. + if !wait { + return self.process.get_work().ok_or(EAGAIN); + } + + // Get work from the process queue. If none is available, atomically register as ready. + let reg = match self.process.get_work_or_register(self) { + Either::Left(work) => return Ok(work), + Either::Right(reg) => reg, + }; + + let mut inner = self.inner.lock(); + loop { + if let Some(work) = inner.pop_work() { + return Ok(work); + } + + inner.looper_flags |= LOOPER_WAITING; + let signal_pending = self.work_condvar.wait(&mut inner); + inner.looper_flags &= !LOOPER_WAITING; + + if signal_pending { + // A signal is pending. We need to pull the thread off the list, then check the + // state again after it's off the list to ensure that something was not queued in + // the meantime. If something has been queued, we just return it (instead of the + // error). + drop(inner); + drop(reg); + return self.inner.lock().pop_work().ok_or(ERESTARTSYS); + } + } + } + + pub(crate) fn push_work(&self, work: Ref) -> BinderResult { + { + let mut inner = self.inner.lock(); + if inner.is_dead { + return Err(BinderError::new_dead()); + } + inner.push_work(work); + } + self.work_condvar.notify_one(); + Ok(()) + } + + /// Attempts to push to given work item to the thread if it's a looper thread (i.e., if it's + /// part of a thread pool) and is alive. Otherwise, push the work item to the process instead. + pub(crate) fn push_work_if_looper(&self, work: Ref) -> BinderResult { + let mut inner = self.inner.lock(); + if inner.is_looper() && !inner.is_dead { + inner.push_work(work); + Ok(()) + } else { + drop(inner); + self.process.push_work(work) + } + } + + pub(crate) fn push_work_deferred(&self, work: Ref) { + self.inner.lock().push_work_deferred(work); + } + + fn translate_object( + &self, + index_offset: usize, + view: &mut AllocationView<'_, '_>, + allow_fds: bool, + ) -> BinderResult { + let offset = view.alloc.read(index_offset)?; + let header = view.read::(offset)?; + // TODO: Handle other types. + match header.type_ { + BINDER_TYPE_WEAK_BINDER | BINDER_TYPE_BINDER => { + let strong = header.type_ == BINDER_TYPE_BINDER; + view.transfer_binder_object(offset, strong, |obj| { + // SAFETY: `binder` is a `binder_uintptr_t`; any bit pattern is a valid + // representation. + let ptr = unsafe { obj.__bindgen_anon_1.binder } as _; + let cookie = obj.cookie as _; + let flags = obj.flags as _; + let node = self.process.as_ref_borrow().get_node( + ptr, + cookie, + flags, + strong, + Some(self), + )?; + security::binder_transfer_binder(&self.process.cred, &view.alloc.process.cred)?; + Ok(node) + })?; + } + BINDER_TYPE_WEAK_HANDLE | BINDER_TYPE_HANDLE => { + let strong = header.type_ == BINDER_TYPE_HANDLE; + view.transfer_binder_object(offset, strong, |obj| { + // SAFETY: `handle` is a `u32`; any bit pattern is a valid representation. + let handle = unsafe { obj.__bindgen_anon_1.handle } as _; + let node = self.process.get_node_from_handle(handle, strong)?; + security::binder_transfer_binder(&self.process.cred, &view.alloc.process.cred)?; + Ok(node) + })?; + } + BINDER_TYPE_FD => { + if !allow_fds { + return Err(BinderError::new_failed()); + } + + let obj = view.read::(offset)?; + // SAFETY: `fd` is a `u32`; any bit pattern is a valid representation. + let fd = unsafe { obj.__bindgen_anon_1.fd }; + let file = File::from_fd(fd)?; + security::binder_transfer_file( + &self.process.cred, + &view.alloc.process.cred, + &file, + )?; + let field_offset = + kernel::offset_of!(bindings::binder_fd_object, __bindgen_anon_1.fd) as usize; + let file_info = Box::try_new(FileInfo::new(file, offset + field_offset))?; + view.alloc.add_file_info(file_info); + } + _ => pr_warn!("Unsupported binder object type: {:x}\n", header.type_), + } + Ok(()) + } + + fn translate_objects( + &self, + alloc: &mut Allocation<'_>, + start: usize, + end: usize, + allow_fds: bool, + ) -> BinderResult { + let mut view = AllocationView::new(alloc, start); + for i in (start..end).step_by(size_of::()) { + if let Err(err) = self.translate_object(i, &mut view, allow_fds) { + alloc.set_info(AllocationInfo { offsets: start..i }); + return Err(err); + } + } + alloc.set_info(AllocationInfo { + offsets: start..end, + }); + Ok(()) + } + + pub(crate) fn copy_transaction_data<'a>( + &self, + to_process: &'a Process, + tr: &BinderTransactionData, + allow_fds: bool, + ) -> BinderResult> { + let data_size = tr.data_size as _; + let adata_size = ptr_align(data_size); + let offsets_size = tr.offsets_size as _; + let aoffsets_size = ptr_align(offsets_size); + + // This guarantees that at least `sizeof(usize)` bytes will be allocated. + let len = core::cmp::max( + adata_size.checked_add(aoffsets_size).ok_or(ENOMEM)?, + size_of::(), + ); + let mut alloc = to_process.buffer_alloc(len)?; + + // Copy raw data. + let mut reader = unsafe { UserSlicePtr::new(tr.data.ptr.buffer as _, data_size) }.reader(); + alloc.copy_into(&mut reader, 0, data_size)?; + + // Copy offsets if there are any. + if offsets_size > 0 { + let mut reader = + unsafe { UserSlicePtr::new(tr.data.ptr.offsets as _, offsets_size) }.reader(); + alloc.copy_into(&mut reader, adata_size, offsets_size)?; + + // Traverse the objects specified. + self.translate_objects( + &mut alloc, + adata_size, + adata_size + aoffsets_size, + allow_fds, + )?; + } + + Ok(alloc) + } + + fn unwind_transaction_stack(self: &Ref) { + let mut thread = self.clone(); + while let Ok(transaction) = { + let mut inner = thread.inner.lock(); + inner.pop_transaction_to_reply(thread.as_ref()) + } { + let reply = Either::Right(BR_DEAD_REPLY); + if !transaction.from.deliver_single_reply(reply, &transaction) { + break; + } + + thread = transaction.from.clone(); + } + } + + pub(crate) fn deliver_reply( + &self, + reply: Either, u32>, + transaction: &Ref, + ) { + if self.deliver_single_reply(reply, transaction) { + transaction.from.unwind_transaction_stack(); + } + } + + /// Delivers a reply to the thread that started a transaction. The reply can either be a + /// reply-transaction or an error code to be delivered instead. + /// + /// Returns whether the thread is dead. If it is, the caller is expected to unwind the + /// transaction stack by completing transactions for threads that are dead. + fn deliver_single_reply( + &self, + reply: Either, u32>, + transaction: &Ref, + ) -> bool { + { + let mut inner = self.inner.lock(); + if !inner.pop_transaction_replied(transaction) { + return false; + } + + if inner.is_dead { + return true; + } + + match reply { + Either::Left(work) => inner.push_work(work), + Either::Right(code) => inner.push_reply_work(code), + } + } + + // Notify the thread now that we've released the inner lock. + self.work_condvar.notify_one(); + false + } + + /// Determines if the given transaction is the current transaction for this thread. + fn is_current_transaction(&self, transaction: &Ref) -> bool { + let inner = self.inner.lock(); + match &inner.current_transaction { + None => false, + Some(current) => Ref::ptr_eq(current, transaction), + } + } + + fn transaction(self: &Ref, tr: &BinderTransactionData, inner: T) + where + T: FnOnce(&Ref, &BinderTransactionData) -> BinderResult, + { + if let Err(err) = inner(self, tr) { + self.inner.lock().push_return_work(err.reply); + } + } + + fn reply_inner(self: &Ref, tr: &BinderTransactionData) -> BinderResult { + let orig = self.inner.lock().pop_transaction_to_reply(self)?; + if !orig.from.is_current_transaction(&orig) { + return Err(BinderError::new_failed()); + } + + // We need to complete the transaction even if we cannot complete building the reply. + (|| -> BinderResult<_> { + let completion = Ref::try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?; + let process = orig.from.process.clone(); + let allow_fds = orig.flags & TF_ACCEPT_FDS != 0; + let reply = Transaction::new_reply(self, process, tr, allow_fds)?; + self.inner.lock().push_work(completion); + orig.from.deliver_reply(Either::Left(reply), &orig); + Ok(()) + })() + .map_err(|mut err| { + // At this point we only return `BR_TRANSACTION_COMPLETE` to the caller, and we must let + // the sender know that the transaction has completed (with an error in this case). + let reply = Either::Right(BR_FAILED_REPLY); + orig.from.deliver_reply(reply, &orig); + err.reply = BR_TRANSACTION_COMPLETE; + err + }) + } + + /// Determines the current top of the transaction stack. It fails if the top is in another + /// thread (i.e., this thread belongs to a stack but it has called another thread). The top is + /// [`None`] if the thread is not currently participating in a transaction stack. + fn top_of_transaction_stack(&self) -> Result>> { + let inner = self.inner.lock(); + Ok(if let Some(cur) = &inner.current_transaction { + if core::ptr::eq(self, cur.from.as_ref()) { + return Err(EINVAL); + } + Some(cur.clone()) + } else { + None + }) + } + + fn oneway_transaction_inner(self: &Ref, tr: &BinderTransactionData) -> BinderResult { + let handle = unsafe { tr.target.handle }; + let node_ref = self.process.get_transaction_node(handle)?; + security::binder_transaction(&self.process.cred, &node_ref.node.owner.cred)?; + let completion = Ref::try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?; + let transaction = Transaction::new(node_ref, None, self, tr)?; + self.inner.lock().push_work(completion); + // TODO: Remove the completion on error? + transaction.submit()?; + Ok(()) + } + + fn transaction_inner(self: &Ref, tr: &BinderTransactionData) -> BinderResult { + let handle = unsafe { tr.target.handle }; + let node_ref = self.process.get_transaction_node(handle)?; + security::binder_transaction(&self.process.cred, &node_ref.node.owner.cred)?; + // TODO: We need to ensure that there isn't a pending transaction in the work queue. How + // could this happen? + let top = self.top_of_transaction_stack()?; + let completion = Ref::try_new(DeliverCode::new(BR_TRANSACTION_COMPLETE))?; + let transaction = Transaction::new(node_ref, top, self, tr)?; + + // Check that the transaction stack hasn't changed while the lock was released, then update + // it with the new transaction. + { + let mut inner = self.inner.lock(); + if !transaction.is_stacked_on(&inner.current_transaction) { + return Err(BinderError::new_failed()); + } + inner.current_transaction = Some(transaction.clone()); + } + + // We push the completion as a deferred work so that we wait for the reply before returning + // to userland. + self.push_work_deferred(completion); + // TODO: Remove completion if submission fails? + transaction.submit()?; + Ok(()) + } + + fn write(self: &Ref, req: &mut BinderWriteRead) -> Result { + let write_start = req.write_buffer.wrapping_add(req.write_consumed); + let write_len = req.write_size - req.write_consumed; + let mut reader = unsafe { UserSlicePtr::new(write_start as _, write_len as _).reader() }; + + while reader.len() >= size_of::() && self.inner.lock().return_work.is_some() { + let before = reader.len(); + match reader.read::()? { + BC_TRANSACTION => { + let tr = reader.read::()?; + if tr.flags & TF_ONE_WAY != 0 { + self.transaction(&tr, Self::oneway_transaction_inner) + } else { + self.transaction(&tr, Self::transaction_inner) + } + } + BC_REPLY => self.transaction(&reader.read()?, Self::reply_inner), + BC_FREE_BUFFER => drop(self.process.buffer_get(reader.read()?)), + BC_INCREFS => self.process.update_ref(reader.read()?, true, false)?, + BC_ACQUIRE => self.process.update_ref(reader.read()?, true, true)?, + BC_RELEASE => self.process.update_ref(reader.read()?, false, true)?, + BC_DECREFS => self.process.update_ref(reader.read()?, false, false)?, + BC_INCREFS_DONE => self.process.inc_ref_done(&mut reader, false)?, + BC_ACQUIRE_DONE => self.process.inc_ref_done(&mut reader, true)?, + BC_REQUEST_DEATH_NOTIFICATION => self.process.request_death(&mut reader, self)?, + BC_CLEAR_DEATH_NOTIFICATION => self.process.clear_death(&mut reader, self)?, + BC_DEAD_BINDER_DONE => self.process.dead_binder_done(reader.read()?, self), + BC_REGISTER_LOOPER => { + let valid = self.process.register_thread(); + self.inner.lock().looper_register(valid); + } + BC_ENTER_LOOPER => self.inner.lock().looper_enter(), + BC_EXIT_LOOPER => self.inner.lock().looper_exit(), + + // TODO: Add support for BC_TRANSACTION_SG and BC_REPLY_SG. + // BC_ATTEMPT_ACQUIRE and BC_ACQUIRE_RESULT are no longer supported. + _ => return Err(EINVAL), + } + + // Update the number of write bytes consumed. + req.write_consumed += (before - reader.len()) as u64; + } + Ok(()) + } + + fn read(self: &Ref, req: &mut BinderWriteRead, wait: bool) -> Result { + let read_start = req.read_buffer.wrapping_add(req.read_consumed); + let read_len = req.read_size - req.read_consumed; + let mut writer = unsafe { UserSlicePtr::new(read_start as _, read_len as _) }.writer(); + let (in_pool, getter) = { + let inner = self.inner.lock(); + ( + inner.is_looper(), + if inner.should_use_process_work_queue() { + Self::get_work + } else { + Self::get_work_local + }, + ) + }; + + // Reserve some room at the beginning of the read buffer so that we can send a + // BR_SPAWN_LOOPER if we need to. + if req.read_consumed == 0 { + writer.write(&BR_NOOP)?; + } + + // Loop doing work while there is room in the buffer. + let initial_len = writer.len(); + while writer.len() >= size_of::() { + match getter(self, wait && initial_len == writer.len()) { + Ok(work) => { + if !work.do_work(self, &mut writer)? { + break; + } + } + Err(err) => { + // Propagate the error if we haven't written anything else. + if initial_len == writer.len() { + return Err(err); + } else { + break; + } + } + } + } + + req.read_consumed += read_len - writer.len() as u64; + + // Write BR_SPAWN_LOOPER if the process needs more threads for its pool. + if in_pool && self.process.needs_thread() { + let mut writer = + unsafe { UserSlicePtr::new(req.read_buffer as _, req.read_size as _) }.writer(); + writer.write(&BR_SPAWN_LOOPER)?; + } + + Ok(()) + } + + pub(crate) fn write_read(self: &Ref, data: UserSlicePtr, wait: bool) -> Result { + let (mut reader, mut writer) = data.reader_writer(); + let mut req = reader.read::()?; + + // TODO: `write(&req)` happens in all exit paths from here on. Find a better way to encode + // it. + + // Go through the write buffer. + if req.write_size > 0 { + if let Err(err) = self.write(&mut req) { + req.read_consumed = 0; + writer.write(&req)?; + return Err(err); + } + } + + // Go through the work queue. + let mut ret = Ok(()); + if req.read_size > 0 { + ret = self.read(&mut req, wait); + } + + // Write the request back so that the consumed fields are visible to the caller. + writer.write(&req)?; + ret + } + + pub(crate) fn poll(&self, file: &File, table: &PollTable) -> (bool, u32) { + // SAFETY: `free_waiters` is called on release. + unsafe { table.register_wait(file, &self.work_condvar) }; + let mut inner = self.inner.lock(); + (inner.should_use_process_work_queue(), inner.poll()) + } + + pub(crate) fn notify_if_poll_ready(&self) { + // Determine if we need to notify. This requires the lock. + let inner = self.inner.lock(); + let notify = inner.looper_flags & LOOPER_POLL != 0 + && inner.should_use_process_work_queue() + && !inner.has_work(); + drop(inner); + + // Now that the lock is no longer held, notify the waiters if we have to. + if notify { + self.work_condvar.notify_one(); + } + } + + pub(crate) fn push_return_work(&self, code: u32) { + self.inner.lock().push_return_work(code) + } + + pub(crate) fn release(self: &Ref) { + // Mark the thread as dead. + self.inner.lock().is_dead = true; + + // Cancel all pending work items. + while let Ok(work) = self.get_work_local(false) { + work.cancel(); + } + + // Complete the transaction stack as far as we can. + self.unwind_transaction_stack(); + + // Remove epoll items if polling was ever used on the thread. + let poller = self.inner.lock().looper_flags & LOOPER_POLL != 0; + if poller { + self.work_condvar.free_waiters(); + + unsafe { bindings::synchronize_rcu() }; + } + } +} + +impl GetLinks for Thread { + type EntryType = Thread; + fn get_links(data: &Thread) -> &Links { + &data.links + } +} + +struct ThreadError { + error_code: AtomicU32, + return_fn: fn(&mut InnerThread, Ref), + links: Links, +} + +impl ThreadError { + fn new(return_fn: fn(&mut InnerThread, Ref)) -> Self { + Self { + error_code: AtomicU32::new(BR_OK), + return_fn, + links: Links::new(), + } + } +} + +impl DeliverToRead for ThreadError { + fn do_work(self: Ref, thread: &Thread, writer: &mut UserSlicePtrWriter) -> Result { + // See `ThreadInner::push_existing_work` for the reason why `error_code` is up to date even + // though we use relaxed semantics. + let code = self.error_code.load(Ordering::Relaxed); + + // Return the `ThreadError` to the thread. + (self.return_fn)(&mut *thread.inner.lock(), self); + + // Deliver the error code to userspace. + writer.write(&code)?; + Ok(true) + } + + fn get_links(&self) -> &Links { + &self.links + } +} diff --git a/drivers/android/transaction.rs b/drivers/android/transaction.rs new file mode 100644 index 00000000000000..8ddf6e21c9cd90 --- /dev/null +++ b/drivers/android/transaction.rs @@ -0,0 +1,326 @@ +// SPDX-License-Identifier: GPL-2.0 + +use core::sync::atomic::{AtomicBool, Ordering}; +use kernel::{ + bindings, + file::{File, FileDescriptorReservation}, + io_buffer::IoBufferWriter, + linked_list::List, + linked_list::{GetLinks, Links}, + prelude::*, + sync::{Ref, SpinLock, UniqueRef}, + user_ptr::UserSlicePtrWriter, + ScopeGuard, +}; + +use crate::{ + defs::*, + node::NodeRef, + process::Process, + ptr_align, + thread::{BinderResult, Thread}, + DeliverToRead, Either, +}; + +struct TransactionInner { + file_list: List>, +} + +pub(crate) struct Transaction { + inner: SpinLock, + // TODO: Node should be released when the buffer is released. + node_ref: Option, + stack_next: Option>, + pub(crate) from: Ref, + to: Ref, + free_allocation: AtomicBool, + code: u32, + pub(crate) flags: u32, + data_size: usize, + offsets_size: usize, + data_address: usize, + links: Links, +} + +impl Transaction { + pub(crate) fn new( + node_ref: NodeRef, + stack_next: Option>, + from: &Ref, + tr: &BinderTransactionData, + ) -> BinderResult> { + let allow_fds = node_ref.node.flags & FLAT_BINDER_FLAG_ACCEPTS_FDS != 0; + let to = node_ref.node.owner.clone(); + let mut alloc = from.copy_transaction_data(&to, tr, allow_fds)?; + let data_address = alloc.ptr; + let file_list = alloc.take_file_list(); + alloc.keep_alive(); + let mut tr = Pin::from(UniqueRef::try_new(Self { + // SAFETY: `spinlock_init` is called below. + inner: unsafe { SpinLock::new(TransactionInner { file_list }) }, + node_ref: Some(node_ref), + stack_next, + from: from.clone(), + to, + code: tr.code, + flags: tr.flags, + data_size: tr.data_size as _, + data_address, + offsets_size: tr.offsets_size as _, + links: Links::new(), + free_allocation: AtomicBool::new(true), + })?); + + // SAFETY: `inner` is pinned when `tr` is. + let pinned = unsafe { tr.as_mut().map_unchecked_mut(|t| &mut t.inner) }; + kernel::spinlock_init!(pinned, "Transaction::inner"); + + Ok(tr.into()) + } + + pub(crate) fn new_reply( + from: &Ref, + to: Ref, + tr: &BinderTransactionData, + allow_fds: bool, + ) -> BinderResult> { + let mut alloc = from.copy_transaction_data(&to, tr, allow_fds)?; + let data_address = alloc.ptr; + let file_list = alloc.take_file_list(); + alloc.keep_alive(); + let mut tr = Pin::from(UniqueRef::try_new(Self { + // SAFETY: `spinlock_init` is called below. + inner: unsafe { SpinLock::new(TransactionInner { file_list }) }, + node_ref: None, + stack_next: None, + from: from.clone(), + to, + code: tr.code, + flags: tr.flags, + data_size: tr.data_size as _, + data_address, + offsets_size: tr.offsets_size as _, + links: Links::new(), + free_allocation: AtomicBool::new(true), + })?); + + // SAFETY: `inner` is pinned when `tr` is. + let pinned = unsafe { tr.as_mut().map_unchecked_mut(|t| &mut t.inner) }; + kernel::spinlock_init!(pinned, "Transaction::inner"); + + Ok(tr.into()) + } + + /// Determines if the transaction is stacked on top of the given transaction. + pub(crate) fn is_stacked_on(&self, onext: &Option>) -> bool { + match (&self.stack_next, onext) { + (None, None) => true, + (Some(stack_next), Some(next)) => Ref::ptr_eq(stack_next, next), + _ => false, + } + } + + /// Returns a pointer to the next transaction on the transaction stack, if there is one. + pub(crate) fn clone_next(&self) -> Option> { + let next = self.stack_next.as_ref()?; + Some(next.clone()) + } + + /// Searches in the transaction stack for a thread that belongs to the target process. This is + /// useful when finding a target for a new transaction: if the node belongs to a process that + /// is already part of the transaction stack, we reuse the thread. + fn find_target_thread(&self) -> Option> { + let process = &self.node_ref.as_ref()?.node.owner; + + let mut it = &self.stack_next; + while let Some(transaction) = it { + if Ref::ptr_eq(&transaction.from.process, process) { + return Some(transaction.from.clone()); + } + it = &transaction.stack_next; + } + None + } + + /// Searches in the transaction stack for a transaction originating at the given thread. + pub(crate) fn find_from(&self, thread: &Thread) -> Option> { + let mut it = &self.stack_next; + while let Some(transaction) = it { + if core::ptr::eq(thread, transaction.from.as_ref()) { + return Some(transaction.clone()); + } + + it = &transaction.stack_next; + } + None + } + + /// Submits the transaction to a work queue. Use a thread if there is one in the transaction + /// stack, otherwise use the destination process. + pub(crate) fn submit(self: Ref) -> BinderResult { + if let Some(thread) = self.find_target_thread() { + thread.push_work(self) + } else { + let process = self.to.clone(); + process.push_work(self) + } + } + + /// Prepares the file list for delivery to the caller. + fn prepare_file_list(&self) -> Result>> { + // Get list of files that are being transferred as part of the transaction. + let mut file_list = core::mem::replace(&mut self.inner.lock().file_list, List::new()); + + // If the list is non-empty, prepare the buffer. + if !file_list.is_empty() { + let alloc = self.to.buffer_get(self.data_address).ok_or(ESRCH)?; + let cleanup = ScopeGuard::new(|| { + self.free_allocation.store(false, Ordering::Relaxed); + }); + + let mut it = file_list.cursor_front_mut(); + while let Some(file_info) = it.current() { + let reservation = FileDescriptorReservation::new(bindings::O_CLOEXEC)?; + alloc.write(file_info.buffer_offset, &reservation.reserved_fd())?; + file_info.reservation = Some(reservation); + it.move_next(); + } + + alloc.keep_alive(); + cleanup.dismiss(); + } + + Ok(file_list) + } +} + +impl DeliverToRead for Transaction { + fn do_work(self: Ref, thread: &Thread, writer: &mut UserSlicePtrWriter) -> Result { + /* TODO: Initialise the following fields from tr: + pub sender_pid: pid_t, + pub sender_euid: uid_t, + */ + let send_failed_reply = ScopeGuard::new(|| { + if self.node_ref.is_some() && self.flags & TF_ONE_WAY == 0 { + let reply = Either::Right(BR_FAILED_REPLY); + self.from.deliver_reply(reply, &self); + } + }); + let mut file_list = if let Ok(list) = self.prepare_file_list() { + list + } else { + // On failure to process the list, we send a reply back to the sender and ignore the + // transaction on the recipient. + return Ok(true); + }; + + let mut tr = BinderTransactionData::default(); + + if let Some(nref) = &self.node_ref { + let (ptr, cookie) = nref.node.get_id(); + tr.target.ptr = ptr as _; + tr.cookie = cookie as _; + }; + + tr.code = self.code; + tr.flags = self.flags; + tr.data_size = self.data_size as _; + tr.data.ptr.buffer = self.data_address as _; + tr.offsets_size = self.offsets_size as _; + if tr.offsets_size > 0 { + tr.data.ptr.offsets = (self.data_address + ptr_align(self.data_size)) as _; + } + + let code = if self.node_ref.is_none() { + BR_REPLY + } else { + BR_TRANSACTION + }; + + // Write the transaction code and data to the user buffer. + writer.write(&code)?; + writer.write(&tr)?; + + // Dismiss the completion of transaction with a failure. No failure paths are allowed from + // here on out. + send_failed_reply.dismiss(); + + // Commit all files. + { + let mut it = file_list.cursor_front_mut(); + while let Some(file_info) = it.current() { + if let Some(reservation) = file_info.reservation.take() { + if let Some(file) = file_info.file.take() { + reservation.commit(file); + } + } + + it.move_next(); + } + } + + // When `drop` is called, we don't want the allocation to be freed because it is now the + // user's reponsibility to free it. + // + // `drop` is guaranteed to see this relaxed store because `Ref` guarantess that everything + // that happens when an object is referenced happens-before the eventual `drop`. + self.free_allocation.store(false, Ordering::Relaxed); + + // When this is not a reply and not an async transaction, update `current_transaction`. If + // it's a reply, `current_transaction` has already been updated appropriately. + if self.node_ref.is_some() && tr.flags & TF_ONE_WAY == 0 { + thread.set_current_transaction(self); + } + + Ok(false) + } + + fn cancel(self: Ref) { + let reply = Either::Right(BR_DEAD_REPLY); + self.from.deliver_reply(reply, &self); + } + + fn get_links(&self) -> &Links { + &self.links + } +} + +impl Drop for Transaction { + fn drop(&mut self) { + if self.free_allocation.load(Ordering::Relaxed) { + self.to.buffer_get(self.data_address); + } + } +} + +pub(crate) struct FileInfo { + links: Links, + + /// The file for which a descriptor will be created in the recipient process. + file: Option>, + + /// The file descriptor reservation on the recipient process. + reservation: Option, + + /// The offset in the buffer where the file descriptor is stored. + buffer_offset: usize, +} + +impl FileInfo { + pub(crate) fn new(file: ARef, buffer_offset: usize) -> Self { + Self { + file: Some(file), + reservation: None, + buffer_offset, + links: Links::new(), + } + } +} + +impl GetLinks for FileInfo { + type EntryType = Self; + + fn get_links(data: &Self::EntryType) -> &Links { + &data.links + } +} diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index 11157fae8a8e76..a982c30dbbf5c8 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -236,19 +236,21 @@ struct binder_frozen_status_info { __u32 async_recv; }; -#define BINDER_WRITE_READ _IOWR('b', 1, struct binder_write_read) -#define BINDER_SET_IDLE_TIMEOUT _IOW('b', 3, __s64) -#define BINDER_SET_MAX_THREADS _IOW('b', 5, __u32) -#define BINDER_SET_IDLE_PRIORITY _IOW('b', 6, __s32) -#define BINDER_SET_CONTEXT_MGR _IOW('b', 7, __s32) -#define BINDER_THREAD_EXIT _IOW('b', 8, __s32) -#define BINDER_VERSION _IOWR('b', 9, struct binder_version) -#define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) -#define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref) -#define BINDER_SET_CONTEXT_MGR_EXT _IOW('b', 13, struct flat_binder_object) -#define BINDER_FREEZE _IOW('b', 14, struct binder_freeze_info) -#define BINDER_GET_FROZEN_INFO _IOWR('b', 15, struct binder_frozen_status_info) -#define BINDER_ENABLE_ONEWAY_SPAM_DETECTION _IOW('b', 16, __u32) +enum { + BINDER_WRITE_READ = _IOWR('b', 1, struct binder_write_read), + BINDER_SET_IDLE_TIMEOUT = _IOW('b', 3, __s64), + BINDER_SET_MAX_THREADS = _IOW('b', 5, __u32), + BINDER_SET_IDLE_PRIORITY = _IOW('b', 6, __s32), + BINDER_SET_CONTEXT_MGR = _IOW('b', 7, __s32), + BINDER_THREAD_EXIT = _IOW('b', 8, __s32), + BINDER_VERSION = _IOWR('b', 9, struct binder_version), + BINDER_GET_NODE_DEBUG_INFO = _IOWR('b', 11, struct binder_node_debug_info), + BINDER_GET_NODE_INFO_FOR_REF = _IOWR('b', 12, struct binder_node_info_for_ref), + BINDER_SET_CONTEXT_MGR_EXT = _IOW('b', 13, struct flat_binder_object), + BINDER_FREEZE = _IOW('b', 14, struct binder_freeze_info), + BINDER_GET_FROZEN_INFO = _IOWR('b', 15, struct binder_frozen_status_info), + BINDER_ENABLE_ONEWAY_SPAM_DETECTION = _IOW('b', 16, __u32), +}; /* * NOTE: Two special error codes you should check for when calling From 9a5fe747d99e1d562dde1f39259bbe2d098262ae Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 10 Jan 2022 08:59:52 +0000 Subject: [PATCH 0040/1250] init/Kconfig: Specify the interpreter for rust-is-available.sh Some common tools like 'diff' don't support permissions of the files. Due to that, 'rust-is-available.sh' in some trees including '-mm' result in having no execution permission, and therefore build fails like below: $ make O=../linux.out/ olddefconfig make[1]: Entering directory 'linux.out' GEN Makefile sh: 1: linux/scripts/rust-is-available.sh: Permission denied init/Kconfig:71: syntax error init/Kconfig:70: invalid statement linux/scripts/kconfig/Makefile:77: recipe for target 'olddefconfig' failed make[2]: *** [olddefconfig] Error 1 linux/Makefile:666: recipe for target 'olddefconfig' failed make[1]: *** [olddefconfig] Error 2 make[1]: Leaving directory 'linux.out' Makefile:226: recipe for target '__sub-make' failed make: *** [__sub-make] Error 2 It's not a big deal, but not so fun. This commit works around the issue by specifying the interpreter for 'rust-is-available.sh' in the Kconfig file. The ugly work around wouldn't be needed once 'rust-is-available.sh' file is merged in the mainline with the execution permission. Signed-off-by: SeongJae Park Reviewed-by: Wei Liu [Edited for new script] Signed-off-by: Miguel Ojeda --- init/Kconfig | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 3457cf596588f5..70788df0db5ab9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -61,7 +61,13 @@ config LLD_VERSION default 0 config RUST_IS_AVAILABLE - def_bool $(success,$(srctree)/scripts/rust-is-available.sh) + # Because some common tools like 'diff' don't support permissions of + # the files, 'rust-is-available.sh' in some trees that managed with such + # tools result in having no execution permission. As a temporal work + # around, we specify the interpreter ('/bin/sh'). It will be unneeded + # once 'rust-is-available.sh' is merged in the mainline with its execution + # permission. + def_bool $(success,/bin/sh $(srctree)/scripts/rust-is-available.sh) help This shows whether a suitable Rust toolchain is available (found). From b01c83388f54dc9a4946cbbf00ea86664d9d1cf7 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 23 May 2022 08:51:05 +0200 Subject: [PATCH 0041/1250] Revert "HID: Driver for Google Hangouts Meet Speakermic" This reverts commit e9c8c7c43b51b277026f94a1175c605436c7c829. The same functionality can be achieve by the following udev rule evdev:input:b0003v18D1p8001* KEYBOARD_KEY_b002f=reserved So no need for this driver. Signed-off-by: Jiri Kosina --- drivers/hid/Kconfig | 12 -------- drivers/hid/Makefile | 1 - drivers/hid/hid-google-atrus.c | 55 ---------------------------------- drivers/hid/hid-ids.h | 1 - 4 files changed, 69 deletions(-) delete mode 100644 drivers/hid/hid-google-atrus.c diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index edab0809ced424..96750b3903449d 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -405,18 +405,6 @@ config HOLTEK_FF Say Y here if you have a Holtek On Line Grip based game controller and want to have force feedback support for it. -config HID_GOOGLE_ATRUS - tristate "Google Hangouts Meet Speakermic" - depends on USB_HID - help - This selects a driver for the Google Hangouts Meet Speakermic. - - This driver works around a problem with the HID usage sent by this - device for the mute button. It prevents key events from being generated - for that HID usage since they would be incorrect. - - Say Y here if you have a Google Hangouts Meet Speakermic. - config HID_GOOGLE_HAMMER tristate "Google Hammer Keyboard" depends on USB_HID && LEDS_CLASS && CROS_EC diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile index 2ba120c7edba1f..c2a2db16309474 100644 --- a/drivers/hid/Makefile +++ b/drivers/hid/Makefile @@ -50,7 +50,6 @@ obj-$(CONFIG_HID_FT260) += hid-ft260.o obj-$(CONFIG_HID_GEMBIRD) += hid-gembird.o obj-$(CONFIG_HID_GFRM) += hid-gfrm.o obj-$(CONFIG_HID_GLORIOUS) += hid-glorious.o -obj-$(CONFIG_HID_GOOGLE_ATRUS) += hid-google-atrus.o obj-$(CONFIG_HID_GOOGLE_HAMMER) += hid-google-hammer.o obj-$(CONFIG_HID_VIVALDI) += hid-vivaldi.o obj-$(CONFIG_HID_GT683R) += hid-gt683r.o diff --git a/drivers/hid/hid-google-atrus.c b/drivers/hid/hid-google-atrus.c deleted file mode 100644 index e136c70e9425c7..00000000000000 --- a/drivers/hid/hid-google-atrus.c +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * HID driver for Google Hangouts Meet Speakermic - * - * Copyright 2022 Google LLC. - */ - -#include -#include - -#include "hid-ids.h" - -/* - * This driver handles the telephony phone mute HID usage by ignoring it. This - * avoids the default handling by the hid-input driver which is to map this to - * a KEY_MICMUTE event. The issue is that this device implements the phone mute - * HID usage as a toggle switch, where 1 indicates muted, and 0 indicates - * unmuted. However, for an EV_KEY event 1 indicates the key has been pressed - * and 0 indicates it has been released. - */ - -static int atrus_event(struct hid_device *hid, struct hid_field *field, - struct hid_usage *usage, __s32 value) -{ - /* - * Return 1 to indicate no further processing should be done for this - * usage. - */ - return 1; -} - -static const struct hid_device_id atrus_devices[] = { - { HID_DEVICE(BUS_USB, HID_GROUP_GENERIC, - USB_VENDOR_ID_GOOGLE, USB_DEVICE_ID_GOOGLE_ATRUS) }, - { } -}; -MODULE_DEVICE_TABLE(hid, atrus_devices); - -static const struct hid_usage_id atrus_usages[] = { - /* Handle only the Telephony Phone Mute usage. */ - { HID_UP_TELEPHONY | 0x2f, EV_KEY, HID_ANY_ID }, - { HID_TERMINATOR, HID_TERMINATOR, HID_TERMINATOR } -}; - -static struct hid_driver atrus_driver = { - .name = "atrus", - .id_table = atrus_devices, - .usage_table = atrus_usages, - .event = atrus_event, -}; -module_hid_driver(atrus_driver); - -MODULE_AUTHOR("Pablo Ceballos "); -MODULE_DESCRIPTION("Google Hangouts Meet Speakermic USB HID Driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 9dfe0232c5ed74..d9eb676abe9600 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -514,7 +514,6 @@ #define USB_DEVICE_ID_GOOGLE_MOONBALL 0x5044 #define USB_DEVICE_ID_GOOGLE_DON 0x5050 #define USB_DEVICE_ID_GOOGLE_EEL 0x5057 -#define USB_DEVICE_ID_GOOGLE_ATRUS 0x8001 #define USB_VENDOR_ID_GOTOP 0x08f2 #define USB_DEVICE_ID_SUPER_Q2 0x007f From d2ca1fd2bc705a91a7da99f1e56da52817566ccd Mon Sep 17 00:00:00 2001 From: Wang Kefeng Date: Tue, 24 May 2022 09:03:46 +0100 Subject: [PATCH 0042/1250] ARM: 9207/1: amba: fix refcount underflow if amba_device_add() fails "ARM: 9192/1: amba: fix memory leak in amba_device_try_add()" leads to a refcount underflow if amba_device_add() fails, which called by of_amba_device_create(), the of_amba_device_create() already exists the error handling, so amba_put_device() only need to be added into amba_deferred_retry(). Fixes: 7719a68b2fa4 ("ARM: 9192/1: amba: fix memory leak in amba_device_try_add()") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Kefeng Wang Signed-off-by: Russell King (Oracle) --- drivers/amba/bus.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/amba/bus.c b/drivers/amba/bus.c index 7e775ba6fdd905..082f036e899beb 100644 --- a/drivers/amba/bus.c +++ b/drivers/amba/bus.c @@ -478,13 +478,8 @@ static int amba_device_try_add(struct amba_device *dev, struct resource *parent) goto skip_probe; ret = amba_read_periphid(dev); - if (ret) { - if (ret != -EPROBE_DEFER) { - amba_device_put(dev); - goto err_out; - } + if (ret) goto err_release; - } skip_probe: ret = device_add(&dev->dev); @@ -531,6 +526,7 @@ static int amba_deferred_retry(void) continue; list_del_init(&ddev->node); + amba_device_put(ddev->dev); kfree(ddev); } From bfcbea2c93e56bc5a0d7db6528fcce8243433c12 Mon Sep 17 00:00:00 2001 From: Jae Hyun Yoo Date: Mon, 23 May 2022 10:56:40 -0700 Subject: [PATCH 0043/1250] ARM: dts: aspeed: nuvia: rename vendor nuvia to qcom Nuvia has been acquired by Qualcomm and the vendor name 'nuvia' will not be used anymore so rename aspeed-bmc-nuvia-dc-scm.dts to aspeed-bmc-qcom-dc-scm-v1.dts and change 'nuvia' to 'qcom' as its vendor name in the file. Signed-off-by: Jae Hyun Yoo Link: https://lore.kernel.org/r/20220523175640.60155-1-quic_jaehyoo@quicinc.com Signed-off-by: Joel Stanley --- arch/arm/boot/dts/Makefile | 2 +- ...eed-bmc-nuvia-dc-scm.dts => aspeed-bmc-qcom-dc-scm-v1.dts} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename arch/arm/boot/dts/{aspeed-bmc-nuvia-dc-scm.dts => aspeed-bmc-qcom-dc-scm-v1.dts} (97%) diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile index e63cd6ed0faae3..6afb7c8102d3a8 100644 --- a/arch/arm/boot/dts/Makefile +++ b/arch/arm/boot/dts/Makefile @@ -1546,7 +1546,6 @@ dtb-$(CONFIG_ARCH_ASPEED) += \ aspeed-bmc-lenovo-hr630.dtb \ aspeed-bmc-lenovo-hr855xg2.dtb \ aspeed-bmc-microsoft-olympus.dtb \ - aspeed-bmc-nuvia-dc-scm.dtb \ aspeed-bmc-opp-lanyang.dtb \ aspeed-bmc-opp-mihawk.dtb \ aspeed-bmc-opp-mowgli.dtb \ @@ -1559,6 +1558,7 @@ dtb-$(CONFIG_ARCH_ASPEED) += \ aspeed-bmc-opp-witherspoon.dtb \ aspeed-bmc-opp-zaius.dtb \ aspeed-bmc-portwell-neptune.dtb \ + aspeed-bmc-qcom-dc-scm-v1.dtb \ aspeed-bmc-quanta-q71l.dtb \ aspeed-bmc-quanta-s6q.dtb \ aspeed-bmc-supermicro-x11spi.dtb \ diff --git a/arch/arm/boot/dts/aspeed-bmc-nuvia-dc-scm.dts b/arch/arm/boot/dts/aspeed-bmc-qcom-dc-scm-v1.dts similarity index 97% rename from arch/arm/boot/dts/aspeed-bmc-nuvia-dc-scm.dts rename to arch/arm/boot/dts/aspeed-bmc-qcom-dc-scm-v1.dts index f4a97cfb0f238c..259ef3f54c5ccd 100644 --- a/arch/arm/boot/dts/aspeed-bmc-nuvia-dc-scm.dts +++ b/arch/arm/boot/dts/aspeed-bmc-qcom-dc-scm-v1.dts @@ -6,8 +6,8 @@ #include "aspeed-g6.dtsi" / { - model = "Nuvia DC-SCM BMC"; - compatible = "nuvia,dc-scm-bmc", "aspeed,ast2600"; + model = "Qualcomm DC-SCM V1 BMC"; + compatible = "qcom,dc-scm-v1-bmc", "aspeed,ast2600"; aliases { serial4 = &uart5; From 73a8dbafd31adc3f35a8e04e80f81e991df355d3 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Thu, 26 May 2022 22:34:07 +0000 Subject: [PATCH 0044/1250] selftests/seccomp: Fix compile warning when CC=clang clang has -Wconstant-conversion by default, and the constant 0xAAAAAAAAA (9 As) being converted to an int, which is generally 32 bits, results in the compile warning: clang -Wl,-no-as-needed -Wall -isystem ../../../../usr/include/ -lpthread seccomp_bpf.c -lcap -o seccomp_bpf seccomp_bpf.c:812:67: warning: implicit conversion from 'long' to 'int' changes value from 45812984490 to -1431655766 [-Wconstant-conversion] int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAAA; ~~~~ ^~~~~~~~~~~ 1 warning generated. -1431655766 is the expected truncation, 0xAAAAAAAA (8 As), so use this directly in the code to avoid the warning. Fixes: 3932fcecd962 ("selftests/seccomp: Add test for unknown SECCOMP_RET kill behavior") Signed-off-by: YiFei Zhu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220526223407.1686936-1-zhuyifei@google.com --- tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 136df5b76319d7..4ae6c899130740 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -809,7 +809,7 @@ void kill_thread_or_group(struct __test_metadata *_metadata, .len = (unsigned short)ARRAY_SIZE(filter_thread), .filter = filter_thread, }; - int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAAA; + int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA; struct sock_filter filter_process[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, offsetof(struct seccomp_data, nr)), From e6b8d9bf983420a09f2817958caeffcafcb315af Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 31 May 2022 12:15:51 +0200 Subject: [PATCH 0045/1250] KVM: s390: selftests: Use TAP interface in the memop test The memop test currently does not have any output (unless one of the TEST_ASSERT statement fails), so it's hard to say for a user whether a certain new sub-test has been included in the binary or not. Let's make this a little bit more user-friendly and include some TAP output via the kselftests.h interface. Reviewed-by: Janosch Frank Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20220531101554.36844-2-thuth@redhat.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/s390x/memop.c | 95 ++++++++++++++++++----- 1 file changed, 77 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c index 49f26f54412702..e704c6fa5758e8 100644 --- a/tools/testing/selftests/kvm/s390x/memop.c +++ b/tools/testing/selftests/kvm/s390x/memop.c @@ -14,6 +14,7 @@ #include "test_util.h" #include "kvm_util.h" +#include "kselftest.h" enum mop_target { LOGICAL, @@ -691,34 +692,92 @@ static void test_errors(void) kvm_vm_free(t.kvm_vm); } +struct testdef { + const char *name; + void (*test)(void); + int extension; +} testlist[] = { + { + .name = "simple copy", + .test = test_copy, + }, + { + .name = "generic error checks", + .test = test_errors, + }, + { + .name = "copy with storage keys", + .test = test_copy_key, + .extension = 1, + }, + { + .name = "copy with key storage protection override", + .test = test_copy_key_storage_prot_override, + .extension = 1, + }, + { + .name = "copy with key fetch protection", + .test = test_copy_key_fetch_prot, + .extension = 1, + }, + { + .name = "copy with key fetch protection override", + .test = test_copy_key_fetch_prot_override, + .extension = 1, + }, + { + .name = "error checks with key", + .test = test_errors_key, + .extension = 1, + }, + { + .name = "termination", + .test = test_termination, + .extension = 1, + }, + { + .name = "error checks with key storage protection override", + .test = test_errors_key_storage_prot_override, + .extension = 1, + }, + { + .name = "error checks without key fetch prot override", + .test = test_errors_key_fetch_prot_override_not_enabled, + .extension = 1, + }, + { + .name = "error checks with key fetch prot override", + .test = test_errors_key_fetch_prot_override_enabled, + .extension = 1, + }, +}; + int main(int argc, char *argv[]) { - int memop_cap, extension_cap; + int memop_cap, extension_cap, idx; setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ + ksft_print_header(); + memop_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP); extension_cap = kvm_check_cap(KVM_CAP_S390_MEM_OP_EXTENSION); if (!memop_cap) { - print_skip("CAP_S390_MEM_OP not supported"); - exit(KSFT_SKIP); + ksft_exit_skip("CAP_S390_MEM_OP not supported.\n"); } - test_copy(); - if (extension_cap > 0) { - test_copy_key(); - test_copy_key_storage_prot_override(); - test_copy_key_fetch_prot(); - test_copy_key_fetch_prot_override(); - test_errors_key(); - test_termination(); - test_errors_key_storage_prot_override(); - test_errors_key_fetch_prot_override_not_enabled(); - test_errors_key_fetch_prot_override_enabled(); - } else { - print_skip("storage key memop extension not supported"); + ksft_set_plan(ARRAY_SIZE(testlist)); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + if (testlist[idx].extension >= extension_cap) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } else { + ksft_test_result_skip("%s - extension level %d not supported\n", + testlist[idx].name, + testlist[idx].extension); + } } - test_errors(); - return 0; + ksft_finished(); /* Print results and exit() accordingly */ } From 933be397cc287829f3142dac30f8d2a13167fec0 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 31 May 2022 12:15:52 +0200 Subject: [PATCH 0046/1250] KVM: s390: selftests: Use TAP interface in the sync_regs test The sync_regs test currently does not have any output (unless one of the TEST_ASSERT statement fails), so it's hard to say for a user whether a certain new sub-test has been included in the binary or not. Let's make this a little bit more user-friendly and include some TAP output via the kselftests.h interface. To be able to distinguish the different sub-tests more easily, we also break up the huge main() function here in more fine grained parts. Acked-by: Janosch Frank Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20220531101554.36844-3-thuth@redhat.com Signed-off-by: Christian Borntraeger --- .../selftests/kvm/s390x/sync_regs_test.c | 87 ++++++++++++++----- 1 file changed, 66 insertions(+), 21 deletions(-) diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c index caf7b8859a94a5..9510739e226d8c 100644 --- a/tools/testing/selftests/kvm/s390x/sync_regs_test.c +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -21,6 +21,7 @@ #include "test_util.h" #include "kvm_util.h" #include "diag318_test_handler.h" +#include "kselftest.h" #define VCPU_ID 5 @@ -74,27 +75,9 @@ static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right) #define TEST_SYNC_FIELDS (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS|KVM_SYNC_DIAG318) #define INVALID_SYNC_FIELD 0x80000000 -int main(int argc, char *argv[]) +void test_read_invalid(struct kvm_vm *vm, struct kvm_run *run) { - struct kvm_vm *vm; - struct kvm_run *run; - struct kvm_regs regs; - struct kvm_sregs sregs; - int rv, cap; - - /* Tell stdout not to buffer its content */ - setbuf(stdout, NULL); - - cap = kvm_check_cap(KVM_CAP_SYNC_REGS); - if (!cap) { - print_skip("CAP_SYNC_REGS not supported"); - exit(KSFT_SKIP); - } - - /* Create VM */ - vm = vm_create_default(VCPU_ID, 0, guest_code); - - run = vcpu_state(vm, VCPU_ID); + int rv; /* Request reading invalid register set from VCPU. */ run->kvm_valid_regs = INVALID_SYNC_FIELD; @@ -110,6 +93,11 @@ int main(int argc, char *argv[]) "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n", rv); vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0; +} + +void test_set_invalid(struct kvm_vm *vm, struct kvm_run *run) +{ + int rv; /* Request setting invalid register set into VCPU. */ run->kvm_dirty_regs = INVALID_SYNC_FIELD; @@ -125,6 +113,13 @@ int main(int argc, char *argv[]) "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n", rv); vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0; +} + +void test_req_and_verify_all_valid_regs(struct kvm_vm *vm, struct kvm_run *run) +{ + struct kvm_sregs sregs; + struct kvm_regs regs; + int rv; /* Request and verify all valid register sets. */ run->kvm_valid_regs = TEST_SYNC_FIELDS; @@ -146,6 +141,13 @@ int main(int argc, char *argv[]) vcpu_sregs_get(vm, VCPU_ID, &sregs); compare_sregs(&sregs, &run->s.regs); +} + +void test_set_and_verify_various_reg_values(struct kvm_vm *vm, struct kvm_run *run) +{ + struct kvm_sregs sregs; + struct kvm_regs regs; + int rv; /* Set and verify various register values */ run->s.regs.gprs[11] = 0xBAD1DEA; @@ -180,6 +182,11 @@ int main(int argc, char *argv[]) vcpu_sregs_get(vm, VCPU_ID, &sregs); compare_sregs(&sregs, &run->s.regs); +} + +void test_clear_kvm_dirty_regs_bits(struct kvm_vm *vm, struct kvm_run *run) +{ + int rv; /* Clear kvm_dirty_regs bits, verify new s.regs values are * overwritten with existing guest values. @@ -200,8 +207,46 @@ int main(int argc, char *argv[]) TEST_ASSERT(run->s.regs.diag318 != 0x4B1D, "diag318 sync regs value incorrect 0x%llx.", run->s.regs.diag318); +} + +struct testdef { + const char *name; + void (*test)(struct kvm_vm *vm, struct kvm_run *run); +} testlist[] = { + { "read invalid", test_read_invalid }, + { "set invalid", test_set_invalid }, + { "request+verify all valid regs", test_req_and_verify_all_valid_regs }, + { "set+verify various regs", test_set_and_verify_various_reg_values }, + { "clear kvm_dirty_regs bits", test_clear_kvm_dirty_regs_bits }, +}; + +int main(int argc, char *argv[]) +{ + static struct kvm_run *run; + static struct kvm_vm *vm; + int idx; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + ksft_print_header(); + + if (!kvm_check_cap(KVM_CAP_SYNC_REGS)) + ksft_exit_skip("CAP_SYNC_REGS not supported"); + + ksft_set_plan(ARRAY_SIZE(testlist)); + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + + run = vcpu_state(vm, VCPU_ID); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(vm, run); + ksft_test_result_pass("%s\n", testlist[idx].name); + } kvm_vm_free(vm); - return 0; + ksft_finished(); /* Print results and exit() accordingly */ } From fd35ba6add67afe9c9ba7740aa29a87d0de10061 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 31 May 2022 12:15:53 +0200 Subject: [PATCH 0047/1250] KVM: s390: selftests: Use TAP interface in the tprot test The tprot test currently does not have any output (unless one of the TEST_ASSERT statement fails), so it's hard to say for a user whether a certain new sub-test has been included in the binary or not. Let's make this a little bit more user-friendly and include some TAP output via the kselftests.h interface. Reviewed-by: Janosch Frank Reviewed-by: Janis Schoetterl-Glausch Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20220531101554.36844-4-thuth@redhat.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/s390x/tprot.c | 29 +++++++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/s390x/tprot.c b/tools/testing/selftests/kvm/s390x/tprot.c index c097b9db495e46..14d74a9e7b3d44 100644 --- a/tools/testing/selftests/kvm/s390x/tprot.c +++ b/tools/testing/selftests/kvm/s390x/tprot.c @@ -8,6 +8,7 @@ #include #include "test_util.h" #include "kvm_util.h" +#include "kselftest.h" #define PAGE_SHIFT 12 #define PAGE_SIZE (1 << PAGE_SHIFT) @@ -63,12 +64,12 @@ static enum permission test_protection(void *addr, uint8_t key) } enum stage { - STAGE_END, STAGE_INIT_SIMPLE, TEST_SIMPLE, STAGE_INIT_FETCH_PROT_OVERRIDE, TEST_FETCH_PROT_OVERRIDE, TEST_STORAGE_PROT_OVERRIDE, + STAGE_END /* must be the last entry (it's the amount of tests) */ }; struct test { @@ -182,7 +183,7 @@ static void guest_code(void) GUEST_SYNC(perform_next_stage(&i, mapped_0)); } -#define HOST_SYNC(vmp, stage) \ +#define HOST_SYNC_NO_TAP(vmp, stage) \ ({ \ struct kvm_vm *__vm = (vmp); \ struct ucall uc; \ @@ -198,12 +199,21 @@ static void guest_code(void) ASSERT_EQ(uc.args[1], __stage); \ }) +#define HOST_SYNC(vmp, stage) \ +({ \ + HOST_SYNC_NO_TAP(vmp, stage); \ + ksft_test_result_pass("" #stage "\n"); \ +}) + int main(int argc, char *argv[]) { struct kvm_vm *vm; struct kvm_run *run; vm_vaddr_t guest_0_page; + ksft_print_header(); + ksft_set_plan(STAGE_END); + vm = vm_create_default(VCPU_ID, 0, guest_code); run = vcpu_state(vm, VCPU_ID); @@ -212,9 +222,14 @@ int main(int argc, char *argv[]) HOST_SYNC(vm, TEST_SIMPLE); guest_0_page = vm_vaddr_alloc(vm, PAGE_SIZE, 0); - if (guest_0_page != 0) - print_skip("Did not allocate page at 0 for fetch protection override tests"); - HOST_SYNC(vm, STAGE_INIT_FETCH_PROT_OVERRIDE); + if (guest_0_page != 0) { + /* Use NO_TAP so we don't get a PASS print */ + HOST_SYNC_NO_TAP(vm, STAGE_INIT_FETCH_PROT_OVERRIDE); + ksft_test_result_skip("STAGE_INIT_FETCH_PROT_OVERRIDE - " + "Did not allocate page at 0\n"); + } else { + HOST_SYNC(vm, STAGE_INIT_FETCH_PROT_OVERRIDE); + } if (guest_0_page == 0) mprotect(addr_gva2hva(vm, (vm_vaddr_t)0), PAGE_SIZE, PROT_READ); run->s.regs.crs[0] |= CR0_FETCH_PROTECTION_OVERRIDE; @@ -224,4 +239,8 @@ int main(int argc, char *argv[]) run->s.regs.crs[0] |= CR0_STORAGE_PROTECTION_OVERRIDE; run->kvm_dirty_regs = KVM_SYNC_CRS; HOST_SYNC(vm, TEST_STORAGE_PROT_OVERRIDE); + + kvm_vm_free(vm); + + ksft_finished(); /* Print results and exit() accordingly */ } From 242c04f01377122b137e13313190303a27773f70 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Tue, 31 May 2022 12:15:54 +0200 Subject: [PATCH 0048/1250] KVM: s390: selftests: Use TAP interface in the reset test Let's standardize the s390x KVM selftest output to the TAP output generated via the kselftests.h interface. Reviewed-by: Janosch Frank Signed-off-by: Thomas Huth Link: https://lore.kernel.org/r/20220531101554.36844-5-thuth@redhat.com Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/s390x/resets.c | 38 +++++++++++++++++----- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c index b143db6d8693b5..889449a22e7add 100644 --- a/tools/testing/selftests/kvm/s390x/resets.c +++ b/tools/testing/selftests/kvm/s390x/resets.c @@ -12,6 +12,7 @@ #include "test_util.h" #include "kvm_util.h" +#include "kselftest.h" #define VCPU_ID 3 #define LOCAL_IRQS 32 @@ -202,7 +203,7 @@ static void inject_irq(int cpu_id) static void test_normal(void) { - pr_info("Testing normal reset\n"); + ksft_print_msg("Testing normal reset\n"); /* Create VM */ vm = vm_create_default(VCPU_ID, 0, guest_code_initial); run = vcpu_state(vm, VCPU_ID); @@ -225,7 +226,7 @@ static void test_normal(void) static void test_initial(void) { - pr_info("Testing initial reset\n"); + ksft_print_msg("Testing initial reset\n"); vm = vm_create_default(VCPU_ID, 0, guest_code_initial); run = vcpu_state(vm, VCPU_ID); sync_regs = &run->s.regs; @@ -247,7 +248,7 @@ static void test_initial(void) static void test_clear(void) { - pr_info("Testing clear reset\n"); + ksft_print_msg("Testing clear reset\n"); vm = vm_create_default(VCPU_ID, 0, guest_code_initial); run = vcpu_state(vm, VCPU_ID); sync_regs = &run->s.regs; @@ -266,14 +267,35 @@ static void test_clear(void) kvm_vm_free(vm); } +struct testdef { + const char *name; + void (*test)(void); + bool needs_cap; +} testlist[] = { + { "initial", test_initial, false }, + { "normal", test_normal, true }, + { "clear", test_clear, true }, +}; + int main(int argc, char *argv[]) { + bool has_s390_vcpu_resets = kvm_check_cap(KVM_CAP_S390_VCPU_RESETS); + int idx; + setbuf(stdout, NULL); /* Tell stdout not to buffer its content */ - test_initial(); - if (kvm_check_cap(KVM_CAP_S390_VCPU_RESETS)) { - test_normal(); - test_clear(); + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(testlist)); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + if (!testlist[idx].needs_cap || has_s390_vcpu_resets) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } else { + ksft_test_result_skip("%s - no VCPU_RESETS capability\n", + testlist[idx].name); + } } - return 0; + + ksft_finished(); /* Print results and exit() accordingly */ } From e1d3373352077f3be9cc1c8adb5fd59d0aa96e7a Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Tue, 31 May 2022 10:59:10 -0700 Subject: [PATCH 0049/1250] cfi: Fix __cfi_slowpath_diag RCU usage with cpuidle RCU_NONIDLE usage during __cfi_slowpath_diag can result in an invalid RCU state in the cpuidle code path: WARNING: CPU: 1 PID: 0 at kernel/rcu/tree.c:613 rcu_eqs_enter+0xe4/0x138 ... Call trace: rcu_eqs_enter+0xe4/0x138 rcu_idle_enter+0xa8/0x100 cpuidle_enter_state+0x154/0x3a8 cpuidle_enter+0x3c/0x58 do_idle.llvm.6590768638138871020+0x1f4/0x2ec cpu_startup_entry+0x28/0x2c secondary_start_kernel+0x1b8/0x220 __secondary_switched+0x94/0x98 Instead, call rcu_irq_enter/exit to wake up RCU only when needed and disable interrupts for the entire CFI shadow/module check when we do. Signed-off-by: Sami Tolvanen Link: https://lore.kernel.org/r/20220531175910.890307-1-samitolvanen@google.com Fixes: cf68fffb66d6 ("add support for Clang CFI") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook --- kernel/cfi.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/kernel/cfi.c b/kernel/cfi.c index 9594cfd1cf2cf7..08102d19ec15a2 100644 --- a/kernel/cfi.c +++ b/kernel/cfi.c @@ -281,6 +281,8 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr) static inline cfi_check_fn find_check_fn(unsigned long ptr) { cfi_check_fn fn = NULL; + unsigned long flags; + bool rcu_idle; if (is_kernel_text(ptr)) return __cfi_check; @@ -290,13 +292,21 @@ static inline cfi_check_fn find_check_fn(unsigned long ptr) * the shadow and __module_address use RCU, so we need to wake it * up if necessary. */ - RCU_NONIDLE({ - if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) - fn = find_shadow_check_fn(ptr); + rcu_idle = !rcu_is_watching(); + if (rcu_idle) { + local_irq_save(flags); + rcu_irq_enter(); + } + + if (IS_ENABLED(CONFIG_CFI_CLANG_SHADOW)) + fn = find_shadow_check_fn(ptr); + if (!fn) + fn = find_module_check_fn(ptr); - if (!fn) - fn = find_module_check_fn(ptr); - }); + if (rcu_idle) { + rcu_irq_exit(); + local_irq_restore(flags); + } return fn; } From d60ea31cb4343623a9d499cfbd05a577e58d9e79 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Thu, 2 Jun 2022 17:28:22 +0800 Subject: [PATCH 0050/1250] Bluetooth: btusb: Add support of IMC Networks PID 0x3568 It is 13d3:3568 for MediaTek MT7922 USB Bluetooth chip. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=13d3 ProdID=3568 Rev=01.00 S: Manufacturer=MediaTek Inc. S: Product=Wireless_Device S: SerialNumber=... C: #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA I: If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=125us E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none) E: Ad=0a(O) Atr=03(Int.) MxPS= 64 Ivl=125us E: Ad=8a(I) Atr=03(Int.) MxPS= 64 Ivl=125us Signed-off-by: Aaron Ma Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btusb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index e25fcd49db702a..fb1a6718941240 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -477,6 +477,9 @@ static const struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x0489, 0xe0d9), .driver_info = BTUSB_MEDIATEK | BTUSB_WIDEBAND_SPEECH | BTUSB_VALID_LE_STATES }, + { USB_DEVICE(0x13d3, 0x3568), .driver_info = BTUSB_MEDIATEK | + BTUSB_WIDEBAND_SPEECH | + BTUSB_VALID_LE_STATES }, /* Additional Realtek 8723AE Bluetooth devices */ { USB_DEVICE(0x0930, 0x021d), .driver_info = BTUSB_REALTEK }, From 2072cdccd70be28111581d08f63e95305de12872 Mon Sep 17 00:00:00 2001 From: Sai Teja Aluvala Date: Fri, 27 May 2022 15:45:43 +0530 Subject: [PATCH 0051/1250] Bluetooth: hci_qca: Return wakeup for qca_wakeup This fixes the return value of qca_wakeup(), since .wakeup work inversely with original .prevent_wake. Fixes: 4539ca67fe8ed (Bluetooth: Rename driver .prevent_wake to .wakeup) Signed-off-by: Sai Teja Aluvala Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_qca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index eab34e24d94465..8df11016fd51b2 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1588,7 +1588,7 @@ static bool qca_wakeup(struct hci_dev *hdev) wakeup = device_may_wakeup(hu->serdev->ctrl->dev.parent); bt_dev_dbg(hu->hdev, "wakeup status : %d", wakeup); - return !wakeup; + return wakeup; } static int qca_regulator_init(struct hci_uart *hu) From 6d912cc3c21fd64c50451c62059683f1fa5102f7 Mon Sep 17 00:00:00 2001 From: Hakan Jansson Date: Mon, 30 May 2022 17:02:17 +0200 Subject: [PATCH 0052/1250] dt-bindings: net: broadcom-bluetooth: Add property for autobaud mode Add property, "brcm,requires-autobaud-mode", to enable autobaud mode selection. Some devices (e.g. CYW5557x) require autobaud mode to enable FW loading. Autobaud mode can also be required on some boards where the controller device is using a non-standard baud rate when first powered on. Signed-off-by: Hakan Jansson Reviewed-by: Krzysztof Kozlowski Signed-off-by: Marcel Holtmann --- .../devicetree/bindings/net/broadcom-bluetooth.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml index 5aac094fd21727..0a58d0fbcbc4bd 100644 --- a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml +++ b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml @@ -92,6 +92,13 @@ properties: pcm-sync-mode: slave, master pcm-clock-mode: slave, master + brcm,requires-autobaud-mode: + type: boolean + description: + Set this property if autobaud mode is required. Autobaud mode is required + if the device's initial baud rate in normal mode is not supported by the + host or if the device requires autobaud mode startup before loading FW. + interrupts: items: - description: Handle to the line HOST_WAKE used to wake From 3f125894bed7c4d613f43d4fdcf23e32a0201f32 Mon Sep 17 00:00:00 2001 From: Hakan Jansson Date: Mon, 30 May 2022 17:02:18 +0200 Subject: [PATCH 0053/1250] Bluetooth: hci_bcm: Add support for FW loading in autobaud mode Use the presence of a DT property, "brcm,requires-autobaud-mode", to enable startup in autobaud mode. If the property is present, the device is started in autobaud mode by asserting RTS (BT_UART_CTS_N) prior to powering on the device. Also prevent the use of unsupported commands for devices started in autobaud mode. Only a limited subset of HCI commands are supported in autobaud mode. Some devices (e.g. CYW5557x) require autobaud mode to enable FW loading. Autobaud mode can also be required on some boards where the controller device is using a non-standard baud rate in normal mode when first powered on. Signed-off-by: Hakan Jansson Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 31 +++++++++++++++++++++++-------- drivers/bluetooth/btbcm.h | 8 ++++---- drivers/bluetooth/hci_bcm.c | 16 +++++++++++++--- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index 76fbb046bdbe80..cfe018a6c1fca4 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -403,6 +403,13 @@ static int btbcm_read_info(struct hci_dev *hdev) bt_dev_info(hdev, "BCM: chip id %u", skb->data[1]); kfree_skb(skb); + return 0; +} + +static int btbcm_print_controller_features(struct hci_dev *hdev) +{ + struct sk_buff *skb; + /* Read Controller Features */ skb = btbcm_read_controller_features(hdev); if (IS_ERR(skb)) @@ -514,7 +521,7 @@ static const char *btbcm_get_board_name(struct device *dev) #endif } -int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) +int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done, bool use_autobaud_mode) { u16 subver, rev, pid, vid; struct sk_buff *skb; @@ -551,9 +558,16 @@ int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) if (err) return err; } - err = btbcm_print_local_name(hdev); - if (err) - return err; + + if (!use_autobaud_mode) { + err = btbcm_print_controller_features(hdev); + if (err) + return err; + + err = btbcm_print_local_name(hdev); + if (err) + return err; + } bcm_subver_table = (hdev->bus == HCI_USB) ? bcm_usb_subver_table : bcm_uart_subver_table; @@ -636,13 +650,13 @@ int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) } EXPORT_SYMBOL_GPL(btbcm_initialize); -int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done) +int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done, bool use_autobaud_mode) { int err; /* Re-initialize if necessary */ if (*fw_load_done) { - err = btbcm_initialize(hdev, fw_load_done); + err = btbcm_initialize(hdev, fw_load_done, use_autobaud_mode); if (err) return err; } @@ -658,15 +672,16 @@ EXPORT_SYMBOL_GPL(btbcm_finalize); int btbcm_setup_patchram(struct hci_dev *hdev) { bool fw_load_done = false; + bool use_autobaud_mode = false; int err; /* Initialize */ - err = btbcm_initialize(hdev, &fw_load_done); + err = btbcm_initialize(hdev, &fw_load_done, use_autobaud_mode); if (err) return err; /* Re-initialize after loading Patch */ - return btbcm_finalize(hdev, &fw_load_done); + return btbcm_finalize(hdev, &fw_load_done, use_autobaud_mode); } EXPORT_SYMBOL_GPL(btbcm_setup_patchram); diff --git a/drivers/bluetooth/btbcm.h b/drivers/bluetooth/btbcm.h index 8bf01565fdfca4..b4cb24231a202e 100644 --- a/drivers/bluetooth/btbcm.h +++ b/drivers/bluetooth/btbcm.h @@ -62,8 +62,8 @@ int btbcm_write_pcm_int_params(struct hci_dev *hdev, int btbcm_setup_patchram(struct hci_dev *hdev); int btbcm_setup_apple(struct hci_dev *hdev); -int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done); -int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done); +int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done, bool use_autobaud_mode); +int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done, bool use_autobaud_mode); #else @@ -104,12 +104,12 @@ static inline int btbcm_setup_apple(struct hci_dev *hdev) return 0; } -static inline int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done) +static inline int btbcm_initialize(struct hci_dev *hdev, bool *fw_load_done, bool use_autobaud_mode) { return 0; } -static inline int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done) +static inline int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done, bool use_autobaud_mode) { return 0; } diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 785f445dd60d5a..4309654f95a55a 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -99,6 +99,7 @@ struct bcm_device_data { * @no_early_set_baudrate: don't set_baudrate before setup() * @drive_rts_on_open: drive RTS signal on ->open() when platform requires it * @pcm_int_params: keep the initial PCM configuration + * @use_autobaud_mode: start Bluetooth device in autobaud mode */ struct bcm_device { /* Must be the first member, hci_serdev.c expects this. */ @@ -136,6 +137,7 @@ struct bcm_device { #endif bool no_early_set_baudrate; bool drive_rts_on_open; + bool use_autobaud_mode; u8 pcm_int_params[5]; }; @@ -472,7 +474,9 @@ static int bcm_open(struct hci_uart *hu) out: if (bcm->dev) { - if (bcm->dev->drive_rts_on_open) + if (bcm->dev->use_autobaud_mode) + hci_uart_set_flow_control(hu, false); /* Assert BT_UART_CTS_N */ + else if (bcm->dev->drive_rts_on_open) hci_uart_set_flow_control(hu, true); hu->init_speed = bcm->dev->init_speed; @@ -564,6 +568,7 @@ static int bcm_setup(struct hci_uart *hu) { struct bcm_data *bcm = hu->priv; bool fw_load_done = false; + bool use_autobaud_mode = (bcm->dev ? bcm->dev->use_autobaud_mode : 0); unsigned int speed; int err; @@ -572,7 +577,7 @@ static int bcm_setup(struct hci_uart *hu) hu->hdev->set_diag = bcm_set_diag; hu->hdev->set_bdaddr = btbcm_set_bdaddr; - err = btbcm_initialize(hu->hdev, &fw_load_done); + err = btbcm_initialize(hu->hdev, &fw_load_done, use_autobaud_mode); if (err) return err; @@ -616,7 +621,7 @@ static int bcm_setup(struct hci_uart *hu) btbcm_write_pcm_int_params(hu->hdev, ¶ms); } - err = btbcm_finalize(hu->hdev, &fw_load_done); + err = btbcm_finalize(hu->hdev, &fw_load_done, use_autobaud_mode); if (err) return err; @@ -1197,6 +1202,11 @@ static int bcm_acpi_probe(struct bcm_device *dev) static int bcm_of_probe(struct bcm_device *bdev) { + bdev->use_autobaud_mode = device_property_read_bool(bdev->dev, + "brcm,requires-autobaud-mode"); + if (bdev->use_autobaud_mode) + bdev->no_early_set_baudrate = true; + device_property_read_u32(bdev->dev, "max-speed", &bdev->oper_speed); device_property_read_u8_array(bdev->dev, "brcm,bt-pcm-int-params", bdev->pcm_int_params, 5); From 5a4e1528d8405e207bdc2c9db0b9952b70ca1e4c Mon Sep 17 00:00:00 2001 From: Alain Michaud Date: Thu, 2 Jun 2022 15:30:03 +0000 Subject: [PATCH 0054/1250] Bluetooth: clear the temporary linkkey in hci_conn_cleanup If a hardware error occurs and the connections are flushed without a disconnection_complete event being signaled, the temporary linkkeys are not flushed. This change ensures that any outstanding flushable linkkeys are flushed when the connection are flushed from the hash table. Additionally, this also makes use of test_and_clear_bit to avoid multiple attempts to delete the link key that's already been flushed. Signed-off-by: Alain Michaud Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 3 +++ net/bluetooth/hci_event.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ac06c9724c7f30..7829433d54c158 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -118,6 +118,9 @@ static void hci_conn_cleanup(struct hci_conn *conn) if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); + if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) + hci_remove_link_key(hdev, &conn->dst); + hci_chan_list_flush(conn); hci_conn_hash_del(hdev, conn); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index af17dfb20e017d..63585c0bb9ceeb 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2741,7 +2741,7 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags); if (conn->type == ACL_LINK) { - if (test_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) + if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) hci_remove_link_key(hdev, &conn->dst); } @@ -3368,7 +3368,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, void *data, reason, mgmt_connected); if (conn->type == ACL_LINK) { - if (test_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) + if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) hci_remove_link_key(hdev, &conn->dst); hci_req_update_scan(hdev); From 822e1b3ca0fbb495bf2d316ca8e7d5eccc77577a Mon Sep 17 00:00:00 2001 From: Jiasheng Jiang Date: Fri, 3 Jun 2022 09:24:36 +0800 Subject: [PATCH 0055/1250] Bluetooth: hci_intel: Add check for platform_driver_register As platform_driver_register() could fail, it should be better to deal with the return value in order to maintain the code consisitency. Fixes: 1ab1f239bf17 ("Bluetooth: hci_intel: Add support for platform driver") Signed-off-by: Jiasheng Jiang Signed-off-by: Marcel Holtmann --- drivers/bluetooth/hci_intel.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/bluetooth/hci_intel.c b/drivers/bluetooth/hci_intel.c index 7249b91d9b91ac..78afb9a348e705 100644 --- a/drivers/bluetooth/hci_intel.c +++ b/drivers/bluetooth/hci_intel.c @@ -1217,7 +1217,11 @@ static struct platform_driver intel_driver = { int __init intel_init(void) { - platform_driver_register(&intel_driver); + int err; + + err = platform_driver_register(&intel_driver); + if (err) + return err; return hci_uart_register_proto(&intel_proto); } From dba7abaead13f9dbf4e946f874127eae427d9947 Mon Sep 17 00:00:00 2001 From: Schspa Shi Date: Fri, 3 Jun 2022 16:19:14 +0800 Subject: [PATCH 0056/1250] Bluetooth: When HCI work queue is drained, only queue chained work The HCI command, event, and data packet processing workqueue is drained to avoid deadlock in commit 76727c02c1e1 ("Bluetooth: Call drain_workqueue() before resetting state"). There is another delayed work, which will queue command to this drained workqueue. Which results in the following error report: Bluetooth: hci2: command 0x040f tx timeout WARNING: CPU: 1 PID: 18374 at kernel/workqueue.c:1438 __queue_work+0xdad/0x1140 Workqueue: events hci_cmd_timeout RIP: 0010:__queue_work+0xdad/0x1140 RSP: 0000:ffffc90002cffc60 EFLAGS: 00010093 RAX: 0000000000000000 RBX: ffff8880b9d3ec00 RCX: 0000000000000000 RDX: ffff888024ba0000 RSI: ffffffff814e048d RDI: ffff8880b9d3ec08 RBP: 0000000000000008 R08: 0000000000000000 R09: 00000000b9d39700 R10: ffffffff814f73c6 R11: 0000000000000000 R12: ffff88807cce4c60 R13: 0000000000000000 R14: ffff8880796d8800 R15: ffff8880796d8800 FS: 0000000000000000(0000) GS:ffff8880b9d00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c0174b4000 CR3: 000000007cae9000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? queue_work_on+0xcb/0x110 ? lockdep_hardirqs_off+0x90/0xd0 queue_work_on+0xee/0x110 process_one_work+0x996/0x1610 ? pwq_dec_nr_in_flight+0x2a0/0x2a0 ? rwlock_bug.part.0+0x90/0x90 ? _raw_spin_lock_irq+0x41/0x50 worker_thread+0x665/0x1080 ? process_one_work+0x1610/0x1610 kthread+0x2e9/0x3a0 ? kthread_complete_and_exit+0x40/0x40 ret_from_fork+0x1f/0x30 To fix this, we can add a new HCI_DRAIN_WQ flag, and don't queue the timeout workqueue while command workqueue is draining. Fixes: 76727c02c1e1 ("Bluetooth: Call drain_workqueue() before resetting state") Reported-by: syzbot+63bed493aebbf6872647@syzkaller.appspotmail.com Signed-off-by: Schspa Shi Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_core.c | 10 +++++++++- net/bluetooth/hci_event.c | 5 +++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index fe7935be7dc443..4a45c48eb0d256 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -361,6 +361,7 @@ enum { HCI_QUALITY_REPORT, HCI_OFFLOAD_CODECS_ENABLED, HCI_LE_SIMULTANEOUS_ROLES, + HCI_CMD_DRAIN_WORKQUEUE, __HCI_NUM_FLAGS, }; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 5abb2ca5b12999..8539b4233da805 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -593,6 +593,11 @@ static int hci_dev_do_reset(struct hci_dev *hdev) skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); + /* Cancel these to avoid queueing non-chained pending work */ + hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); + cancel_delayed_work(&hdev->cmd_timer); + cancel_delayed_work(&hdev->ncmd_timer); + /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ @@ -606,6 +611,8 @@ static int hci_dev_do_reset(struct hci_dev *hdev) if (hdev->flush) hdev->flush(hdev); + hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); + atomic_set(&hdev->cmd_cnt, 1); hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; @@ -3861,7 +3868,8 @@ static void hci_cmd_work(struct work_struct *work) if (res < 0) __hci_cmd_sync_cancel(hdev, -res); - if (test_bit(HCI_RESET, &hdev->flags)) + if (test_bit(HCI_RESET, &hdev->flags) || + hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) cancel_delayed_work(&hdev->cmd_timer); else schedule_delayed_work(&hdev->cmd_timer, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 63585c0bb9ceeb..34bec7446d005e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3768,8 +3768,9 @@ static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, u8 ncmd) cancel_delayed_work(&hdev->ncmd_timer); atomic_set(&hdev->cmd_cnt, 1); } else { - schedule_delayed_work(&hdev->ncmd_timer, - HCI_NCMD_TIMEOUT); + if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) + schedule_delayed_work(&hdev->ncmd_timer, + HCI_NCMD_TIMEOUT); } } } From 8d4b73539cca7a5927d84e57df6c654db34ec01b Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Thu, 2 Jun 2022 09:46:49 -0700 Subject: [PATCH 0057/1250] Bluetooth: Fix index added after unregister When a userchannel socket is released, we should check whether the hdev is already unregistered before sending out an IndexAdded. Signed-off-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_sock.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 189e3115c8c62c..bd8358b44aa4cf 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -869,7 +869,8 @@ static int hci_sock_release(struct socket *sock) hdev = hci_pi(sk)->hdev; if (hdev) { - if (hci_pi(sk)->channel == HCI_CHANNEL_USER) { + if (hci_pi(sk)->channel == HCI_CHANNEL_USER && + !hci_dev_test_flag(hdev, HCI_UNREGISTER)) { /* When releasing a user channel exclusive access, * call hci_dev_do_close directly instead of calling * hci_dev_close to ensure the exclusive access will @@ -878,6 +879,11 @@ static int hci_sock_release(struct socket *sock) * The checking of HCI_AUTO_OFF is not needed in this * case since it will have been cleared already when * opening the user channel. + * + * Make sure to also check that we haven't already + * unregistered since all the cleanup will have already + * been complete and hdev will get released when we put + * below. */ hci_dev_do_close(hdev); hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); From d6bb2a91f95bf5f3276cd88e9e2baf74871b07e8 Mon Sep 17 00:00:00 2001 From: Abhishek Pandit-Subedi Date: Thu, 2 Jun 2022 09:46:50 -0700 Subject: [PATCH 0058/1250] Bluetooth: Unregister suspend with userchannel When HCI_USERCHANNEL is used, unregister the suspend notifier when binding and register when releasing. The userchannel socket should be left alone after open is completed. Signed-off-by: Abhishek Pandit-Subedi Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 33 ++++++++++++++++++++++++-------- net/bluetooth/hci_sock.c | 3 +++ 3 files changed, 30 insertions(+), 8 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 5a52a2018b56a3..5b92a9abe14116 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1289,6 +1289,8 @@ void hci_free_dev(struct hci_dev *hdev); int hci_register_dev(struct hci_dev *hdev); void hci_unregister_dev(struct hci_dev *hdev); void hci_release_dev(struct hci_dev *hdev); +int hci_register_suspend_notifier(struct hci_dev *hdev); +int hci_unregister_suspend_notifier(struct hci_dev *hdev); int hci_suspend_dev(struct hci_dev *hdev); int hci_resume_dev(struct hci_dev *hdev); int hci_reset_dev(struct hci_dev *hdev); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 8539b4233da805..3d8d2fcc0eb4a5 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2646,12 +2646,8 @@ int hci_register_dev(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); - if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { - hdev->suspend_notifier.notifier_call = hci_suspend_notifier; - error = register_pm_notifier(&hdev->suspend_notifier); - if (error) - goto err_wqueue; - } + if (hci_register_suspend_notifier(hdev)) + goto err_wqueue; queue_work(hdev->req_workqueue, &hdev->power_on); @@ -2684,8 +2680,7 @@ void hci_unregister_dev(struct hci_dev *hdev) hci_cmd_sync_clear(hdev); - if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) - unregister_pm_notifier(&hdev->suspend_notifier); + hci_unregister_suspend_notifier(hdev); msft_unregister(hdev); @@ -2749,6 +2744,28 @@ void hci_release_dev(struct hci_dev *hdev) } EXPORT_SYMBOL(hci_release_dev); +int hci_register_suspend_notifier(struct hci_dev *hdev) +{ + int ret = 0; + + if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { + hdev->suspend_notifier.notifier_call = hci_suspend_notifier; + ret = register_pm_notifier(&hdev->suspend_notifier); + } + + return ret; +} + +int hci_unregister_suspend_notifier(struct hci_dev *hdev) +{ + int ret = 0; + + if (!test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) + ret = unregister_pm_notifier(&hdev->suspend_notifier); + + return ret; +} + /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index bd8358b44aa4cf..0d015d4a8e4146 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -887,6 +887,7 @@ static int hci_sock_release(struct socket *sock) */ hci_dev_do_close(hdev); hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); + hci_register_suspend_notifier(hdev); mgmt_index_added(hdev); } @@ -1215,6 +1216,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, } mgmt_index_removed(hdev); + hci_unregister_suspend_notifier(hdev); err = hci_dev_open(hdev->id); if (err) { @@ -1229,6 +1231,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, err = 0; } else { hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); + hci_register_suspend_notifier(hdev); mgmt_index_added(hdev); hci_dev_put(hdev); goto done; From 534fdae369a80c765ef54eb749efd886eba27cd7 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Tue, 24 May 2022 07:56:40 +0200 Subject: [PATCH 0059/1250] dt-bindings: bluetooth: broadcom: Add BCM4349B1 DT binding The BCM4349B1, aka CYW/BCM89359, is a WiFi+BT chip and its Bluetooth portion can be controlled over serial. Extend the binding with its DT compatible. Acked-by: Krzysztof Kozlowski Reviewed-by: Linus Walleij Signed-off-by: Ahmad Fatoum Signed-off-by: Marcel Holtmann --- Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml index 0a58d0fbcbc4bd..df59575840fe37 100644 --- a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml +++ b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml @@ -23,6 +23,7 @@ properties: - brcm,bcm4345c5 - brcm,bcm43540-bt - brcm,bcm4335a0 + - brcm,bcm4349-bt shutdown-gpios: maxItems: 1 From a589ee43644c0269a066fa06e1ad4b3599ae6b1d Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Tue, 24 May 2022 07:56:41 +0200 Subject: [PATCH 0060/1250] Bluetooth: hci_bcm: Add BCM4349B1 variant The BCM4349B1, aka CYW/BCM89359, is a WiFi+BT chip and its Bluetooth portion can be controlled over serial. Two subversions are added for the chip, because ROM firmware reports 002.002.013 (at least for the chips I have here), while depending on patchram firmware revision, either 002.002.013 or 002.002.014 is reported. Signed-off-by: Ahmad Fatoum Reviewed-by: Linus Walleij Signed-off-by: Marcel Holtmann --- drivers/bluetooth/btbcm.c | 2 ++ drivers/bluetooth/hci_bcm.c | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index cfe018a6c1fca4..3006e2a0f37e1f 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -461,6 +461,8 @@ static const struct bcm_subver_table bcm_uart_subver_table[] = { { 0x6606, "BCM4345C5" }, /* 003.006.006 */ { 0x230f, "BCM4356A2" }, /* 001.003.015 */ { 0x220e, "BCM20702A1" }, /* 001.002.014 */ + { 0x420d, "BCM4349B1" }, /* 002.002.013 */ + { 0x420e, "BCM4349B1" }, /* 002.002.014 */ { 0x4217, "BCM4329B1" }, /* 002.002.023 */ { 0x6106, "BCM4359C0" }, /* 003.001.006 */ { 0x4106, "BCM4335A0" }, /* 002.001.006 */ diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 4309654f95a55a..6f834ff1b44b16 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -1554,6 +1554,7 @@ static const struct of_device_id bcm_bluetooth_of_match[] = { { .compatible = "brcm,bcm43430a0-bt" }, { .compatible = "brcm,bcm43430a1-bt" }, { .compatible = "brcm,bcm43438-bt", .data = &bcm43438_device_data }, + { .compatible = "brcm,bcm4349-bt", .data = &bcm43438_device_data }, { .compatible = "brcm,bcm43540-bt", .data = &bcm4354_device_data }, { .compatible = "brcm,bcm4335a0" }, { }, From 40b88d536f4c6b4a149c5134e761f8a596e53f14 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Tue, 31 May 2022 09:49:24 +0100 Subject: [PATCH 0061/1250] ARM: 9208/1: entry: add .ltorg directive to keep literals in range LKP reports a build issue on Clang, related to a literal load of __current issued through the ldr_va macro. This turns out to be due to the fact that group relocations are disabled when CONFIG_COMPILE_TEST=y, which means that the ldr_va macro resolves to a pair of LDR instructions, the first one being a literal load issued too far from its literal pool. Due to the introduction of a couple of new uses of this macro in commit 508074607c7b95b2 ("ARM: 9195/1: entry: avoid explicit literal loads"), the literal pools end up getting rearranged in a way that causes the literal for __current to go out of range. Let's fix this up by putting a .ltorg directive in a suitable place in the code. Link: https://lore.kernel.org/all/202205290805.1vZLAr36-lkp@intel.com/ Fixes: 508074607c7b95b2 ("ARM: 9195/1: entry: avoid explicit literal loads") Reported-by: kernel test robot Signed-off-by: Ard Biesheuvel Tested-by: Nathan Chancellor Signed-off-by: Russell King (Oracle) --- arch/arm/kernel/entry-common.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 7aa3ded4af9292..6a447ac67d80df 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -302,6 +302,7 @@ local_restart: b ret_fast_syscall #endif ENDPROC(vector_swi) + .ltorg /* * This is the really slow path. We're going to be doing From 51fda8ab74642d43d7b0a334ee2f01842f945233 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 18 May 2022 14:51:28 +0300 Subject: [PATCH 0062/1250] dt-bindings: microchip-otpc: document Microchip OTPC Document Microchip OTP controller. Signed-off-by: Claudiu Beznea Reviewed-by: Krzysztof Kozlowski Signed-off-by: Srinivas Kandagatla --- .../nvmem/microchip,sama7g5-otpc.yaml | 50 +++++++++++++++++++ .../nvmem/microchip,sama7g5-otpc.h | 12 +++++ 2 files changed, 62 insertions(+) create mode 100644 Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml create mode 100644 include/dt-bindings/nvmem/microchip,sama7g5-otpc.h diff --git a/Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml b/Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml new file mode 100644 index 00000000000000..c3c96fd0baacef --- /dev/null +++ b/Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/nvmem/microchip,sama7g5-otpc.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Microchip SAMA7G5 OTP Controller (OTPC) + +maintainers: + - Claudiu Beznea + +description: | + OTP controller drives a NVMEM memory where system specific data + (e.g. calibration data for analog cells, hardware configuration + settings, chip identifiers) or user specific data could be stored. + +allOf: + - $ref: "nvmem.yaml#" + +properties: + compatible: + items: + - const: microchip,sama7g5-otpc + - const: syscon + + reg: + maxItems: 1 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + #include + + otpc: efuse@e8c00000 { + compatible = "microchip,sama7g5-otpc", "syscon"; + reg = <0xe8c00000 0xec>; + #address-cells = <1>; + #size-cells = <1>; + + temperature_calib: calib@1 { + reg = ; + }; + }; + +... diff --git a/include/dt-bindings/nvmem/microchip,sama7g5-otpc.h b/include/dt-bindings/nvmem/microchip,sama7g5-otpc.h new file mode 100644 index 00000000000000..f570b23165a23a --- /dev/null +++ b/include/dt-bindings/nvmem/microchip,sama7g5-otpc.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ + +#ifndef _DT_BINDINGS_NVMEM_MICROCHIP_OTPC_H +#define _DT_BINDINGS_NVMEM_MICROCHIP_OTPC_H + +/* + * Need to have it as a multiple of 4 as NVMEM memory is registered with + * stride = 4. + */ +#define OTP_PKT(id) ((id) * 4) + +#endif From 6b291610dd57a4c5694914de012aa37b906fa93e Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 18 May 2022 14:51:29 +0300 Subject: [PATCH 0063/1250] nvmem: microchip-otpc: add support Add support for Microchip OTP controller available on SAMA7G5. The OTPC controls the access to a non-volatile memory. The memory behind OTPC is organized into packets, packets are composed by a fixed length header (4 bytes long) and a variable length payload (payload length is available in the header). When software request the data at an offset in memory the OTPC will return (via header + data registers) the whole packet that has a word at that offset. For the OTP memory layout like below: offset OTP Memory layout . . . ... . . . 0x0E +-----------+ <--- packet X | header X | 0x12 +-----------+ | payload X | 0x16 | | | | 0x1A | | +-----------+ . . . ... . . . if user requests data at address 0x16 the data started at 0x0E will be returned by controller. User will be able to fetch the whole packet starting at 0x0E (or parts of the packet) via proper registers. The same packet will be returned if software request the data at offset 0x0E or 0x12 or 0x1A. The OTP will be populated by Microchip with at least 2 packets first one being boot configuration packet and the 2nd one being temperature calibration packet. The packet order will be preserved b/w different chip revisions but the packet sizes may change. For the above reasons and to keep the same software able to work on all chip variants the read function of the driver is working with a packet id instead of an offset in OTP memory. Signed-off-by: Claudiu Beznea Signed-off-by: Srinivas Kandagatla --- MAINTAINERS | 8 + drivers/nvmem/Kconfig | 7 + drivers/nvmem/Makefile | 2 + drivers/nvmem/microchip-otpc.c | 288 +++++++++++++++++++++++++++++++++ 4 files changed, 305 insertions(+) create mode 100644 drivers/nvmem/microchip-otpc.c diff --git a/MAINTAINERS b/MAINTAINERS index a6d3bd9d2a8d0f..e51eeb0ee0ed01 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13100,6 +13100,14 @@ S: Supported F: Documentation/devicetree/bindings/mtd/atmel-nand.txt F: drivers/mtd/nand/raw/atmel/* +MICROCHIP OTPC DRIVER +M: Claudiu Beznea +L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) +S: Supported +F: Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml +F: drivers/nvmem/microchip-otpc.c +F: dt-bindings/nvmem/microchip,sama7g5-otpc.h + MICROCHIP PWM DRIVER M: Claudiu Beznea L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) diff --git a/drivers/nvmem/Kconfig b/drivers/nvmem/Kconfig index 967d0084800e68..d72d879a6d342f 100644 --- a/drivers/nvmem/Kconfig +++ b/drivers/nvmem/Kconfig @@ -107,6 +107,13 @@ config MTK_EFUSE This driver can also be built as a module. If so, the module will be called efuse-mtk. +config MICROCHIP_OTPC + tristate "Microchip OTPC support" + depends on ARCH_AT91 || COMPILE_TEST + help + This driver enable the OTP controller available on Microchip SAMA7G5 + SoCs. It controlls the access to the OTP memory connected to it. + config NVMEM_NINTENDO_OTP tristate "Nintendo Wii and Wii U OTP Support" depends on WII || COMPILE_TEST diff --git a/drivers/nvmem/Makefile b/drivers/nvmem/Makefile index 00e136a0a123b6..c710b64f9fe41c 100644 --- a/drivers/nvmem/Makefile +++ b/drivers/nvmem/Makefile @@ -67,3 +67,5 @@ obj-$(CONFIG_NVMEM_SUNPLUS_OCOTP) += nvmem_sunplus_ocotp.o nvmem_sunplus_ocotp-y := sunplus-ocotp.o obj-$(CONFIG_NVMEM_APPLE_EFUSES) += nvmem-apple-efuses.o nvmem-apple-efuses-y := apple-efuses.o +obj-$(CONFIG_MICROCHIP_OTPC) += nvmem-microchip-otpc.o +nvmem-microchip-otpc-y := microchip-otpc.o diff --git a/drivers/nvmem/microchip-otpc.c b/drivers/nvmem/microchip-otpc.c new file mode 100644 index 00000000000000..436e0dc4f33755 --- /dev/null +++ b/drivers/nvmem/microchip-otpc.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * OTP Memory controller + * + * Copyright (C) 2022 Microchip Technology Inc. and its subsidiaries + * + * Author: Claudiu Beznea + */ + +#include +#include +#include +#include +#include +#include + +#define MCHP_OTPC_CR (0x0) +#define MCHP_OTPC_CR_READ BIT(6) +#define MCHP_OTPC_MR (0x4) +#define MCHP_OTPC_MR_ADDR GENMASK(31, 16) +#define MCHP_OTPC_AR (0x8) +#define MCHP_OTPC_SR (0xc) +#define MCHP_OTPC_SR_READ BIT(6) +#define MCHP_OTPC_HR (0x20) +#define MCHP_OTPC_HR_SIZE GENMASK(15, 8) +#define MCHP_OTPC_DR (0x24) + +#define MCHP_OTPC_NAME "mchp-otpc" +#define MCHP_OTPC_SIZE (11 * 1024) + +/** + * struct mchp_otpc - OTPC private data structure + * @base: base address + * @dev: struct device pointer + * @packets: list of packets in OTP memory + * @npackets: number of packets in OTP memory + */ +struct mchp_otpc { + void __iomem *base; + struct device *dev; + struct list_head packets; + u32 npackets; +}; + +/** + * struct mchp_otpc_packet - OTPC packet data structure + * @list: list head + * @id: packet ID + * @offset: packet offset (in words) in OTP memory + */ +struct mchp_otpc_packet { + struct list_head list; + u32 id; + u32 offset; +}; + +static struct mchp_otpc_packet *mchp_otpc_id_to_packet(struct mchp_otpc *otpc, + u32 id) +{ + struct mchp_otpc_packet *packet; + + if (id >= otpc->npackets) + return NULL; + + list_for_each_entry(packet, &otpc->packets, list) { + if (packet->id == id) + return packet; + } + + return NULL; +} + +static int mchp_otpc_prepare_read(struct mchp_otpc *otpc, + unsigned int offset) +{ + u32 tmp; + + /* Set address. */ + tmp = readl_relaxed(otpc->base + MCHP_OTPC_MR); + tmp &= ~MCHP_OTPC_MR_ADDR; + tmp |= FIELD_PREP(MCHP_OTPC_MR_ADDR, offset); + writel_relaxed(tmp, otpc->base + MCHP_OTPC_MR); + + /* Set read. */ + tmp = readl_relaxed(otpc->base + MCHP_OTPC_CR); + tmp |= MCHP_OTPC_CR_READ; + writel_relaxed(tmp, otpc->base + MCHP_OTPC_CR); + + /* Wait for packet to be transferred into temporary buffers. */ + return read_poll_timeout(readl_relaxed, tmp, !(tmp & MCHP_OTPC_SR_READ), + 10000, 2000, false, otpc->base + MCHP_OTPC_SR); +} + +/* + * OTPC memory is organized into packets. Each packets contains a header and + * a payload. Header is 4 bytes long and contains the size of the payload. + * Payload size varies. The memory footprint is something as follows: + * + * Memory offset Memory footprint Packet ID + * ------------- ---------------- --------- + * + * 0x0 +------------+ <-- packet 0 + * | header 0 | + * 0x4 +------------+ + * | payload 0 | + * . . + * . ... . + * . . + * offset1 +------------+ <-- packet 1 + * | header 1 | + * offset1 + 0x4 +------------+ + * | payload 1 | + * . . + * . ... . + * . . + * offset2 +------------+ <-- packet 2 + * . . + * . ... . + * . . + * offsetN +------------+ <-- packet N + * | header N | + * offsetN + 0x4 +------------+ + * | payload N | + * . . + * . ... . + * . . + * +------------+ + * + * where offset1, offset2, offsetN depends on the size of payload 0, payload 1, + * payload N-1. + * + * The access to memory is done on a per packet basis: the control registers + * need to be updated with an offset address (within a packet range) and the + * data registers will be update by controller with information contained by + * that packet. E.g. if control registers are updated with any address within + * the range [offset1, offset2) the data registers are updated by controller + * with packet 1. Header data is accessible though MCHP_OTPC_HR register. + * Payload data is accessible though MCHP_OTPC_DR and MCHP_OTPC_AR registers. + * There is no direct mapping b/w the offset requested by software and the + * offset returned by hardware. + * + * For this, the read function will return the first requested bytes in the + * packet. The user will have to be aware of the memory footprint before doing + * the read request. + */ +static int mchp_otpc_read(void *priv, unsigned int off, void *val, + size_t bytes) +{ + struct mchp_otpc *otpc = priv; + struct mchp_otpc_packet *packet; + u32 *buf = val; + u32 offset; + size_t len = 0; + int ret, payload_size; + + /* + * We reach this point with off being multiple of stride = 4 to + * be able to cross the subsystem. Inside the driver we use continuous + * unsigned integer numbers for packet id, thus devide off by 4 + * before passing it to mchp_otpc_id_to_packet(). + */ + packet = mchp_otpc_id_to_packet(otpc, off / 4); + if (!packet) + return -EINVAL; + offset = packet->offset; + + while (len < bytes) { + ret = mchp_otpc_prepare_read(otpc, offset); + if (ret) + return ret; + + /* Read and save header content. */ + *buf++ = readl_relaxed(otpc->base + MCHP_OTPC_HR); + len += sizeof(*buf); + offset++; + if (len >= bytes) + break; + + /* Read and save payload content. */ + payload_size = FIELD_GET(MCHP_OTPC_HR_SIZE, *(buf - 1)); + writel_relaxed(0UL, otpc->base + MCHP_OTPC_AR); + do { + *buf++ = readl_relaxed(otpc->base + MCHP_OTPC_DR); + len += sizeof(*buf); + offset++; + payload_size--; + } while (payload_size >= 0 && len < bytes); + } + + return 0; +} + +static int mchp_otpc_init_packets_list(struct mchp_otpc *otpc, u32 *size) +{ + struct mchp_otpc_packet *packet; + u32 word, word_pos = 0, id = 0, npackets = 0, payload_size; + int ret; + + INIT_LIST_HEAD(&otpc->packets); + *size = 0; + + while (*size < MCHP_OTPC_SIZE) { + ret = mchp_otpc_prepare_read(otpc, word_pos); + if (ret) + return ret; + + word = readl_relaxed(otpc->base + MCHP_OTPC_HR); + payload_size = FIELD_GET(MCHP_OTPC_HR_SIZE, word); + if (!payload_size) + break; + + packet = devm_kzalloc(otpc->dev, sizeof(*packet), GFP_KERNEL); + if (!packet) + return -ENOMEM; + + packet->id = id++; + packet->offset = word_pos; + INIT_LIST_HEAD(&packet->list); + list_add_tail(&packet->list, &otpc->packets); + + /* Count size by adding header and paload sizes. */ + *size += 4 * (payload_size + 1); + /* Next word: this packet (header, payload) position + 1. */ + word_pos += payload_size + 2; + + npackets++; + } + + otpc->npackets = npackets; + + return 0; +} + +static struct nvmem_config mchp_nvmem_config = { + .name = MCHP_OTPC_NAME, + .type = NVMEM_TYPE_OTP, + .read_only = true, + .word_size = 4, + .stride = 4, + .reg_read = mchp_otpc_read, +}; + +static int mchp_otpc_probe(struct platform_device *pdev) +{ + struct nvmem_device *nvmem; + struct mchp_otpc *otpc; + u32 size; + int ret; + + otpc = devm_kzalloc(&pdev->dev, sizeof(*otpc), GFP_KERNEL); + if (!otpc) + return -ENOMEM; + + otpc->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(otpc->base)) + return PTR_ERR(otpc->base); + + otpc->dev = &pdev->dev; + ret = mchp_otpc_init_packets_list(otpc, &size); + if (ret) + return ret; + + mchp_nvmem_config.dev = otpc->dev; + mchp_nvmem_config.size = size; + mchp_nvmem_config.priv = otpc; + nvmem = devm_nvmem_register(&pdev->dev, &mchp_nvmem_config); + + return PTR_ERR_OR_ZERO(nvmem); +} + +static const struct of_device_id __maybe_unused mchp_otpc_ids[] = { + { .compatible = "microchip,sama7g5-otpc", }, + { }, +}; +MODULE_DEVICE_TABLE(of, mchp_otpc_ids); + +static struct platform_driver mchp_otpc_driver = { + .probe = mchp_otpc_probe, + .driver = { + .name = MCHP_OTPC_NAME, + .of_match_table = of_match_ptr(mchp_otpc_ids), + }, +}; +module_platform_driver(mchp_otpc_driver); + +MODULE_AUTHOR("Claudiu Beznea "); +MODULE_DESCRIPTION("Microchip SAMA7G5 OTPC driver"); +MODULE_LICENSE("GPL"); From e44850ee00a1556c1179179d92df1ad53d5c3347 Mon Sep 17 00:00:00 2001 From: keliu Date: Fri, 27 May 2022 07:30:53 +0000 Subject: [PATCH 0064/1250] drivers: slimbus: Directly use ida_alloc()/free() Use ida_alloc()/ida_free() instead of deprecated ida_simple_get()/ida_simple_remove() . Signed-off-by: keliu Signed-off-by: Srinivas Kandagatla --- drivers/slimbus/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/slimbus/core.c b/drivers/slimbus/core.c index 78480e332ab8f5..219483b79c09ce 100644 --- a/drivers/slimbus/core.c +++ b/drivers/slimbus/core.c @@ -250,7 +250,7 @@ int slim_register_controller(struct slim_controller *ctrl) { int id; - id = ida_simple_get(&ctrl_ida, 0, 0, GFP_KERNEL); + id = ida_alloc(&ctrl_ida, GFP_KERNEL); if (id < 0) return id; @@ -299,7 +299,7 @@ int slim_unregister_controller(struct slim_controller *ctrl) { /* Remove all clients */ device_for_each_child(ctrl->dev, NULL, slim_ctrl_remove_device); - ida_simple_remove(&ctrl_ida, ctrl->id); + ida_free(&ctrl_ida, ctrl->id); return 0; } @@ -323,7 +323,7 @@ void slim_report_absent(struct slim_device *sbdev) sbdev->is_laddr_valid = false; mutex_unlock(&ctrl->lock); if (!ctrl->get_laddr) - ida_simple_remove(&ctrl->laddr_ida, sbdev->laddr); + ida_free(&ctrl->laddr_ida, sbdev->laddr); slim_device_update_status(sbdev, SLIM_DEVICE_STATUS_DOWN); } EXPORT_SYMBOL_GPL(slim_report_absent); From 4ad3deabeea21f9fda6a49de10fd417c4199ffaf Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:11:38 +0200 Subject: [PATCH 0065/1250] slimbus: messaging: fix typos in comments Spelling mistakes (triple letters) in comments. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Srinivas Kandagatla --- drivers/slimbus/messaging.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/slimbus/messaging.c b/drivers/slimbus/messaging.c index e5ae26227bdbff..4ce0cb61e48135 100644 --- a/drivers/slimbus/messaging.c +++ b/drivers/slimbus/messaging.c @@ -79,7 +79,7 @@ int slim_alloc_txn_tid(struct slim_controller *ctrl, struct slim_msg_txn *txn) EXPORT_SYMBOL_GPL(slim_alloc_txn_tid); /** - * slim_free_txn_tid() - Freee tid of txn + * slim_free_txn_tid() - Free tid of txn * * @ctrl: Controller handle * @txn: transaction whose tid should be freed @@ -101,7 +101,7 @@ EXPORT_SYMBOL_GPL(slim_free_txn_tid); * @txn: Transaction to be sent over SLIMbus * * Called by controller to transmit messaging transactions not dealing with - * Interface/Value elements. (e.g. transmittting a message to assign logical + * Interface/Value elements. (e.g. transmitting a message to assign logical * address to a slave device * * Return: -ETIMEDOUT: If transmission of this message timed out From c98ebe065e07dc6c6c82bcbfd2a0351a7bde68da Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sun, 5 Jun 2022 18:05:07 +0200 Subject: [PATCH 0066/1250] pinctrl: samsung: do not use bindings header with constants The Samsung SoC pin controller driver uses only three defines from the bindings header with pin configuration register values, which proves the point that this header is not a proper bindings-type abstraction layer with IDs. Define the needed register values directly in the driver and stop using the bindings header. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Chanho Park Acked-by: Rob Herring Link: https://lore.kernel.org/r/20220605160508.134075-8-krzysztof.kozlowski@linaro.org --- drivers/pinctrl/samsung/pinctrl-exynos.c | 6 ++---- drivers/pinctrl/samsung/pinctrl-exynos.h | 3 +++ drivers/pinctrl/samsung/pinctrl-samsung.c | 4 +--- drivers/pinctrl/samsung/pinctrl-samsung.h | 8 ++++++++ 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/pinctrl/samsung/pinctrl-exynos.c b/drivers/pinctrl/samsung/pinctrl-exynos.c index 6d7ca1758292ba..a8212fc126bf28 100644 --- a/drivers/pinctrl/samsung/pinctrl-exynos.c +++ b/drivers/pinctrl/samsung/pinctrl-exynos.c @@ -27,8 +27,6 @@ #include #include -#include - #include "pinctrl-samsung.h" #include "pinctrl-exynos.h" @@ -173,7 +171,7 @@ static int exynos_irq_request_resources(struct irq_data *irqd) con = readl(bank->pctl_base + reg_con); con &= ~(mask << shift); - con |= EXYNOS_PIN_FUNC_EINT << shift; + con |= EXYNOS_PIN_CON_FUNC_EINT << shift; writel(con, bank->pctl_base + reg_con); raw_spin_unlock_irqrestore(&bank->slock, flags); @@ -196,7 +194,7 @@ static void exynos_irq_release_resources(struct irq_data *irqd) con = readl(bank->pctl_base + reg_con); con &= ~(mask << shift); - con |= EXYNOS_PIN_FUNC_INPUT << shift; + con |= PIN_CON_FUNC_INPUT << shift; writel(con, bank->pctl_base + reg_con); raw_spin_unlock_irqrestore(&bank->slock, flags); diff --git a/drivers/pinctrl/samsung/pinctrl-exynos.h b/drivers/pinctrl/samsung/pinctrl-exynos.h index bfad1ced80176f..7bd6d82c9f369a 100644 --- a/drivers/pinctrl/samsung/pinctrl-exynos.h +++ b/drivers/pinctrl/samsung/pinctrl-exynos.h @@ -16,6 +16,9 @@ #ifndef __PINCTRL_SAMSUNG_EXYNOS_H #define __PINCTRL_SAMSUNG_EXYNOS_H +/* Values for the pin CON register */ +#define EXYNOS_PIN_CON_FUNC_EINT 0xf + /* External GPIO and wakeup interrupt related definitions */ #define EXYNOS_GPIO_ECON_OFFSET 0x700 #define EXYNOS_GPIO_EFLTCON_OFFSET 0x800 diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.c b/drivers/pinctrl/samsung/pinctrl-samsung.c index 26d309d2516d13..4837bceb767b45 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.c +++ b/drivers/pinctrl/samsung/pinctrl-samsung.c @@ -26,8 +26,6 @@ #include #include -#include - #include "../core.h" #include "pinctrl-samsung.h" @@ -614,7 +612,7 @@ static int samsung_gpio_set_direction(struct gpio_chip *gc, data = readl(reg); data &= ~(mask << shift); if (!input) - data |= EXYNOS_PIN_FUNC_OUTPUT << shift; + data |= PIN_CON_FUNC_OUTPUT << shift; writel(data, reg); return 0; diff --git a/drivers/pinctrl/samsung/pinctrl-samsung.h b/drivers/pinctrl/samsung/pinctrl-samsung.h index fc6f5199c548c3..9af93e3d8d9ff3 100644 --- a/drivers/pinctrl/samsung/pinctrl-samsung.h +++ b/drivers/pinctrl/samsung/pinctrl-samsung.h @@ -53,6 +53,14 @@ enum pincfg_type { #define PINCFG_UNPACK_TYPE(cfg) ((cfg) & PINCFG_TYPE_MASK) #define PINCFG_UNPACK_VALUE(cfg) (((cfg) & PINCFG_VALUE_MASK) >> \ PINCFG_VALUE_SHIFT) +/* + * Values for the pin CON register, choosing pin function. + * The basic set (input and output) are same between: S3C24xx, S3C64xx, S5PV210, + * Exynos ARMv7, Exynos ARMv8, Tesla FSD. + */ +#define PIN_CON_FUNC_INPUT 0x0 +#define PIN_CON_FUNC_OUTPUT 0x1 + /** * enum eint_type - possible external interrupt types. * @EINT_TYPE_NONE: bank does not support external interrupts From 4ff21ed9269793eaa7c64e06bfb4119608efa731 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 30 May 2022 11:46:25 +0900 Subject: [PATCH 0067/1250] arm64: dts: renesas: r8a779f0: Add IPMMU nodes Add IPMMU nodes for r8a779f0. Signed-off-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20220530024626.1870277-2-yoshihiro.shimoda.uh@renesas.com Signed-off-by: Geert Uytterhoeven --- arch/arm64/boot/dts/renesas/r8a779f0.dtsi | 46 +++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi index df46fb87cffc55..512e0b57fd6ae7 100644 --- a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi @@ -343,6 +343,52 @@ dma-channels = <16>; }; + ipmmu_rt0: iommu@ee480000 { + compatible = "renesas,ipmmu-r8a779f0", + "renesas,rcar-gen4-ipmmu"; + reg = <0 0xee480000 0 0x20000>; + renesas,ipmmu-main = <&ipmmu_mm 10>; + power-domains = <&sysc R8A779F0_PD_ALWAYS_ON>; + #iommu-cells = <1>; + }; + + ipmmu_rt1: iommu@ee4c0000 { + compatible = "renesas,ipmmu-r8a779f0", + "renesas,rcar-gen4-ipmmu"; + reg = <0 0xee4c0000 0 0x20000>; + renesas,ipmmu-main = <&ipmmu_mm 19>; + power-domains = <&sysc R8A779F0_PD_ALWAYS_ON>; + #iommu-cells = <1>; + }; + + ipmmu_ds0: iommu@eed00000 { + compatible = "renesas,ipmmu-r8a779f0", + "renesas,rcar-gen4-ipmmu"; + reg = <0 0xeed00000 0 0x20000>; + renesas,ipmmu-main = <&ipmmu_mm 0>; + power-domains = <&sysc R8A779F0_PD_ALWAYS_ON>; + #iommu-cells = <1>; + }; + + ipmmu_hc: iommu@eed40000 { + compatible = "renesas,ipmmu-r8a779f0", + "renesas,rcar-gen4-ipmmu"; + reg = <0 0xeed40000 0 0x20000>; + renesas,ipmmu-main = <&ipmmu_mm 2>; + power-domains = <&sysc R8A779F0_PD_ALWAYS_ON>; + #iommu-cells = <1>; + }; + + ipmmu_mm: iommu@eefc0000 { + compatible = "renesas,ipmmu-r8a779f0", + "renesas,rcar-gen4-ipmmu"; + reg = <0 0xeefc0000 0 0x20000>; + interrupts = , + ; + power-domains = <&sysc R8A779F0_PD_ALWAYS_ON>; + #iommu-cells = <1>; + }; + gic: interrupt-controller@f1000000 { compatible = "arm,gic-v3"; #interrupt-cells = <3>; From b36be13ed6cb4619f26f9e963e41ffd74c3a2ef7 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 30 May 2022 11:46:26 +0900 Subject: [PATCH 0068/1250] arm64: dts: renesas: r8a779f0: Add iommus to DMAC nodes Add iommus properties to the DMAC nodes for r8a779f0. Signed-off-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20220530024626.1870277-3-yoshihiro.shimoda.uh@renesas.com Signed-off-by: Geert Uytterhoeven --- arch/arm64/boot/dts/renesas/r8a779f0.dtsi | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi index 512e0b57fd6ae7..ad8c77edb12699 100644 --- a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi @@ -306,6 +306,14 @@ resets = <&cpg 709>; #dma-cells = <1>; dma-channels = <16>; + iommus = <&ipmmu_ds0 0>, <&ipmmu_ds0 1>, + <&ipmmu_ds0 2>, <&ipmmu_ds0 3>, + <&ipmmu_ds0 4>, <&ipmmu_ds0 5>, + <&ipmmu_ds0 6>, <&ipmmu_ds0 7>, + <&ipmmu_ds0 8>, <&ipmmu_ds0 9>, + <&ipmmu_ds0 10>, <&ipmmu_ds0 11>, + <&ipmmu_ds0 12>, <&ipmmu_ds0 13>, + <&ipmmu_ds0 14>, <&ipmmu_ds0 15>; }; dmac1: dma-controller@e7351000 { @@ -341,6 +349,14 @@ resets = <&cpg 710>; #dma-cells = <1>; dma-channels = <16>; + iommus = <&ipmmu_ds0 16>, <&ipmmu_ds0 17>, + <&ipmmu_ds0 18>, <&ipmmu_ds0 19>, + <&ipmmu_ds0 20>, <&ipmmu_ds0 21>, + <&ipmmu_ds0 22>, <&ipmmu_ds0 23>, + <&ipmmu_ds0 24>, <&ipmmu_ds0 25>, + <&ipmmu_ds0 26>, <&ipmmu_ds0 27>, + <&ipmmu_ds0 28>, <&ipmmu_ds0 29>, + <&ipmmu_ds0 30>, <&ipmmu_ds0 31>; }; ipmmu_rt0: iommu@ee480000 { From 1105171d080eb051fbd51748b76fd5fed747505e Mon Sep 17 00:00:00 2001 From: "GONG, Ruiqi" Date: Mon, 6 Jun 2022 16:17:14 +0800 Subject: [PATCH 0069/1250] smack: Replace kzalloc + strncpy with kstrndup Simplify the code by using kstrndup instead of kzalloc and strncpy in smk_parse_smack(), which meanwhile remove strncpy as [1] suggests. [1]: https://github.com/KSPP/linux/issues/90 Signed-off-by: GONG, Ruiqi Signed-off-by: Casey Schaufler --- security/smack/smack_access.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index d2186e2757be81..585e5e35710b24 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -465,12 +465,9 @@ char *smk_parse_smack(const char *string, int len) if (i == 0 || i >= SMK_LONGLABEL) return ERR_PTR(-EINVAL); - smack = kzalloc(i + 1, GFP_NOFS); - if (smack == NULL) + smack = kstrndup(string, i, GFP_NOFS); + if (!smack) return ERR_PTR(-ENOMEM); - - strncpy(smack, string, i); - return smack; } From ad564394b3db3ae93ccf2f185a6d95719162e222 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 8 Jun 2022 16:46:13 +0300 Subject: [PATCH 0070/1250] Bluetooth: fix an error code in hci_register_dev() Preserve the error code from hci_register_suspend_notifier(). Don't return success. Fixes: d6bb2a91f95b ("Bluetooth: Unregister suspend with userchannel") Signed-off-by: Dan Carpenter Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 3d8d2fcc0eb4a5..6faae50d933dcc 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2646,7 +2646,8 @@ int hci_register_dev(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); - if (hci_register_suspend_notifier(hdev)) + error = hci_register_suspend_notifier(hdev); + if (error) goto err_wqueue; queue_work(hdev->req_workqueue, &hdev->power_on); From 0b537674e072a37dec2fcefef4df2317b58aaa3f Mon Sep 17 00:00:00 2001 From: Xiaohui Zhang Date: Tue, 7 Jun 2022 23:30:20 +0800 Subject: [PATCH 0071/1250] Bluetooth: use memset avoid memory leaks Similar to the handling of l2cap_ecred_connect in commit d3715b2333e9 ("Bluetooth: use memset avoid memory leaks"), we thought a patch might be needed here as well. Use memset to initialize structs to prevent memory leaks in l2cap_le_connect Signed-off-by: Xiaohui Zhang Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index ae78490ecd3d4b..09ecaf556de567 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1369,6 +1369,7 @@ static void l2cap_le_connect(struct l2cap_chan *chan) l2cap_le_flowctl_init(chan, 0); + memset(&req, 0, sizeof(req)); req.psm = chan->psm; req.scid = cpu_to_le16(chan->scid); req.mtu = cpu_to_le16(chan->imtu); From 575947d8537cbf5cd9594e67805da72fa46810c7 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 9 Jun 2022 18:40:32 +0800 Subject: [PATCH 0072/1250] mm: sysctl: fix missing numa_stat when !CONFIG_HUGETLB_PAGE "numa_stat" should not be included in the scope of CONFIG_HUGETLB_PAGE, if CONFIG_HUGETLB_PAGE is not configured even if CONFIG_NUMA is configured, "numa_stat" is missed form /proc. Move it out of CONFIG_HUGETLB_PAGE to fix it. Fixes: 4518085e127d ("mm, sysctl: make NUMA stats configurable") Signed-off-by: Muchun Song Cc: Acked-by: Michal Hocko Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e52b6e372c602c..aaf0b1f1dc573f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2091,6 +2091,17 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO_HUNDRED, }, +#ifdef CONFIG_NUMA + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif #ifdef CONFIG_HUGETLB_PAGE { .procname = "nr_hugepages", @@ -2107,15 +2118,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, - { - .procname = "numa_stat", - .data = &sysctl_vm_numa_stat, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = sysctl_vm_numa_stat_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #endif { .procname = "hugetlb_shm_group", From 30bb01fe2ff13042eef6454516526b46739f6896 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 17 May 2022 17:07:31 +0200 Subject: [PATCH 0073/1250] sysctl: Merge adjacent CONFIG_TREE_RCU blocks There are two adjacent sysctl entries protected by the same CONFIG_TREE_RCU config symbol. Merge them into a single block to improve readability. Use the more common "#ifdef" form while at it. Signed-off-by: Geert Uytterhoeven Reviewed-by: Paul E. McKenney Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index aaf0b1f1dc573f..55839c34ff85a2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2017,7 +2017,7 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#if defined(CONFIG_TREE_RCU) +#ifdef CONFIG_TREE_RCU { .procname = "panic_on_rcu_stall", .data = &sysctl_panic_on_rcu_stall, @@ -2027,8 +2027,6 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, -#endif -#if defined(CONFIG_TREE_RCU) { .procname = "max_rcu_stall_to_panic", .data = &sysctl_max_rcu_stall_to_panic, From acdc07ace871d307fb19550bfcdd47ac1a896e64 Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Sun, 22 May 2022 13:29:33 +0800 Subject: [PATCH 0074/1250] kernel/sysctl.c: Clean up indentation, replace spaces with tab. This patch fixes two coding style issues: 1. Clean up indentation, replace spaces with tab 2. Add space after ',' Signed-off-by: Fanjun Kong Signed-off-by: Luis Chamberlain --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 55839c34ff85a2..c4901612ecc4a0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1277,8 +1277,8 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,buffer,lenp,ppos, - do_proc_dointvec_userhz_jiffies_conv,NULL); + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_userhz_jiffies_conv, NULL); } /** From b50503990d8889a73bc412fd976ef00951cdb2b5 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Fri, 10 Jun 2022 17:23:07 +0800 Subject: [PATCH 0075/1250] smack: Remove the redundant lsm_inode_alloc It's not possible for inode->i_security to be NULL here because every inode will call inode_init_always and then lsm_inode_alloc to alloc memory for inode->security, this is what LSM infrastructure management do, so remove this redundant code. Signed-off-by: Xiu Jianfeng Signed-off-by: Casey Schaufler --- security/smack/smack_lsm.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 6207762dbdb131..001831458fa2c8 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -766,13 +766,6 @@ static int smack_set_mnt_opts(struct super_block *sb, if (sp->smk_flags & SMK_SB_INITIALIZED) return 0; - if (inode->i_security == NULL) { - int rc = lsm_inode_alloc(inode); - - if (rc) - return rc; - } - if (!smack_privileged(CAP_MAC_ADMIN)) { /* * Unprivileged mounts don't get to specify Smack values. From 51c6aad4190b47a91f1f164fb4ab7999f886ce2c Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 21 Mar 2022 08:33:56 +0000 Subject: [PATCH 0076/1250] media: v4l2-compat-ioctl32.c: zero buffer passed to v4l2_compat_get_array_args() The v4l2_compat_get_array_args() function can leave uninitialized memory in the buffer it is passed. So zero it before copying array elements from userspace into the buffer. Signed-off-by: Hans Verkuil Reported-by: syzbot+ff18193ff05f3f87f226@syzkaller.appspotmail.com Reviewed-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index 0f3d6b5667b07e..55c26e7d370e92 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -1040,6 +1040,8 @@ int v4l2_compat_get_array_args(struct file *file, void *mbuf, { int err = 0; + memset(mbuf, 0, array_size); + switch (cmd) { case VIDIOC_G_FMT32: case VIDIOC_S_FMT32: From ea37ee3a1561355fa94dbf574305c568fc23d560 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 18 May 2022 14:06:31 +0100 Subject: [PATCH 0077/1250] media: v4l2-ioctl.c: fix incorrect error path If allocating array_buf fails, or copying data from userspace into that buffer fails, then just free memory and return the error. Don't attempt to call video_put_user() since there is no point, and it would copy back data on error even if INFO_FL_ALWAYS_COPY wasn't set. So if writing the array back to userspace fails, then don't go to out_array_args, instead just continue with the regular code that just returns the error unless 'always_copy' is set. Update the VIDIOC_G/S/TRY_EXT_CTRLS ioctls to set the ALWAYS_COPY flag since they now need it. Before this worked due to this buggy code, but now that that is fixed these ioctls need to set this flag explicitly. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/v4l2-core/v4l2-ioctl.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 21470de62d7236..6e0ddb70e341b2 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -2823,9 +2823,9 @@ static const struct v4l2_ioctl_info v4l2_ioctls[] = { IOCTL_INFO(VIDIOC_S_PRIORITY, v4l_s_priority, v4l_print_u32, INFO_FL_PRIO), IOCTL_INFO(VIDIOC_G_SLICED_VBI_CAP, v4l_g_sliced_vbi_cap, v4l_print_sliced_vbi_cap, INFO_FL_CLEAR(v4l2_sliced_vbi_cap, type)), IOCTL_INFO(VIDIOC_LOG_STATUS, v4l_log_status, v4l_print_newline, 0), - IOCTL_INFO(VIDIOC_G_EXT_CTRLS, v4l_g_ext_ctrls, v4l_print_ext_controls, INFO_FL_CTRL), - IOCTL_INFO(VIDIOC_S_EXT_CTRLS, v4l_s_ext_ctrls, v4l_print_ext_controls, INFO_FL_PRIO | INFO_FL_CTRL), - IOCTL_INFO(VIDIOC_TRY_EXT_CTRLS, v4l_try_ext_ctrls, v4l_print_ext_controls, INFO_FL_CTRL), + IOCTL_INFO(VIDIOC_G_EXT_CTRLS, v4l_g_ext_ctrls, v4l_print_ext_controls, INFO_FL_CTRL | INFO_FL_ALWAYS_COPY), + IOCTL_INFO(VIDIOC_S_EXT_CTRLS, v4l_s_ext_ctrls, v4l_print_ext_controls, INFO_FL_PRIO | INFO_FL_CTRL | INFO_FL_ALWAYS_COPY), + IOCTL_INFO(VIDIOC_TRY_EXT_CTRLS, v4l_try_ext_ctrls, v4l_print_ext_controls, INFO_FL_CTRL | INFO_FL_ALWAYS_COPY), IOCTL_INFO(VIDIOC_ENUM_FRAMESIZES, v4l_stub_enum_framesizes, v4l_print_frmsizeenum, INFO_FL_CLEAR(v4l2_frmsizeenum, pixel_format)), IOCTL_INFO(VIDIOC_ENUM_FRAMEINTERVALS, v4l_stub_enum_frameintervals, v4l_print_frmivalenum, INFO_FL_CLEAR(v4l2_frmivalenum, height)), IOCTL_INFO(VIDIOC_G_ENC_INDEX, v4l_stub_g_enc_index, v4l_print_enc_idx, 0), @@ -3318,8 +3318,7 @@ video_usercopy(struct file *file, unsigned int orig_cmd, unsigned long arg, array_buf = kvmalloc(array_size, GFP_KERNEL); err = -ENOMEM; if (array_buf == NULL) - goto out_array_args; - err = -EFAULT; + goto out; if (in_compat_syscall()) err = v4l2_compat_get_array_args(file, array_buf, user_ptr, array_size, @@ -3328,7 +3327,7 @@ video_usercopy(struct file *file, unsigned int orig_cmd, unsigned long arg, err = copy_from_user(array_buf, user_ptr, array_size) ? -EFAULT : 0; if (err) - goto out_array_args; + goto out; *kernel_ptr = array_buf; } @@ -3346,6 +3345,13 @@ video_usercopy(struct file *file, unsigned int orig_cmd, unsigned long arg, trace_v4l2_qbuf(video_devdata(file)->minor, parg); } + /* + * Some ioctls can return an error, but still have valid + * results that must be returned. + */ + if (err < 0 && !always_copy) + goto out; + if (has_array_args) { *kernel_ptr = (void __force *)user_ptr; if (in_compat_syscall()) { @@ -3360,16 +3366,8 @@ video_usercopy(struct file *file, unsigned int orig_cmd, unsigned long arg, } else if (copy_to_user(user_ptr, array_buf, array_size)) { err = -EFAULT; } - goto out_array_args; } - /* - * Some ioctls can return an error, but still have valid - * results that must be returned. - */ - if (err < 0 && !always_copy) - goto out; -out_array_args: if (video_put_user((void __user *)arg, parg, cmd, orig_cmd)) err = -EFAULT; out: From d3daf73380ce50f89f07954a2160a5fd414f8725 Mon Sep 17 00:00:00 2001 From: Hangyu Hua Date: Thu, 19 May 2022 03:17:43 +0100 Subject: [PATCH 0078/1250] media: dvb_vb2: fix possible out of bound access vb2_core_qbuf and vb2_core_querybuf don't check the range of b->index controlled by the user. Fix this by adding range checking code before using them. Fixes: 57868acc369a ("media: videobuf2: Add new uAPI for DVB streaming I/O") Signed-off-by: Hangyu Hua Reviewed-by: Sergey Senozhatsky Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/media/dvb-core/dvb_vb2.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/media/dvb-core/dvb_vb2.c b/drivers/media/dvb-core/dvb_vb2.c index a1bd6d9c9223cd..909df82fed3329 100644 --- a/drivers/media/dvb-core/dvb_vb2.c +++ b/drivers/media/dvb-core/dvb_vb2.c @@ -354,6 +354,12 @@ int dvb_vb2_reqbufs(struct dvb_vb2_ctx *ctx, struct dmx_requestbuffers *req) int dvb_vb2_querybuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b) { + struct vb2_queue *q = &ctx->vb_q; + + if (b->index >= q->num_buffers) { + dprintk(1, "[%s] buffer index out of range\n", ctx->name); + return -EINVAL; + } vb2_core_querybuf(&ctx->vb_q, b->index, b); dprintk(3, "[%s] index=%d\n", ctx->name, b->index); return 0; @@ -378,8 +384,13 @@ int dvb_vb2_expbuf(struct dvb_vb2_ctx *ctx, struct dmx_exportbuffer *exp) int dvb_vb2_qbuf(struct dvb_vb2_ctx *ctx, struct dmx_buffer *b) { + struct vb2_queue *q = &ctx->vb_q; int ret; + if (b->index >= q->num_buffers) { + dprintk(1, "[%s] buffer index out of range\n", ctx->name); + return -EINVAL; + } ret = vb2_core_qbuf(&ctx->vb_q, b->index, b, NULL); if (ret) { dprintk(1, "[%s] index=%d errno=%d\n", ctx->name, From 3d1c64c9c1e6ede50599677a9b3ff4d83b5641c7 Mon Sep 17 00:00:00 2001 From: Liao Chang Date: Wed, 25 May 2022 16:02:41 +0800 Subject: [PATCH 0079/1250] csky/kprobe: reclaim insn_slot on kprobe unregistration On kprobe registration kernel allocate one insn_slot for new kprobe, but it forget to reclaim the insn_slot on unregistration, leading to a potential leakage. Reported-by: Chen Guokai Reviewed-by: Masami Hiramatsu (Google) Signed-off-by: Liao Chang Signed-off-by: Guo Ren --- arch/csky/kernel/probes/kprobes.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 34ba684d5962b1..3c6e5c725d8143 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -124,6 +124,10 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { + if (p->ainsn.api.insn) { + free_insn_slot(p->ainsn.api.insn, 0); + p->ainsn.api.insn = NULL; + } } static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) From 54b5189fe7098f92ccf1fd943b10f62a5d2d8bb7 Mon Sep 17 00:00:00 2001 From: Linh Phung Date: Wed, 25 May 2022 17:13:55 +0200 Subject: [PATCH 0080/1250] arm64: dts: renesas: r8a779f0: Add thermal support Add support for 3 TSC nodes of thermal. The 4th node is for the control domain and not for Linux. Signed-off-by: Linh Phung [wsa: rebased, fixed resource size, removed unused 4th node breaking probe] Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20220525151355.24175-1-wsa+renesas@sang-engineering.com Signed-off-by: Geert Uytterhoeven --- arch/arm64/boot/dts/renesas/r8a779f0.dtsi | 56 +++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi index ad8c77edb12699..145447ac1fe267 100644 --- a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi @@ -157,6 +157,18 @@ #power-domain-cells = <1>; }; + tsc: thermal@e6198000 { + compatible = "renesas,r8a779f0-thermal"; + /* The 4th sensor is in control domain and not for Linux */ + reg = <0 0xe6198000 0 0x200>, + <0 0xe61a0000 0 0x200>, + <0 0xe61a8000 0 0x200>; + clocks = <&cpg CPG_MOD 919>; + power-domains = <&sysc R8A779F0_PD_ALWAYS_ON>; + resets = <&cpg 919>; + #thermal-sensor-cells = <1>; + }; + i2c0: i2c@e6500000 { compatible = "renesas,i2c-r8a779f0", "renesas,rcar-gen4-i2c"; @@ -422,6 +434,50 @@ }; }; + thermal-zones { + sensor_thermal1: sensor-thermal1 { + polling-delay-passive = <250>; + polling-delay = <1000>; + thermal-sensors = <&tsc 0>; + + trips { + sensor1_crit: sensor1-crit { + temperature = <120000>; + hysteresis = <1000>; + type = "critical"; + }; + }; + }; + + sensor_thermal2: sensor-thermal2 { + polling-delay-passive = <250>; + polling-delay = <1000>; + thermal-sensors = <&tsc 1>; + + trips { + sensor2_crit: sensor2-crit { + temperature = <120000>; + hysteresis = <1000>; + type = "critical"; + }; + }; + }; + + sensor_thermal3: sensor-thermal3 { + polling-delay-passive = <250>; + polling-delay = <1000>; + thermal-sensors = <&tsc 2>; + + trips { + sensor3_crit: sensor3-crit { + temperature = <120000>; + hysteresis = <1000>; + type = "critical"; + }; + }; + }; + }; + timer { compatible = "arm,armv8-timer"; interrupts-extended = <&gic GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(1) | IRQ_TYPE_LEVEL_LOW)>, From e358e16fc3fa87aa2831d0e4976e2266b975c39f Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 26 May 2022 22:42:31 +0200 Subject: [PATCH 0081/1250] arm64: dts: renesas: Adjust whitespace around '=' Fix whitespace coding style: use single space instead of tabs or multiple spaces around '=' sign in property assignment. No functional changes (same DTB). Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220526204231.832090-1-krzysztof.kozlowski@linaro.org Signed-off-by: Geert Uytterhoeven --- arch/arm64/boot/dts/renesas/draak.dtsi | 2 +- arch/arm64/boot/dts/renesas/ebisu.dtsi | 2 +- arch/arm64/boot/dts/renesas/salvator-common.dtsi | 2 +- arch/arm64/boot/dts/renesas/ulcb-kf.dtsi | 14 +++++++------- arch/arm64/boot/dts/renesas/ulcb.dtsi | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm64/boot/dts/renesas/draak.dtsi b/arch/arm64/boot/dts/renesas/draak.dtsi index 7231f820d60113..ef3bb835d5c051 100644 --- a/arch/arm64/boot/dts/renesas/draak.dtsi +++ b/arch/arm64/boot/dts/renesas/draak.dtsi @@ -630,7 +630,7 @@ bitclock-master = <&rsnd_for_ak4613>; frame-master = <&rsnd_for_ak4613>; playback = <&ssi3>, <&src5>, <&dvc0>; - capture = <&ssi4>, <&src6>, <&dvc1>; + capture = <&ssi4>, <&src6>, <&dvc1>; }; }; }; diff --git a/arch/arm64/boot/dts/renesas/ebisu.dtsi b/arch/arm64/boot/dts/renesas/ebisu.dtsi index 9c311906fdaf39..8fc03491a11c43 100644 --- a/arch/arm64/boot/dts/renesas/ebisu.dtsi +++ b/arch/arm64/boot/dts/renesas/ebisu.dtsi @@ -711,7 +711,7 @@ rcar_sound,dai { dai0 { playback = <&ssi0>, <&src0>, <&dvc0>; - capture = <&ssi1>, <&src1>, <&dvc1>; + capture = <&ssi1>, <&src1>, <&dvc1>; }; }; diff --git a/arch/arm64/boot/dts/renesas/salvator-common.dtsi b/arch/arm64/boot/dts/renesas/salvator-common.dtsi index aa44bef0c370ff..b7c7911858b2c7 100644 --- a/arch/arm64/boot/dts/renesas/salvator-common.dtsi +++ b/arch/arm64/boot/dts/renesas/salvator-common.dtsi @@ -832,7 +832,7 @@ frame-master = <&rsnd_endpoint0>; playback = <&ssi0>, <&src0>, <&dvc0>; - capture = <&ssi1>, <&src1>, <&dvc1>; + capture = <&ssi1>, <&src1>, <&dvc1>; }; }; diff --git a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi index 5bcb84403ef682..408871c2859d14 100644 --- a/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi +++ b/arch/arm64/boot/dts/renesas/ulcb-kf.dtsi @@ -206,12 +206,12 @@ clocks = <&clksndsel>; clock-names = "scki"; - VDD1-supply = <&snd_3p3v>; - VDD2-supply = <&snd_3p3v>; - VCCAD1-supply = <&snd_vcc5v>; - VCCAD2-supply = <&snd_vcc5v>; - VCCDA1-supply = <&snd_vcc5v>; - VCCDA2-supply = <&snd_vcc5v>; + VDD1-supply = <&snd_3p3v>; + VDD2-supply = <&snd_3p3v>; + VCCAD1-supply = <&snd_vcc5v>; + VCCAD2-supply = <&snd_vcc5v>; + VCCDA1-supply = <&snd_vcc5v>; + VCCDA2-supply = <&snd_vcc5v>; ports { #address-cells = <1>; @@ -438,7 +438,7 @@ bitclock-master; frame-master; dai-tdm-slot-num = <6>; - capture = <&ssi4>; + capture = <&ssi4>; }; }; }; diff --git a/arch/arm64/boot/dts/renesas/ulcb.dtsi b/arch/arm64/boot/dts/renesas/ulcb.dtsi index d5f0f75b249b46..0772dfe4adffee 100644 --- a/arch/arm64/boot/dts/renesas/ulcb.dtsi +++ b/arch/arm64/boot/dts/renesas/ulcb.dtsi @@ -411,7 +411,7 @@ bitclock-master; frame-master; playback = <&ssi0>, <&src0>, <&dvc0>; - capture = <&ssi1>, <&src1>, <&dvc1>; + capture = <&ssi1>, <&src1>, <&dvc1>; }; }; rsnd_port1: port@1 { From 584add1b6c6ac4f540e10ceb95c46dcc50e9b086 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 4 Jun 2022 01:29:40 +0200 Subject: [PATCH 0082/1250] arm64: dts: renesas: rzg2l-smarc: Use proper bool operator When checking for defined macros, we want the boolean AND not the binary one. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20220603232940.21736-1-wsa+renesas@sang-engineering.com Signed-off-by: Geert Uytterhoeven --- arch/arm64/boot/dts/renesas/rzg2lc-smarc.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/renesas/rzg2lc-smarc.dtsi b/arch/arm64/boot/dts/renesas/rzg2lc-smarc.dtsi index aa170492dd2b42..6be25a8a28db7b 100644 --- a/arch/arm64/boot/dts/renesas/rzg2lc-smarc.dtsi +++ b/arch/arm64/boot/dts/renesas/rzg2lc-smarc.dtsi @@ -29,7 +29,7 @@ #define SW_RSPI_CAN 1 #endif -#if (SW_SCIF_CAN & SW_RSPI_CAN) +#if (SW_SCIF_CAN && SW_RSPI_CAN) #error "Can not set 1 to both SW_SCIF_CAN and SW_RSPI_CAN due to HW routing" #endif From a6c0f41e7aef8919ccc203baa9fe5f44452da41d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Wed, 8 Jun 2022 11:08:50 +0200 Subject: [PATCH 0083/1250] ARM: dts: r9a06g032-rzn1d400-db: Enable rtc0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RZ/N1D-DB board does have a battery to power the RTC. Enable the RTC device on this board. Signed-off-by: Clément Léger Link: https://lore.kernel.org/r/20220608090850.92735-1-clement.leger@bootlin.com Signed-off-by: Geert Uytterhoeven --- arch/arm/boot/dts/r9a06g032-rzn1d400-db.dts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm/boot/dts/r9a06g032-rzn1d400-db.dts b/arch/arm/boot/dts/r9a06g032-rzn1d400-db.dts index 3f8f3ce87e122a..ca39e1d681c1a7 100644 --- a/arch/arm/boot/dts/r9a06g032-rzn1d400-db.dts +++ b/arch/arm/boot/dts/r9a06g032-rzn1d400-db.dts @@ -23,6 +23,10 @@ }; }; +&rtc0 { + status = "okay"; +}; + &uart0 { status = "okay"; }; From e37996ab191a196a737eec8e471596017d5ba284 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 8 Jun 2022 17:40:19 +0200 Subject: [PATCH 0084/1250] arm64: dts: renesas: r8a779f0: Add L3 cache controller Describe the cache configuration for the first Cortex-A55 CPU core on the Renesas R-Car S4-8 (R8A779F0) SoC. Extracted from a larger patch in the BSP by LUU HOAI. Signed-off-by: Geert Uytterhoeven Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/a63715ce1d2d2fcc7ab987f7a1b40847965e8d6a.1654701480.git.geert+renesas@glider.be --- arch/arm64/boot/dts/renesas/r8a779f0.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi index 145447ac1fe267..28f7af14a64cb9 100644 --- a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi @@ -23,6 +23,14 @@ reg = <0>; device_type = "cpu"; power-domains = <&sysc R8A779F0_PD_A1E0D0C0>; + next-level-cache = <&L3_CA55_0>; + }; + + L3_CA55_0: cache-controller-0 { + compatible = "cache"; + power-domains = <&sysc R8A779F0_PD_A2E0D0>; + cache-unified; + cache-level = <3>; }; }; From 1760712db37ac67711bdf7721b388a5f499ade52 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 8 Jun 2022 17:40:20 +0200 Subject: [PATCH 0085/1250] arm64: dts: renesas: r8a779f0: Add secondary CA55 CPU cores Complete the description of the Cortex-A55 CPU cores and L3 cache controllers on the Renesas R-Car S4-8 (R8A779F0) SoC, including CPU topology and PSCI support for enabling CPU cores. R-Car S4-8 has 8 Cortex-A55 cores, grouped in 4 clusters. Based on patches in the BSP by Takeshi Kihara. Signed-off-by: Geert Uytterhoeven Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/d6af5975090d5830cb053b52400439bd1cbe8fc7.1654701480.git.geert+renesas@glider.be --- arch/arm64/boot/dts/renesas/r8a779f0.dtsi | 138 +++++++++++++++++++++- 1 file changed, 133 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi index 28f7af14a64cb9..f918304506263a 100644 --- a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi @@ -18,12 +18,114 @@ #address-cells = <1>; #size-cells = <0>; + cpu-map { + cluster0 { + core0 { + cpu = <&a55_0>; + }; + core1 { + cpu = <&a55_1>; + }; + }; + + cluster1 { + core0 { + cpu = <&a55_2>; + }; + core1 { + cpu = <&a55_3>; + }; + }; + + cluster2 { + core0 { + cpu = <&a55_4>; + }; + core1 { + cpu = <&a55_5>; + }; + }; + + cluster3 { + core0 { + cpu = <&a55_6>; + }; + core1 { + cpu = <&a55_7>; + }; + }; + }; + a55_0: cpu@0 { compatible = "arm,cortex-a55"; reg = <0>; device_type = "cpu"; power-domains = <&sysc R8A779F0_PD_A1E0D0C0>; next-level-cache = <&L3_CA55_0>; + enable-method = "psci"; + }; + + a55_1: cpu@100 { + compatible = "arm,cortex-a55"; + reg = <0x100>; + device_type = "cpu"; + power-domains = <&sysc R8A779F0_PD_A1E0D0C1>; + next-level-cache = <&L3_CA55_0>; + enable-method = "psci"; + }; + + a55_2: cpu@10000 { + compatible = "arm,cortex-a55"; + reg = <0x10000>; + device_type = "cpu"; + power-domains = <&sysc R8A779F0_PD_A1E0D1C0>; + next-level-cache = <&L3_CA55_1>; + enable-method = "psci"; + }; + + a55_3: cpu@10100 { + compatible = "arm,cortex-a55"; + reg = <0x10100>; + device_type = "cpu"; + power-domains = <&sysc R8A779F0_PD_A1E0D1C1>; + next-level-cache = <&L3_CA55_1>; + enable-method = "psci"; + }; + + a55_4: cpu@20000 { + compatible = "arm,cortex-a55"; + reg = <0x20000>; + device_type = "cpu"; + power-domains = <&sysc R8A779F0_PD_A1E1D0C0>; + next-level-cache = <&L3_CA55_2>; + enable-method = "psci"; + }; + + a55_5: cpu@20100 { + compatible = "arm,cortex-a55"; + reg = <0x20100>; + device_type = "cpu"; + power-domains = <&sysc R8A779F0_PD_A1E1D0C1>; + next-level-cache = <&L3_CA55_2>; + enable-method = "psci"; + }; + + a55_6: cpu@30000 { + compatible = "arm,cortex-a55"; + reg = <0x30000>; + device_type = "cpu"; + power-domains = <&sysc R8A779F0_PD_A1E1D1C0>; + next-level-cache = <&L3_CA55_3>; + enable-method = "psci"; + }; + + a55_7: cpu@30100 { + compatible = "arm,cortex-a55"; + reg = <0x30100>; + device_type = "cpu"; + power-domains = <&sysc R8A779F0_PD_A1E1D1C1>; + next-level-cache = <&L3_CA55_3>; + enable-method = "psci"; }; L3_CA55_0: cache-controller-0 { @@ -32,6 +134,27 @@ cache-unified; cache-level = <3>; }; + + L3_CA55_1: cache-controller-1 { + compatible = "cache"; + power-domains = <&sysc R8A779F0_PD_A2E0D1>; + cache-unified; + cache-level = <3>; + }; + + L3_CA55_2: cache-controller-2 { + compatible = "cache"; + power-domains = <&sysc R8A779F0_PD_A2E1D0>; + cache-unified; + cache-level = <3>; + }; + + L3_CA55_3: cache-controller-3 { + compatible = "cache"; + power-domains = <&sysc R8A779F0_PD_A2E1D1>; + cache-unified; + cache-level = <3>; + }; }; extal_clk: extal { @@ -53,6 +176,11 @@ interrupts-extended = <&gic GIC_PPI 7 IRQ_TYPE_LEVEL_LOW>; }; + psci { + compatible = "arm,psci-1.0", "arm,psci-0.2"; + method = "smc"; + }; + /* External SCIF clock - to be overridden by boards that provide it */ scif_clk: scif { compatible = "fixed-clock"; @@ -433,7 +561,7 @@ reg = <0x0 0xf1000000 0 0x20000>, <0x0 0xf1060000 0 0x110000>; interrupts = ; + (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_HIGH)>; }; prr: chipid@fff00044 { @@ -488,9 +616,9 @@ timer { compatible = "arm,armv8-timer"; - interrupts-extended = <&gic GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(1) | IRQ_TYPE_LEVEL_LOW)>, - <&gic GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(1) | IRQ_TYPE_LEVEL_LOW)>, - <&gic GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(1) | IRQ_TYPE_LEVEL_LOW)>, - <&gic GIC_PPI 10 (GIC_CPU_MASK_SIMPLE(1) | IRQ_TYPE_LEVEL_LOW)>; + interrupts-extended = <&gic GIC_PPI 13 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>, + <&gic GIC_PPI 14 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>, + <&gic GIC_PPI 11 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>, + <&gic GIC_PPI 10 (GIC_CPU_MASK_SIMPLE(8) | IRQ_TYPE_LEVEL_LOW)>; }; }; From 25b6fd4309540f4a78b57962e58eb60842befd7e Mon Sep 17 00:00:00 2001 From: Tho Vu Date: Wed, 8 Jun 2022 17:40:21 +0200 Subject: [PATCH 0086/1250] arm64: dts: renesas: r8a779f0: Add CPUIdle support Support CPUIdle for ARM Cortex-A55 on R-Car S4-8. Signed-off-by: Tho Vu Signed-off-by: Geert Uytterhoeven Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/5310792ce4c06515a5373ff44ceb9b925f007489.1654701480.git.geert+renesas@glider.be --- arch/arm64/boot/dts/renesas/r8a779f0.dtsi | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi index f918304506263a..44996a22c83747 100644 --- a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi @@ -63,6 +63,7 @@ power-domains = <&sysc R8A779F0_PD_A1E0D0C0>; next-level-cache = <&L3_CA55_0>; enable-method = "psci"; + cpu-idle-states = <&CPU_SLEEP_0>; }; a55_1: cpu@100 { @@ -72,6 +73,7 @@ power-domains = <&sysc R8A779F0_PD_A1E0D0C1>; next-level-cache = <&L3_CA55_0>; enable-method = "psci"; + cpu-idle-states = <&CPU_SLEEP_0>; }; a55_2: cpu@10000 { @@ -81,6 +83,7 @@ power-domains = <&sysc R8A779F0_PD_A1E0D1C0>; next-level-cache = <&L3_CA55_1>; enable-method = "psci"; + cpu-idle-states = <&CPU_SLEEP_0>; }; a55_3: cpu@10100 { @@ -90,6 +93,7 @@ power-domains = <&sysc R8A779F0_PD_A1E0D1C1>; next-level-cache = <&L3_CA55_1>; enable-method = "psci"; + cpu-idle-states = <&CPU_SLEEP_0>; }; a55_4: cpu@20000 { @@ -99,6 +103,7 @@ power-domains = <&sysc R8A779F0_PD_A1E1D0C0>; next-level-cache = <&L3_CA55_2>; enable-method = "psci"; + cpu-idle-states = <&CPU_SLEEP_0>; }; a55_5: cpu@20100 { @@ -108,6 +113,7 @@ power-domains = <&sysc R8A779F0_PD_A1E1D0C1>; next-level-cache = <&L3_CA55_2>; enable-method = "psci"; + cpu-idle-states = <&CPU_SLEEP_0>; }; a55_6: cpu@30000 { @@ -117,6 +123,7 @@ power-domains = <&sysc R8A779F0_PD_A1E1D1C0>; next-level-cache = <&L3_CA55_3>; enable-method = "psci"; + cpu-idle-states = <&CPU_SLEEP_0>; }; a55_7: cpu@30100 { @@ -126,6 +133,7 @@ power-domains = <&sysc R8A779F0_PD_A1E1D1C1>; next-level-cache = <&L3_CA55_3>; enable-method = "psci"; + cpu-idle-states = <&CPU_SLEEP_0>; }; L3_CA55_0: cache-controller-0 { @@ -155,6 +163,19 @@ cache-unified; cache-level = <3>; }; + + idle-states { + entry-method = "psci"; + + CPU_SLEEP_0: cpu-sleep-0 { + compatible = "arm,idle-state"; + arm,psci-suspend-param = <0x0010000>; + local-timer-stop; + entry-latency-us = <400>; + exit-latency-us = <500>; + min-residency-us = <4000>; + }; + }; }; extal_clk: extal { From 8d41224d76f47d072dba87e8b000a2ee067f5bd3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 8 Jun 2022 17:40:22 +0200 Subject: [PATCH 0087/1250] arm64: dts: renesas: r8a779f0: Add CPU core clocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Describe the clocks for the eight Cortex-A55 CPU cores. CA55 Sub-System 0 (first 2 clusters / CPU cores 0-3) is clocked by Z0φ. CA55 Sub-System 1 (last 2 clusters / CPU cores 4-7) is clocked by Z1φ. For now no operating points are defined. Signed-off-by: Geert Uytterhoeven Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/c502087f9affa86dd665def0d990d277a51cc75c.1654701480.git.geert+renesas@glider.be --- arch/arm64/boot/dts/renesas/r8a779f0.dtsi | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi index 44996a22c83747..180a0f5a860333 100644 --- a/arch/arm64/boot/dts/renesas/r8a779f0.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a779f0.dtsi @@ -64,6 +64,7 @@ next-level-cache = <&L3_CA55_0>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; + clocks = <&cpg CPG_CORE R8A779F0_CLK_Z0>; }; a55_1: cpu@100 { @@ -74,6 +75,7 @@ next-level-cache = <&L3_CA55_0>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; + clocks = <&cpg CPG_CORE R8A779F0_CLK_Z0>; }; a55_2: cpu@10000 { @@ -84,6 +86,7 @@ next-level-cache = <&L3_CA55_1>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; + clocks = <&cpg CPG_CORE R8A779F0_CLK_Z0>; }; a55_3: cpu@10100 { @@ -94,6 +97,7 @@ next-level-cache = <&L3_CA55_1>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; + clocks = <&cpg CPG_CORE R8A779F0_CLK_Z0>; }; a55_4: cpu@20000 { @@ -104,6 +108,7 @@ next-level-cache = <&L3_CA55_2>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; + clocks = <&cpg CPG_CORE R8A779F0_CLK_Z1>; }; a55_5: cpu@20100 { @@ -114,6 +119,7 @@ next-level-cache = <&L3_CA55_2>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; + clocks = <&cpg CPG_CORE R8A779F0_CLK_Z1>; }; a55_6: cpu@30000 { @@ -124,6 +130,7 @@ next-level-cache = <&L3_CA55_3>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; + clocks = <&cpg CPG_CORE R8A779F0_CLK_Z1>; }; a55_7: cpu@30100 { @@ -134,6 +141,7 @@ next-level-cache = <&L3_CA55_3>; enable-method = "psci"; cpu-idle-states = <&CPU_SLEEP_0>; + clocks = <&cpg CPG_CORE R8A779F0_CLK_Z1>; }; L3_CA55_0: cache-controller-0 { From a1553161374cec39760528727823c605f04c0c24 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 8 Jun 2022 18:30:24 +0100 Subject: [PATCH 0088/1250] arm64: dts: renesas: rzg2ul-smarc: Enable RSPI1 on carrier board RSPI1 (SPI1) interface is available on PMOD0 connector (J1) on the carrier board. This patch adds pinmux and spi1 nodes to the carrier board dtsi file and drops deleting pinctl* properties from board DTS file. Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20220608173025.22792-2-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- arch/arm64/boot/dts/renesas/r9a07g043u11-smarc.dts | 6 ------ arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi | 7 +++++++ arch/arm64/boot/dts/renesas/rzg2ul-smarc.dtsi | 6 ++++++ 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/arm64/boot/dts/renesas/r9a07g043u11-smarc.dts b/arch/arm64/boot/dts/renesas/r9a07g043u11-smarc.dts index 2d740bd420ca95..121e55282d1818 100644 --- a/arch/arm64/boot/dts/renesas/r9a07g043u11-smarc.dts +++ b/arch/arm64/boot/dts/renesas/r9a07g043u11-smarc.dts @@ -13,9 +13,3 @@ model = "Renesas SMARC EVK based on r9a07g043u11"; compatible = "renesas,smarc-evk", "renesas,r9a07g043u11", "renesas,r9a07g043"; }; - -&spi1 { - /delete-property/ pinctrl-0; - /delete-property/ pinctrl-names; - status = "disabled"; -}; diff --git a/arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi b/arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi index 429bdde579c3f0..8fdc956cd6c76a 100644 --- a/arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi +++ b/arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi @@ -209,6 +209,13 @@ pinmux = ; /* SD0_CD */ }; }; + + spi1_pins: rspi1 { + pinmux = , /* CK */ + , /* MOSI */ + , /* MISO */ + ; /* SSL */ + }; }; #if (SW_SW0_DEV_SEL) diff --git a/arch/arm64/boot/dts/renesas/rzg2ul-smarc.dtsi b/arch/arm64/boot/dts/renesas/rzg2ul-smarc.dtsi index 0051634d7b1c4f..f9835c12023e06 100644 --- a/arch/arm64/boot/dts/renesas/rzg2ul-smarc.dtsi +++ b/arch/arm64/boot/dts/renesas/rzg2ul-smarc.dtsi @@ -51,6 +51,12 @@ status = "disabled"; }; +&spi1 { + /delete-property/ pinctrl-0; + /delete-property/ pinctrl-names; + status = "disabled"; +}; + &ssi1 { /delete-property/ pinctrl-0; /delete-property/ pinctrl-names; From e14fe2221cadbd0bc6256a512bca8994006fd6dd Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 8 Jun 2022 18:30:25 +0100 Subject: [PATCH 0089/1250] arm64: dts: renesas: rzg2ul-smarc-som: Enable ADC on SMARC platform Enable the ADC found on RZ/G2UL SMARC SoM. Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20220608173025.22792-3-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi b/arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi index 8fdc956cd6c76a..cf3b3d118ef170 100644 --- a/arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi +++ b/arch/arm64/boot/dts/renesas/rzg2ul-smarc-som.dtsi @@ -57,6 +57,14 @@ #endif }; +#if (SW_SW0_DEV_SEL) +&adc { + pinctrl-0 = <&adc_pins>; + pinctrl-names = "default"; + status = "okay"; +}; +#endif + #if (!SW_ET0_EN_N) ð0 { pinctrl-0 = <ð0_pins>; @@ -124,6 +132,10 @@ }; &pinctrl { + adc_pins: adc { + pinmux = ; /* ADC_TRG */ + }; + eth0_pins: eth0 { pinmux = , /* ET0_LINKSTA */ , /* ET0_MDC */ From 21af7579da7ffcc71086cab00c3d4282305633a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Wed, 8 Jun 2022 19:57:28 +0200 Subject: [PATCH 0090/1250] arm64: dts: renesas: Add missing space after remote-endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the missing space after remote-endpoint in r8a774c0.dtsi and r8a77990.dtsi before the typo spreads to other files. Signed-off-by: Niklas Söderlund Link: https://lore.kernel.org/r/20220608175728.1012550-1-niklas.soderlund+renesas@ragnatech.se Signed-off-by: Geert Uytterhoeven --- arch/arm64/boot/dts/renesas/r8a774c0.dtsi | 4 ++-- arch/arm64/boot/dts/renesas/r8a77990.dtsi | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/boot/dts/renesas/r8a774c0.dtsi b/arch/arm64/boot/dts/renesas/r8a774c0.dtsi index b6aeb22e883645..90588bbff7e0b8 100644 --- a/arch/arm64/boot/dts/renesas/r8a774c0.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a774c0.dtsi @@ -1281,7 +1281,7 @@ vin4csi40: endpoint@2 { reg = <2>; - remote-endpoint= <&csi40vin4>; + remote-endpoint = <&csi40vin4>; }; }; }; @@ -1309,7 +1309,7 @@ vin5csi40: endpoint@2 { reg = <2>; - remote-endpoint= <&csi40vin5>; + remote-endpoint = <&csi40vin5>; }; }; }; diff --git a/arch/arm64/boot/dts/renesas/r8a77990.dtsi b/arch/arm64/boot/dts/renesas/r8a77990.dtsi index d3302120263762..4c7c7feed70267 100644 --- a/arch/arm64/boot/dts/renesas/r8a77990.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a77990.dtsi @@ -1324,7 +1324,7 @@ vin4csi40: endpoint@2 { reg = <2>; - remote-endpoint= <&csi40vin4>; + remote-endpoint = <&csi40vin4>; }; }; }; @@ -1352,7 +1352,7 @@ vin5csi40: endpoint@2 { reg = <2>; - remote-endpoint= <&csi40vin5>; + remote-endpoint = <&csi40vin5>; }; }; }; From 2a258824306149512cc22ff65cfcf8d52c1a306b Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 7 Jun 2022 08:42:20 +0200 Subject: [PATCH 0091/1250] MAINTAINERS: rectify file pattern in MICROCHIP OTPC DRIVER Commit 6b291610dd57 ("nvmem: microchip-otpc: add support") adds the Microchip otpc driver and a corresponding MAINTAINERS section, but slips in a slightly wrong file pattern. Hence, ./scripts/get_maintainer.pl --self-test=patterns complains about a broken reference. Rectify this file pattern in MICROCHIP OTPC DRIVER. Signed-off-by: Lukas Bulwahn Acked-by: Claudiu Beznea Signed-off-by: Srinivas Kandagatla --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index e51eeb0ee0ed01..62a02b67db25ba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13106,7 +13106,7 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Supported F: Documentation/devicetree/bindings/nvmem/microchip,sama7g5-otpc.yaml F: drivers/nvmem/microchip-otpc.c -F: dt-bindings/nvmem/microchip,sama7g5-otpc.h +F: include/dt-bindings/nvmem/microchip,sama7g5-otpc.h MICROCHIP PWM DRIVER M: Claudiu Beznea From af0d041c4c3efa173af2cfa94e55c8dabb484558 Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Fri, 17 Jun 2022 17:31:29 +0800 Subject: [PATCH 0092/1250] dt-bindings: nvmem: convert mtk-efuse.txt to YAML schema Convert mtk-efuse.txt to YAML schema mediatek,efuse.yaml Reviewed-by: Krzysztof Kozlowski Reviewed-by: Rob Herring Signed-off-by: Chunfeng Yun Signed-off-by: Srinivas Kandagatla --- .../bindings/nvmem/mediatek,efuse.yaml | 87 +++++++++++++++++++ .../devicetree/bindings/nvmem/mtk-efuse.txt | 43 --------- 2 files changed, 87 insertions(+), 43 deletions(-) create mode 100644 Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml delete mode 100644 Documentation/devicetree/bindings/nvmem/mtk-efuse.txt diff --git a/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml b/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml new file mode 100644 index 00000000000000..f6e01ddb7499b4 --- /dev/null +++ b/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/nvmem/mediatek,efuse.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: MediaTek efuse + +description: | + MediaTek's efuse is used for storing calibration data, it can be accessed + on ARM devices usiong I/O mapped memory. + +maintainers: + - Andrew-CT Chen + - Lala Lin + +allOf: + - $ref: "nvmem.yaml#" + +properties: + $nodename: + pattern: "^efuse@[0-9a-f]+$" + + compatible: + oneOf: + - items: + - enum: + - mediatek,mt7622-efuse + - mediatek,mt7623-efuse + - mediatek,mt8173-efuse + - mediatek,mt8192-efuse + - mediatek,mt8195-efuse + - mediatek,mt8516-efuse + - const: mediatek,efuse + - const: mediatek,mt8173-efuse + deprecated: true + + reg: + maxItems: 1 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + efuse@11c10000 { + compatible = "mediatek,mt8195-efuse", "mediatek,efuse"; + reg = <0x11c10000 0x1000>; + #address-cells = <1>; + #size-cells = <1>; + + u3_tx_imp_p0: usb3-tx-imp@184,1 { + reg = <0x184 0x1>; + bits = <0 5>; + }; + u3_rx_imp_p0: usb3-rx-imp@184,2 { + reg = <0x184 0x2>; + bits = <5 5>; + }; + u3_intr_p0: usb3-intr@185 { + reg = <0x185 0x1>; + bits = <2 6>; + }; + comb_tx_imp_p1: usb3-tx-imp@186,1 { + reg = <0x186 0x1>; + bits = <0 5>; + }; + comb_rx_imp_p1: usb3-rx-imp@186,2 { + reg = <0x186 0x2>; + bits = <5 5>; + }; + comb_intr_p1: usb3-intr@187 { + reg = <0x187 0x1>; + bits = <2 6>; + }; + u2_intr_p0: usb2-intr-p0@188,1 { + reg = <0x188 0x1>; + bits = <0 5>; + }; + u2_intr_p1: usb2-intr-p1@188,2 { + reg = <0x188 0x2>; + bits = <5 5>; + }; + }; diff --git a/Documentation/devicetree/bindings/nvmem/mtk-efuse.txt b/Documentation/devicetree/bindings/nvmem/mtk-efuse.txt deleted file mode 100644 index 39d529599444df..00000000000000 --- a/Documentation/devicetree/bindings/nvmem/mtk-efuse.txt +++ /dev/null @@ -1,43 +0,0 @@ -= Mediatek MTK-EFUSE device tree bindings = - -This binding is intended to represent MTK-EFUSE which is found in most Mediatek SOCs. - -Required properties: -- compatible: should be - "mediatek,mt7622-efuse", "mediatek,efuse": for MT7622 - "mediatek,mt7623-efuse", "mediatek,efuse": for MT7623 - "mediatek,mt8173-efuse" or "mediatek,efuse": for MT8173 - "mediatek,mt8192-efuse", "mediatek,efuse": for MT8192 - "mediatek,mt8195-efuse", "mediatek,efuse": for MT8195 - "mediatek,mt8516-efuse", "mediatek,efuse": for MT8516 -- reg: Should contain registers location and length -- bits: contain the bits range by offset and size - -= Data cells = -Are child nodes of MTK-EFUSE, bindings of which as described in -bindings/nvmem/nvmem.txt - -Example: - - efuse: efuse@10206000 { - compatible = "mediatek,mt8173-efuse"; - reg = <0 0x10206000 0 0x1000>; - #address-cells = <1>; - #size-cells = <1>; - - /* Data cells */ - thermal_calibration: calib@528 { - reg = <0x528 0xc>; - }; - }; - -= Data consumers = -Are device nodes which consume nvmem data cells. - -For example: - - thermal { - ... - nvmem-cells = <&thermal_calibration>; - nvmem-cell-names = "calibration"; - }; From ccba200e4801aaf57848d71b7bd167e52370fbf2 Mon Sep 17 00:00:00 2001 From: Chunfeng Yun Date: Fri, 17 Jun 2022 17:31:30 +0800 Subject: [PATCH 0093/1250] dt-bindings: nvmem: mediatek: efuse: add support mt8183 Add "mediatek,mt8183-efuse" to fix dtbs check warning. Acked-by: Rob Herring Signed-off-by: Chunfeng Yun Signed-off-by: Srinivas Kandagatla --- Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml b/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml index f6e01ddb7499b4..7c7233e29ecf16 100644 --- a/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml +++ b/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml @@ -28,6 +28,7 @@ properties: - mediatek,mt7622-efuse - mediatek,mt7623-efuse - mediatek,mt8173-efuse + - mediatek,mt8183-efuse - mediatek,mt8192-efuse - mediatek,mt8195-efuse - mediatek,mt8516-efuse From dc150dfb081fcd17f8ecfa07d94813e4c445d808 Mon Sep 17 00:00:00 2001 From: Alexander Fomichev Date: Fri, 13 May 2022 22:37:02 +0300 Subject: [PATCH 0094/1250] ntb_perf: extend with burst latency measurement Burst latency is a delay between start to send 1 byte to the remote system and hardware readiness to send another byte. The measurement performed within bandwidth test procedure. The DMA Engine is off. Data integrity is not checked. This mode can be disabled by 'perf_latency=N' module parameter. Signed-off-by: Alexander Fomichev Reviewed-by: Dave Jiang Signed-off-by: Jon Mason --- drivers/ntb/test/ntb_perf.c | 140 ++++++++++++++++++++++++++++++++++-- 1 file changed, 136 insertions(+), 4 deletions(-) diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index 65e1e5cf1b29a6..23bde12eaf3d9a 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -6,6 +6,7 @@ * * Copyright(c) 2015 Intel Corporation. All rights reserved. * Copyright(c) 2017 T-Platforms. All Rights Reserved. + * Copyright(c) 2022 YADRO. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -15,6 +16,7 @@ * * Copyright(c) 2015 Intel Corporation. All rights reserved. * Copyright(c) 2017 T-Platforms. All Rights Reserved. + * Copyright(c) 2022 YADRO. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -85,7 +87,7 @@ #include #define DRIVER_NAME "ntb_perf" -#define DRIVER_VERSION "2.0" +#define DRIVER_VERSION "2.1" MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRIVER_VERSION); @@ -106,6 +108,9 @@ MODULE_DESCRIPTION("PCIe NTB Performance Measurement Tool"); #define PERF_BUF_LEN 1024 +#define LAT_MIN_TRIES 20 +#define RESCHEDULE_RATIO 10000 + static unsigned long max_mw_size; module_param(max_mw_size, ulong, 0644); MODULE_PARM_DESC(max_mw_size, "Upper limit of memory window size"); @@ -122,6 +127,14 @@ static bool use_dma; /* default to 0 */ module_param(use_dma, bool, 0644); MODULE_PARM_DESC(use_dma, "Use DMA engine to measure performance"); +static bool perf_latency = true; +module_param(perf_latency, bool, 0644); +MODULE_PARM_DESC(perf_latency, "Measure burst latency"); + +static unsigned long lat_time_ms = 1000; /* default 1s */ +module_param(lat_time_ms, ulong, 0644); +MODULE_PARM_DESC(lat_time_ms, "Time (in ms) to test latency"); + /*============================================================================== * Perf driver data definition *============================================================================== @@ -178,6 +191,8 @@ struct perf_thread { void *src; u64 copied; ktime_t duration; + ktime_t latency; + u64 tries; int status; struct work_struct work; }; @@ -783,7 +798,7 @@ static void perf_dma_copy_callback(void *data) } static int perf_copy_chunk(struct perf_thread *pthr, - void __iomem *dst, void *src, size_t len) + void __iomem *dst, void *src, size_t len, bool _use_dma) { struct dma_async_tx_descriptor *tx; struct dmaengine_unmap_data *unmap; @@ -794,7 +809,7 @@ static int perf_copy_chunk(struct perf_thread *pthr, void __iomem *dst_vaddr; dma_addr_t dst_dma_addr; - if (!use_dma) { + if (!_use_dma) { memcpy_toio(dst, src, len); goto ret_check_tsync; } @@ -940,7 +955,7 @@ static int perf_run_test(struct perf_thread *pthr) /* Copied field is cleared on test launch stage */ while (pthr->copied < total_size) { - ret = perf_copy_chunk(pthr, flt_dst, flt_src, chunk_size); + ret = perf_copy_chunk(pthr, flt_dst, flt_src, chunk_size, use_dma); if (ret) { dev_err(&perf->ntb->dev, "%d: Got error %d on test\n", pthr->tidx, ret); @@ -1018,6 +1033,67 @@ static void perf_clear_test(struct perf_thread *pthr) kfree(pthr->src); } +static int perf_run_latency(struct perf_thread *pthr) +{ + struct perf_peer *peer = pthr->perf->test_peer; + struct ntb_dev *ntb = pthr->perf->ntb; + void __iomem *flt_dst, *bnd_dst; + void *flt_src; + u64 stop_at; + int ret; + + pthr->tries = 0; + pthr->latency = ktime_get(); + flt_src = pthr->src; + flt_dst = peer->outbuf; + bnd_dst = peer->outbuf + peer->outbuf_size; + + stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC; + while (ktime_get_real_fast_ns() < stop_at) { + ret = perf_copy_chunk(pthr, flt_dst, flt_src, 1, false); + if (ret) { + dev_err(&ntb->dev, "%d: Latency testing error %d\n", + pthr->tidx, ret); + pthr->latency = ktime_set(0, 0); + return ret; + } + + pthr->tries++; + flt_dst++; + flt_src++; + + if (flt_dst >= bnd_dst || flt_dst < peer->outbuf) { + flt_dst = peer->outbuf; + flt_src = pthr->src; + } + + /* Avoid processor soft lock-ups */ + if (!(pthr->tries % RESCHEDULE_RATIO)) + schedule(); + } + + /* Stop timer */ + pthr->latency = ktime_sub(ktime_get(), pthr->latency); + + if (pthr->tries < LAT_MIN_TRIES) { + dev_err(&ntb->dev, + "%d: Too few steps (%llu) to measure Latency, recommended > %d. Increase value of 'lat_time_ms' parameter\n", + pthr->tidx, pthr->tries, LAT_MIN_TRIES); + pthr->latency = ktime_set(0, 0); + return -EINVAL; + } + + dev_dbg(&ntb->dev, "%d: made %llu tries, lasted %llu usecs\n", + pthr->tidx, pthr->tries, ktime_to_us(pthr->latency)); + + pthr->latency = ns_to_ktime(ktime_divns(pthr->latency, pthr->tries)); + + dev_dbg(&ntb->dev, "%d: latency %llu us (%llu ns)\n", pthr->tidx, + ktime_to_us(pthr->latency), ktime_to_ns(pthr->latency)); + + return 0; +} + static void perf_thread_work(struct work_struct *work) { struct perf_thread *pthr = to_thread_work(work); @@ -1043,6 +1119,11 @@ static void perf_thread_work(struct work_struct *work) } pthr->status = perf_sync_test(pthr); + if (pthr->status) + goto err_clear_test; + + if (perf_latency) + pthr->status = perf_run_latency(pthr); err_clear_test: perf_clear_test(pthr); @@ -1142,6 +1223,18 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf, "%d: copied %llu bytes in %llu usecs, %llu MBytes/s\n", tidx, pthr->copied, ktime_to_us(pthr->duration), div64_u64(pthr->copied, ktime_to_us(pthr->duration))); + + if (perf_latency && ktime_compare(pthr->latency, ktime_set(0, 0))) { + if (ktime_to_us(pthr->latency) < 10) { + (*pos) += scnprintf(buf + *pos, size - *pos, + "%d: latency %llu ns\n", + tidx, ktime_to_ns(pthr->latency)); + } else { + (*pos) += scnprintf(buf + *pos, size - *pos, + "%d: latency %llu us\n", + tidx, ktime_to_us(pthr->latency)); + } + } } clear_bit_unlock(0, &perf->busy_flag); @@ -1344,12 +1437,48 @@ static ssize_t perf_dbgfs_write_tcnt(struct file *filep, return size; } +static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf, + size_t size, loff_t *offp) +{ + size_t buf_size = min_t(size_t, size, PERF_BUF_LEN); + struct perf_ctx *perf = filep->private_data; + ssize_t pos, ret; + char *buf; + int tidx; + + buf = kmalloc(buf_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + pos = scnprintf(buf, buf_size, " Peer %d latency try count:\n", + perf->test_peer->pidx); + + for (tidx = 0; tidx < perf->tcnt; tidx++) { + struct perf_thread *pthr = &perf->threads[tidx]; + + pos += scnprintf(buf + pos, buf_size - pos, + "%d: made %llu tries\n", + tidx, pthr->tries); + } + + ret = simple_read_from_buffer(ubuf, size, offp, buf, pos); + + kfree(buf); + + return ret; +} + static const struct file_operations perf_dbgfs_tcnt = { .open = simple_open, .read = perf_dbgfs_read_tcnt, .write = perf_dbgfs_write_tcnt }; +static const struct file_operations perf_dbgfs_lattrs = { + .open = simple_open, + .read = perf_dbgfs_read_lattrs +}; + static void perf_setup_dbgfs(struct perf_ctx *perf) { struct pci_dev *pdev = perf->ntb->pdev; @@ -1375,6 +1504,9 @@ static void perf_setup_dbgfs(struct perf_ctx *perf) debugfs_create_u8("total_order", 0500, perf->dbgfs_dir, &total_order); debugfs_create_bool("use_dma", 0500, perf->dbgfs_dir, &use_dma); + + debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf, + &perf_dbgfs_lattrs); } static void perf_clear_dbgfs(struct perf_ctx *perf) From 9ace38ad4e8adffeb5ac292b8ceb74fb4f564452 Mon Sep 17 00:00:00 2001 From: Alexander Fomichev Date: Fri, 13 May 2022 22:37:03 +0300 Subject: [PATCH 0095/1250] ntb_perf: extend with poll latency measurement Poll latency is a delay between start to send 1 byte to the remote system and receiving the confirmation. The remote system needs to be run in server mode beforehand. Then the server polls the input buffer and on receiving data immediately sends the confirmation back. Signed-off-by: Alexander Fomichev Reviewed-by: Dave Jiang Signed-off-by: Jon Mason --- drivers/ntb/test/ntb_perf.c | 374 +++++++++++++++++++++++++++++++++++- 1 file changed, 373 insertions(+), 1 deletion(-) diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index 23bde12eaf3d9a..f0f3beba70a599 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -67,6 +67,14 @@ * * root@self# echo 0 > $DBG_DIR/run * root@self# cat $DBG_DIR/run + *----------------------------------------------------------------------------- + * Eg: start latency test with peer (index 0) poll-waiting and get the metrics + * + * Server side: + * root@self# echo 0 > $DBG_DIR/poll_latency/run_server + * Client side: + * root@self# echo 0 > $DBG_DIR/poll_latency/run_client + * root@self# cat $DBG_DIR/poll_latency/run_client */ #include @@ -87,7 +95,7 @@ #include #define DRIVER_NAME "ntb_perf" -#define DRIVER_VERSION "2.1" +#define DRIVER_VERSION "2.2" MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRIVER_VERSION); @@ -135,6 +143,10 @@ static unsigned long lat_time_ms = 1000; /* default 1s */ module_param(lat_time_ms, ulong, 0644); MODULE_PARM_DESC(lat_time_ms, "Time (in ms) to test latency"); +static unsigned long lat_timeout_us = 500; +module_param(lat_timeout_us, ulong, 0644); +MODULE_PARM_DESC(lat_timeout_us, "Timeout (in us) to wait for server reply"); + /*============================================================================== * Perf driver data definition *============================================================================== @@ -151,6 +163,11 @@ enum perf_cmd { PERF_STS_LNKUP = 6, /* link up state flag */ }; +enum run_mode { + RUN_PL_CLIENT, + RUN_PL_SERVER, +}; + struct perf_ctx; struct perf_peer { @@ -199,6 +216,21 @@ struct perf_thread { #define to_thread_work(__work) \ container_of(__work, struct perf_thread, work) +struct perf_poll_lat_data { + struct perf_ctx *perf; + void *src; + ktime_t latency; + u64 tries; + int status; + atomic_t running; + struct work_struct clt_work; + struct work_struct srv_work; +}; +#define to_pldata_clt_work(__work) \ + container_of(__work, struct perf_poll_lat_data, clt_work) +#define to_pldata_srv_work(__work) \ + container_of(__work, struct perf_poll_lat_data, srv_work) + struct perf_ctx { struct ntb_dev *ntb; @@ -206,6 +238,7 @@ struct perf_ctx { int gidx; int pcnt; struct perf_peer *peers; + struct perf_poll_lat_data pldata; /* Performance measuring work-threads interface */ unsigned long busy_flag; @@ -254,6 +287,8 @@ static struct dentry *perf_dbgfs_topdir; static struct workqueue_struct *perf_wq __read_mostly; +static const u8 stop_word = 0xFF; + /*============================================================================== * NTB cross-link commands execution service *============================================================================== @@ -1129,6 +1164,185 @@ static void perf_thread_work(struct work_struct *work) perf_clear_test(pthr); } +static int perf_init_pl(struct perf_poll_lat_data *pldata) +{ + struct perf_ctx *perf = pldata->perf; + struct perf_peer *peer = perf->test_peer; + u8 *bp; + + pldata->src = kmalloc_node(peer->outbuf_size, GFP_KERNEL, + dev_to_node(&perf->ntb->dev)); + if (!pldata->src) + return -ENOMEM; + + /* + * Prepare random data to send, guaranteed exclusion of 0x00 (unreceived) + * and 0xFF (stop_word) + */ + get_random_bytes(pldata->src, peer->outbuf_size); + for (bp = pldata->src; bp < (u8 *) pldata->src + peer->outbuf_size; bp++) + while (*bp == 0 || *bp == stop_word) + *bp = (u8)get_random_int(); + + memset(peer->inbuf, 0, peer->inbuf_size); + + return 0; +} + +static int perf_poll_peer_reply(volatile u8 *cur) +{ + u64 wait_till = ktime_get_real_fast_ns() + lat_timeout_us * NSEC_PER_USEC; + + while (ktime_get_real_fast_ns() < wait_till) { + if (*cur == stop_word) { + *cur = 0; + return 1; + } + if (*cur != 0) { + *cur = 0; + return 0; + } + } + return -EINTR; +} + +static int perf_run_pl_client(struct perf_poll_lat_data *pldata) +{ + struct perf_peer *peer = pldata->perf->test_peer; + struct ntb_dev *ntb = pldata->perf->ntb; + void *src = pldata->src; + u64 stop_at; + int ret; + + dev_dbg(&ntb->dev, "poll_lat: client started.\n"); + + pldata->tries = 0; + pldata->latency = ktime_get(); + + stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC; + while (ktime_get_real_fast_ns() < stop_at) { + memcpy_toio(peer->outbuf, src, 1); + + /* Avoid processor soft lock-ups */ + schedule(); + + ret = perf_poll_peer_reply(peer->inbuf); + if (ret < 0) { + dev_err(&ntb->dev, "Timeout waiting for peer reply on poll latency\n"); + pldata->latency = ktime_set(0, 0); + return -EINTR; + } else if (ret == 1) { + dev_warn(&ntb->dev, "Server terminated on poll latency, stopping\n"); + break; + } else if (!atomic_read(&pldata->running)) { + dev_err(&ntb->dev, "Poll latency client terminated\n"); + return -EINTR; + } + + pldata->tries++; + src++; + + if (src >= pldata->src + peer->outbuf_size) + src = pldata->src; + } + + /* Stop timer */ + pldata->latency = ktime_sub(ktime_get(), pldata->latency); + /* Send stop to peer */ + memcpy_toio(peer->outbuf, &stop_word, 1); + + if (pldata->tries < LAT_MIN_TRIES) { + dev_err(&ntb->dev, + "Too few steps (%llu) to measure Latency, recommended > %d. Increase value of 'lat_time_ms' parameter\n", + pldata->tries, LAT_MIN_TRIES); + pldata->latency = ktime_set(0, 0); + return -EINVAL; + } + + dev_dbg(&ntb->dev, "poll_lat: made %llu tries, lasted %llu usecs\n", + pldata->tries, ktime_to_us(pldata->latency)); + + pldata->latency = ns_to_ktime(ktime_divns(pldata->latency, pldata->tries)); + + dev_dbg(&ntb->dev, "poll_lat: latency %llu us (%llu ns)\n", + ktime_to_us(pldata->latency), ktime_to_ns(pldata->latency)); + + return 0; +} + +static int perf_run_pl_server(struct perf_poll_lat_data *pldata) +{ + struct perf_peer *peer = pldata->perf->test_peer; + struct ntb_dev *ntb = pldata->perf->ntb; + void *src = pldata->src; + int ret = 0; + + dev_dbg(&ntb->dev, "poll_lat: server started.\n"); + + pldata->tries = 0; + + while (ret != 1 && atomic_read(&pldata->running)) { + ret = perf_poll_peer_reply(peer->inbuf); + if (!ret) { + /* Pong to client */ + memcpy_toio(peer->outbuf, src++, 1); + if (src >= pldata->src + peer->outbuf_size) + src = pldata->src; + + pldata->tries++; + } + + /* Avoid processor soft lock-ups */ + schedule(); + } + + if (pldata->tries < LAT_MIN_TRIES) + dev_warn(&ntb->dev, + "Poll latency test terminated too early. Increase client's test time\n"); + + dev_dbg(&ntb->dev, "poll_lat: server stopped, had responded %llu times\n", + pldata->tries); + + return atomic_read(&pldata->running) ? -ENODATA : -EINTR; +} + +static void perf_clear_pl(struct perf_poll_lat_data *pldata) +{ + struct perf_ctx *perf = pldata->perf; + struct perf_peer *peer = perf->test_peer; + + memset(peer->inbuf, stop_word, 1); + atomic_set(&pldata->running, 0); + wake_up(&perf->twait); + kfree(pldata->src); +} + +static void perf_poll_lat_client_work(struct work_struct *work) +{ + struct perf_poll_lat_data *pldata = to_pldata_clt_work(work); + + pldata->status = perf_init_pl(pldata); + if (pldata->status) + return; + + pldata->status = perf_run_pl_client(pldata); + + perf_clear_pl(pldata); +} + +static void perf_poll_lat_server_work(struct work_struct *work) +{ + struct perf_poll_lat_data *pldata = to_pldata_srv_work(work); + + pldata->status = perf_init_pl(pldata); + if (pldata->status) + return; + + pldata->status = perf_run_pl_server(pldata); + + perf_clear_pl(pldata); +} + static int perf_set_tcnt(struct perf_ctx *perf, u8 tcnt) { if (tcnt == 0 || tcnt > MAX_THREADS_CNT) @@ -1149,7 +1363,10 @@ static void perf_terminate_test(struct perf_ctx *perf) int tidx; atomic_set(&perf->tsync, -1); + atomic_set(&perf->pldata.running, 0); wake_up(&perf->twait); + cancel_work_sync(&perf->pldata.srv_work); + cancel_work_sync(&perf->pldata.clt_work); for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) { wake_up(&perf->threads[tidx].dma_wait); @@ -1195,6 +1412,46 @@ static int perf_submit_test(struct perf_peer *peer) return ret; } +static int perf_submit_poll_lat(struct perf_peer *peer, enum run_mode mode) +{ + struct perf_ctx *perf = peer->perf; + int ret; + + ret = wait_for_completion_interruptible(&peer->init_comp); + if (ret < 0) + return ret; + + if (test_and_set_bit_lock(0, &perf->busy_flag)) + return -EBUSY; + + perf->test_peer = peer; + atomic_set(&perf->pldata.running, 1); + perf->pldata.status = -ENODATA; + perf->pldata.tries = 0; + perf->pldata.latency = ktime_set(0, 0); + + switch (mode) { + case RUN_PL_SERVER: + (void)queue_work(perf_wq, &perf->pldata.srv_work); + break; + case RUN_PL_CLIENT: + default: + (void)queue_work(perf_wq, &perf->pldata.clt_work); + break; + } + + ret = wait_event_interruptible(perf->twait, + !atomic_read(&perf->pldata.running)); + if (ret == -ERESTARTSYS) { + perf_terminate_test(perf); + ret = -EINTR; + } + + clear_bit_unlock(0, &perf->busy_flag); + + return ret; +} + static int perf_read_stats(struct perf_ctx *perf, char *buf, size_t size, ssize_t *pos) { @@ -1237,6 +1494,24 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf, } } + if (perf->pldata.status != -ENODATA) { + (*pos) += scnprintf(buf + *pos, size - *pos, "\n"); + if (perf->pldata.status) { + (*pos) += scnprintf(buf + *pos, size - *pos, + "poll latency: error status %d\n", perf->pldata.status); + } else { + if (ktime_to_us(perf->pldata.latency) < 10) { + (*pos) += scnprintf(buf + *pos, size - *pos, + "poll latency %llu ns\n", + ktime_to_ns(perf->pldata.latency)); + } else { + (*pos) += scnprintf(buf + *pos, size - *pos, + "poll latency %llu us\n", + ktime_to_us(perf->pldata.latency)); + } + } + } + clear_bit_unlock(0, &perf->busy_flag); return 0; @@ -1250,6 +1525,10 @@ static void perf_init_threads(struct perf_ctx *perf) perf->tcnt = DEF_THREADS_CNT; perf->test_peer = &perf->peers[0]; init_waitqueue_head(&perf->twait); + perf->pldata.perf = perf; + INIT_WORK(&perf->pldata.srv_work, perf_poll_lat_server_work); + INIT_WORK(&perf->pldata.clt_work, perf_poll_lat_client_work); + perf->pldata.status = -ENODATA; for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) { pthr = &perf->threads[tidx]; @@ -1406,6 +1685,64 @@ static const struct file_operations perf_dbgfs_run = { .write = perf_dbgfs_write_run }; +static ssize_t perf_dbgfs_write_run_pl(struct file *filep, const char __user *ubuf, + size_t size, loff_t *offp, enum run_mode mode) +{ + struct perf_ctx *perf = filep->private_data; + struct ntb_dev *ntb = perf->ntb; + struct perf_peer *peer; + int pidx, ret; + + ret = kstrtoint_from_user(ubuf, size, 0, &pidx); + if (ret) + return ret; + + if (pidx < 0 && mode == RUN_PL_SERVER) { + dev_dbg(&ntb->dev, "poll_lat: kill server\n"); + if (test_bit(0, &perf->busy_flag)) { + peer = perf->test_peer; + /* Send stop to client */ + memcpy_toio(peer->outbuf, &stop_word, 1); + } + perf_terminate_test(perf); + clear_bit_unlock(0, &perf->busy_flag); + return size; + } + + if (pidx < 0 || pidx >= perf->pcnt) + return -EINVAL; + + peer = &perf->peers[pidx]; + + ret = perf_submit_poll_lat(peer, mode); + + return ret ? ret : size; +} + +static ssize_t perf_dbgfs_write_run_client(struct file *filep, const char __user *ubuf, + size_t size, loff_t *offp) +{ + return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_CLIENT); +} + +static const struct file_operations perf_dbgfs_run_client = { + .open = simple_open, + .read = perf_dbgfs_read_run, + .write = perf_dbgfs_write_run_client +}; + +static ssize_t perf_dbgfs_write_run_server(struct file *filep, const char __user *ubuf, + size_t size, loff_t *offp) +{ + return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_SERVER); +} + +static const struct file_operations perf_dbgfs_run_server = { + .open = simple_open, + .read = perf_dbgfs_read_run, + .write = perf_dbgfs_write_run_server +}; + static ssize_t perf_dbgfs_read_tcnt(struct file *filep, char __user *ubuf, size_t size, loff_t *offp) { @@ -1468,6 +1805,24 @@ static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf, return ret; } +static ssize_t perf_dbgfs_read_inbuf(struct file *filep, char __user *ubuf, + size_t size, loff_t *offp) +{ + struct perf_ctx *perf = filep->private_data; + char buf[32]; + ssize_t pos; + u64 *value; + + if (!perf->test_peer || !perf->test_peer->inbuf) { + pos = scnprintf(buf, sizeof(buf), "NULL\n"); + } else { + value = perf->test_peer->inbuf; + pos = scnprintf(buf, sizeof(buf), "0x%llx\n", *value); + } + + return simple_read_from_buffer(ubuf, size, offp, buf, pos); +} + static const struct file_operations perf_dbgfs_tcnt = { .open = simple_open, .read = perf_dbgfs_read_tcnt, @@ -1479,6 +1834,11 @@ static const struct file_operations perf_dbgfs_lattrs = { .read = perf_dbgfs_read_lattrs }; +static const struct file_operations perf_dbgfs_inbuf = { + .open = simple_open, + .read = perf_dbgfs_read_inbuf, +}; + static void perf_setup_dbgfs(struct perf_ctx *perf) { struct pci_dev *pdev = perf->ntb->pdev; @@ -1495,6 +1855,12 @@ static void perf_setup_dbgfs(struct perf_ctx *perf) debugfs_create_file("run", 0600, perf->dbgfs_dir, perf, &perf_dbgfs_run); + debugfs_create_file("run_client", 0600, perf->dbgfs_dir, perf, + &perf_dbgfs_run_client); + + debugfs_create_file("run_server", 0600, perf->dbgfs_dir, perf, + &perf_dbgfs_run_server); + debugfs_create_file("threads_count", 0600, perf->dbgfs_dir, perf, &perf_dbgfs_tcnt); @@ -1507,6 +1873,12 @@ static void perf_setup_dbgfs(struct perf_ctx *perf) debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf, &perf_dbgfs_lattrs); + + debugfs_create_u64("poll_latency_tries", 0400, perf->dbgfs_dir, + &perf->pldata.tries); + + debugfs_create_file("inbuf", 0400, perf->dbgfs_dir, perf, + &perf_dbgfs_inbuf); } static void perf_clear_dbgfs(struct perf_ctx *perf) From d3b4b9c2142ca288f81a373841989f592f138934 Mon Sep 17 00:00:00 2001 From: Alexander Fomichev Date: Fri, 13 May 2022 22:37:04 +0300 Subject: [PATCH 0096/1250] ntb_perf: extend with doorbell latency measurement Doorbell latency is a delay between start to ring an NTB doorbell and receiving the confirmation. The remote system needs to be run in server mode beforehand. Then the server waits for a doorbell event and immediately rings self doorbell to confirm. Thanks-to: Guo Zhengkui Signed-off-by: Alexander Fomichev Reviewed-by: Dave Jiang Signed-off-by: Jon Mason --- drivers/ntb/test/ntb_perf.c | 532 ++++++++++++++++++++++++++++-------- 1 file changed, 411 insertions(+), 121 deletions(-) diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index f0f3beba70a599..23e154bd41b94b 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -75,6 +75,14 @@ * Client side: * root@self# echo 0 > $DBG_DIR/poll_latency/run_client * root@self# cat $DBG_DIR/poll_latency/run_client + *----------------------------------------------------------------------------- + * Eg: start doorbell latency test with peer (index 0) and get the metrics + * + * Server side: + * root@self# echo 0 > $DBG_DIR/db_latency/run_server + * Client side: + * root@self# echo 0 > $DBG_DIR/db_latency/run_client + * root@self# cat $DBG_DIR/db_latency/run_client */ #include @@ -86,6 +94,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +104,7 @@ #include #define DRIVER_NAME "ntb_perf" -#define DRIVER_VERSION "2.2" +#define DRIVER_VERSION "2.3" MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRIVER_VERSION); @@ -147,6 +156,10 @@ static unsigned long lat_timeout_us = 500; module_param(lat_timeout_us, ulong, 0644); MODULE_PARM_DESC(lat_timeout_us, "Timeout (in us) to wait for server reply"); +static unsigned long peer_timeout_s = 60; +module_param(peer_timeout_s, ulong, 0644); +MODULE_PARM_DESC(peer_timeout_s, "Timeout (in s) to wait for peer link"); + /*============================================================================== * Perf driver data definition *============================================================================== @@ -166,9 +179,18 @@ enum perf_cmd { enum run_mode { RUN_PL_CLIENT, RUN_PL_SERVER, + RUN_DBL_CLIENT, + RUN_DBL_SERVER, }; struct perf_ctx; +struct perf_ext_lat_data; + +struct perf_ext_lat_ops { + int (*init)(struct perf_ext_lat_data *data); + int (*run)(struct perf_ext_lat_data *data); + void (*clear)(struct perf_ext_lat_data *data); +}; struct perf_peer { struct perf_ctx *perf; @@ -216,20 +238,21 @@ struct perf_thread { #define to_thread_work(__work) \ container_of(__work, struct perf_thread, work) -struct perf_poll_lat_data { +struct perf_ext_lat_data { struct perf_ctx *perf; - void *src; ktime_t latency; u64 tries; int status; - atomic_t running; - struct work_struct clt_work; - struct work_struct srv_work; + struct perf_ext_lat_ops ops; + struct work_struct work; + + union { + void *src; + int db; + }; }; -#define to_pldata_clt_work(__work) \ - container_of(__work, struct perf_poll_lat_data, clt_work) -#define to_pldata_srv_work(__work) \ - container_of(__work, struct perf_poll_lat_data, srv_work) +#define to_ext_lat_data(__work) \ + container_of(__work, struct perf_ext_lat_data, work) struct perf_ctx { struct ntb_dev *ntb; @@ -238,7 +261,12 @@ struct perf_ctx { int gidx; int pcnt; struct perf_peer *peers; - struct perf_poll_lat_data pldata; + + /* Ext latency tests interface */ + enum run_mode mode; + struct perf_ext_lat_data pldata; + struct perf_ext_lat_data dbldata; + atomic_t running; /* Performance measuring work-threads interface */ unsigned long busy_flag; @@ -551,6 +579,15 @@ static void perf_link_event(void *ctx) } } +static inline void perf_dbl_pong(struct perf_ctx *perf) +{ + struct perf_ext_lat_data *data = &perf->dbldata; + + ntb_db_clear(perf->ntb, BIT_ULL(data->db)); + data->tries++; + ntb_peer_db_set(perf->ntb, BIT_ULL(data->db)); +} + static void perf_db_event(void *ctx, int vec) { struct perf_ctx *perf = ctx; @@ -559,7 +596,11 @@ static void perf_db_event(void *ctx, int vec) ntb_db_vector_mask(perf->ntb, vec), ntb_db_read(perf->ntb)); /* Just receive all available commands */ - (void)perf_cmd_recv(perf); + if (perf->dbldata.db >= 0 && + BIT_ULL(perf->dbldata.db) & ntb_db_read(perf->ntb)) + perf_dbl_pong(perf); + else + (void)perf_cmd_recv(perf); } static void perf_msg_event(void *ctx) @@ -714,6 +755,8 @@ static int perf_init_service(struct perf_ctx *perf) return -EINVAL; } + perf->dbldata.db = -1; + if (ntb_msg_count(perf->ntb) >= PERF_MSG_CNT) { perf->cmd_send = perf_msg_cmd_send; perf->cmd_recv = perf_msg_cmd_recv; @@ -1164,14 +1207,14 @@ static void perf_thread_work(struct work_struct *work) perf_clear_test(pthr); } -static int perf_init_pl(struct perf_poll_lat_data *pldata) +static int perf_init_pl(struct perf_ext_lat_data *pldata) { struct perf_ctx *perf = pldata->perf; struct perf_peer *peer = perf->test_peer; u8 *bp; pldata->src = kmalloc_node(peer->outbuf_size, GFP_KERNEL, - dev_to_node(&perf->ntb->dev)); + dev_to_node(&perf->ntb->dev)); if (!pldata->src) return -ENOMEM; @@ -1206,10 +1249,11 @@ static int perf_poll_peer_reply(volatile u8 *cur) return -EINTR; } -static int perf_run_pl_client(struct perf_poll_lat_data *pldata) +static int perf_run_pl_client(struct perf_ext_lat_data *pldata) { - struct perf_peer *peer = pldata->perf->test_peer; - struct ntb_dev *ntb = pldata->perf->ntb; + struct perf_ctx *perf = pldata->perf; + struct perf_peer *peer = perf->test_peer; + struct ntb_dev *ntb = perf->ntb; void *src = pldata->src; u64 stop_at; int ret; @@ -1234,7 +1278,7 @@ static int perf_run_pl_client(struct perf_poll_lat_data *pldata) } else if (ret == 1) { dev_warn(&ntb->dev, "Server terminated on poll latency, stopping\n"); break; - } else if (!atomic_read(&pldata->running)) { + } else if (!atomic_read(&perf->running)) { dev_err(&ntb->dev, "Poll latency client terminated\n"); return -EINTR; } @@ -1270,10 +1314,11 @@ static int perf_run_pl_client(struct perf_poll_lat_data *pldata) return 0; } -static int perf_run_pl_server(struct perf_poll_lat_data *pldata) +static int perf_run_pl_server(struct perf_ext_lat_data *pldata) { - struct perf_peer *peer = pldata->perf->test_peer; - struct ntb_dev *ntb = pldata->perf->ntb; + struct perf_ctx *perf = pldata->perf; + struct perf_peer *peer = perf->test_peer; + struct ntb_dev *ntb = perf->ntb; void *src = pldata->src; int ret = 0; @@ -1281,7 +1326,7 @@ static int perf_run_pl_server(struct perf_poll_lat_data *pldata) pldata->tries = 0; - while (ret != 1 && atomic_read(&pldata->running)) { + while (ret != 1 && atomic_read(&perf->running)) { ret = perf_poll_peer_reply(peer->inbuf); if (!ret) { /* Pong to client */ @@ -1303,44 +1348,131 @@ static int perf_run_pl_server(struct perf_poll_lat_data *pldata) dev_dbg(&ntb->dev, "poll_lat: server stopped, had responded %llu times\n", pldata->tries); - return atomic_read(&pldata->running) ? -ENODATA : -EINTR; + return atomic_read(&perf->running) ? -ENODATA : -EINTR; } -static void perf_clear_pl(struct perf_poll_lat_data *pldata) +static void perf_clear_pl(struct perf_ext_lat_data *pldata) { struct perf_ctx *perf = pldata->perf; struct perf_peer *peer = perf->test_peer; memset(peer->inbuf, stop_word, 1); - atomic_set(&pldata->running, 0); + atomic_set(&perf->running, 0); wake_up(&perf->twait); kfree(pldata->src); } -static void perf_poll_lat_client_work(struct work_struct *work) +static struct perf_ext_lat_ops perf_pl_client_ops = { + .init = perf_init_pl, + .run = perf_run_pl_client, + .clear = perf_clear_pl +}; + +static struct perf_ext_lat_ops perf_pl_server_ops = { + .init = perf_init_pl, + .run = perf_run_pl_server, + .clear = perf_clear_pl +}; + +static int perf_init_dbl(struct perf_ext_lat_data *data) { - struct perf_poll_lat_data *pldata = to_pldata_clt_work(work); + struct perf_ctx *perf = data->perf; - pldata->status = perf_init_pl(pldata); - if (pldata->status) - return; + data->db = get_bitmask_order(ntb_db_valid_mask(perf->ntb)) - 1; + dev_dbg(&perf->ntb->dev, "DB bit for latency test: %d\n", data->db); - pldata->status = perf_run_pl_client(pldata); + if (data->db <= perf->gidx) { + dev_err(&perf->ntb->dev, "No spare DoorBell found\n"); + data->db = -1; + return -ENOSPC; + } - perf_clear_pl(pldata); + return ntb_db_clear_mask(perf->ntb, BIT_ULL(data->db)); } -static void perf_poll_lat_server_work(struct work_struct *work) +static int perf_run_dbl_client(struct perf_ext_lat_data *data) { - struct perf_poll_lat_data *pldata = to_pldata_srv_work(work); + struct perf_ctx *perf = data->perf; + struct ntb_dev *ntb = perf->ntb; + u64 stop_at; + + dev_dbg(&ntb->dev, "db_lat: client started.\n"); + + data->tries = 0; + data->latency = ktime_get(); + + if (ntb_peer_db_set(perf->ntb, BIT_ULL(data->db))) + return -EIO; + + stop_at = ktime_get_real_fast_ns() + lat_time_ms * NSEC_PER_MSEC; + while (ktime_get_real_fast_ns() < stop_at) { + /* Avoid processor soft lock-ups */ + schedule(); - pldata->status = perf_init_pl(pldata); - if (pldata->status) + if (!atomic_read(&perf->running)) { + dev_err(&ntb->dev, "DoorBell latency client terminated\n"); + return -EINTR; + } + } + + /* Stop timer */ + data->latency = ktime_sub(ktime_get(), data->latency); + + if (data->tries < LAT_MIN_TRIES) { + dev_err(&ntb->dev, + "Too few steps (%llu) to measure Latency, recommended > %d. Increase value of 'lat_time_ms' parameter\n", + data->tries, LAT_MIN_TRIES); + data->latency = ktime_set(0, 0); + return -EINVAL; + } + + dev_dbg(&ntb->dev, "db_lat: made %llu tries, lasted %llu usecs\n", + data->tries, ktime_to_us(data->latency)); + + data->latency = ns_to_ktime(ktime_divns(data->latency, data->tries)); + + dev_dbg(&ntb->dev, "db_lat: latency %llu us (%llu ns)\n", + ktime_to_us(data->latency), ktime_to_ns(data->latency)); + + return 0; +} + +static void perf_clear_dbl(struct perf_ext_lat_data *data) +{ + struct perf_ctx *perf = data->perf; + + data->db = -1; + ntb_db_set_mask(perf->ntb, BIT_ULL(data->db)); + atomic_set(&perf->running, 0); + wake_up(&perf->twait); +} + +static struct perf_ext_lat_ops perf_dbl_client_ops = { + .init = perf_init_dbl, + .run = perf_run_dbl_client, + .clear = perf_clear_dbl +}; + +static void perf_ext_lat_work(struct work_struct *work) +{ + struct perf_ext_lat_data *data = to_ext_lat_data(work); + + if (!data->ops.init || !data->ops.run || !data->ops.clear) { + struct perf_ctx *perf = data->perf; + + data->status = -EFAULT; + atomic_set(&perf->running, 0); + wake_up(&perf->twait); return; + } - pldata->status = perf_run_pl_server(pldata); + data->status = data->ops.init(data); + if (data->status) + return; - perf_clear_pl(pldata); + data->status = data->ops.run(data); + + data->ops.clear(data); } static int perf_set_tcnt(struct perf_ctx *perf, u8 tcnt) @@ -1363,10 +1495,10 @@ static void perf_terminate_test(struct perf_ctx *perf) int tidx; atomic_set(&perf->tsync, -1); - atomic_set(&perf->pldata.running, 0); + atomic_set(&perf->running, 0); wake_up(&perf->twait); - cancel_work_sync(&perf->pldata.srv_work); - cancel_work_sync(&perf->pldata.clt_work); + cancel_work_sync(&perf->pldata.work); + cancel_work_sync(&perf->dbldata.work); for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) { wake_up(&perf->threads[tidx].dma_wait); @@ -1380,9 +1512,10 @@ static int perf_submit_test(struct perf_peer *peer) struct perf_thread *pthr; int tidx, ret; - ret = wait_for_completion_interruptible(&peer->init_comp); - if (ret < 0) - return ret; + ret = wait_for_completion_interruptible_timeout(&peer->init_comp, + msecs_to_jiffies(peer_timeout_s * 1000)); + if (ret <= 0) + return ret ? ret : -ETIMEDOUT; if (test_and_set_bit_lock(0, &perf->busy_flag)) return -EBUSY; @@ -1412,41 +1545,58 @@ static int perf_submit_test(struct perf_peer *peer) return ret; } -static int perf_submit_poll_lat(struct perf_peer *peer, enum run_mode mode) +static int perf_submit_ext_lat(struct perf_peer *peer) { struct perf_ctx *perf = peer->perf; int ret; - ret = wait_for_completion_interruptible(&peer->init_comp); - if (ret < 0) - return ret; + ret = wait_for_completion_interruptible_timeout(&peer->init_comp, + msecs_to_jiffies(peer_timeout_s * 1000)); + if (ret <= 0) + return ret ? ret : -ETIMEDOUT; if (test_and_set_bit_lock(0, &perf->busy_flag)) return -EBUSY; perf->test_peer = peer; - atomic_set(&perf->pldata.running, 1); + atomic_set(&perf->running, 1); perf->pldata.status = -ENODATA; perf->pldata.tries = 0; perf->pldata.latency = ktime_set(0, 0); + perf->dbldata.status = -ENODATA; + perf->dbldata.tries = 0; + perf->dbldata.latency = ktime_set(0, 0); - switch (mode) { + switch (perf->mode) { case RUN_PL_SERVER: - (void)queue_work(perf_wq, &perf->pldata.srv_work); + perf->pldata.ops = perf_pl_server_ops; + (void)queue_work(perf_wq, &perf->pldata.work); break; case RUN_PL_CLIENT: - default: - (void)queue_work(perf_wq, &perf->pldata.clt_work); + perf->pldata.ops = perf_pl_client_ops; + (void)queue_work(perf_wq, &perf->pldata.work); + break; + case RUN_DBL_SERVER: + ret = perf_init_dbl(&perf->dbldata); + dev_dbg(&perf->ntb->dev, "db_lat: server started.\n"); + goto submit_exit; + case RUN_DBL_CLIENT: + perf->dbldata.ops = perf_dbl_client_ops; + (void)queue_work(perf_wq, &perf->dbldata.work); break; + default: + ret = -EINVAL; + goto submit_exit; } ret = wait_event_interruptible(perf->twait, - !atomic_read(&perf->pldata.running)); + !atomic_read(&perf->running)); if (ret == -ERESTARTSYS) { perf_terminate_test(perf); ret = -EINTR; } +submit_exit: clear_bit_unlock(0, &perf->busy_flag); return ret; @@ -1494,30 +1644,12 @@ static int perf_read_stats(struct perf_ctx *perf, char *buf, } } - if (perf->pldata.status != -ENODATA) { - (*pos) += scnprintf(buf + *pos, size - *pos, "\n"); - if (perf->pldata.status) { - (*pos) += scnprintf(buf + *pos, size - *pos, - "poll latency: error status %d\n", perf->pldata.status); - } else { - if (ktime_to_us(perf->pldata.latency) < 10) { - (*pos) += scnprintf(buf + *pos, size - *pos, - "poll latency %llu ns\n", - ktime_to_ns(perf->pldata.latency)); - } else { - (*pos) += scnprintf(buf + *pos, size - *pos, - "poll latency %llu us\n", - ktime_to_us(perf->pldata.latency)); - } - } - } - clear_bit_unlock(0, &perf->busy_flag); return 0; } -static void perf_init_threads(struct perf_ctx *perf) +static void perf_init_workers(struct perf_ctx *perf) { struct perf_thread *pthr; int tidx; @@ -1525,11 +1657,15 @@ static void perf_init_threads(struct perf_ctx *perf) perf->tcnt = DEF_THREADS_CNT; perf->test_peer = &perf->peers[0]; init_waitqueue_head(&perf->twait); + perf->pldata.perf = perf; - INIT_WORK(&perf->pldata.srv_work, perf_poll_lat_server_work); - INIT_WORK(&perf->pldata.clt_work, perf_poll_lat_client_work); + INIT_WORK(&perf->pldata.work, perf_ext_lat_work); perf->pldata.status = -ENODATA; + perf->dbldata.perf = perf; + INIT_WORK(&perf->dbldata.work, perf_ext_lat_work); + perf->dbldata.status = -ENODATA; + for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) { pthr = &perf->threads[tidx]; @@ -1541,7 +1677,7 @@ static void perf_init_threads(struct perf_ctx *perf) } } -static void perf_clear_threads(struct perf_ctx *perf) +static void perf_clear_workers(struct perf_ctx *perf) { perf_terminate_test(perf); } @@ -1685,8 +1821,55 @@ static const struct file_operations perf_dbgfs_run = { .write = perf_dbgfs_write_run }; -static ssize_t perf_dbgfs_write_run_pl(struct file *filep, const char __user *ubuf, - size_t size, loff_t *offp, enum run_mode mode) +static ssize_t perf_dbgfs_read_run_pl(struct file *filep, char __user *ubuf, + size_t fsize, loff_t *offp) +{ + struct perf_ctx *perf = filep->private_data; + ssize_t size = PERF_BUF_LEN; + ssize_t pos = 0; + ssize_t ret; + char *buf; + + if (test_and_set_bit_lock(0, &perf->busy_flag)) + return -EBUSY; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + pos += scnprintf(buf + pos, size - pos, + " Peer %d test statistics:\n", perf->test_peer->pidx); + + if (perf->pldata.status != -ENODATA) { + if (perf->pldata.status) { + pos += scnprintf(buf + pos, size - pos, + "poll latency: error status %d\n", perf->pldata.status); + } else { + if (ktime_to_us(perf->pldata.latency) < 10) { + pos += scnprintf(buf + pos, size - pos, + "poll latency %llu ns\n", + ktime_to_ns(perf->pldata.latency)); + } else { + pos += scnprintf(buf + pos, size - pos, + "poll latency %llu us\n", + ktime_to_us(perf->pldata.latency)); + } + } + } else { + pos += scnprintf(buf + pos, size - pos, "Test did not run\n"); + } + + ret = simple_read_from_buffer(ubuf, fsize, offp, buf, pos); + + kfree(buf); + + clear_bit_unlock(0, &perf->busy_flag); + + return ret; +} + +static ssize_t perf_dbgfs_write_run_ext(struct file *filep, const char __user *ubuf, + size_t size, loff_t *offp, enum run_mode mode) { struct perf_ctx *perf = filep->private_data; struct ntb_dev *ntb = perf->ntb; @@ -1697,50 +1880,132 @@ static ssize_t perf_dbgfs_write_run_pl(struct file *filep, const char __user *ub if (ret) return ret; - if (pidx < 0 && mode == RUN_PL_SERVER) { - dev_dbg(&ntb->dev, "poll_lat: kill server\n"); - if (test_bit(0, &perf->busy_flag)) { - peer = perf->test_peer; - /* Send stop to client */ - memcpy_toio(peer->outbuf, &stop_word, 1); + if (pidx < 0) { + switch (mode) { + case RUN_PL_SERVER: + dev_dbg(&ntb->dev, "poll_lat: kill server\n"); + if (test_bit(0, &perf->busy_flag)) { + peer = perf->test_peer; + /* Send stop to client */ + memcpy_toio(peer->outbuf, &stop_word, 1); + } + perf_terminate_test(perf); + clear_bit_unlock(0, &perf->busy_flag); + return size; + case RUN_DBL_SERVER: + dev_dbg(&ntb->dev, "db_lat: kill server\n"); + perf_clear_dbl(&perf->dbldata); + clear_bit_unlock(0, &perf->busy_flag); + return size; + default: + return -EINVAL; } - perf_terminate_test(perf); - clear_bit_unlock(0, &perf->busy_flag); - return size; } - if (pidx < 0 || pidx >= perf->pcnt) + if (pidx >= perf->pcnt) return -EINVAL; peer = &perf->peers[pidx]; + perf->mode = mode; - ret = perf_submit_poll_lat(peer, mode); + ret = perf_submit_ext_lat(peer); return ret ? ret : size; } -static ssize_t perf_dbgfs_write_run_client(struct file *filep, const char __user *ubuf, - size_t size, loff_t *offp) +static ssize_t perf_dbgfs_write_run_pl_client(struct file *filep, + const char __user *ubuf, size_t size, loff_t *offp) { - return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_CLIENT); + return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_PL_CLIENT); } -static const struct file_operations perf_dbgfs_run_client = { +static const struct file_operations perf_dbgfs_run_pl_client = { .open = simple_open, - .read = perf_dbgfs_read_run, - .write = perf_dbgfs_write_run_client + .read = perf_dbgfs_read_run_pl, + .write = perf_dbgfs_write_run_pl_client }; -static ssize_t perf_dbgfs_write_run_server(struct file *filep, const char __user *ubuf, - size_t size, loff_t *offp) +static ssize_t perf_dbgfs_write_run_pl_server(struct file *filep, + const char __user *ubuf, size_t size, loff_t *offp) { - return perf_dbgfs_write_run_pl(filep, ubuf, size, offp, RUN_PL_SERVER); + return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_PL_SERVER); } -static const struct file_operations perf_dbgfs_run_server = { +static const struct file_operations perf_dbgfs_run_pl_server = { .open = simple_open, - .read = perf_dbgfs_read_run, - .write = perf_dbgfs_write_run_server + .read = perf_dbgfs_read_run_pl, + .write = perf_dbgfs_write_run_pl_server +}; + +static ssize_t perf_dbgfs_read_run_dbl(struct file *filep, char __user *ubuf, + size_t fsize, loff_t *offp) +{ + struct perf_ctx *perf = filep->private_data; + ssize_t size = PERF_BUF_LEN; + ssize_t pos = 0; + ssize_t ret; + char *buf; + + if (test_and_set_bit_lock(0, &perf->busy_flag)) + return -EBUSY; + + buf = kmalloc(size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + pos += scnprintf(buf + pos, size - pos, + " Peer %d test statistics:\n", perf->test_peer->pidx); + + if (perf->dbldata.status != -ENODATA) { + if (perf->dbldata.status) { + pos += scnprintf(buf + pos, size - pos, + "doorbell latency: error status %d\n", perf->dbldata.status); + } else { + if (ktime_to_us(perf->dbldata.latency) < 10) { + pos += scnprintf(buf + pos, size - pos, + "doorbell latency %llu ns\n", + ktime_to_ns(perf->dbldata.latency)); + } else { + pos += scnprintf(buf + pos, size - pos, + "doorbell latency %llu us\n", + ktime_to_us(perf->dbldata.latency)); + } + } + } else { + pos += scnprintf(buf + pos, size - pos, "Test did not run\n"); + } + + ret = simple_read_from_buffer(ubuf, fsize, offp, buf, pos); + + kfree(buf); + + clear_bit_unlock(0, &perf->busy_flag); + + return ret; +} + +static ssize_t perf_dbgfs_write_run_dbl_client(struct file *filep, + const char __user *ubuf, size_t size, loff_t *offp) +{ + return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_DBL_CLIENT); +} + +static const struct file_operations perf_dbgfs_run_dbl_client = { + .open = simple_open, + .read = perf_dbgfs_read_run_dbl, + .write = perf_dbgfs_write_run_dbl_client +}; + +static ssize_t perf_dbgfs_write_run_dbl_server(struct file *filep, + const char __user *ubuf, size_t size, loff_t *offp) +{ + return perf_dbgfs_write_run_ext(filep, ubuf, size, offp, RUN_DBL_SERVER); +} + +static const struct file_operations perf_dbgfs_run_dbl_server = { + .open = simple_open, + .read = perf_dbgfs_read_run_dbl, + .write = perf_dbgfs_write_run_dbl_server }; static ssize_t perf_dbgfs_read_tcnt(struct file *filep, char __user *ubuf, @@ -1794,8 +2059,7 @@ static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf, struct perf_thread *pthr = &perf->threads[tidx]; pos += scnprintf(buf + pos, buf_size - pos, - "%d: made %llu tries\n", - tidx, pthr->tries); + "%d: made %llu tries\n", tidx, pthr->tries); } ret = simple_read_from_buffer(ubuf, size, offp, buf, pos); @@ -1806,7 +2070,7 @@ static ssize_t perf_dbgfs_read_lattrs(struct file *filep, char __user *ubuf, } static ssize_t perf_dbgfs_read_inbuf(struct file *filep, char __user *ubuf, - size_t size, loff_t *offp) + size_t size, loff_t *offp) { struct perf_ctx *perf = filep->private_data; char buf[32]; @@ -1842,6 +2106,9 @@ static const struct file_operations perf_dbgfs_inbuf = { static void perf_setup_dbgfs(struct perf_ctx *perf) { struct pci_dev *pdev = perf->ntb->pdev; + struct dentry *burst_lat_dir; + struct dentry *poll_lat_dir; + struct dentry *db_lat_dir; perf->dbgfs_dir = debugfs_create_dir(pci_name(pdev), perf_dbgfs_topdir); if (!perf->dbgfs_dir) { @@ -1852,17 +2119,10 @@ static void perf_setup_dbgfs(struct perf_ctx *perf) debugfs_create_file("info", 0600, perf->dbgfs_dir, perf, &perf_dbgfs_info); - debugfs_create_file("run", 0600, perf->dbgfs_dir, perf, - &perf_dbgfs_run); + debugfs_create_symlink("run", perf->dbgfs_dir, "burst_latency/run"); - debugfs_create_file("run_client", 0600, perf->dbgfs_dir, perf, - &perf_dbgfs_run_client); - - debugfs_create_file("run_server", 0600, perf->dbgfs_dir, perf, - &perf_dbgfs_run_server); - - debugfs_create_file("threads_count", 0600, perf->dbgfs_dir, perf, - &perf_dbgfs_tcnt); + debugfs_create_symlink("threads_count", perf->dbgfs_dir, + "burst_latency/threads_count"); /* They are made read-only for test exec safety and integrity */ debugfs_create_u8("chunk_order", 0500, perf->dbgfs_dir, &chunk_order); @@ -1871,14 +2131,44 @@ static void perf_setup_dbgfs(struct perf_ctx *perf) debugfs_create_bool("use_dma", 0500, perf->dbgfs_dir, &use_dma); - debugfs_create_file("latency_tries", 0400, perf->dbgfs_dir, perf, + debugfs_create_file("inbuf", 0400, perf->dbgfs_dir, perf, + &perf_dbgfs_inbuf); + + /* burst_latency subdir */ + + burst_lat_dir = debugfs_create_dir("burst_latency", perf->dbgfs_dir); + + debugfs_create_file("run", 0600, burst_lat_dir, perf, &perf_dbgfs_run); + + debugfs_create_file("threads_count", 0600, burst_lat_dir, perf, + &perf_dbgfs_tcnt); + + debugfs_create_file("tries", 0400, burst_lat_dir, perf, &perf_dbgfs_lattrs); - debugfs_create_u64("poll_latency_tries", 0400, perf->dbgfs_dir, - &perf->pldata.tries); + /* poll_latency subdir */ - debugfs_create_file("inbuf", 0400, perf->dbgfs_dir, perf, - &perf_dbgfs_inbuf); + poll_lat_dir = debugfs_create_dir("poll_latency", perf->dbgfs_dir); + + debugfs_create_file("run_client", 0600, poll_lat_dir, perf, + &perf_dbgfs_run_pl_client); + + debugfs_create_file("run_server", 0600, poll_lat_dir, perf, + &perf_dbgfs_run_pl_server); + + debugfs_create_u64("tries", 0400, poll_lat_dir, &perf->pldata.tries); + + /* db_latency subdir */ + + db_lat_dir = debugfs_create_dir("db_latency", perf->dbgfs_dir); + + debugfs_create_file("run_client", 0600, db_lat_dir, perf, + &perf_dbgfs_run_dbl_client); + + debugfs_create_file("run_server", 0600, db_lat_dir, perf, + &perf_dbgfs_run_dbl_server); + + debugfs_create_u64("tries", 0400, db_lat_dir, &perf->dbldata.tries); } static void perf_clear_dbgfs(struct perf_ctx *perf) @@ -1998,7 +2288,7 @@ static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb) if (ret) return ret; - perf_init_threads(perf); + perf_init_workers(perf); ret = perf_init_service(perf); if (ret) @@ -2021,7 +2311,7 @@ static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb) perf_disable_service(perf); - perf_clear_threads(perf); + perf_clear_workers(perf); } static struct ntb_client perf_client = { From df19e18e21fda4aa4281b53764889f635c2446ec Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Aug 2020 16:14:34 -0700 Subject: [PATCH 0097/1250] tools/memory-model: Document locking corner cases Most Linux-kernel uses of locking are straightforward, but there are corner-case uses that rely on less well-known aspects of the lock and unlock primitives. This commit therefore adds a locking.txt and litmus tests in Documentation/litmus-tests/locking to explain these corner-case uses. Signed-off-by: Paul E. McKenney --- .../litmus-tests/locking/DCL-broken.litmus | 55 +++ .../litmus-tests/locking/DCL-fixed.litmus | 56 +++ .../litmus-tests/locking/RM-broken.litmus | 42 +++ .../litmus-tests/locking/RM-fixed.litmus | 42 +++ tools/memory-model/Documentation/locking.txt | 320 ++++++++++++++++++ 5 files changed, 515 insertions(+) create mode 100644 Documentation/litmus-tests/locking/DCL-broken.litmus create mode 100644 Documentation/litmus-tests/locking/DCL-fixed.litmus create mode 100644 Documentation/litmus-tests/locking/RM-broken.litmus create mode 100644 Documentation/litmus-tests/locking/RM-fixed.litmus create mode 100644 tools/memory-model/Documentation/locking.txt diff --git a/Documentation/litmus-tests/locking/DCL-broken.litmus b/Documentation/litmus-tests/locking/DCL-broken.litmus new file mode 100644 index 00000000000000..cfaa25ff82b1e3 --- /dev/null +++ b/Documentation/litmus-tests/locking/DCL-broken.litmus @@ -0,0 +1,55 @@ +C DCL-broken + +(* + * Result: Sometimes + * + * This litmus test demonstrates more than just locking is required to + * correctly implement double-checked locking. + *) + +{ + int flag; + int data; + int lck; +} + +P0(int *flag, int *data, int *lck) +{ + int r0; + int r1; + int r2; + + r0 = READ_ONCE(*flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + WRITE_ONCE(*flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); +} + +P1(int *flag, int *data, int *lck) +{ + int r0; + int r1; + int r2; + + r0 = READ_ONCE(*flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + WRITE_ONCE(*flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); +} + +locations [flag;data;lck;0:r0;0:r1;1:r0;1:r1] +exists (0:r2=0 \/ 1:r2=0) diff --git a/Documentation/litmus-tests/locking/DCL-fixed.litmus b/Documentation/litmus-tests/locking/DCL-fixed.litmus new file mode 100644 index 00000000000000..579d6c246f167c --- /dev/null +++ b/Documentation/litmus-tests/locking/DCL-fixed.litmus @@ -0,0 +1,56 @@ +C DCL-fixed + +(* + * Result: Never + * + * This litmus test demonstrates that double-checked locking can be + * reliable given proper use of smp_load_acquire() and smp_store_release() + * in addition to the locking. + *) + +{ + int flag; + int data; + int lck; +} + +P0(int *flag, int *data, int *lck) +{ + int r0; + int r1; + int r2; + + r0 = smp_load_acquire(flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + smp_store_release(flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); +} + +P1(int *flag, int *data, int *lck) +{ + int r0; + int r1; + int r2; + + r0 = smp_load_acquire(flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + smp_store_release(flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); +} + +locations [flag;data;lck;0:r0;0:r1;1:r0;1:r1] +exists (0:r2=0 \/ 1:r2=0) diff --git a/Documentation/litmus-tests/locking/RM-broken.litmus b/Documentation/litmus-tests/locking/RM-broken.litmus new file mode 100644 index 00000000000000..c586ae4b547de1 --- /dev/null +++ b/Documentation/litmus-tests/locking/RM-broken.litmus @@ -0,0 +1,42 @@ +C RM-broken + +(* + * Result: DEADLOCK + * + * This litmus test demonstrates that the old "roach motel" approach + * to locking, where code can be freely moved into critical sections, + * cannot be used in the Linux kernel. + *) + +{ + int lck; + int x; + int y; +} + +P0(int *x, int *y, int *lck) +{ + int r2; + + spin_lock(lck); + r2 = atomic_inc_return(y); + WRITE_ONCE(*x, 1); + spin_unlock(lck); +} + +P1(int *x, int *y, int *lck) +{ + int r0; + int r1; + int r2; + + spin_lock(lck); + r0 = READ_ONCE(*x); + r1 = READ_ONCE(*x); + r2 = atomic_inc_return(y); + spin_unlock(lck); +} + +locations [x;lck;0:r2;1:r0;1:r1;1:r2] +filter (y=2 /\ 1:r0=0 /\ 1:r1=1) +exists (1:r2=1) diff --git a/Documentation/litmus-tests/locking/RM-fixed.litmus b/Documentation/litmus-tests/locking/RM-fixed.litmus new file mode 100644 index 00000000000000..672856736b42e8 --- /dev/null +++ b/Documentation/litmus-tests/locking/RM-fixed.litmus @@ -0,0 +1,42 @@ +C RM-fixed + +(* + * Result: Never + * + * This litmus test demonstrates that the old "roach motel" approach + * to locking, where code can be freely moved into critical sections, + * cannot be used in the Linux kernel. + *) + +{ + int lck; + int x; + int y; +} + +P0(int *x, int *y, int *lck) +{ + int r2; + + spin_lock(lck); + r2 = atomic_inc_return(y); + WRITE_ONCE(*x, 1); + spin_unlock(lck); +} + +P1(int *x, int *y, int *lck) +{ + int r0; + int r1; + int r2; + + r0 = READ_ONCE(*x); + r1 = READ_ONCE(*x); + spin_lock(lck); + r2 = atomic_inc_return(y); + spin_unlock(lck); +} + +locations [x;lck;0:r2;1:r0;1:r1;1:r2] +filter (y=2 /\ 1:r0=0 /\ 1:r1=1) +exists (1:r2=1) diff --git a/tools/memory-model/Documentation/locking.txt b/tools/memory-model/Documentation/locking.txt new file mode 100644 index 00000000000000..4e05c6d53ab724 --- /dev/null +++ b/tools/memory-model/Documentation/locking.txt @@ -0,0 +1,320 @@ +Locking +======= + +Locking is well-known and the common use cases are straightforward: Any +CPU holding a given lock sees any changes previously seen or made by any +CPU before it previously released that same lock. This last sentence +is the only part of this document that most developers will need to read. + +However, developers who would like to also access lock-protected shared +variables outside of their corresponding locks should continue reading. + + +Locking and Prior Accesses +-------------------------- + +The basic rule of locking is worth repeating: + + Any CPU holding a given lock sees any changes previously seen + or made by any CPU before it previously released that same lock. + +Note that this statement is a bit stronger than "Any CPU holding a +given lock sees all changes made by any CPU during the time that CPU was +previously holding this same lock". For example, consider the following +pair of code fragments: + + /* See MP+polocks.litmus. */ + void CPU0(void) + { + WRITE_ONCE(x, 1); + spin_lock(&mylock); + WRITE_ONCE(y, 1); + spin_unlock(&mylock); + } + + void CPU1(void) + { + spin_lock(&mylock); + r0 = READ_ONCE(y); + spin_unlock(&mylock); + r1 = READ_ONCE(x); + } + +The basic rule guarantees that if CPU0() acquires mylock before CPU1(), +then both r0 and r1 must be set to the value 1. This also has the +consequence that if the final value of r0 is equal to 1, then the final +value of r1 must also be equal to 1. In contrast, the weaker rule would +say nothing about the final value of r1. + + +Locking and Subsequent Accesses +------------------------------- + +The converse to the basic rule also holds: Any CPU holding a given +lock will not see any changes that will be made by any CPU after it +subsequently acquires this same lock. This converse statement is +illustrated by the following litmus test: + + /* See MP+porevlocks.litmus. */ + void CPU0(void) + { + r0 = READ_ONCE(y); + spin_lock(&mylock); + r1 = READ_ONCE(x); + spin_unlock(&mylock); + } + + void CPU1(void) + { + spin_lock(&mylock); + WRITE_ONCE(x, 1); + spin_unlock(&mylock); + WRITE_ONCE(y, 1); + } + +This converse to the basic rule guarantees that if CPU0() acquires +mylock before CPU1(), then both r0 and r1 must be set to the value 0. +This also has the consequence that if the final value of r1 is equal +to 0, then the final value of r0 must also be equal to 0. In contrast, +the weaker rule would say nothing about the final value of r0. + +These examples show only a single pair of CPUs, but the effects of the +locking basic rule extend across multiple acquisitions of a given lock +across multiple CPUs. + + +Double-Checked Locking +---------------------- + +It is well known that more than just a lock is required to make +double-checked locking work correctly, This litmus test illustrates +one incorrect approach: + + /* See Documentation/litmus-tests/locking/DCL-broken.litmus. */ + P0(int *flag, int *data, int *lck) + { + int r0; + int r1; + int r2; + + r0 = READ_ONCE(*flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + WRITE_ONCE(*flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); + } + /* P1() is the exactly the same as P0(). */ + +There are two problems. First, there is no ordering between the first +READ_ONCE() of "flag" and the READ_ONCE() of "data". Second, there is +no ordering between the two WRITE_ONCE() calls. It should therefore be +no surprise that "r2" can be zero, and a quick herd7 run confirms this. + +One way to fix this is to use smp_load_acquire() and smp_store_release() +as shown in this corrected version: + + /* See Documentation/litmus-tests/locking/DCL-fixed.litmus. */ + P0(int *flag, int *data, int *lck) + { + int r0; + int r1; + int r2; + + r0 = smp_load_acquire(flag); + if (r0 == 0) { + spin_lock(lck); + r1 = READ_ONCE(*flag); + if (r1 == 0) { + WRITE_ONCE(*data, 1); + smp_store_release(flag, 1); + } + spin_unlock(lck); + } + r2 = READ_ONCE(*data); + } + /* P1() is the exactly the same as P0(). */ + +The smp_load_acquire() guarantees that its load from "flags" will +be ordered before the READ_ONCE() from data, thus solving the first +problem. The smp_store_release() guarantees that its store will be +ordered after the WRITE_ONCE() to "data", solving the second problem. +The smp_store_release() pairs with the smp_load_acquire(), thus ensuring +that the ordering provided by each actually takes effect. Again, a +quick herd7 run confirms this. + +In short, if you access a lock-protected variable without holding the +corresponding lock, you will need to provide additional ordering, in +this case, via the smp_load_acquire() and the smp_store_release(). + + +Ordering Provided by a Lock to CPUs Not Holding That Lock +--------------------------------------------------------- + +It is not necessarily the case that accesses ordered by locking will be +seen as ordered by CPUs not holding that lock. Consider this example: + + /* See Z6.0+pooncelock+pooncelock+pombonce.litmus. */ + void CPU0(void) + { + spin_lock(&mylock); + WRITE_ONCE(x, 1); + WRITE_ONCE(y, 1); + spin_unlock(&mylock); + } + + void CPU1(void) + { + spin_lock(&mylock); + r0 = READ_ONCE(y); + WRITE_ONCE(z, 1); + spin_unlock(&mylock); + } + + void CPU2(void) + { + WRITE_ONCE(z, 2); + smp_mb(); + r1 = READ_ONCE(x); + } + +Counter-intuitive though it might be, it is quite possible to have +the final value of r0 be 1, the final value of z be 2, and the final +value of r1 be 0. The reason for this surprising outcome is that CPU2() +never acquired the lock, and thus did not fully benefit from the lock's +ordering properties. + +Ordering can be extended to CPUs not holding the lock by careful use +of smp_mb__after_spinlock(): + + /* See Z6.0+pooncelock+poonceLock+pombonce.litmus. */ + void CPU0(void) + { + spin_lock(&mylock); + WRITE_ONCE(x, 1); + WRITE_ONCE(y, 1); + spin_unlock(&mylock); + } + + void CPU1(void) + { + spin_lock(&mylock); + smp_mb__after_spinlock(); + r0 = READ_ONCE(y); + WRITE_ONCE(z, 1); + spin_unlock(&mylock); + } + + void CPU2(void) + { + WRITE_ONCE(z, 2); + smp_mb(); + r1 = READ_ONCE(x); + } + +This addition of smp_mb__after_spinlock() strengthens the lock +acquisition sufficiently to rule out the counter-intuitive outcome. +In other words, the addition of the smp_mb__after_spinlock() prohibits +the counter-intuitive result where the final value of r0 is 1, the final +value of z is 2, and the final value of r1 is 0. + + +No Roach-Motel Locking! +----------------------- + +This example requires familiarity with the herd7 "filter" clause, so +please read up on that topic in litmus-tests.txt. + +It is tempting to allow memory-reference instructions to be pulled +into a critical section, but this cannot be allowed in the general case. +For example, consider a spin loop preceding a lock-based critical section. +Now, herd7 does not model spin loops, but we can emulate one with two +loads, with a "filter" clause to constrain the first to return the +initial value and the second to return the updated value, as shown below: + + /* See Documentation/litmus-tests/locking/RM-fixed.litmus. */ + P0(int *x, int *y, int *lck) + { + int r2; + + spin_lock(lck); + r2 = atomic_inc_return(y); + WRITE_ONCE(*x, 1); + spin_unlock(lck); + } + + P1(int *x, int *y, int *lck) + { + int r0; + int r1; + int r2; + + r0 = READ_ONCE(*x); + r1 = READ_ONCE(*x); + spin_lock(lck); + r2 = atomic_inc_return(y); + spin_unlock(lck); + } + + filter (y=2 /\ 1:r0=0 /\ 1:r1=1) + exists (1:r2=1) + +The variable "x" is the control variable for the emulated spin loop. +P0() sets it to "1" while holding the lock, and P1() emulates the +spin loop by reading it twice, first into "1:r0" (which should get the +initial value "0") and then into "1:r1" (which should get the updated +value "1"). + +The purpose of the variable "y" is to reject deadlocked executions. +Only those executions where the final value of "y" have avoided deadlock. + +The "filter" clause takes all this into account, constraining "y" to +equal "2", "1:r0" to equal "0", and "1:r1" to equal 1. + +Then the "exists" clause checks to see if P1() acquired its lock first, +which should not happen given the filter clause because P0() updates +"x" while holding the lock. And herd7 confirms this. + +But suppose that the compiler was permitted to reorder the spin loop +into P1()'s critical section, like this: + + /* See Documentation/litmus-tests/locking/RM-broken.litmus. */ + P0(int *x, int *y, int *lck) + { + int r2; + + spin_lock(lck); + r2 = atomic_inc_return(y); + WRITE_ONCE(*x, 1); + spin_unlock(lck); + } + + P1(int *x, int *y, int *lck) + { + int r0; + int r1; + int r2; + + spin_lock(lck); + r0 = READ_ONCE(*x); + r1 = READ_ONCE(*x); + r2 = atomic_inc_return(y); + spin_unlock(lck); + } + + locations [x;lck;0:r2;1:r0;1:r1;1:r2] + filter (y=2 /\ 1:r0=0 /\ 1:r1=1) + exists (1:r2=1) + +If "1:r0" is equal to "0", "1:r1" can never equal "1" because P0() +cannot update "x" while P1() holds the lock. And herd7 confirms this, +showing zero executions matching the "filter" criteria. + +And this is why Linux-kernel lock and unlock primitives must prevent +code from entering critical sections. It is not sufficient to only +prevent code from leaving them. From 3b75b8f485d30cc153678bb740c13243799a30af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2019 11:53:50 -0700 Subject: [PATCH 0098/1250] tools/memory-model: Make judgelitmus.sh note timeouts Currently, judgelitmus.sh treats timeouts (as in the "--timeout" argument) as "!!! Verification error". This can be misleading because it is quite possible that running the test longer would have produced a verification. This commit therefore changes judgelitmus.sh to check for timeouts and to report them with "!!! Timeout". Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 0cc63875e395d0..d3c313b9a458a7 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -42,6 +42,14 @@ grep '^Observation' $LKMM_DESTDIR/$litmus.out if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out then : +elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmus.out +then + echo ' !!! Timeout' $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo ' !!! Timeout' >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + exit 124 else echo ' !!! Verification error' $litmus if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out From e288d88305357368436c19ee132e62765388333f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2019 13:07:46 -0700 Subject: [PATCH 0099/1250] tools/memory-model: Make cmplitmushist.sh note timeouts Currently, cmplitmushist.sh treats timeouts (as in the "--timeout" argument) as "Missing Observation line". This can be misleading because it is quite possible that running the test longer would have produced a verification. This commit therefore changes cmplitmushist.sh to check for timeouts and to report them with "Timed out". Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/cmplitmushist.sh | 22 +++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/memory-model/scripts/cmplitmushist.sh b/tools/memory-model/scripts/cmplitmushist.sh index 0f498aeeccf5ec..b9c174dd80042a 100755 --- a/tools/memory-model/scripts/cmplitmushist.sh +++ b/tools/memory-model/scripts/cmplitmushist.sh @@ -12,12 +12,30 @@ trap 'rm -rf $T' 0 mkdir $T # comparetest oldpath newpath +timedout=0 perfect=0 obsline=0 noobsline=0 obsresult=0 badcompare=0 comparetest () { + if grep -q '^Command exited with non-zero status 124' $1 || + grep -q '^Command exited with non-zero status 124' $2 + then + if grep -q '^Command exited with non-zero status 124' $1 && + grep -q '^Command exited with non-zero status 124' $2 + then + echo Both runs timed out: $2 + elif grep -q '^Command exited with non-zero status 124' $1 + then + echo Old run timed out: $2 + elif grep -q '^Command exited with non-zero status 124' $2 + then + echo New run timed out: $2 + fi + timedout=`expr "$timedout" + 1` + return 0 + fi grep -v 'maxresident)k\|minor)pagefaults\|^Time' $1 > $T/oldout grep -v 'maxresident)k\|minor)pagefaults\|^Time' $2 > $T/newout if cmp -s $T/oldout $T/newout && grep -q '^Observation' $1 @@ -78,6 +96,10 @@ if test "$obsresult" -ne 0 then echo Matching Observation Always/Sometimes/Never result: $obsresult 1>&2 fi +if test "$timedout" -ne 0 +then + echo "!!!" Timed out: $timedout 1>&2 +fi if test "$badcompare" -ne 0 then echo "!!!" Result changed: $badcompare 1>&2 From 454cb4a71993e666fd3e0d929efe5bc64ab3062f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Mar 2019 13:40:57 -0700 Subject: [PATCH 0100/1250] tools/memory-model: Make judgelitmus.sh identify bad macros Currently, judgelitmus.sh treats use of unknown primitives (such as srcu_read_lock() prior to SRCU support) as "!!! Verification error". This can be misleading because it fails to call out typos and running a version LKMM on a litmus test requiring a feature not provided by that version. This commit therefore changes judgelitmus.sh to check for unknown primitives and to report them, for example, with: '!!! Current LKMM version does not know "rcu_write_lock"'. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/cmplitmushist.sh | 31 ++++++++++++++++++--- tools/memory-model/scripts/judgelitmus.sh | 12 ++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/tools/memory-model/scripts/cmplitmushist.sh b/tools/memory-model/scripts/cmplitmushist.sh index b9c174dd80042a..ca1ac8b646144a 100755 --- a/tools/memory-model/scripts/cmplitmushist.sh +++ b/tools/memory-model/scripts/cmplitmushist.sh @@ -12,6 +12,7 @@ trap 'rm -rf $T' 0 mkdir $T # comparetest oldpath newpath +badmacnam=0 timedout=0 perfect=0 obsline=0 @@ -19,8 +20,26 @@ noobsline=0 obsresult=0 badcompare=0 comparetest () { - if grep -q '^Command exited with non-zero status 124' $1 || - grep -q '^Command exited with non-zero status 124' $2 + if grep -q ': Unknown macro ' $1 || grep -q ': Unknown macro ' $2 + then + if grep -q ': Unknown macro ' $1 + then + badname=`grep ': Unknown macro ' $1 | + sed -e 's/^.*: Unknown macro //' | + sed -e 's/ (User error).*$//'` + echo 'Current LKMM version does not know "'$badname'"' $1 + fi + if grep -q ': Unknown macro ' $2 + then + badname=`grep ': Unknown macro ' $2 | + sed -e 's/^.*: Unknown macro //' | + sed -e 's/ (User error).*$//'` + echo 'Current LKMM version does not know "'$badname'"' $2 + fi + badmacnam=`expr "$badmacnam" + 1` + return 0 + elif grep -q '^Command exited with non-zero status 124' $1 || + grep -q '^Command exited with non-zero status 124' $2 then if grep -q '^Command exited with non-zero status 124' $1 && grep -q '^Command exited with non-zero status 124' $2 @@ -56,7 +75,7 @@ comparetest () { return 0 fi else - echo Missing Observation line "(e.g., herd7 timeout)": $2 + echo Missing Observation line "(e.g., syntax error)": $2 noobsline=`expr "$noobsline" + 1` return 0 fi @@ -90,7 +109,7 @@ then fi if test "$noobsline" -ne 0 then - echo Missing Observation line "(e.g., herd7 timeout)": $noobsline 1>&2 + echo Missing Observation line "(e.g., syntax error)": $noobsline 1>&2 fi if test "$obsresult" -ne 0 then @@ -100,6 +119,10 @@ if test "$timedout" -ne 0 then echo "!!!" Timed out: $timedout 1>&2 fi +if test "$badmacnam" -ne 0 +then + echo "!!!" Unknown primitive: $badmacnam 1>&2 +fi if test "$badcompare" -ne 0 then echo "!!!" Result changed: $badcompare 1>&2 diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index d3c313b9a458a7..d40439c7b71e0f 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -42,6 +42,18 @@ grep '^Observation' $LKMM_DESTDIR/$litmus.out if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out then : +elif grep ': Unknown macro ' $LKMM_DESTDIR/$litmus.out +then + badname=`grep ': Unknown macro ' $LKMM_DESTDIR/$litmus.out | + sed -e 's/^.*: Unknown macro //' | + sed -e 's/ (User error).*$//'` + badmsg=' !!! Current LKMM version does not know "'$badname'"'" $litmus" + echo $badmsg + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo ' !!! '$badmsg >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + exit 254 elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmus.out then echo ' !!! Timeout' $litmus From f2bb81a27d1bd5c8e01189e003544fc512bd346f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 14:27:06 -0700 Subject: [PATCH 0101/1250] tools/memory-model: Make judgelitmus.sh detect hard deadlocks If a litmus test specifies "Result: Never" and if it contains an unconditional ("hard") deadlock, then running checklitmus.sh on it will not flag any errors, despite the fact that there are no executions. This commit therefore updates judgelitmus.sh to complain about tests with no executions that are marked, but not as "Result: DEADLOCK". Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index d40439c7b71e0f..84c62eee321bf4 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -83,6 +83,14 @@ then fi ret=1 fi +elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$' +then + echo " !!! Unexpected non-$outcome deadlock" $litmus + if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + then + echo " !!! Unexpected non-$outcome deadlock" $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 + fi + ret=1 elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q $outcome || test "$outcome" = Maybe then ret=0 From deb2df812405620d8a6e510036c37d46ebb3d61e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 11 Apr 2019 07:33:18 -0700 Subject: [PATCH 0102/1250] tools/memory-model: Fix paulmck email address on pre-existing scripts Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkalllitmus.sh | 2 +- tools/memory-model/scripts/checklitmus.sh | 2 +- tools/memory-model/scripts/checklitmushist.sh | 2 +- tools/memory-model/scripts/judgelitmus.sh | 2 +- tools/memory-model/scripts/newlitmushist.sh | 2 +- tools/memory-model/scripts/parseargs.sh | 2 +- tools/memory-model/scripts/runlitmushist.sh | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh index 3c0c7fbbd223b7..10e14d94acee5c 100755 --- a/tools/memory-model/scripts/checkalllitmus.sh +++ b/tools/memory-model/scripts/checkalllitmus.sh @@ -17,7 +17,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney . scripts/parseargs.sh diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh index 11461ed40b5e45..638b8c610894b1 100755 --- a/tools/memory-model/scripts/checklitmus.sh +++ b/tools/memory-model/scripts/checklitmus.sh @@ -15,7 +15,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney litmus=$1 herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} diff --git a/tools/memory-model/scripts/checklitmushist.sh b/tools/memory-model/scripts/checklitmushist.sh index 1d210ffb7c8af6..406ecfc0aee4ca 100755 --- a/tools/memory-model/scripts/checklitmushist.sh +++ b/tools/memory-model/scripts/checklitmushist.sh @@ -12,7 +12,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney . scripts/parseargs.sh diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 84c62eee321bf4..d82133e75580c9 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -13,7 +13,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney litmus=$1 diff --git a/tools/memory-model/scripts/newlitmushist.sh b/tools/memory-model/scripts/newlitmushist.sh index 991f8f81488174..3f4b06e299886c 100755 --- a/tools/memory-model/scripts/newlitmushist.sh +++ b/tools/memory-model/scripts/newlitmushist.sh @@ -12,7 +12,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney . scripts/parseargs.sh diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 40f52080fdbd6e..afe7bd23de6b83 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -9,7 +9,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney T=/tmp/parseargs.sh.$$ mkdir $T diff --git a/tools/memory-model/scripts/runlitmushist.sh b/tools/memory-model/scripts/runlitmushist.sh index 6ed376f495bb4e..852786fef179f7 100755 --- a/tools/memory-model/scripts/runlitmushist.sh +++ b/tools/memory-model/scripts/runlitmushist.sh @@ -13,7 +13,7 @@ # # Copyright IBM Corporation, 2018 # -# Author: Paul E. McKenney +# Author: Paul E. McKenney T=/tmp/runlitmushist.sh.$$ trap 'rm -rf $T' 0 From e292cf7435ce8696d3d78e35aa67f46ebaf00af6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 15:59:26 -0700 Subject: [PATCH 0103/1250] tools/memory-model: Update parseargs.sh for hardware verification This commit adds a --hw argument to parseargs.sh to specify the CPU family for a hardware verification. For example, "--hw AArch64" will specify that a C-language litmus test is to be translated to ARMv8 and the result verified. This will set the LKMM_HW_MAP_FILE environment variable accordingly. If there is no --hw argument, this environment variable will be set to the empty string. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/parseargs.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index afe7bd23de6b83..5f016fc3f3af5d 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -27,6 +27,7 @@ initparam () { initparam LKMM_DESTDIR "." initparam LKMM_HERD_OPTIONS "-conf linux-kernel.cfg" +initparam LKMM_HW_MAP_FILE "" initparam LKMM_JOBS `getconf _NPROCESSORS_ONLN` initparam LKMM_PROCS "3" initparam LKMM_TIMEOUT "1m" @@ -37,10 +38,11 @@ usagehelp () { echo "Usage $scriptname [ arguments ]" echo " --destdir path (place for .litmus.out, default by .litmus)" echo " --herdopts -conf linux-kernel.cfg ..." + echo " --hw AArch64" echo " --jobs N (number of jobs, default one per CPU)" echo " --procs N (litmus tests with at most this many processes)" echo " --timeout N (herd7 timeout (e.g., 10s, 1m, 2hr, 1d, '')" - echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'" + echo "Defaults: --destdir '$LKMM_DESTDIR_DEF' --herdopts '$LKMM_HERD_OPTIONS_DEF' --hw '$LKMM_HW_MAP_FILE' --jobs '$LKMM_JOBS_DEF' --procs '$LKMM_PROCS_DEF' --timeout '$LKMM_TIMEOUT_DEF'" exit 1 } @@ -95,6 +97,11 @@ do LKMM_HERD_OPTIONS="$2" shift ;; + --hw) + checkarg --hw "(.map file architecture name)" "$#" "$2" '^[A-Za-z0-9_-]\+' '^--' + LKMM_HW_MAP_FILE="$2" + shift + ;; -j[1-9]*) njobs="`echo $1 | sed -e 's/^-j//'`" trailchars="`echo $njobs | sed -e 's/[0-9]\+\(.*\)$/\1/'`" From 8bda372a2bdaa2be1f51c4f8b9a4fcb3cbbefa22 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 14:39:10 -0700 Subject: [PATCH 0104/1250] tools/memory-model: Make judgelitmus.sh handle hardware verifications This commit makes the judgelitmus.sh script check the --hw argument (AKA the LKMM_HW_MAP_FILE environment variable) and to adjust its judgment for a run where a C-language litmus test has been translated to assembly and the assembly version verified. In this case, the assembly verification output is checked against the C-language script's "Result:" comment. However, because hardware can be stronger than LKMM requires, the judgelitmus.sh script forgives verification mismatches featuring a "Sometimes" in the C-language script and an "Always" or "Never" assembly-language verification. Note that deadlock is not forgiven, however, this should not normally be an issue given that C-language tests containing locking, RCU, or SRCU cannot be translated to assembly. However, this issue can crop up in litmus tests that mimic deadlock by using the "filter" clause to ignore all executions. It can also crop up when certain herd arguments are used to autofilter everything that does not match the "exists" clause in cases where the "exists" clause cannot be satisfied. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/README | 8 +-- tools/memory-model/scripts/judgelitmus.sh | 75 ++++++++++++++--------- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/tools/memory-model/scripts/README b/tools/memory-model/scripts/README index 095c7eb36f9f90..0e29a52044c1af 100644 --- a/tools/memory-model/scripts/README +++ b/tools/memory-model/scripts/README @@ -43,10 +43,10 @@ initlitmushist.sh judgelitmus.sh - Given a .litmus file and its .litmus.out herd7 output, check the - .litmus.out file against the .litmus file's "Result:" comment to - judge whether the test ran correctly. Not normally run manually, - provided instead for use by other scripts. + Given a .litmus file and its herd7 output, check the output file + against the .litmus file's "Result:" comment to judge whether + the test ran correctly. Not normally run manually, provided + instead for use by other scripts. newlitmushist.sh diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index d82133e75580c9..6f3c60065c8b5b 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -1,9 +1,14 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# Given a .litmus test and the corresponding .litmus.out file, check -# the .litmus.out file against the "Result:" comment to judge whether -# the test ran correctly. +# Given a .litmus test and the corresponding litmus output file, check +# the .litmus.out file against the "Result:" comment to judge whether the +# test ran correctly. If the --hw argument is omitted, check against the +# LKMM output, which is assumed to be in file.litmus.out. If this argument +# is provided, this is assumed to be a hardware test, and the output is +# assumed to be in file.HW.litmus.out, where "HW" is the --hw argument. +# In addition, non-Sometimes verification results will be noted, but +# forgiven. # # Usage: # judgelitmus.sh file.litmus @@ -24,11 +29,18 @@ else echo ' --- ' error: \"$litmus\" is not a readable file exit 255 fi -if test -f "$LKMM_DESTDIR/$litmus".out -a -r "$LKMM_DESTDIR/$litmus".out +if test -z "$LKMM_HW_MAP_FILE" +then + litmusout=$litmus.out +else + litmusout="`echo $litmus | + sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'`.out" +fi +if test -f "$LKMM_DESTDIR/$litmusout" -a -r "$LKMM_DESTDIR/$litmusout" then : else - echo ' --- ' error: \"$LKMM_DESTDIR/$litmus\".out is not a readable file + echo ' --- ' error: \"$LKMM_DESTDIR/$litmusout is not a readable file exit 255 fi if grep -q '^ \* Result: ' $litmus @@ -38,69 +50,76 @@ else outcome=specified fi -grep '^Observation' $LKMM_DESTDIR/$litmus.out -if grep -q '^Observation' $LKMM_DESTDIR/$litmus.out +grep '^Observation' $LKMM_DESTDIR/$litmusout +if grep -q '^Observation' $LKMM_DESTDIR/$litmusout then : -elif grep ': Unknown macro ' $LKMM_DESTDIR/$litmus.out +elif grep ': Unknown macro ' $LKMM_DESTDIR/$litmusout then - badname=`grep ': Unknown macro ' $LKMM_DESTDIR/$litmus.out | + badname=`grep ': Unknown macro ' $LKMM_DESTDIR/$litmusout | sed -e 's/^.*: Unknown macro //' | sed -e 's/ (User error).*$//'` badmsg=' !!! Current LKMM version does not know "'$badname'"'" $litmus" echo $badmsg - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo ' !!! '$badmsg >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo ' !!! '$badmsg >> $LKMM_DESTDIR/$litmusout 2>&1 fi exit 254 -elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmus.out +elif grep '^Command exited with non-zero status 124' $LKMM_DESTDIR/$litmusout then echo ' !!! Timeout' $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo ' !!! Timeout' >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo ' !!! Timeout' >> $LKMM_DESTDIR/$litmusout 2>&1 fi exit 124 else echo ' !!! Verification error' $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo ' !!! Verification error' >> $LKMM_DESTDIR/$litmusout 2>&1 fi exit 255 fi if test "$outcome" = DEADLOCK then - if grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$' + if grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q 'Never 0 0$' then ret=0 else echo " !!! Unexpected non-$outcome verification" $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmusout 2>&1 fi ret=1 fi -elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q 'Never 0 0$' +elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q 'Never 0 0$' then echo " !!! Unexpected non-$outcome deadlock" $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if ! grep -q '!!!' $LKMM_DESTDIR/$litmusout then - echo " !!! Unexpected non-$outcome deadlock" $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 + echo " !!! Unexpected non-$outcome deadlock" $litmus >> $LKMM_DESTDIR/$litmusout 2>&1 fi ret=1 -elif grep '^Observation' $LKMM_DESTDIR/$litmus.out | grep -q $outcome || test "$outcome" = Maybe +elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q $outcome || test "$outcome" = Maybe then ret=0 else - echo " !!! Unexpected non-$outcome verification" $litmus - if ! grep -q '!!!' $LKMM_DESTDIR/$litmus.out + if test -n "$LKMM_HW_MAP_FILE" -a "$outcome" = Sometimes then - echo " !!! Unexpected non-$outcome verification" >> $LKMM_DESTDIR/$litmus.out 2>&1 + flag="--- Forgiven" + ret=0 + else + flag="!!! Unexpected" + ret=1 + fi + echo " $flag non-$outcome verification" $litmus + if ! grep -qe "$flag" $LKMM_DESTDIR/$litmusout + then + echo " $flag non-$outcome verification" >> $LKMM_DESTDIR/$litmusout 2>&1 fi - ret=1 fi -tail -2 $LKMM_DESTDIR/$litmus.out | head -1 +tail -2 $LKMM_DESTDIR/$litmusout | head -1 exit $ret From 6b6c60b65adde696e7d4f3ffd7240e3290ecc392 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 16:21:09 -0700 Subject: [PATCH 0105/1250] tools/memory-model: Add simpletest.sh to check locking, RCU, and SRCU This commit abstracts out common function to check a given litmus test for locking, RCU, and SRCU in order to avoid duplicating code. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/simpletest.sh | 35 ++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 tools/memory-model/scripts/simpletest.sh diff --git a/tools/memory-model/scripts/simpletest.sh b/tools/memory-model/scripts/simpletest.sh new file mode 100755 index 00000000000000..7edc5d36166570 --- /dev/null +++ b/tools/memory-model/scripts/simpletest.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Give zero status if this is a simple test and non-zero otherwise. +# Simple tests do not contain locking, RCU, or SRCU. +# +# Usage: +# simpletest.sh file.litmus +# +# Copyright IBM Corporation, 2019 +# +# Author: Paul E. McKenney + + +litmus=$1 + +if test -f "$litmus" -a -r "$litmus" +then + : +else + echo ' --- ' error: \"$litmus\" is not a readable file + exit 255 +fi +exclude="^[[:space:]]*\(" +exclude="${exclude}spin_lock(\|spin_unlock(\|spin_trylock(\|spin_is_locked(" +exclude="${exclude}\|rcu_read_lock(\|rcu_read_unlock(" +exclude="${exclude}\|synchronize_rcu(\|synchronize_rcu_expedited(" +exclude="${exclude}\|srcu_read_lock(\|srcu_read_unlock(" +exclude="${exclude}\|synchronize_srcu(\|synchronize_srcu_expedited(" +exclude="${exclude}\)" +if grep -q $exclude $litmus +then + exit 255 +fi +exit 0 From c541c92f62974b341bcd3cc7fea427404dd101f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 19 Mar 2019 16:37:01 -0700 Subject: [PATCH 0106/1250] tools/memory-model: Fix checkalllitmus.sh comment The checkalllitmus.sh runs litmus tests in the litmus-tests directory, not those in the github archive, so this commit updates the comment to reflect this reality. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkalllitmus.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh index 10e14d94acee5c..54d8da8c338e17 100755 --- a/tools/memory-model/scripts/checkalllitmus.sh +++ b/tools/memory-model/scripts/checkalllitmus.sh @@ -30,8 +30,8 @@ else exit 255 fi -# Create any new directories that have appeared in the github litmus -# repo since the last run. +# Create any new directories that have appeared in the litmus-tests +# directory since the last run. if test "$LKMM_DESTDIR" != "." then find $litmusdir -type d -print | From 4416bb57a79ad019bf98691d1ba731282fd21b16 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Mar 2019 12:39:27 -0700 Subject: [PATCH 0107/1250] tools/memory-model: Hardware checking for check{,all}litmus.sh This commit makes checklitmus.sh and checkalllitmus.sh check to see if a hardware verification was specified (via the --hw command-line argument, which sets the LKMM_HW_MAP_FILE environment variable). If so, the C-language litmus test is converted to the specified type of assembly-language litmus test and herd is run on it. Hardware is permitted to be stronger than LKMM requires, so "Always" and "Never" verifications of "Sometimes" C-language litmus tests are forgiven. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkalllitmus.sh | 23 +++++------ tools/memory-model/scripts/checklitmus.sh | 42 ++++++++++++++++++-- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/tools/memory-model/scripts/checkalllitmus.sh b/tools/memory-model/scripts/checkalllitmus.sh index 54d8da8c338e17..2d3ee850a8399e 100755 --- a/tools/memory-model/scripts/checkalllitmus.sh +++ b/tools/memory-model/scripts/checkalllitmus.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # SPDX-License-Identifier: GPL-2.0+ # # Run herd7 tests on all .litmus files in the litmus-tests directory @@ -8,6 +8,11 @@ # "^^^". It also outputs verification results to a file whose name is # that of the specified litmus test, but with ".out" appended. # +# If the --hw argument is specified, this script translates the .litmus +# C-language file to the specified type of assembly and verifies that. +# But in this case, litmus tests using complex synchronization (such as +# locking, RCU, and SRCU) are cheerfully ignored. +# # Usage: # checkalllitmus.sh # @@ -38,21 +43,15 @@ then ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) fi -# Find the checklitmus script. If it is not where we expect it, then -# assume that the caller has the PATH environment variable set -# appropriately. -if test -x scripts/checklitmus.sh -then - clscript=scripts/checklitmus.sh -else - clscript=checklitmus.sh -fi - # Run the script on all the litmus tests in the specified directory ret=0 for i in $litmusdir/*.litmus do - if ! $clscript $i + if test -n "$LKMM_HW_MAP_FILE" && ! scripts/simpletest.sh $i + then + continue + fi + if ! scripts/checklitmus.sh $i then ret=1 fi diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh index 638b8c610894b1..42ff11869cd626 100755 --- a/tools/memory-model/scripts/checklitmus.sh +++ b/tools/memory-model/scripts/checklitmus.sh @@ -6,6 +6,11 @@ # results to a file whose name is that of the specified litmus test, but # with ".out" appended. # +# If the --hw argument is specified, this script translates the .litmus +# C-language file to the specified type of assembly and verifies that. +# But in this case, litmus tests using complex synchronization (such as +# locking, RCU, and SRCU) are cheerfully ignored. +# # Usage: # checklitmus.sh file.litmus # @@ -18,8 +23,6 @@ # Author: Paul E. McKenney litmus=$1 -herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} - if test -f "$litmus" -a -r "$litmus" then : @@ -28,7 +31,38 @@ else exit 255 fi -echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out -/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 +if test -z "$LKMM_HW_MAP_FILE" +then + # LKMM run + herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} + echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out + /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 +else + # Hardware run + + T=/tmp/checklitmushw.sh.$$ + trap 'rm -rf $T' 0 2 + mkdir $T + + # Generate filenames + catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" + mapfile="Linux2${LKMM_HW_MAP_FILE}.map" + themefile="$T/${LKMM_HW_MAP_FILE}.theme" + herdoptions="-model $LKMM_HW_CAT_FILE" + hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` + hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` + + # Don't run on litmus tests with complex synchronization + if ! scripts/simpletest.sh $litmus + then + echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU + exit 254 + fi + + # Generate the assembly code and run herd7 on it. + gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile + jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out + /usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +fi scripts/judgelitmus.sh $litmus From 26ac8a58dac7f0b46015b7bf620d0e3f9f143342 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Mar 2019 14:37:46 -0700 Subject: [PATCH 0108/1250] tools/memory-model: Make judgelitmus.sh ransack .litmus.out files The judgelitmus.sh script currently relies solely on the "Result:" comment in the .litmus file. This is problematic when using the --hw argument, because it is necessary to check the hardware model against LKMM even in the absence of "Result:" comments. This commit therefore modifies judgelitmus.sh to check the observation in a .litmus.out file, in case one was generated by a previous LKMM run. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 6f3c60065c8b5b..fe9131f8eb969a 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -8,7 +8,9 @@ # is provided, this is assumed to be a hardware test, and the output is # assumed to be in file.HW.litmus.out, where "HW" is the --hw argument. # In addition, non-Sometimes verification results will be noted, but -# forgiven. +# forgiven. Furthermore, if there is no "Result:" comment but there is +# an LKMM .litmus.out file, the observation in that file will be used +# to judge the assembly-language verification. # # Usage: # judgelitmus.sh file.litmus @@ -32,9 +34,11 @@ fi if test -z "$LKMM_HW_MAP_FILE" then litmusout=$litmus.out + lkmmout= else litmusout="`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'`.out" + lkmmout=$litmus.out fi if test -f "$LKMM_DESTDIR/$litmusout" -a -r "$LKMM_DESTDIR/$litmusout" then @@ -46,6 +50,9 @@ fi if grep -q '^ \* Result: ' $litmus then outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` +elif test -n "$LKMM_HW_MAP_FILE" && grep -q '^Observation' $LKMM_DESTDIR/$lkmmout > /dev/null 2>&1 +then + outcome=`grep -m 1 '^Observation ' $LKMM_DESTDIR/$lkmmout | awk '{ print $3 }'` else outcome=specified fi From eeaa50faf94cb149f62282ad2569b4425c4b625d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Mar 2019 14:57:56 -0700 Subject: [PATCH 0109/1250] tools/memory-model: Split runlitmus.sh out of checklitmus.sh This commit prepares for adding --hw capability to github litmus-test scripts by splitting runlitmus.sh (which simply runs the verification) out of checklitmus.sh (which also judges the results). Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checklitmus.sh | 57 ++----------------- tools/memory-model/scripts/runlitmus.sh | 69 +++++++++++++++++++++++ 2 files changed, 73 insertions(+), 53 deletions(-) create mode 100755 tools/memory-model/scripts/runlitmus.sh diff --git a/tools/memory-model/scripts/checklitmus.sh b/tools/memory-model/scripts/checklitmus.sh index 42ff11869cd626..4c1d0cf0ddadcf 100755 --- a/tools/memory-model/scripts/checklitmus.sh +++ b/tools/memory-model/scripts/checklitmus.sh @@ -1,15 +1,8 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# Run a herd7 test and invokes judgelitmus.sh to check the result against -# a "Result:" comment within the litmus test. It also outputs verification -# results to a file whose name is that of the specified litmus test, but -# with ".out" appended. -# -# If the --hw argument is specified, this script translates the .litmus -# C-language file to the specified type of assembly and verifies that. -# But in this case, litmus tests using complex synchronization (such as -# locking, RCU, and SRCU) are cheerfully ignored. +# Invokes runlitmus.sh and judgelitmus.sh on its arguments to run the +# specified litmus test and pass judgment on the results. # # Usage: # checklitmus.sh file.litmus @@ -22,47 +15,5 @@ # # Author: Paul E. McKenney -litmus=$1 -if test -f "$litmus" -a -r "$litmus" -then - : -else - echo ' --- ' error: \"$litmus\" is not a readable file - exit 255 -fi - -if test -z "$LKMM_HW_MAP_FILE" -then - # LKMM run - herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} - echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out - /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 -else - # Hardware run - - T=/tmp/checklitmushw.sh.$$ - trap 'rm -rf $T' 0 2 - mkdir $T - - # Generate filenames - catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" - mapfile="Linux2${LKMM_HW_MAP_FILE}.map" - themefile="$T/${LKMM_HW_MAP_FILE}.theme" - herdoptions="-model $LKMM_HW_CAT_FILE" - hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` - hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` - - # Don't run on litmus tests with complex synchronization - if ! scripts/simpletest.sh $litmus - then - echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU - exit 254 - fi - - # Generate the assembly code and run herd7 on it. - gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile - jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out - /usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 -fi - -scripts/judgelitmus.sh $litmus +scripts/runlitmus.sh $1 +scripts/judgelitmus.sh $1 diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh new file mode 100755 index 00000000000000..91af859c0e90c1 --- /dev/null +++ b/tools/memory-model/scripts/runlitmus.sh @@ -0,0 +1,69 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Without the -hw argument, runs a herd7 test and outputs verification +# results to a file whose name is that of the specified litmus test, +# but with ".out" appended. +# +# If the --hw argument is specified, this script translates the .litmus +# C-language file to the specified type of assembly and verifies that. +# But in this case, litmus tests using complex synchronization (such as +# locking, RCU, and SRCU) are cheerfully ignored. +# +# Either way, return the status of the herd7 command. +# +# Usage: +# runlitmus.sh file.litmus +# +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. The caller is expected to have +# properly set up the LKMM environment variables. +# +# Copyright IBM Corporation, 2019 +# +# Author: Paul E. McKenney + +litmus=$1 +if test -f "$litmus" -a -r "$litmus" +then + : +else + echo ' --- ' error: \"$litmus\" is not a readable file + exit 255 +fi + +if test -z "$LKMM_HW_MAP_FILE" +then + # LKMM run + herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} + echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out + /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 +else + # Hardware run + + T=/tmp/checklitmushw.sh.$$ + trap 'rm -rf $T' 0 2 + mkdir $T + + # Generate filenames + catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" + mapfile="Linux2${LKMM_HW_MAP_FILE}.map" + themefile="$T/${LKMM_HW_MAP_FILE}.theme" + herdoptions="-model $LKMM_HW_CAT_FILE" + hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` + hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` + + # Don't run on litmus tests with complex synchronization + if ! scripts/simpletest.sh $litmus + then + echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU + exit 254 + fi + + # Generate the assembly code and run herd on it. + gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile + jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out + /usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +fi + +exit $? From 07fcb46d8e76e3f3abdd113a1bcafd39303cd443 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 20 Mar 2019 16:41:41 -0700 Subject: [PATCH 0110/1250] tools/memory-model: Make runlitmus.sh generate .litmus.out for --hw In the absence of "Result:" comments, the runlitmus.sh script relies on litmus.out files from prior LKMM runs. This can be a bit user-hostile, so this commit makes runlitmus.sh generate any needed .litmus.out files that don't already exist. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 54 ++++++++++++++----------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index 91af859c0e90c1..2865a9661b0789 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -28,42 +28,48 @@ if test -f "$litmus" -a -r "$litmus" then : else - echo ' --- ' error: \"$litmus\" is not a readable file + echo ' !!! ' error: \"$litmus\" is not a readable file exit 255 fi -if test -z "$LKMM_HW_MAP_FILE" +if test -z "$LKMM_HW_MAP_FILE" -o ! -e $LKMM_DESTDIR/$litmus.out then # LKMM run herdoptions=${LKMM_HERD_OPTIONS--conf linux-kernel.cfg} echo Herd options: $herdoptions > $LKMM_DESTDIR/$litmus.out /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $litmus >> $LKMM_DESTDIR/$litmus.out 2>&1 -else - # Hardware run + ret=$? + if test -z "$LKMM_HW_MAP_FILE" + then + exit $ret + fi + echo " --- " Automatically generated LKMM output for '"'--hw $LKMM_HW_MAP_FILE'"' run +fi - T=/tmp/checklitmushw.sh.$$ - trap 'rm -rf $T' 0 2 - mkdir $T +# Hardware run - # Generate filenames - catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" - mapfile="Linux2${LKMM_HW_MAP_FILE}.map" - themefile="$T/${LKMM_HW_MAP_FILE}.theme" - herdoptions="-model $LKMM_HW_CAT_FILE" - hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` - hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` +T=/tmp/checklitmushw.sh.$$ +trap 'rm -rf $T' 0 2 +mkdir $T - # Don't run on litmus tests with complex synchronization - if ! scripts/simpletest.sh $litmus - then - echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU - exit 254 - fi +# Generate filenames +catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" +mapfile="Linux2${LKMM_HW_MAP_FILE}.map" +themefile="$T/${LKMM_HW_MAP_FILE}.theme" +herdoptions="-model $LKMM_HW_CAT_FILE" +hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` +hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` - # Generate the assembly code and run herd on it. - gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile - jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out - /usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +# Don't run on litmus tests with complex synchronization +if ! scripts/simpletest.sh $litmus +then + echo ' --- ' error: \"$litmus\" contains locking, RCU, or SRCU + exit 254 fi +# Generate the assembly code and run herd7 on it. +gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile +jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 + exit $? From b77cadc213400eb86d10c7faf02bb78b667372b2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Mar 2019 14:06:27 -0700 Subject: [PATCH 0111/1250] tools/memory-model: Move from .AArch64.litmus.out to .litmus.AArch.out When the github scripts see ".litmus.out", they assume that there must be a corresponding C-language ".litmus" file. Won't they be disappointed when they instead see nothing, or, worse yet, the corresponding assembly-language litmus test? This commit therefore swaps the hardware tag with the "litmus" to avoid this sort of disappointment. This commit also adjusts the .gitignore file so as to avoid adding these new ".out" files to git. [ paulmck: Apply Akira Yokosawa feedback. ] Signed-off-by: Paul E. McKenney --- tools/memory-model/litmus-tests/.gitignore | 2 +- tools/memory-model/scripts/judgelitmus.sh | 4 ++-- tools/memory-model/scripts/runlitmus.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/memory-model/litmus-tests/.gitignore b/tools/memory-model/litmus-tests/.gitignore index c492a1ddad91d5..d65462d64816d5 100644 --- a/tools/memory-model/litmus-tests/.gitignore +++ b/tools/memory-model/litmus-tests/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -*.litmus.out +*.out diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index fe9131f8eb969a..9abda72fe013cb 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -6,7 +6,7 @@ # test ran correctly. If the --hw argument is omitted, check against the # LKMM output, which is assumed to be in file.litmus.out. If this argument # is provided, this is assumed to be a hardware test, and the output is -# assumed to be in file.HW.litmus.out, where "HW" is the --hw argument. +# assumed to be in file.litmus.HW.out, where "HW" is the --hw argument. # In addition, non-Sometimes verification results will be noted, but # forgiven. Furthermore, if there is no "Result:" comment but there is # an LKMM .litmus.out file, the observation in that file will be used @@ -37,7 +37,7 @@ then lkmmout= else litmusout="`echo $litmus | - sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'`.out" + sed -e 's/\.litmus$/.litmus.'${LKMM_HW_MAP_FILE}'/'`.out" lkmmout=$litmus.out fi if test -f "$LKMM_DESTDIR/$litmusout" -a -r "$LKMM_DESTDIR/$litmusout" diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index 2865a9661b0789..c84124b32bee6c 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -57,7 +57,7 @@ catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" mapfile="Linux2${LKMM_HW_MAP_FILE}.map" themefile="$T/${LKMM_HW_MAP_FILE}.theme" herdoptions="-model $LKMM_HW_CAT_FILE" -hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.'${LKMM_HW_MAP_FILE}'.litmus/'` +hwlitmus=`echo $litmus | sed -e 's/\.litmus$/.litmus.'${LKMM_HW_MAP_FILE}'/'` hwlitmusfile=`echo $hwlitmus | sed -e 's,^.*/,,'` # Don't run on litmus tests with complex synchronization From 5febfdb3c2b550e189c8156527c3ab9ade9d59a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Mar 2019 14:44:09 -0700 Subject: [PATCH 0112/1250] tools/memory-model: Keep assembly-language litmus tests This commit retains the assembly-language litmus tests generated from the C-language litmus tests, appending the hardware tag to the original C-language litmus test's filename. Thus, S+poonceonces.litmus.AArch64 contains the Armv8 assembly language corresponding to the C-language S+poonceonces.litmus test. This commit also updates the .gitignore to avoid committing these automatically generated assembly-language litmus tests. Signed-off-by: Paul E. McKenney --- tools/memory-model/litmus-tests/.gitignore | 2 +- tools/memory-model/scripts/runlitmus.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/memory-model/litmus-tests/.gitignore b/tools/memory-model/litmus-tests/.gitignore index d65462d64816d5..19c379cf069d23 100644 --- a/tools/memory-model/litmus-tests/.gitignore +++ b/tools/memory-model/litmus-tests/.gitignore @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -*.out +*.litmus.* diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index c84124b32bee6c..62b47c7e1ba93d 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -69,7 +69,7 @@ fi # Generate the assembly code and run herd7 on it. gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile -jingle7 -theme $themefile $litmus > $T/$hwlitmusfile 2> $T/$hwlitmusfile.jingle7.out -/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $T/$hwlitmusfile > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +jingle7 -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 exit $? From a95a565cd763624a408615a5a0666c33d42abb63 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Mar 2019 08:57:20 -0700 Subject: [PATCH 0113/1250] tools/memory-model: Allow herd to deduce CPU type Currently, the scripts specify the CPU's .cat file to herd. But this is pointless because herd will select a good and sufficient .cat file from the assembly-language litmus test itself. This commit therefore removes the -model argument to herd, allowing herd to figure the CPU family out itself. Note that the user can override herd's choice using the "--herdopts" argument to the scripts. Suggested-by: Luc Maranget Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index 62b47c7e1ba93d..afb196d7ef1061 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -53,7 +53,6 @@ trap 'rm -rf $T' 0 2 mkdir $T # Generate filenames -catfile="`echo $LKMM_HW_MAP_FILE | tr '[A-Z]' '[a-z]'`.cat" mapfile="Linux2${LKMM_HW_MAP_FILE}.map" themefile="$T/${LKMM_HW_MAP_FILE}.theme" herdoptions="-model $LKMM_HW_CAT_FILE" @@ -70,6 +69,6 @@ fi # Generate the assembly code and run herd7 on it. gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile jingle7 -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out -/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -model $catfile $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 exit $? From 38f3c8f5204e215684ca46b6e27d990449107c1c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 25 Mar 2019 17:20:51 -0700 Subject: [PATCH 0114/1250] tools/memory-model: Make runlitmus.sh check for jingle errors It turns out that the jingle7 tool is currently a bit picky about the litmus tests it is willing to process. This commit therefore ensures that jingle7 failures are reported. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index afb196d7ef1061..5f2d29b460ff0c 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -69,6 +69,11 @@ fi # Generate the assembly code and run herd7 on it. gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile jingle7 -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out +if grep -q "Generated 0 tests" $T/$hwlitmusfile.jingle7.out +then + echo ' !!! ' jingle7 failed, no $hwlitmus generated + exit 253 +fi /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 exit $? From da024cabe61ae3d1b042f290deebe3e42b29a200 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Apr 2019 12:34:56 -0700 Subject: [PATCH 0115/1250] tools/memory-model: Add -v flag to jingle7 runs Adding the -v flag to jingle7 invocations gives much useful information on why jingle7 didn't like a given litmus test. This commit therefore adds this flag and saves off any such information into a .err file. Suggested-by: Luc Maranget Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index 5f2d29b460ff0c..dfdb1f00fcc033 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -68,10 +68,11 @@ fi # Generate the assembly code and run herd7 on it. gen_theme7 -n 10 -map $mapfile -call Linux.call > $themefile -jingle7 -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out +jingle7 -v -theme $themefile $litmus > $LKMM_DESTDIR/$hwlitmus 2> $T/$hwlitmusfile.jingle7.out if grep -q "Generated 0 tests" $T/$hwlitmusfile.jingle7.out then - echo ' !!! ' jingle7 failed, no $hwlitmus generated + echo ' !!! ' jingle7 failed, errors in $hwlitmus.err + cp $T/$hwlitmusfile.jingle7.out $LKMM_DESTDIR/$hwlitmus.err exit 253 fi /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 From 2046702ac1ac49f6b6552c58ff26db160b5ed3b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Mar 2019 17:18:43 -0700 Subject: [PATCH 0116/1250] tools/memory-model: Implement --hw support for checkghlitmus.sh This commits enables the "--hw" argument for the checkghlitmus.sh script, causing it to convert any applicable C-language litmus tests to the specified flavor of assembly language, to verify these assembly-language litmus tests, and checking compatibility of the outcomes. Note that the conversion does not yet handle locking, RCU, SRCU, plain C-language memory accesses, or casts. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkghlitmus.sh | 9 ++++--- tools/memory-model/scripts/hwfnseg.sh | 20 +++++++++++++++ tools/memory-model/scripts/runlitmushist.sh | 27 +++++++++++++-------- 3 files changed, 42 insertions(+), 14 deletions(-) create mode 100755 tools/memory-model/scripts/hwfnseg.sh diff --git a/tools/memory-model/scripts/checkghlitmus.sh b/tools/memory-model/scripts/checkghlitmus.sh index 6589fbb6f65383..2ea220d2564b93 100755 --- a/tools/memory-model/scripts/checkghlitmus.sh +++ b/tools/memory-model/scripts/checkghlitmus.sh @@ -10,6 +10,7 @@ # parseargs.sh scripts for arguments. . scripts/parseargs.sh +. scripts/hwfnseg.sh T=/tmp/checkghlitmus.sh.$$ trap 'rm -rf $T' 0 @@ -32,9 +33,9 @@ then ( cd "$LKMM_DESTDIR"; sed -e 's/^/mkdir -p /' | sh ) fi -# Create a list of the C-language litmus tests previously run. -( cd $LKMM_DESTDIR; find litmus -name '*.litmus.out' -print ) | - sed -e 's/\.out$//' | +# Create a list of the specified litmus tests previously run. +( cd $LKMM_DESTDIR; find litmus -name "*.litmus${hwfnseg}.out" -print ) | + sed -e "s/${hwfnseg}"'\.out$//' | xargs -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' | xargs -r grep -L "^P${LKMM_PROCS}"> $T/list-C-already @@ -44,7 +45,7 @@ find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C xargs < $T/list-C -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' > $T/list-C-result xargs < $T/list-C-result -r grep -L "^P${LKMM_PROCS}" > $T/list-C-result-short -# Form list of tests without corresponding .litmus.out files +# Form list of tests without corresponding .out files sort $T/list-C-already $T/list-C-result-short | uniq -u > $T/list-C-needed # Run any needed tests. diff --git a/tools/memory-model/scripts/hwfnseg.sh b/tools/memory-model/scripts/hwfnseg.sh new file mode 100755 index 00000000000000..580c3281181c54 --- /dev/null +++ b/tools/memory-model/scripts/hwfnseg.sh @@ -0,0 +1,20 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Generate the hardware extension to the litmus-test filename, or the +# empty string if this is an LKMM run. The extension is placed in +# the shell variable hwfnseg. +# +# Usage: +# . hwfnseg.sh +# +# Copyright IBM Corporation, 2019 +# +# Author: Paul E. McKenney + +if test -z "$LKMM_HW_MAP_FILE" +then + hwfnseg= +else + hwfnseg=".$LKMM_HW_MAP_FILE" +fi diff --git a/tools/memory-model/scripts/runlitmushist.sh b/tools/memory-model/scripts/runlitmushist.sh index 852786fef179f7..c6c2bdc67a5021 100755 --- a/tools/memory-model/scripts/runlitmushist.sh +++ b/tools/memory-model/scripts/runlitmushist.sh @@ -15,6 +15,8 @@ # # Author: Paul E. McKenney +. scripts/hwfnseg.sh + T=/tmp/runlitmushist.sh.$$ trap 'rm -rf $T' 0 mkdir $T @@ -30,15 +32,12 @@ fi # Prefixes for per-CPU scripts for ((i=0;i<$LKMM_JOBS;i++)) do - echo dir="$LKMM_DESTDIR" > $T/$i.sh echo T=$T >> $T/$i.sh - echo herdoptions=\"$LKMM_HERD_OPTIONS\" >> $T/$i.sh cat << '___EOF___' >> $T/$i.sh runtest () { - echo ' ... ' /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 '>' $dir/$1.out '2>&1' - if /usr/bin/time $LKMM_TIMEOUT_CMD herd7 $herdoptions $1 > $dir/$1.out 2>&1 + if scripts/runlitmus.sh $1 then - if ! grep -q '^Observation ' $dir/$1.out + if ! grep -q '^Observation ' $LKMM_DESTDIR/$1$2.out then echo ' !!! Herd failed, no Observation:' $1 fi @@ -47,10 +46,16 @@ do if test "$exitcode" -eq 124 then exitmsg="timed out" + elif test "$exitcode" -eq 253 + then + exitmsg= else exitmsg="failed, exit code $exitcode" fi - echo ' !!! Herd' ${exitmsg}: $1 + if test -n "$exitmsg" + then + echo ' !!! Herd' ${exitmsg}: $1 + fi fi } ___EOF___ @@ -59,11 +64,13 @@ done awk -v q="'" -v b='\\' ' { print "echo `grep " q "^P[0-9]" b "+(" q " " $0 " | tail -1 | sed -e " q "s/^P" b "([0-9]" b "+" b ")(.*$/" b "1/" q "` " $0 -}' | bash | -sort -k1n | -awk -v ncpu=$LKMM_JOBS -v t=$T ' +}' | sh | sort -k1n | +awk -v dq='"' -v hwfnseg="$hwfnseg" -v ncpu="$LKMM_JOBS" -v t="$T" ' { - print "runtest " $2 >> t "/" NR % ncpu ".sh"; + print "if test -z " dq hwfnseg dq " || scripts/simpletest.sh " dq $2 dq + print "then" + print "\techo runtest " dq $2 dq " " hwfnseg " >> " t "/" NR % ncpu ".sh"; + print "fi" } END { From 2915afd21ae5d816838f36cd699bb612c8fa7590 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Mar 2019 11:47:14 -0700 Subject: [PATCH 0117/1250] tools/memory-model: Fix scripting --jobs argument The parseargs.sh regular expression for the --jobs argument incorrectly requires that the number of jobs be at least 10, that is, have at least two digits. This commit therefore adjusts this regular expression to allow single-digit numbers of jobs to be specified. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/parseargs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 5f016fc3f3af5d..25a81ac0dfdf46 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -113,7 +113,7 @@ do LKMM_JOBS="`echo $njobs | sed -e 's/^\([0-9]\+\).*$/\1/'`" ;; --jobs|--job|-j) - checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]\+$' '^--' + checkarg --jobs "(number)" "$#" "$2" '^[1-9][0-9]*$' '^--' LKMM_JOBS="$2" shift ;; From 4208482741f82deff83eca3a07b807375825e3d0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Apr 2019 09:27:28 -0700 Subject: [PATCH 0118/1250] tools/memory-model: Make checkghlitmus.sh use mselect7 The checkghlitmus.sh script currently uses grep to ignore non-C-language litmus tests, which is a bit fragile. This commit therefore enlists the aid of "mselect7 -arch C", given Luc Maraget's recent modifications that allow mselect7 to operate in filter mode. This change requires herdtools 7.52-32-g1da3e0e50977 or later. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/checkghlitmus.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/checkghlitmus.sh b/tools/memory-model/scripts/checkghlitmus.sh index 2ea220d2564b93..cedd0290b73f8d 100755 --- a/tools/memory-model/scripts/checkghlitmus.sh +++ b/tools/memory-model/scripts/checkghlitmus.sh @@ -41,7 +41,7 @@ fi # Create a list of C-language litmus tests with "Result:" commands and # no more than the specified number of processes. -find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C +find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C xargs < $T/list-C -r egrep -l '^ \* Result: (Never|Sometimes|Always|DEADLOCK)' > $T/list-C-result xargs < $T/list-C-result -r grep -L "^P${LKMM_PROCS}" > $T/list-C-result-short From 22467ba0a44f8ea8dad70b8a5f091f30af398dee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 8 Apr 2019 10:02:23 -0700 Subject: [PATCH 0119/1250] tools/memory-model: Make history-check scripts use mselect7 The history-check scripts currently use grep to ignore non-C-language litmus tests, which is a bit fragile. This commit therefore enlists the aid of "mselect7 -arch C", given Luc Maraget's recent modifications that allow mselect7 to operate in filter mode. This change requires herdtools 7.52-32-g1da3e0e50977 or later. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/initlitmushist.sh | 2 +- tools/memory-model/scripts/newlitmushist.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/memory-model/scripts/initlitmushist.sh b/tools/memory-model/scripts/initlitmushist.sh index 956b6957484d87..31ea782955d3f0 100755 --- a/tools/memory-model/scripts/initlitmushist.sh +++ b/tools/memory-model/scripts/initlitmushist.sh @@ -60,7 +60,7 @@ fi # Create a list of the C-language litmus tests with no more than the # specified number of processes (per the --procs argument). -find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C +find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C xargs < $T/list-C -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short scripts/runlitmushist.sh < $T/list-C-short diff --git a/tools/memory-model/scripts/newlitmushist.sh b/tools/memory-model/scripts/newlitmushist.sh index 3f4b06e299886c..25235e2049cf05 100755 --- a/tools/memory-model/scripts/newlitmushist.sh +++ b/tools/memory-model/scripts/newlitmushist.sh @@ -43,7 +43,7 @@ fi # Form full list of litmus tests with no more than the specified # number of processes (per the --procs argument). -find litmus -name '*.litmus' -exec grep -l -m 1 "^C " {} \; > $T/list-C-all +find litmus -name '*.litmus' -print | mselect7 -arch C > $T/list-C-all xargs < $T/list-C-all -r grep -L "^P${LKMM_PROCS}" > $T/list-C-short # Form list of new tests. Note: This does not handle litmus-test deletion! From 45c841e305003777c3f527f1572d3d0e1c1f1f95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 May 2019 09:51:57 -0700 Subject: [PATCH 0120/1250] tools/memory-model: Add "--" to parseargs.sh for additional arguments Currently, parseargs.sh expects to consume all the command-line arguments, which prevents the calling script from having any of its own arguments. This commit therefore causes parseargs.sh to stop consuming arguments when it encounters a "--" argument, leaving any remaining arguments for the calling script. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/parseargs.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 25a81ac0dfdf46..7aa58755adfc01 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -83,7 +83,7 @@ do echo "Cannot create directory --destdir '$LKMM_DESTDIR'" usage fi - if test -d "$LKMM_DESTDIR" -a -w "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR" + if test -d "$LKMM_DESTDIR" -a -x "$LKMM_DESTDIR" then : else @@ -127,6 +127,10 @@ do LKMM_TIMEOUT="$2" shift ;; + --) + shift + break + ;; *) echo Unknown argument $1 usage From edfb0a41ac6e60eb5d7c33fe1bf858f67855fa82 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 May 2019 10:03:29 -0700 Subject: [PATCH 0121/1250] tools/memory-model: Repair parseargs.sh header comment Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/parseargs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/parseargs.sh b/tools/memory-model/scripts/parseargs.sh index 7aa58755adfc01..08ded59098607d 100755 --- a/tools/memory-model/scripts/parseargs.sh +++ b/tools/memory-model/scripts/parseargs.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0+ # -# the corresponding .litmus.out file, and does not judge the result. +# Parse arguments common to the various scripts. # # . scripts/parseargs.sh # From f3d1f9769848f9f152631270efd3599f5c461e92 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 May 2019 10:05:14 -0700 Subject: [PATCH 0122/1250] tools/memory-model: Add checktheselitmus.sh to run specified litmus tests This commit adds a checktheselitmus.sh script that runs the litmus tests specified on the command line. This is useful for verifying fixes to specific litmus tests. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/README | 8 ++++ .../memory-model/scripts/checktheselitmus.sh | 43 +++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100755 tools/memory-model/scripts/checktheselitmus.sh diff --git a/tools/memory-model/scripts/README b/tools/memory-model/scripts/README index 0e29a52044c1af..cc2c4e5be9ec16 100644 --- a/tools/memory-model/scripts/README +++ b/tools/memory-model/scripts/README @@ -27,6 +27,14 @@ checklitmushist.sh checklitmus.sh Check a single litmus test against its "Result:" expected result. + Not intended to for manual use. + +checktheselitmus.sh + + Check the specified list of litmus tests against their "Result:" + expected results. This takes optional parseargs.sh arguments, + followed by "--" followed by pathnames starting from the current + directory. cmplitmushist.sh diff --git a/tools/memory-model/scripts/checktheselitmus.sh b/tools/memory-model/scripts/checktheselitmus.sh new file mode 100755 index 00000000000000..10eeb5ecea6de6 --- /dev/null +++ b/tools/memory-model/scripts/checktheselitmus.sh @@ -0,0 +1,43 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0+ +# +# Invokes checklitmus.sh on its arguments to run the specified litmus +# test and pass judgment on the results. +# +# Usage: +# checktheselitmus.sh -- [ file1.litmus [ file2.litmus ... ] ] +# +# Run this in the directory containing the memory model, specifying the +# pathname of the litmus test to check. The usual parseargs.sh arguments +# can be specified prior to the "--". +# +# This script is intended for use with pathnames that start from the +# tools/memory-model directory. If some of the pathnames instead start at +# the root directory, they all must do so and the "--destdir /" parseargs.sh +# argument must be specified prior to the "--". Alternatively, some other +# "--destdir" argument can be supplied as long as the needed subdirectories +# are populated. +# +# Copyright IBM Corporation, 2018 +# +# Author: Paul E. McKenney + +. scripts/parseargs.sh + +ret=0 +for i in "$@" +do + if scripts/checklitmus.sh $i + then + : + else + ret=1 + fi +done +if test "$ret" -ne 0 +then + echo " ^^^ VERIFICATION MISMATCHES" 1>&2 +else + echo All litmus tests verified as was expected. 1>&2 +fi +exit $ret From 18bc38d07be9b1daf30e0adccb8c225498b90ff8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 3 May 2019 07:34:20 -0700 Subject: [PATCH 0123/1250] tools/memory-model: Add data-race capabilities to judgelitmus.sh This commit adds functionality to judgelitmus.sh to allow it to handle both the "DATARACE" markers in the "Result:" comments in litmus tests and the "Flag data-race" markers in LKMM output. For C-language tests, if either marker is present, the other must also be as well, at least for litmus tests having a "Result:" comment. If the LKMM output indicates a data race, then failures of the Always/Sometimes/Never portion of the "Result:" prediction are forgiven. The reason for forgiving "Result:" mispredictions is that data races can result in "interesting" compiler optimizations, so that all bets are off in the data-race case. [ paulmck: Apply Akira Yokosawa feedback. ] Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 40 ++++++++++++++++++----- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 9abda72fe013cb..2700481d20f016 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -4,13 +4,19 @@ # Given a .litmus test and the corresponding litmus output file, check # the .litmus.out file against the "Result:" comment to judge whether the # test ran correctly. If the --hw argument is omitted, check against the -# LKMM output, which is assumed to be in file.litmus.out. If this argument -# is provided, this is assumed to be a hardware test, and the output is -# assumed to be in file.litmus.HW.out, where "HW" is the --hw argument. -# In addition, non-Sometimes verification results will be noted, but -# forgiven. Furthermore, if there is no "Result:" comment but there is -# an LKMM .litmus.out file, the observation in that file will be used -# to judge the assembly-language verification. +# LKMM output, which is assumed to be in file.litmus.out. If either a +# "DATARACE" marker in the "Result:" comment or a "Flag data-race" marker +# in the LKMM output is present, the other must also be as well, at least +# for litmus tests having a "Result:" comment. In this case, a failure of +# the Always/Sometimes/Never portion of the "Result:" prediction will be +# noted, but forgiven. +# +# If the --hw argument is provided, this is assumed to be a hardware +# test, and the output is assumed to be in file.litmus.HW.out, where +# "HW" is the --hw argument. In addition, non-Sometimes verification +# results will be noted, but forgiven. Furthermore, if there is no +# "Result:" comment but there is an LKMM .litmus.out file, the observation +# in that file will be used to judge the assembly-language verification. # # Usage: # judgelitmus.sh file.litmus @@ -47,9 +53,27 @@ else echo ' --- ' error: \"$LKMM_DESTDIR/$litmusout is not a readable file exit 255 fi +if grep -q '^Flag data-race$' "$LKMM_DESTDIR/$litmusout" +then + datarace_modeled=1 +fi if grep -q '^ \* Result: ' $litmus then outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` + if grep -m1 '^ \* Result: .* DATARACE' $litmus + then + datarace_predicted=1 + fi + if test -n "$datarace_predicted" -a -z "$datarace_modeled" -a -z "$LKMM_HW_MAP_FILE" + then + echo '!!! Predicted data race not modeled' $litmus + exit 252 + elif test -z "$datarace_predicted" -a -n "$datarace_modeled" + then + # Note that hardware models currently don't model data races + echo '!!! Unexpected data race modeled' $litmus + exit 253 + fi elif test -n "$LKMM_HW_MAP_FILE" && grep -q '^Observation' $LKMM_DESTDIR/$lkmmout > /dev/null 2>&1 then outcome=`grep -m 1 '^Observation ' $LKMM_DESTDIR/$lkmmout | awk '{ print $3 }'` @@ -114,7 +138,7 @@ elif grep '^Observation' $LKMM_DESTDIR/$litmusout | grep -q $outcome || test "$o then ret=0 else - if test -n "$LKMM_HW_MAP_FILE" -a "$outcome" = Sometimes + if test \( -n "$LKMM_HW_MAP_FILE" -a "$outcome" = Sometimes \) -o -n "$datarace_modeled" then flag="--- Forgiven" ret=0 From f98f4fc388579b0ed0b53c6b00b5313ee0693fd7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 Jun 2019 02:13:27 -0700 Subject: [PATCH 0124/1250] tools/memory-model: Make judgelitmus.sh handle scripted Result: tag The scripts that generate the litmus tests in the "auto" directory of the https://github.com/paulmckrcu/litmus archive place the "Result:" tag into a single-line ocaml comment, which judgelitmus.sh currently does not recognize. This commit therefore makes judgelitmus.sh recognize both the multiline comment format that it currently does and the automatically generated single-line format. Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/judgelitmus.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/memory-model/scripts/judgelitmus.sh b/tools/memory-model/scripts/judgelitmus.sh index 2700481d20f016..1ec5d89fcfbb2a 100755 --- a/tools/memory-model/scripts/judgelitmus.sh +++ b/tools/memory-model/scripts/judgelitmus.sh @@ -57,10 +57,10 @@ if grep -q '^Flag data-race$' "$LKMM_DESTDIR/$litmusout" then datarace_modeled=1 fi -if grep -q '^ \* Result: ' $litmus +if grep -q '^[( ]\* Result: ' $litmus then - outcome=`grep -m 1 '^ \* Result: ' $litmus | awk '{ print $3 }'` - if grep -m1 '^ \* Result: .* DATARACE' $litmus + outcome=`grep -m 1 '^[( ]\* Result: ' $litmus | awk '{ print $3 }'` + if grep -m1 '^[( ]\* Result: .* DATARACE' $litmus then datarace_predicted=1 fi From ae0c3e94367ea8b58a842d49278a93d9ca7322b0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 24 Jun 2019 22:30:32 -0700 Subject: [PATCH 0125/1250] tools/memory-model: Use "-unroll 0" to keep --hw runs finite Litmus tests involving atomic operations produce LL/SC loops on a number of architectures, and unrolling these loops can result in excessive verification times or even stack overflows. This commit therefore uses the "-unroll 0" herd7 argument to avoid unrolling, on the grounds that additional passes through an LL/SC loop should not change the verification. Note however, that certain bugs in the mapping of the LL/SC loop to machine instructions may go undetected. On the other hand, herd7 might not be the best vehicle for finding such bugs in any case. (You do stress-test your architecture-specific code, don't you?) Suggested-by: Luc Maranget Signed-off-by: Paul E. McKenney --- tools/memory-model/scripts/runlitmus.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/memory-model/scripts/runlitmus.sh b/tools/memory-model/scripts/runlitmus.sh index dfdb1f00fcc033..94608d4b6502e6 100755 --- a/tools/memory-model/scripts/runlitmus.sh +++ b/tools/memory-model/scripts/runlitmus.sh @@ -75,6 +75,6 @@ then cp $T/$hwlitmusfile.jingle7.out $LKMM_DESTDIR/$hwlitmus.err exit 253 fi -/usr/bin/time $LKMM_TIMEOUT_CMD herd7 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 +/usr/bin/time $LKMM_TIMEOUT_CMD herd7 -unroll 0 $LKMM_DESTDIR/$hwlitmus > $LKMM_DESTDIR/$hwlitmus.out 2>&1 exit $? From c8490f375393f076dbd99404fa7e606a1d7fec0e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 24 May 2022 13:02:45 -0700 Subject: [PATCH 0126/1250] Bluetooth: eir: Fix using strlen with hdev->{dev_name,short_name} Both dev_name and short_name are not guaranteed to be NULL terminated so this instead use strnlen and then attempt to determine if the resulting string needs to be truncated or not. Link: https://bugzilla.kernel.org/show_bug.cgi?id=216018 Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/eir.c | 41 ++++++++++++++++++++++++++--------------- net/bluetooth/mgmt.c | 4 ++-- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c index 7d77fb00c2bf39..776d27f7e18d5f 100644 --- a/net/bluetooth/eir.c +++ b/net/bluetooth/eir.c @@ -13,6 +13,20 @@ #define PNP_INFO_SVCLASS_ID 0x1200 +static u8 eir_append_name(u8 *eir, u16 eir_len, u8 type, u8 *data, u8 data_len) +{ + u8 name[HCI_MAX_SHORT_NAME_LENGTH + 1]; + + /* If data is already NULL terminated just pass it directly */ + if (data[data_len - 1] == '\0') + return eir_append_data(eir, eir_len, type, data, data_len); + + memcpy(name, data, HCI_MAX_SHORT_NAME_LENGTH); + name[HCI_MAX_SHORT_NAME_LENGTH] = '\0'; + + return eir_append_data(eir, eir_len, type, name, sizeof(name)); +} + u8 eir_append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) { size_t short_len; @@ -23,29 +37,26 @@ u8 eir_append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) return ad_len; /* use complete name if present and fits */ - complete_len = strlen(hdev->dev_name); + complete_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name)); if (complete_len && complete_len <= HCI_MAX_SHORT_NAME_LENGTH) - return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE, + return eir_append_name(ptr, ad_len, EIR_NAME_COMPLETE, hdev->dev_name, complete_len + 1); /* use short name if present */ - short_len = strlen(hdev->short_name); + short_len = strnlen(hdev->short_name, sizeof(hdev->short_name)); if (short_len) - return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, - hdev->short_name, short_len + 1); + return eir_append_name(ptr, ad_len, EIR_NAME_SHORT, + hdev->short_name, + short_len == HCI_MAX_SHORT_NAME_LENGTH ? + short_len : short_len + 1); /* use shortened full name if present, we already know that name * is longer then HCI_MAX_SHORT_NAME_LENGTH */ - if (complete_len) { - u8 name[HCI_MAX_SHORT_NAME_LENGTH + 1]; - - memcpy(name, hdev->dev_name, HCI_MAX_SHORT_NAME_LENGTH); - name[HCI_MAX_SHORT_NAME_LENGTH] = '\0'; - - return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, name, - sizeof(name)); - } + if (complete_len) + return eir_append_name(ptr, ad_len, EIR_NAME_SHORT, + hdev->dev_name, + HCI_MAX_SHORT_NAME_LENGTH); return ad_len; } @@ -181,7 +192,7 @@ void eir_create(struct hci_dev *hdev, u8 *data) u8 *ptr = data; size_t name_len; - name_len = strlen(hdev->dev_name); + name_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name)); if (name_len > 0) { /* EIR Data type */ diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 74937a8346488d..325695bf2fa972 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1082,11 +1082,11 @@ static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir) eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE, hdev->appearance); - name_len = strlen(hdev->dev_name); + name_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name)); eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE, hdev->dev_name, name_len); - name_len = strlen(hdev->short_name); + name_len = strnlen(hdev->short_name, sizeof(hdev->short_name)); eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT, hdev->short_name, name_len); From 221a7a1d32ed0cadf2a0dc25e3400bd09c0d52b5 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 8 Jun 2022 15:00:01 -0700 Subject: [PATCH 0127/1250] Bluetooth: HCI: Fix not always setting Scan Response/Advertising Data The scan response and advertising data needs to be tracked on a per instance (adv_info) since when these instaces are removed so are their data, to fix that new flags are introduced which is used to mark when the data changes and then checked to confirm when the data needs to be synced with the controller. Tested-by: Tedd Ho-Jeong An Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 11 ++++++ net/bluetooth/hci_core.c | 42 ++++++++++---------- net/bluetooth/hci_sync.c | 66 ++++++++++++++++++++++---------- 3 files changed, 76 insertions(+), 43 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 5b92a9abe14116..15237ee5f76164 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -246,8 +246,10 @@ struct adv_info { __u16 duration; __u16 adv_data_len; __u8 adv_data[HCI_MAX_EXT_AD_LENGTH]; + bool adv_data_changed; __u16 scan_rsp_len; __u8 scan_rsp_data[HCI_MAX_EXT_AD_LENGTH]; + bool scan_rsp_changed; __s8 tx_power; __u32 min_interval; __u32 max_interval; @@ -261,6 +263,15 @@ struct adv_info { #define HCI_ADV_TX_POWER_NO_PREFERENCE 0x7F +#define DATA_CMP(_d1, _l1, _d2, _l2) \ + (_l1 == _l2 ? memcmp(_d1, _d2, _l1) : _l1 - _l2) + +#define ADV_DATA_CMP(_adv, _data, _len) \ + DATA_CMP((_adv)->adv_data, (_adv)->adv_data_len, _data, _len) + +#define SCAN_RSP_CMP(_adv, _data, _len) \ + DATA_CMP((_adv)->scan_rsp_data, (_adv)->scan_rsp_len, _data, _len) + struct monitored_device { struct list_head list; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 6faae50d933dcc..05c13f639b9475 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1727,18 +1727,12 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, } adv_instance->flags = flags; - adv_instance->adv_data_len = adv_data_len; - adv_instance->scan_rsp_len = scan_rsp_len; adv_instance->min_interval = min_interval; adv_instance->max_interval = max_interval; adv_instance->tx_power = tx_power; - if (adv_data_len) - memcpy(adv_instance->adv_data, adv_data, adv_data_len); - - if (scan_rsp_len) - memcpy(adv_instance->scan_rsp_data, - scan_rsp_data, scan_rsp_len); + hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data, + scan_rsp_len, scan_rsp_data); adv_instance->timeout = timeout; adv_instance->remaining_time = timeout; @@ -1761,29 +1755,33 @@ int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data) { - struct adv_info *adv_instance; + struct adv_info *adv; - adv_instance = hci_find_adv_instance(hdev, instance); + adv = hci_find_adv_instance(hdev, instance); /* If advertisement doesn't exist, we can't modify its data */ - if (!adv_instance) + if (!adv) return -ENOENT; - if (adv_data_len) { - memset(adv_instance->adv_data, 0, - sizeof(adv_instance->adv_data)); - memcpy(adv_instance->adv_data, adv_data, adv_data_len); - adv_instance->adv_data_len = adv_data_len; + if (adv_data_len && ADV_DATA_CMP(adv, adv_data, adv_data_len)) { + memset(adv->adv_data, 0, sizeof(adv->adv_data)); + memcpy(adv->adv_data, adv_data, adv_data_len); + adv->adv_data_len = adv_data_len; + adv->adv_data_changed = true; } - if (scan_rsp_len) { - memset(adv_instance->scan_rsp_data, 0, - sizeof(adv_instance->scan_rsp_data)); - memcpy(adv_instance->scan_rsp_data, - scan_rsp_data, scan_rsp_len); - adv_instance->scan_rsp_len = scan_rsp_len; + if (scan_rsp_len && SCAN_RSP_CMP(adv, scan_rsp_data, scan_rsp_len)) { + memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); + memcpy(adv->scan_rsp_data, scan_rsp_data, scan_rsp_len); + adv->scan_rsp_len = scan_rsp_len; + adv->scan_rsp_changed = true; } + /* Mark as changed if there are flags which would affect it */ + if (((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) || + adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) + adv->scan_rsp_changed = true; + return 0; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 4d2203c5f1bb41..e5602e209b637c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -849,26 +849,38 @@ static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) u8 data[HCI_MAX_EXT_AD_LENGTH]; } pdu; u8 len; + struct adv_info *adv = NULL; + int err; memset(&pdu, 0, sizeof(pdu)); - len = eir_create_scan_rsp(hdev, instance, pdu.data); - - if (hdev->scan_rsp_data_len == len && - !memcmp(pdu.data, hdev->scan_rsp_data, len)) - return 0; + if (instance) { + adv = hci_find_adv_instance(hdev, instance); + if (!adv || !adv->scan_rsp_changed) + return 0; + } - memcpy(hdev->scan_rsp_data, pdu.data, len); - hdev->scan_rsp_data_len = len; + len = eir_create_scan_rsp(hdev, instance, pdu.data); pdu.cp.handle = instance; pdu.cp.length = len; pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, - sizeof(pdu.cp) + len, &pdu.cp, - HCI_CMD_TIMEOUT); + err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, + sizeof(pdu.cp) + len, &pdu.cp, + HCI_CMD_TIMEOUT); + if (err) + return err; + + if (adv) { + adv->scan_rsp_changed = false; + } else { + memcpy(hdev->scan_rsp_data, pdu.data, len); + hdev->scan_rsp_data_len = len; + } + + return 0; } static int __hci_set_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance) @@ -1119,27 +1131,39 @@ static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance) u8 data[HCI_MAX_EXT_AD_LENGTH]; } pdu; u8 len; + struct adv_info *adv = NULL; + int err; memset(&pdu, 0, sizeof(pdu)); - len = eir_create_adv_data(hdev, instance, pdu.data); - - /* There's nothing to do if the data hasn't changed */ - if (hdev->adv_data_len == len && - memcmp(pdu.data, hdev->adv_data, len) == 0) - return 0; + if (instance) { + adv = hci_find_adv_instance(hdev, instance); + if (!adv || !adv->adv_data_changed) + return 0; + } - memcpy(hdev->adv_data, pdu.data, len); - hdev->adv_data_len = len; + len = eir_create_adv_data(hdev, instance, pdu.data); pdu.cp.length = len; pdu.cp.handle = instance; pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA, - sizeof(pdu.cp) + len, &pdu.cp, - HCI_CMD_TIMEOUT); + err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA, + sizeof(pdu.cp) + len, &pdu.cp, + HCI_CMD_TIMEOUT); + if (err) + return err; + + /* Update data if the command succeed */ + if (adv) { + adv->adv_data_changed = false; + } else { + memcpy(hdev->adv_data, pdu.data, len); + hdev->adv_data_len = len; + } + + return 0; } static int hci_set_adv_data_sync(struct hci_dev *hdev, u8 instance) From d9cc9d78ca85210132c7a7cbe75975db16464390 Mon Sep 17 00:00:00 2001 From: Zhengping Jiang Date: Mon, 13 Jun 2022 14:43:27 -0700 Subject: [PATCH 0128/1250] Bluetooth: mgmt: Fix refresh cached connection info Set the connection data before calling get_conn_info_sync, so it can be verified the connection is still connected, before refreshing cached values. Fixes: 47db6b42991e6 ("Bluetooth: hci_sync: Convert MGMT_OP_GET_CONN_INFO") Signed-off-by: Zhengping Jiang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 325695bf2fa972..ef8371975c4ebc 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6828,11 +6828,14 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, cmd = mgmt_pending_new(sk, MGMT_OP_GET_CONN_INFO, hdev, data, len); - if (!cmd) + if (!cmd) { err = -ENOMEM; - else + } else { + hci_conn_hold(conn); + cmd->user_data = hci_conn_get(conn); err = hci_cmd_sync_queue(hdev, get_conn_info_sync, cmd, get_conn_info_complete); + } if (err < 0) { mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO, @@ -6844,9 +6847,6 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - hci_conn_hold(conn); - cmd->user_data = hci_conn_get(conn); - conn->conn_info_timestamp = jiffies; } else { /* Cache is valid, just reply with values cached in hci_conn */ From 052404a72040dd15a6cb92a16122e6512d6c6c13 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 13 Apr 2022 17:46:15 -0700 Subject: [PATCH 0129/1250] rcu: Switch polled grace-period APIs to ->gp_seq_polled This commit switches the existing polled grace-period APIs to use a new ->gp_seq_polled counter in the rcu_state structure. An additional ->gp_seq_polled_snap counter in that same structure allows the normal grace period kthread to interact properly with the !SMP !PREEMPT fastpath through synchronize_rcu(). The first of the two to note the end of a given grace period will make knowledge of this transition available to the polled API. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/tree.h | 2 ++ 2 files changed, 89 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 46cfceea878477..bb50fdd93bee59 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1775,6 +1775,78 @@ static void rcu_strict_gp_boundary(void *unused) invoke_rcu_core(); } +// Has rcu_init() been invoked? This is used (for example) to determine +// whether spinlocks may be acquired safely. +static bool rcu_init_invoked(void) +{ + return !!rcu_state.n_online_cpus; +} + +// Make the polled API aware of the beginning of a grace period. +static void rcu_poll_gp_seq_start(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If RCU was idle, note beginning of GP. + if (!rcu_seq_state(rcu_state.gp_seq_polled)) + rcu_seq_start(&rcu_state.gp_seq_polled); + + // Either way, record current state. + *snap = rcu_state.gp_seq_polled; +} + +// Make the polled API aware of the end of a grace period. +static void rcu_poll_gp_seq_end(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) + raw_lockdep_assert_held_rcu_node(rnp); + + // If the previously noted GP is still in effect, record the + // end of that GP. Either way, zero counter to avoid counter-wrap + // problems. + if (*snap && *snap == rcu_state.gp_seq_polled) { + rcu_seq_end(&rcu_state.gp_seq_polled); + rcu_state.gp_seq_polled_snap = 0; + } else { + *snap = 0; + } +} + +// Make the polled API aware of the beginning of a grace period, but +// where caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_start(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + +// Make the polled API aware of the end of a grace period, but where +// caller does not hold the root rcu_node structure's lock. +static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) +{ + struct rcu_node *rnp = rcu_get_root(); + + if (rcu_init_invoked()) { + lockdep_assert_irqs_enabled(); + raw_spin_lock_irq_rcu_node(rnp); + } + rcu_poll_gp_seq_end(snap); + if (rcu_init_invoked()) + raw_spin_unlock_irq_rcu_node(rnp); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1810,6 +1882,7 @@ static noinline_for_stack bool rcu_gp_init(void) rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); + rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -2069,6 +2142,7 @@ static noinline void rcu_gp_cleanup(void) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ + rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* @@ -3837,8 +3911,18 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); - if (rcu_blocking_is_gp()) + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this GP overlaps with other + // GPs only by being fully nested within them, which allows + // reuse of ->gp_seq_polled_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); + if (rcu_init_invoked()) + cond_resched_tasks_rcu_qs(); return; // Context allows vacuous grace periods. + } if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else @@ -3860,7 +3944,7 @@ unsigned long get_state_synchronize_rcu(void) * before the load from ->gp_seq. */ smp_mb(); /* ^^^ */ - return rcu_seq_snap(&rcu_state.gp_seq); + return rcu_seq_snap(&rcu_state.gp_seq_polled); } EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); @@ -3925,7 +4009,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu); bool poll_state_synchronize_rcu(unsigned long oldstate) { if (oldstate == RCU_GET_STATE_COMPLETED || - rcu_seq_done_exact(&rcu_state.gp_seq, oldstate)) { + rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) { smp_mb(); /* Ensure GP ends before subsequent accesses. */ return true; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 2ccf5845957df4..9c853033f159d6 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -323,6 +323,8 @@ struct rcu_state { short gp_state; /* GP kthread sleep state. */ unsigned long gp_wake_time; /* Last GP kthread wake. */ unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ + unsigned long gp_seq_polled; /* GP seq for polled API. */ + unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ /* End of fields guarded by root rcu_node's lock. */ From f87e0dcb6b86848328869abc5ade2cf049451b50 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2022 06:56:35 -0700 Subject: [PATCH 0130/1250] rcu: Make polled grace-period API account for expedited grace periods Currently, this code could splat: oldstate = get_state_synchronize_rcu(); synchronize_rcu_expedited(); WARN_ON_ONCE(!poll_state_synchronize_rcu(oldstate)); This situation is counter-intuitive and user-unfriendly. After all, there really was a perfectly valid full grace period right after the call to get_state_synchronize_rcu(), so why shouldn't poll_state_synchronize_rcu() know about it? This commit therefore makes the polled grace-period API aware of expedited grace periods in addition to the normal grace periods that it is already aware of. With this change, the above code is guaranteed not to splat. Please note that the above code can still splat due to counter wrap on the one hand and situations involving partially overlapping normal/expedited grace periods on the other. On 64-bit systems, the second is of course much more likely than the first. It is possible to modify this approach to prevent overlapping grace periods from causing splats, but only at the expense of greatly increasing the probability of counter wrap, as in within milliseconds on 32-bit systems and within minutes on 64-bit systems. This commit is in preparation for polled expedited grace periods. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- kernel/rcu/tree.h | 1 + kernel/rcu/tree_exp.h | 16 ++++++++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bb50fdd93bee59..90909109c141fd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1812,6 +1812,7 @@ static void rcu_poll_gp_seq_end(unsigned long *snap) if (*snap && *snap == rcu_state.gp_seq_polled) { rcu_seq_end(&rcu_state.gp_seq_polled); rcu_state.gp_seq_polled_snap = 0; + rcu_state.gp_seq_polled_exp_snap = 0; } else { *snap = 0; } @@ -3913,10 +3914,10 @@ void synchronize_rcu(void) "Illegal synchronize_rcu() in RCU read-side critical section"); if (rcu_blocking_is_gp()) { // Note well that this code runs with !PREEMPT && !SMP. - // In addition, all code that advances grace periods runs - // at process level. Therefore, this GP overlaps with other - // GPs only by being fully nested within them, which allows - // reuse of ->gp_seq_polled_snap. + // In addition, all code that advances grace periods runs at + // process level. Therefore, this normal GP overlaps with + // other normal GPs only by being fully nested within them, + // which allows reuse of ->gp_seq_polled_snap. rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap); rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap); if (rcu_init_invoked()) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9c853033f159d6..5634e76106c48f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -325,6 +325,7 @@ struct rcu_state { unsigned long gp_wake_seq; /* ->gp_seq at ^^^. */ unsigned long gp_seq_polled; /* GP seq for polled API. */ unsigned long gp_seq_polled_snap; /* ->gp_seq_polled at normal GP start. */ + unsigned long gp_seq_polled_exp_snap; /* ->gp_seq_polled at expedited GP start. */ /* End of fields guarded by root rcu_node's lock. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 0f70f62039a909..e0258066b881ef 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -18,6 +18,7 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp); static void rcu_exp_gp_seq_start(void) { rcu_seq_start(&rcu_state.expedited_sequence); + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); } /* @@ -34,6 +35,7 @@ static __maybe_unused unsigned long rcu_exp_gp_seq_endval(void) */ static void rcu_exp_gp_seq_end(void) { + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); rcu_seq_end(&rcu_state.expedited_sequence); smp_mb(); /* Ensure that consecutive grace periods serialize. */ } @@ -913,8 +915,18 @@ void synchronize_rcu_expedited(void) "Illegal synchronize_rcu_expedited() in RCU read-side critical section"); /* Is the state is such that the call is a grace period? */ - if (rcu_blocking_is_gp()) - return; + if (rcu_blocking_is_gp()) { + // Note well that this code runs with !PREEMPT && !SMP. + // In addition, all code that advances grace periods runs + // at process level. Therefore, this expedited GP overlaps + // with other expedited GPs only by being fully nested within + // them, which allows reuse of ->gp_seq_polled_exp_snap. + rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_exp_snap); + rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_exp_snap); + if (rcu_init_invoked()) + cond_resched(); + return; // Context allows vacuous grace periods. + } /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { From 21d6b7f1aadd9cb24a33d7cdf56153efdb029553 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2022 11:49:58 -0700 Subject: [PATCH 0131/1250] rcu: Make Tiny RCU grace periods visible to polled APIs This commit makes the Tiny RCU implementation of synchronize_rcu() increment the rcu_ctrlblk.gp_seq counter, thus making both synchronize_rcu() and synchronize_rcu_expedited() visible to get_state_synchronize_rcu() and friends. Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index dbee6bea672698..60071817d93998 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -139,8 +139,10 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused /* * Wait for a grace period to elapse. But it is illegal to invoke * synchronize_rcu() from within an RCU read-side critical section. - * Therefore, any legal call to synchronize_rcu() is a quiescent - * state, and so on a UP system, synchronize_rcu() need do nothing. + * Therefore, any legal call to synchronize_rcu() is a quiescent state, + * and so on a UP system, synchronize_rcu() need do nothing, other than + * let the polled APIs know that another grace period elapsed. + * * (But Lai Jiangshan points out the benefits of doing might_sleep() * to reduce latency.) * @@ -152,6 +154,7 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); + WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); } EXPORT_SYMBOL_GPL(synchronize_rcu); From e9baafdb398004077430353ff3532b60ba84c2fa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 14 Apr 2022 09:09:11 -0700 Subject: [PATCH 0132/1250] rcutorture: Verify that polled GP API sees synchronous grace periods This commit causes rcu_torture_writer() to use WARN_ON_ONCE() to check that the cookie returned by the current RCU flavor's ->get_gp_state() function (get_state_synchronize_rcu() for vanilla RCU) causes that flavor's ->poll_gp_state function (poll_state_synchronize_rcu() for vanilla RCU) to unconditionally return true. Note that a pair calls to synchronous grace-period-wait functions are used. This is necessary to account for partially overlapping normal and expedited grace periods aligning in just the wrong way with polled API invocations, which can cause those polled API invocations to ignore one or the other of those partially overlapping grace periods. It is unlikely that this sort of ignored grace period will be a problem in production, but rcutorture can make it happen quite within a few tens of seconds. This commit is in preparation for polled expedited grace periods. [ paulmck: Apply feedback from Frederic Weisbecker. ] Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4ceec9f4169c75..d2edc763bb92a6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1269,7 +1269,12 @@ rcu_torture_writer(void *arg) break; case RTWS_EXP_SYNC: rcu_torture_writer_state = RTWS_EXP_SYNC; + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); cur_ops->exp_sync(); + cur_ops->exp_sync(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); rcu_torture_pipe_update(old_rp); break; case RTWS_COND_GET: @@ -1291,7 +1296,12 @@ rcu_torture_writer(void *arg) break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); cur_ops->sync(); + cur_ops->sync(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ON_ONCE(!cur_ops->poll_gp_state(cookie)); rcu_torture_pipe_update(old_rp); break; default: From bb1142b15824013ee799465759f5fb08a5401647 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 15 Apr 2022 10:55:42 -0700 Subject: [PATCH 0133/1250] rcu: Add polled expedited grace-period primitives This commit adds expedited grace-period functionality to RCU's polled grace-period API, adding start_poll_synchronize_rcu_expedited() and cond_synchronize_rcu_expedited(), which are similar to the existing start_poll_synchronize_rcu() and cond_synchronize_rcu() functions, respectively. Note that although start_poll_synchronize_rcu_expedited() can be invoked very early, the resulting expedited grace periods are not guaranteed to start until after workqueues are fully initialized. On the other hand, both synchronize_rcu() and synchronize_rcu_expedited() can also be invoked very early, and the resulting grace periods will be taken into account as they occur. [ paulmck: Apply feedback from Neeraj Upadhyay. ] Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 10 +++++ include/linux/rcutree.h | 2 + kernel/rcu/tree.c | 17 ++++++--- kernel/rcu/tree.h | 7 ++++ kernel/rcu/tree_exp.h | 85 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 116 insertions(+), 5 deletions(-) diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 5fed476f977f63..ab7e20dfb07b02 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -23,6 +23,16 @@ static inline void cond_synchronize_rcu(unsigned long oldstate) might_sleep(); } +static inline unsigned long start_poll_synchronize_rcu_expedited(void) +{ + return start_poll_synchronize_rcu(); +} + +static inline void cond_synchronize_rcu_expedited(unsigned long oldstate) +{ + cond_synchronize_rcu(oldstate); +} + extern void rcu_barrier(void); static inline void synchronize_rcu_expedited(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 9c6cfb742504f6..20dbaa9a388207 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -40,6 +40,8 @@ bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); void kfree_rcu_scheduler_running(void); bool rcu_gp_might_be_stalled(void); +unsigned long start_poll_synchronize_rcu_expedited(void); +void cond_synchronize_rcu_expedited(unsigned long oldstate); unsigned long get_state_synchronize_rcu(void); unsigned long start_poll_synchronize_rcu(void); bool poll_state_synchronize_rcu(unsigned long oldstate); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 90909109c141fd..92b65c8aa7567e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4021,20 +4021,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); /** * cond_synchronize_rcu - Conditionally wait for an RCU grace period * - * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() * * If a full RCU grace period has elapsed since the earlier call to * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. * Otherwise, invoke synchronize_rcu() to wait for a full grace period. * - * Yes, this function does not take counter wrap into account. But - * counter wrap is harmless. If the counter wraps, we have waited for + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), - * so waiting for one additional grace period should be just fine. + * so waiting for a couple of additional grace periods should be just fine. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call - * to the function that provided @oldstate, and that returned at the end + * to the function that provided @oldstate and that returned at the end * of this function. */ void cond_synchronize_rcu(unsigned long oldstate) @@ -4787,6 +4787,9 @@ static void __init rcu_init_one(void) init_waitqueue_head(&rnp->exp_wq[3]); spin_lock_init(&rnp->exp_lock); mutex_init(&rnp->boost_kthread_mutex); + raw_spin_lock_init(&rnp->exp_poll_lock); + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp); } } @@ -5012,6 +5015,10 @@ void __init rcu_init(void) qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark; else qovld_calc = qovld; + + // Kick-start any polled grace periods that started early. + if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1)) + (void)start_poll_synchronize_rcu_expedited(); } #include "tree_stall.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 5634e76106c48f..fb77deca5f5c6f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -133,6 +133,10 @@ struct rcu_node { wait_queue_head_t exp_wq[4]; struct rcu_exp_work rew; bool exp_need_flush; /* Need to flush workitem? */ + raw_spinlock_t exp_poll_lock; + /* Lock and data for polled expedited grace periods. */ + unsigned long exp_seq_poll_rq; + struct work_struct exp_poll_wq; } ____cacheline_internodealigned_in_smp; /* @@ -484,3 +488,6 @@ static void rcu_iw_handler(struct irq_work *iwp); static void check_cpu_stall(struct rcu_data *rdp); static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, const unsigned long gpssdelay); + +/* Forward declarations for tree_exp.h. */ +static void sync_rcu_do_polled_gp(struct work_struct *wp); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index e0258066b881ef..571b0a700ccedb 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -962,3 +962,88 @@ void synchronize_rcu_expedited(void) synchronize_rcu_expedited_destroy_work(&rew); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Ensure that start_poll_synchronize_rcu_expedited() has the expedited + * RCU grace periods that it needs. + */ +static void sync_rcu_do_polled_gp(struct work_struct *wp) +{ + unsigned long flags; + struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq); + unsigned long s; + + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + if (s == RCU_GET_STATE_COMPLETED) + return; + while (!poll_state_synchronize_rcu(s)) + synchronize_rcu_expedited(); + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + s = rnp->exp_seq_poll_rq; + if (poll_state_synchronize_rcu(s)) + rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED; + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); +} + +/** + * start_poll_synchronize_rcu_expedited - Snapshot current RCU state and start expedited grace period + * + * Returns a cookie to pass to a call to cond_synchronize_rcu(), + * cond_synchronize_rcu_expedited(), or poll_state_synchronize_rcu(), + * allowing them to determine whether or not any sort of grace period has + * elapsed in the meantime. If the needed expedited grace period is not + * already slated to start, initiates that grace period. + */ +unsigned long start_poll_synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + unsigned long s; + + s = get_state_synchronize_rcu(); + rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); + rnp = rdp->mynode; + if (rcu_init_invoked()) + raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); + if (!poll_state_synchronize_rcu(s)) { + rnp->exp_seq_poll_rq = s; + if (rcu_init_invoked()) + queue_work(rcu_gp_wq, &rnp->exp_poll_wq); + } + if (rcu_init_invoked()) + raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); + + return s; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu_expedited); + +/** + * cond_synchronize_rcu_expedited - Conditionally wait for an expedited RCU grace period + * + * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() + * + * If any type of full RCU grace period has elapsed since the earlier + * call to get_state_synchronize_rcu(), start_poll_synchronize_rcu(), + * or start_poll_synchronize_rcu_expedited(), just return. Otherwise, + * invoke synchronize_rcu_expedited() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. + * But counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for a couple of additional grace periods should be just fine. + * + * This function provides the same memory-ordering guarantees that + * would be provided by a synchronize_rcu() that was invoked at the call + * to the function that provided @oldstate and that returned at the end + * of this function. + */ +void cond_synchronize_rcu_expedited(unsigned long oldstate) +{ + if (!poll_state_synchronize_rcu(oldstate)) + synchronize_rcu_expedited(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_rcu_expedited); From 0c3a779e8f0187fc4a7dfc91605f9aeef41aa50d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Feb 2022 07:01:20 -0800 Subject: [PATCH 0134/1250] rcutorture: Test polled expedited grace-period primitives This commit adds tests of start_poll_synchronize_rcu_expedited() and poll_state_synchronize_rcu_expedited(). Link: https://lore.kernel.org/all/20220121142454.1994916-1-bfoster@redhat.com/ Link: https://docs.google.com/document/d/1RNKWW9jQyfjxw2E8dsXVTdvZYh0HnYeSHDKog9jhdN8/edit?usp=sharing Cc: Brian Foster Cc: Dave Chinner Cc: Al Viro Cc: Ian Kent Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 87 +++++++++++++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 13 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d2edc763bb92a6..0788ef2a449111 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -86,10 +86,12 @@ torture_param(int, fwd_progress_holdoff, 60, torture_param(bool, fwd_progress_need_resched, 1, "Hide cond_resched() behind need_resched()"); torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); +torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); +torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -209,12 +211,16 @@ static int rcu_torture_writer_state; #define RTWS_DEF_FREE 3 #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 -#define RTWS_COND_SYNC 6 -#define RTWS_POLL_GET 7 -#define RTWS_POLL_WAIT 8 -#define RTWS_SYNC 9 -#define RTWS_STUTTER 10 -#define RTWS_STOPPING 11 +#define RTWS_COND_GET_EXP 6 +#define RTWS_COND_SYNC 7 +#define RTWS_COND_SYNC_EXP 8 +#define RTWS_POLL_GET 9 +#define RTWS_POLL_GET_EXP 10 +#define RTWS_POLL_WAIT 11 +#define RTWS_POLL_WAIT_EXP 12 +#define RTWS_SYNC 13 +#define RTWS_STUTTER 14 +#define RTWS_STOPPING 15 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -222,9 +228,13 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_DEF_FREE", "RTWS_EXP_SYNC", "RTWS_COND_GET", + "RTWS_COND_GET_EXP", "RTWS_COND_SYNC", + "RTWS_COND_SYNC_EXP", "RTWS_POLL_GET", + "RTWS_POLL_GET_EXP", "RTWS_POLL_WAIT", + "RTWS_POLL_WAIT_EXP", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -337,6 +347,10 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); + unsigned long (*get_gp_state_exp)(void); + unsigned long (*start_gp_poll_exp)(void); + bool (*poll_gp_state_exp)(unsigned long oldstate); + void (*cond_sync_exp)(unsigned long oldstate); unsigned long (*get_gp_state)(void); unsigned long (*get_gp_completed)(void); unsigned long (*start_gp_poll)(void); @@ -509,6 +523,10 @@ static struct rcu_torture_ops rcu_ops = { .start_gp_poll = start_poll_synchronize_rcu, .poll_gp_state = poll_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, + .get_gp_state_exp = get_state_synchronize_rcu, + .start_gp_poll_exp = start_poll_synchronize_rcu_expedited, + .poll_gp_state_exp = poll_state_synchronize_rcu, + .cond_sync_exp = cond_synchronize_rcu_expedited, .call = call_rcu, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, @@ -1138,9 +1156,8 @@ rcu_torture_fqs(void *arg) return 0; } -// Used by writers to randomly choose from the available grace-period -// primitives. The only purpose of the initialization is to size the array. -static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; +// Used by writers to randomly choose from the available grace-period primitives. +static int synctype[ARRAY_SIZE(rcu_torture_writer_state_names)] = { }; static int nsynctypes; /* @@ -1148,18 +1165,27 @@ static int nsynctypes; */ static void rcu_torture_write_types(void) { - bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; - bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; + bool gp_cond1 = gp_cond, gp_cond_exp1 = gp_cond_exp, gp_exp1 = gp_exp; + bool gp_poll_exp1 = gp_poll_exp, gp_normal1 = gp_normal, gp_poll1 = gp_poll; + bool gp_sync1 = gp_sync; /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) - gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true; + if (!gp_cond1 && !gp_cond_exp1 && !gp_exp1 && !gp_poll_exp && + !gp_normal1 && !gp_poll1 && !gp_sync1) + gp_cond1 = gp_cond_exp1 = gp_exp1 = gp_poll_exp1 = + gp_normal1 = gp_poll1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); } + if (gp_cond_exp1 && cur_ops->get_gp_state_exp && cur_ops->cond_sync_exp) { + synctype[nsynctypes++] = RTWS_COND_GET_EXP; + pr_info("%s: Testing conditional expedited GPs.\n", __func__); + } else if (gp_cond_exp && (!cur_ops->get_gp_state_exp || !cur_ops->cond_sync_exp)) { + pr_alert("%s: gp_cond_exp without primitives.\n", __func__); + } if (gp_exp1 && cur_ops->exp_sync) { synctype[nsynctypes++] = RTWS_EXP_SYNC; pr_info("%s: Testing expedited GPs.\n", __func__); @@ -1178,6 +1204,12 @@ static void rcu_torture_write_types(void) } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { pr_alert("%s: gp_poll without primitives.\n", __func__); } + if (gp_poll_exp1 && cur_ops->start_gp_poll_exp && cur_ops->poll_gp_state_exp) { + synctype[nsynctypes++] = RTWS_POLL_GET_EXP; + pr_info("%s: Testing polling expedited GPs.\n", __func__); + } else if (gp_poll_exp && (!cur_ops->start_gp_poll_exp || !cur_ops->poll_gp_state_exp)) { + pr_alert("%s: gp_poll_exp without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); @@ -1285,6 +1317,14 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_COND_GET_EXP: + rcu_torture_writer_state = RTWS_COND_GET_EXP; + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + rcu_torture_writer_state = RTWS_COND_SYNC_EXP; + cur_ops->cond_sync_exp(gp_snap); + rcu_torture_pipe_update(old_rp); + break; case RTWS_POLL_GET: rcu_torture_writer_state = RTWS_POLL_GET; gp_snap = cur_ops->start_gp_poll(); @@ -1294,6 +1334,15 @@ rcu_torture_writer(void *arg) &rand); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET_EXP: + rcu_torture_writer_state = RTWS_POLL_GET_EXP; + gp_snap = cur_ops->start_gp_poll_exp(); + rcu_torture_writer_state = RTWS_POLL_WAIT_EXP; + while (!cur_ops->poll_gp_state_exp(gp_snap)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; if (cur_ops->get_gp_state && cur_ops->poll_gp_state) @@ -1400,6 +1449,11 @@ rcu_torture_fakewriter(void *arg) torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); cur_ops->cond_sync(gp_snap); break; + case RTWS_COND_GET_EXP: + gp_snap = cur_ops->get_gp_state_exp(); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); + cur_ops->cond_sync_exp(gp_snap); + break; case RTWS_POLL_GET: gp_snap = cur_ops->start_gp_poll(); while (!cur_ops->poll_gp_state(gp_snap)) { @@ -1407,6 +1461,13 @@ rcu_torture_fakewriter(void *arg) &rand); } break; + case RTWS_POLL_GET_EXP: + gp_snap = cur_ops->start_gp_poll_exp(); + while (!cur_ops->poll_gp_state_exp(gp_snap)) { + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); + } + break; case RTWS_SYNC: cur_ops->sync(); break; From 97566a931913ed4dc57c96a2e54836a9752f358a Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 22 Apr 2022 21:15:18 +0800 Subject: [PATCH 0135/1250] rcu: Put panic_on_rcu_stall() after expedited RCU CPU stall warnings When a normal RCU CPU stall warning is encountered with the panic_on_rcu_stall sysfs variable is set, the system panics only after the stall warning is printed. But when an expedited RCU CPU stall warning is encountered with the panic_on_rcu_stall sysfs variable is set, the system panics first, thus never printing the stall warning. This commit therefore brings the expedited stall warning into line with the normal stall warning by printing first and panicking afterwards. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 571b0a700ccedb..f05a15b11fa0ce 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -623,7 +623,6 @@ static void synchronize_rcu_expedited_wait(void) return; if (rcu_stall_is_suppressed()) continue; - panic_on_rcu_stall(); trace_rcu_stall_warning(rcu_state.name, TPS("ExpeditedStall")); pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rcu_state.name); @@ -671,6 +670,7 @@ static void synchronize_rcu_expedited_wait(void) } } jiffies_stall = 3 * rcu_exp_jiffies_till_stall_check() + 3; + panic_on_rcu_stall(); } } From 2bc46e7bd100dd3f879908eb7e6d19676becc0c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 May 2022 09:49:05 -0700 Subject: [PATCH 0136/1250] rcu: Diagnose extended sync_rcu_do_polled_gp() loops This commit dumps out state when the sync_rcu_do_polled_gp() function loops more than expected. This is a debugging aid. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f05a15b11fa0ce..4c7037b507032d 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -970,6 +970,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); static void sync_rcu_do_polled_gp(struct work_struct *wp) { unsigned long flags; + int i = 0; struct rcu_node *rnp = container_of(wp, struct rcu_node, exp_poll_wq); unsigned long s; @@ -979,8 +980,12 @@ static void sync_rcu_do_polled_gp(struct work_struct *wp) raw_spin_unlock_irqrestore(&rnp->exp_poll_lock, flags); if (s == RCU_GET_STATE_COMPLETED) return; - while (!poll_state_synchronize_rcu(s)) + while (!poll_state_synchronize_rcu(s)) { synchronize_rcu_expedited(); + if (i == 10 || i == 20) + pr_info("%s: i = %d s = %lx gp_seq_polled = %lx\n", __func__, i, s, READ_ONCE(rcu_state.gp_seq_polled)); + i++; + } raw_spin_lock_irqsave(&rnp->exp_poll_lock, flags); s = rnp->exp_seq_poll_rq; if (poll_state_synchronize_rcu(s)) From 18d5c58fdaadcd6c8c5cb42a8c956b9c70ab9e50 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Wed, 18 May 2022 19:43:10 +0800 Subject: [PATCH 0137/1250] rcu: Add irqs-disabled indicator to expedited RCU CPU stall warnings If a CPU has interrupts disabled continuously starting before the beginning of a given expedited RCU grace period, that CPU will not execute that grace period's IPI handler. This will in turn mean that the ->cpu_no_qs.b.exp field in that CPU's rcu_data structure will continue to contain the boolean value false. Knowing whether or not a CPU has had interrupts disabled can be helpful when debugging an expedited RCU CPU stall warning, so this commit adds a "D" indicator expedited RCU CPU stall warnings that signifies that the corresponding CPU has had interrupts disabled throughout. This capability was tested as follows: runqemu kvm slirp nographic qemuparams="-m 4096 -smp 4" bootparams= "isolcpus=2,3 nohz_full=2,3 rcu_nocbs=2,3 rcutree.dump_tree=1 rcutorture.stall_cpu_holdoff=30 rcutorture.stall_cpu=40 rcutorture.stall_cpu_irqsoff=1 rcutorture.stall_cpu_block=0 rcutorture.stall_no_softlockup=1" -d The rcu_torture_stall() function ran on CPU 1, which displays the "D" as expected given the rcutorture.stall_cpu_irqsoff=1 module parameter: ............ rcu: INFO: rcu_preempt detected expedited stalls on CPUs/tasks: { 1-...D } 26467 jiffies s: 13317 root: 0x1/. rcu: blocking rcu_node structures (internal RCU debug): l=1:0-1:0x2/. Task dump for CPU 1: task:rcu_torture_sta state:R running task stack: 0 pid: 76 ppid: 2 flags:0x00004008 Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 4c7037b507032d..f092c7f18a5f3f 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -637,10 +637,11 @@ static void synchronize_rcu_expedited_wait(void) continue; ndetected++; rdp = per_cpu_ptr(&rcu_data, cpu); - pr_cont(" %d-%c%c%c", cpu, + pr_cont(" %d-%c%c%c%c", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)], + "D."[!!(rdp->cpu_no_qs.b.exp)]); } } pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", From 6ee324afdf30b8704490f1f56ea8cc3cae8cebc9 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 13 Jun 2022 13:43:24 +0100 Subject: [PATCH 0138/1250] drivers/thermal/cpufreq_cooling: Use private callback ops for each cooling device It is very unlikely that one CPU cluster would have the EM and some other won't have it (because EM registration failed or DT lacks needed entry). Although, we should avoid modifying global variable with callbacks anyway. Redesign this and add safety for such situation. Signed-off-by: Lukasz Luba Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20220613124327.30766-2-lukasz.luba@arm.com Signed-off-by: Daniel Lezcano --- drivers/thermal/cpufreq_cooling.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index b8151d95a8068b..ad8b86f5281b85 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -59,6 +59,7 @@ struct time_in_idle { * @cdev: thermal_cooling_device pointer to keep track of the * registered cooling device. * @policy: cpufreq policy. + * @cooling_ops: cpufreq callbacks to thermal cooling device ops * @idle_time: idle time stats * @qos_req: PM QoS contraint to apply * @@ -71,6 +72,7 @@ struct cpufreq_cooling_device { unsigned int max_level; struct em_perf_domain *em; struct cpufreq_policy *policy; + struct thermal_cooling_device_ops cooling_ops; #ifndef CONFIG_SMP struct time_in_idle *idle_time; #endif @@ -485,14 +487,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, return ret; } -/* Bind cpufreq callbacks to thermal cooling device ops */ - -static struct thermal_cooling_device_ops cpufreq_cooling_ops = { - .get_max_state = cpufreq_get_max_state, - .get_cur_state = cpufreq_get_cur_state, - .set_cur_state = cpufreq_set_cur_state, -}; - /** * __cpufreq_cooling_register - helper function to create cpufreq cooling device * @np: a valid struct device_node to the cooling device device tree node @@ -554,7 +548,10 @@ __cpufreq_cooling_register(struct device_node *np, /* max_level is an index, not a counter */ cpufreq_cdev->max_level = i - 1; - cooling_ops = &cpufreq_cooling_ops; + cooling_ops = &cpufreq_cdev->cooling_ops; + cooling_ops->get_max_state = cpufreq_get_max_state; + cooling_ops->get_cur_state = cpufreq_get_cur_state; + cooling_ops->set_cur_state = cpufreq_set_cur_state; #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR if (em_is_sane(cpufreq_cdev, em)) { From b279c684469073cfc08d35c45b05eea95fe28249 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 13 Jun 2022 13:43:25 +0100 Subject: [PATCH 0139/1250] drivers/thermal/cpufreq_cooling : Refactor thermal_power_cpu_get_power tracing Simplify the thermal_power_cpu_get_power trace event by removing complicated cpumask and variable length array. Now the tools parsing trace output don't have to hassle to get this power data. The simplified format version uses 'policy->cpu'. Remove also the 'load' information completely since there is very little value of it in this trace event. To get the CPUs' load (or utilization) there are other dedicated trace hooks in the kernel. This patch also simplifies and speeds-up the main cooling code when that trace event is enabled. Rename the trace event to avoid confusion of tools which parse the trace file. Acked-by: Viresh Kumar Signed-off-by: Lukasz Luba Link: https://lore.kernel.org/r/20220613124327.30766-3-lukasz.luba@arm.com Signed-off-by: Daniel Lezcano --- drivers/thermal/cpufreq_cooling.c | 18 +----------------- include/trace/events/thermal.h | 28 ++++++++-------------------- 2 files changed, 9 insertions(+), 37 deletions(-) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index ad8b86f5281b85..492a67e267e8f0 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -216,16 +216,9 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev, u32 total_load = 0; struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; struct cpufreq_policy *policy = cpufreq_cdev->policy; - u32 *load_cpu = NULL; freq = cpufreq_quick_get(policy->cpu); - if (trace_thermal_power_cpu_get_power_enabled()) { - u32 ncpus = cpumask_weight(policy->related_cpus); - - load_cpu = kcalloc(ncpus, sizeof(*load_cpu), GFP_KERNEL); - } - for_each_cpu(cpu, policy->related_cpus) { u32 load; @@ -235,22 +228,13 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev, load = 0; total_load += load; - if (load_cpu) - load_cpu[i] = load; - - i++; } cpufreq_cdev->last_load = total_load; *power = get_dynamic_power(cpufreq_cdev, freq); - if (load_cpu) { - trace_thermal_power_cpu_get_power(policy->related_cpus, freq, - load_cpu, i, *power); - - kfree(load_cpu); - } + trace_thermal_power_cpu_get_power_simple(policy->cpu, *power); return 0; } diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h index 8a5f04888abd7e..e58bf3072f3244 100644 --- a/include/trace/events/thermal.h +++ b/include/trace/events/thermal.h @@ -92,34 +92,22 @@ TRACE_EVENT(thermal_zone_trip, ); #ifdef CONFIG_CPU_THERMAL -TRACE_EVENT(thermal_power_cpu_get_power, - TP_PROTO(const struct cpumask *cpus, unsigned long freq, u32 *load, - size_t load_len, u32 dynamic_power), +TRACE_EVENT(thermal_power_cpu_get_power_simple, + TP_PROTO(int cpu, u32 power), - TP_ARGS(cpus, freq, load, load_len, dynamic_power), + TP_ARGS(cpu, power), TP_STRUCT__entry( - __bitmask(cpumask, num_possible_cpus()) - __field(unsigned long, freq ) - __dynamic_array(u32, load, load_len) - __field(size_t, load_len ) - __field(u32, dynamic_power ) + __field(int, cpu) + __field(u32, power) ), TP_fast_assign( - __assign_bitmask(cpumask, cpumask_bits(cpus), - num_possible_cpus()); - __entry->freq = freq; - memcpy(__get_dynamic_array(load), load, - load_len * sizeof(*load)); - __entry->load_len = load_len; - __entry->dynamic_power = dynamic_power; + __entry->cpu = cpu; + __entry->power = power; ), - TP_printk("cpus=%s freq=%lu load={%s} dynamic_power=%d", - __get_bitmask(cpumask), __entry->freq, - __print_array(__get_dynamic_array(load), __entry->load_len, 4), - __entry->dynamic_power) + TP_printk("cpu=%d power=%u", __entry->cpu, __entry->power) ); TRACE_EVENT(thermal_power_cpu_limit, From 3c1325a6c02a5a9b38a4c4eef0443a0c0cc780ac Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 13 Jun 2022 13:43:26 +0100 Subject: [PATCH 0140/1250] drivers/thermal/cpufreq_cooling: Update outdated comments The code has moved and left some comments stale. Update them where there is a need. Acked-by: Viresh Kumar Signed-off-by: Lukasz Luba Link: https://lore.kernel.org/r/20220613124327.30766-4-lukasz.luba@arm.com Signed-off-by: Daniel Lezcano --- drivers/thermal/cpufreq_cooling.c | 44 +++++++++++++------------------ 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 492a67e267e8f0..50f8b90abba69f 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -206,7 +206,7 @@ static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev, * complex code may be needed if experiments show that it's not * accurate enough. * - * Return: 0 on success, -E* if getting the static power failed. + * Return: 0 on success, this function doesn't fail. */ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev, u32 *power) @@ -249,9 +249,8 @@ static int cpufreq_get_requested_power(struct thermal_cooling_device *cdev, * milliwatts assuming 100% load. Store the calculated power in * @power. * - * Return: 0 on success, -EINVAL if the cooling device state could not - * be converted into a frequency or other -E* if there was an error - * when calculating the static power. + * Return: 0 on success, -EINVAL if the cooling device state is bigger + * than maximum allowed. */ static int cpufreq_state2power(struct thermal_cooling_device *cdev, unsigned long state, u32 *power) @@ -281,15 +280,11 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev, * Calculate a cooling device state for the cpus described by @cdev * that would allow them to consume at most @power mW and store it in * @state. Note that this calculation depends on external factors - * such as the cpu load or the current static power. Calling this - * function with the same power as input can yield different cooling - * device states depending on those external factors. - * - * Return: 0 on success, -ENODEV if no cpus are online or -EINVAL if - * the calculated frequency could not be converted to a valid state. - * The latter should not happen unless the frequencies available to - * cpufreq have changed since the initialization of the cpu cooling - * device. + * such as the CPUs load. Calling this function with the same power + * as input can yield different cooling device states depending on those + * external factors. + * + * Return: 0 on success, this function doesn't fail. */ static int cpufreq_power2state(struct thermal_cooling_device *cdev, u32 power, unsigned long *state) @@ -401,7 +396,7 @@ static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev, * Callback for the thermal cooling device to return the cpufreq * max cooling state. * - * Return: 0 on success, an error code otherwise. + * Return: 0 on success, this function doesn't fail. */ static int cpufreq_get_max_state(struct thermal_cooling_device *cdev, unsigned long *state) @@ -420,7 +415,7 @@ static int cpufreq_get_max_state(struct thermal_cooling_device *cdev, * Callback for the thermal cooling device to return the cpufreq * current cooling state. * - * Return: 0 on success, an error code otherwise. + * Return: 0 on success, this function doesn't fail. */ static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev, unsigned long *state) @@ -479,7 +474,7 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev, * @em: Energy Model of the cpufreq policy * * This interface function registers the cpufreq cooling device with the name - * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq + * "cpufreq-%s". This API can support multiple instances of cpufreq * cooling devices. It also gives the opportunity to link the cooling device * with a device tree node, in order to bind it via the thermal DT code. * @@ -590,8 +585,8 @@ __cpufreq_cooling_register(struct device_node *np, * @policy: cpufreq policy * * This interface function registers the cpufreq cooling device with the name - * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq - * cooling devices. + * "cpufreq-%s". This API can support multiple instances of cpufreq cooling + * devices. * * Return: a valid struct thermal_cooling_device pointer on success, * on failure, it returns a corresponding ERR_PTR(). @@ -608,17 +603,14 @@ EXPORT_SYMBOL_GPL(cpufreq_cooling_register); * @policy: cpufreq policy * * This interface function registers the cpufreq cooling device with the name - * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq - * cooling devices. Using this API, the cpufreq cooling device will be - * linked to the device tree node provided. + * "cpufreq-%s". This API can support multiple instances of cpufreq cooling + * devices. Using this API, the cpufreq cooling device will be linked to the + * device tree node provided. * * Using this function, the cooling device will implement the power - * extensions by using a simple cpu power model. The cpus must have + * extensions by using the Energy Model (if present). The cpus must have * registered their OPPs using the OPP library. * - * It also takes into account, if property present in policy CPU node, the - * static power consumed by the cpu. - * * Return: a valid struct thermal_cooling_device pointer on success, * and NULL on failure. */ @@ -654,7 +646,7 @@ EXPORT_SYMBOL_GPL(of_cpufreq_cooling_register); * cpufreq_cooling_unregister - function to remove cpufreq cooling device. * @cdev: thermal cooling device pointer. * - * This interface function unregisters the "thermal-cpufreq-%x" cooling device. + * This interface function unregisters the "cpufreq-%x" cooling device. */ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) { From 77988a4351b2dfaecd571b49e14501df4909c722 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Mon, 13 Jun 2022 13:43:27 +0100 Subject: [PATCH 0141/1250] drivers/thermal/devfreq_cooling: Extend the devfreq_cooling_device with ops Remove unneeded global variable devfreq_cooling_ops which is used only as a copy pattern. Instead, extend the struct devfreq_cooling_device with the needed ops structure. This also simplifies the allocation/free code during the setup/cleanup. Signed-off-by: Lukasz Luba Link: https://lore.kernel.org/r/20220613124327.30766-5-lukasz.luba@arm.com Signed-off-by: Daniel Lezcano --- drivers/thermal/devfreq_cooling.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c index 8c76f9655e5774..67b618b1afc88b 100644 --- a/drivers/thermal/devfreq_cooling.c +++ b/drivers/thermal/devfreq_cooling.c @@ -28,6 +28,7 @@ * struct devfreq_cooling_device - Devfreq cooling device * devfreq_cooling_device registered. * @cdev: Pointer to associated thermal cooling device. + * @cooling_ops: devfreq callbacks to thermal cooling device ops * @devfreq: Pointer to associated devfreq device. * @cooling_state: Current cooling state. * @freq_table: Pointer to a table with the frequencies sorted in descending @@ -48,6 +49,7 @@ */ struct devfreq_cooling_device { struct thermal_cooling_device *cdev; + struct thermal_cooling_device_ops cooling_ops; struct devfreq *devfreq; unsigned long cooling_state; u32 *freq_table; @@ -290,12 +292,6 @@ static int devfreq_cooling_power2state(struct thermal_cooling_device *cdev, return 0; } -static struct thermal_cooling_device_ops devfreq_cooling_ops = { - .get_max_state = devfreq_cooling_get_max_state, - .get_cur_state = devfreq_cooling_get_cur_state, - .set_cur_state = devfreq_cooling_set_cur_state, -}; - /** * devfreq_cooling_gen_tables() - Generate frequency table. * @dfc: Pointer to devfreq cooling device. @@ -363,18 +359,18 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, char *name; int err, num_opps; - ops = kmemdup(&devfreq_cooling_ops, sizeof(*ops), GFP_KERNEL); - if (!ops) - return ERR_PTR(-ENOMEM); dfc = kzalloc(sizeof(*dfc), GFP_KERNEL); - if (!dfc) { - err = -ENOMEM; - goto free_ops; - } + if (!dfc) + return ERR_PTR(-ENOMEM); dfc->devfreq = df; + ops = &dfc->cooling_ops; + ops->get_max_state = devfreq_cooling_get_max_state; + ops->get_cur_state = devfreq_cooling_get_cur_state; + ops->set_cur_state = devfreq_cooling_set_cur_state; + em = em_pd_get(dev); if (em && !em_is_artificial(em)) { dfc->em_pd = em; @@ -437,8 +433,6 @@ of_devfreq_cooling_register_power(struct device_node *np, struct devfreq *df, kfree(dfc->freq_table); free_dfc: kfree(dfc); -free_ops: - kfree(ops); return ERR_PTR(err); } @@ -520,13 +514,11 @@ EXPORT_SYMBOL_GPL(devfreq_cooling_em_register); void devfreq_cooling_unregister(struct thermal_cooling_device *cdev) { struct devfreq_cooling_device *dfc; - const struct thermal_cooling_device_ops *ops; struct device *dev; if (IS_ERR_OR_NULL(cdev)) return; - ops = cdev->ops; dfc = cdev->devdata; dev = dfc->devfreq->dev.parent; @@ -537,6 +529,5 @@ void devfreq_cooling_unregister(struct thermal_cooling_device *cdev) kfree(dfc->freq_table); kfree(dfc); - kfree(ops); } EXPORT_SYMBOL_GPL(devfreq_cooling_unregister); From 7a8696f727e1a713046f12af45da0578ddaacb6f Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 13 Jun 2022 14:43:09 +0200 Subject: [PATCH 0142/1250] MAINTAINERS: add include/dt-bindings/thermal to THERMAL Maintainers of the directory Documentation/devicetree/bindings/thermal are also the maintainers of the corresponding directory include/dt-bindings/thermal. Add the file entry for include/dt-bindings/thermal to the appropriate section in MAINTAINERS. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20220613124309.28790-1-lukas.bulwahn@gmail.com Signed-off-by: Daniel Lezcano --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1fc9ead83d2aa3..c1beed2d9ab46f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19770,6 +19770,7 @@ F: Documentation/ABI/testing/sysfs-class-thermal F: Documentation/devicetree/bindings/thermal/ F: Documentation/driver-api/thermal/ F: drivers/thermal/ +F: include/dt-bindings/thermal/ F: include/linux/cpu_cooling.h F: include/linux/thermal.h F: include/uapi/linux/thermal.h From 3be4812d6594b2844c305521056c118cdba9dac1 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 10 Jun 2022 22:04:59 +0200 Subject: [PATCH 0143/1250] drivers/thermal/rcar_gen3_thermal: Improve logging during probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When setting up a new board, a plain "Can't register thermal zone" didn't help me much because the thermal zones in DT were all fine. I just had a sensor entry too much in the parent TSC node. Reword the failure/success messages to contain the sensor number to make it easier to understand which sensor is affected. Example output now: rcar_gen3_thermal e6198000.thermal: Sensor 0: Loaded 1 trip points rcar_gen3_thermal e6198000.thermal: Sensor 1: Loaded 1 trip points rcar_gen3_thermal e6198000.thermal: Sensor 2: Loaded 1 trip points rcar_gen3_thermal e6198000.thermal: Sensor 3: Can't register thermal zone Signed-off-by: Wolfram Sang Reviewed-by: Geert Uytterhoeven Reviewed-by: Niklas Söderlund Link: https://lore.kernel.org/r/20220610200500.6727-1-wsa+renesas@sang-engineering.com Signed-off-by: Daniel Lezcano --- drivers/thermal/rcar_gen3_thermal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/rcar_gen3_thermal.c b/drivers/thermal/rcar_gen3_thermal.c index 43eb25b167bc00..e2020c6308cc18 100644 --- a/drivers/thermal/rcar_gen3_thermal.c +++ b/drivers/thermal/rcar_gen3_thermal.c @@ -507,7 +507,7 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev) zone = devm_thermal_zone_of_sensor_register(dev, i, tsc, &rcar_gen3_tz_of_ops); if (IS_ERR(zone)) { - dev_err(dev, "Can't register thermal zone\n"); + dev_err(dev, "Sensor %u: Can't register thermal zone\n", i); ret = PTR_ERR(zone); goto error_unregister; } @@ -529,7 +529,7 @@ static int rcar_gen3_thermal_probe(struct platform_device *pdev) if (ret < 0) goto error_unregister; - dev_info(dev, "TSC%u: Loaded %d trip points\n", i, ret); + dev_info(dev, "Sensor %u: Loaded %d trip points\n", i, ret); } if (!priv->num_tscs) { From a5c34de172886fe776b1fd002807cddd81493d02 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 8 Jun 2022 13:27:01 +0200 Subject: [PATCH 0144/1250] dt-bindings: thermal: qcom,spmi-temp-alarm: convert to dtschema Convert the Qualcomm QPNP PMIC Temperature Alarm to DT Schema. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20220608112702.80873-1-krzysztof.kozlowski@linaro.org Signed-off-by: Daniel Lezcano --- .../thermal/qcom,spmi-temp-alarm.yaml | 85 +++++++++++++++++++ .../bindings/thermal/qcom-spmi-temp-alarm.txt | 51 ----------- 2 files changed, 85 insertions(+), 51 deletions(-) create mode 100644 Documentation/devicetree/bindings/thermal/qcom,spmi-temp-alarm.yaml delete mode 100644 Documentation/devicetree/bindings/thermal/qcom-spmi-temp-alarm.txt diff --git a/Documentation/devicetree/bindings/thermal/qcom,spmi-temp-alarm.yaml b/Documentation/devicetree/bindings/thermal/qcom,spmi-temp-alarm.yaml new file mode 100644 index 00000000000000..5f08b6e59b8a5c --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/qcom,spmi-temp-alarm.yaml @@ -0,0 +1,85 @@ +# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/thermal/qcom,spmi-temp-alarm.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Qualcomm QPNP PMIC Temperature Alarm + +maintainers: + - Bjorn Andersson + +description: + QPNP temperature alarm peripherals are found inside of Qualcomm PMIC chips + that utilize the Qualcomm SPMI implementation. These peripherals provide an + interrupt signal and status register to identify high PMIC die temperature. + +allOf: + - $ref: thermal-sensor.yaml# + +properties: + compatible: + const: qcom,spmi-temp-alarm + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + io-channels: + items: + - description: ADC channel, which reports chip die temperature + + io-channel-names: + items: + - const: thermal + + '#thermal-sensor-cells': + const: 0 + +required: + - compatible + - reg + - interrupts + - '#thermal-sensor-cells' + +additionalProperties: false + +examples: + - | + #include + + pmic { + #address-cells = <1>; + #size-cells = <0>; + + pm8350_temp_alarm: temperature-sensor@a00 { + compatible = "qcom,spmi-temp-alarm"; + reg = <0xa00>; + interrupts = <0x1 0xa 0x0 IRQ_TYPE_EDGE_BOTH>; + #thermal-sensor-cells = <0>; + }; + }; + + thermal-zones { + pm8350_thermal: pm8350c-thermal { + polling-delay-passive = <100>; + polling-delay = <0>; + thermal-sensors = <&pm8350_temp_alarm>; + + trips { + pm8350_trip0: trip0 { + temperature = <95000>; + hysteresis = <0>; + type = "passive"; + }; + + pm8350_crit: pm8350c-crit { + temperature = <115000>; + hysteresis = <0>; + type = "critical"; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/thermal/qcom-spmi-temp-alarm.txt b/Documentation/devicetree/bindings/thermal/qcom-spmi-temp-alarm.txt deleted file mode 100644 index 2d5b2ad03314bc..00000000000000 --- a/Documentation/devicetree/bindings/thermal/qcom-spmi-temp-alarm.txt +++ /dev/null @@ -1,51 +0,0 @@ -Qualcomm QPNP PMIC Temperature Alarm - -QPNP temperature alarm peripherals are found inside of Qualcomm PMIC chips -that utilize the Qualcomm SPMI implementation. These peripherals provide an -interrupt signal and status register to identify high PMIC die temperature. - -Required properties: -- compatible: Should contain "qcom,spmi-temp-alarm". -- reg: Specifies the SPMI address. -- interrupts: PMIC temperature alarm interrupt. -- #thermal-sensor-cells: Should be 0. See Documentation/devicetree/bindings/thermal/thermal-sensor.yaml for a description. - -Optional properties: -- io-channels: Should contain IIO channel specifier for the ADC channel, - which report chip die temperature. -- io-channel-names: Should contain "thermal". - -Example: - - pm8941_temp: thermal-alarm@2400 { - compatible = "qcom,spmi-temp-alarm"; - reg = <0x2400>; - interrupts = <0 0x24 0 IRQ_TYPE_EDGE_RISING>; - #thermal-sensor-cells = <0>; - - io-channels = <&pm8941_vadc VADC_DIE_TEMP>; - io-channel-names = "thermal"; - }; - - thermal-zones { - pm8941 { - polling-delay-passive = <250>; - polling-delay = <1000>; - - thermal-sensors = <&pm8941_temp>; - - trips { - stage1 { - temperature = <105000>; - hysteresis = <2000>; - type = "passive"; - }; - stage2 { - temperature = <125000>; - hysteresis = <2000>; - type = "critical"; - }; - }; - }; - }; - From 78516a12d7428176015997f655b4990112563ad3 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 10 Jun 2022 22:17:00 +0200 Subject: [PATCH 0145/1250] dt-bindings: thermal: rcar-gen3-thermal: use positive logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When handling the V3U/r8a779a0 exception, avoid using 'not:' because then its subschemas are far away in the 'else:' branch. Keep them together using positive logic. Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20220610201701.7946-1-wsa+renesas@sang-engineering.com Signed-off-by: Daniel Lezcano --- .../bindings/thermal/rcar-gen3-thermal.yaml | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml index 1368d90da0e859..72dc7eb27f8d51 100644 --- a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml @@ -57,31 +57,30 @@ required: - "#thermal-sensor-cells" if: - not: - properties: - compatible: - contains: - enum: - - renesas,r8a779a0-thermal + properties: + compatible: + contains: + enum: + - renesas,r8a779a0-thermal then: properties: reg: - minItems: 2 items: + - description: TSC0 registers - description: TSC1 registers - description: TSC2 registers - description: TSC3 registers - required: - - interrupts + - description: TSC4 registers else: properties: reg: + minItems: 2 items: - - description: TSC0 registers - description: TSC1 registers - description: TSC2 registers - description: TSC3 registers - - description: TSC4 registers + required: + - interrupts additionalProperties: false From 4768f717d85c67c5158331c6b6bc661994462948 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 10 Jun 2022 22:17:01 +0200 Subject: [PATCH 0146/1250] dt-bindings: thermal: rcar-gen3-thermal: Add r8a779f0 support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for R-Car S4. The S4 IP differs a bit from its siblings in such way that it has 3 out of 4 TSC nodes for Linux and the interrupts are not routed to the INTC-AP but to the ECM. Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Reviewed-by: Geert Uytterhoeven Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20220610201701.7946-2-wsa+renesas@sang-engineering.com Signed-off-by: Daniel Lezcano --- .../bindings/thermal/rcar-gen3-thermal.yaml | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml index 72dc7eb27f8d51..0f05f5c886c5fe 100644 --- a/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml +++ b/Documentation/devicetree/bindings/thermal/rcar-gen3-thermal.yaml @@ -8,9 +8,9 @@ $schema: http://devicetree.org/meta-schemas/core.yaml# title: Renesas R-Car Gen3 Thermal Sensor description: - On R-Car Gen3 SoCs, the thermal sensor controllers (TSC) control the thermal - sensors (THS) which are the analog circuits for measuring temperature (Tj) - inside the LSI. + On most R-Car Gen3 and later SoCs, the thermal sensor controllers (TSC) + control the thermal sensors (THS) which are the analog circuits for + measuring temperature (Tj) inside the LSI. maintainers: - Niklas Söderlund @@ -27,6 +27,7 @@ properties: - renesas,r8a77965-thermal # R-Car M3-N - renesas,r8a77980-thermal # R-Car V3H - renesas,r8a779a0-thermal # R-Car V3U + - renesas,r8a779f0-thermal # R-Car S4-8 reg: true @@ -79,8 +80,16 @@ else: - description: TSC1 registers - description: TSC2 registers - description: TSC3 registers - required: - - interrupts + if: + not: + properties: + compatible: + contains: + enum: + - renesas,r8a779f0-thermal + then: + required: + - interrupts additionalProperties: false From ced86c536be191ce39a16cf9fcbcffc3f30ae1b4 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 15 Jun 2022 17:53:24 -0700 Subject: [PATCH 0147/1250] ARM: dts: xilinx: align gpio-key node names with dtschema The node names should be generic and DT schema expects certain pattern (e.g. with key/button/switch). Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220616005333.18491-31-krzysztof.kozlowski@linaro.org Signed-off-by: Michal Simek --- arch/arm/boot/dts/zynq-zc702.dts | 4 ++-- arch/arm/boot/dts/zynq-zturn-common.dtsi | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/zynq-zc702.dts b/arch/arm/boot/dts/zynq-zc702.dts index cf70aff26c6648..d23201ba8cd7d1 100644 --- a/arch/arm/boot/dts/zynq-zc702.dts +++ b/arch/arm/boot/dts/zynq-zc702.dts @@ -30,14 +30,14 @@ gpio-keys { compatible = "gpio-keys"; autorepeat; - sw14 { + switch-14 { label = "sw14"; gpios = <&gpio0 12 0>; linux,code = <108>; /* down */ wakeup-source; autorepeat; }; - sw13 { + switch-13 { label = "sw13"; gpios = <&gpio0 14 0>; linux,code = <103>; /* up */ diff --git a/arch/arm/boot/dts/zynq-zturn-common.dtsi b/arch/arm/boot/dts/zynq-zturn-common.dtsi index bf5d1c4568b0d3..dfb1fbafe3aa4e 100644 --- a/arch/arm/boot/dts/zynq-zturn-common.dtsi +++ b/arch/arm/boot/dts/zynq-zturn-common.dtsi @@ -49,7 +49,7 @@ gpio-keys { compatible = "gpio-keys"; autorepeat; - K1 { + key { label = "K1"; gpios = <&gpio0 0x32 0x1>; linux,code = <0x66>; From b98274483f62f8caaebc04b1677256845c550d8c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 22 Jun 2022 14:45:22 -0400 Subject: [PATCH 0148/1250] fs: dlm: add deprecation Kconfig and warnings for timeouts This patch adds a CONFIG_DLM_DEPRECATED_API Kconfig option that must be enabled to use two timeout-related features that we intend to remove in kernel v5.22. Warnings are printed if either is enabled and used. Neither has ever been used as far as we know. . The DLM_LSFL_TIMEWARN lockspace creation flag will be removed, along with the associated configfs entry for setting the timeout. Setting the flag and configfs file would cause dlm to track how long locks were waiting for reply messages. After a timeout, a kernel message would be logged, and a netlink message would be sent to userspace. Recently, midcomms messages have been added that produce much better logging about actual problems with messages. No use has ever been found for the netlink messages. . The userspace libdlm API has allowed the DLM_LKF_TIMEOUT flag with a timeout value to be set in lock requests. The lock request would be cancelled after the timeout. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/Kconfig | 9 +++++++++ fs/dlm/lockspace.c | 11 ++++++++++- fs/dlm/user.c | 8 ++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index ee92634196a8e7..1105ce3c80cbdd 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -9,6 +9,15 @@ menuconfig DLM A general purpose distributed lock manager for kernel or userspace applications. +config DLM_DEPRECATED_API + bool "DLM deprecated API" + depends on DLM + help + Enables deprecated DLM timeout features that will be removed in + later Linux kernel releases. + + If you are unsure, say N. + config DLM_DEBUG bool "DLM debugging" depends on DLM diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index ca1eca0809d416..c9ec107001153a 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -489,8 +489,17 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_ops_arg = ops_arg; } - if (flags & DLM_LSFL_TIMEWARN) + if (flags & DLM_LSFL_TIMEWARN) { +#ifdef CONFIG_DLM_DEPRECATED_API + pr_warn_once("===============================================================\n" + "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n" + " will be removed in v5.22!\n" + " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n" + "===============================================================\n"); +#endif + set_bit(LSFL_TIMEWARN, &ls->ls_flags); + } /* ls_exflags are forced to match among nodes, and we don't need to require all nodes to have some flags set */ diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 4b2a24a6a15da3..1fccb08bd8259b 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -250,6 +250,14 @@ static int device_user_lock(struct dlm_user_proc *proc, goto out; } +#ifdef CONFIG_DLM_DEPRECATED_API + if (params->timeout) + pr_warn_once("========================================================\n" + "WARNING: the lkb timeout feature is being deprecated and\n" + " will be removed in v5.22!\n" + "========================================================\n"); +#endif + ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS); if (!ua) goto out; From 09248df31b4794e1ba1e3d3d9d8d91804f3e392c Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 22 Jun 2022 14:45:23 -0400 Subject: [PATCH 0149/1250] fs: dlm: don't use deprecated timeout features by default This patch will disable use of deprecated timeout features if CONFIG_DLM_DEPRECATED_API is not set. The deprecated features will be removed in upcoming kernel release v5.22. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/Makefile | 2 +- fs/dlm/config.c | 14 +++++++++++++ fs/dlm/config.h | 2 ++ fs/dlm/dlm_internal.h | 19 +++++++++++++++++- fs/dlm/lock.c | 46 +++++++++++++++++++++++++++++++++++++++++++ fs/dlm/lock.h | 14 +++++++++++++ fs/dlm/lockspace.c | 14 ++++++++++--- fs/dlm/user.c | 12 +++++++++++ 8 files changed, 118 insertions(+), 5 deletions(-) diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile index 3545fdafc6fbbe..71dab733cf9a83 100644 --- a/fs/dlm/Makefile +++ b/fs/dlm/Makefile @@ -9,7 +9,6 @@ dlm-y := ast.o \ member.o \ memory.o \ midcomms.o \ - netlink.o \ lowcomms.o \ plock.o \ rcom.o \ @@ -18,5 +17,6 @@ dlm-y := ast.o \ requestqueue.o \ user.o \ util.o +dlm-$(CONFIG_DLM_DEPRECATED_API) += netlink.o dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 081fd201e3a855..ac8b62106ce0e0 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -75,7 +75,9 @@ struct dlm_cluster { unsigned int cl_log_info; unsigned int cl_protocol; unsigned int cl_mark; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned int cl_timewarn_cs; +#endif unsigned int cl_new_rsb_count; unsigned int cl_recover_callbacks; char cl_cluster_name[DLM_LOCKSPACE_LEN]; @@ -101,7 +103,9 @@ enum { CLUSTER_ATTR_LOG_INFO, CLUSTER_ATTR_PROTOCOL, CLUSTER_ATTR_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR_TIMEWARN_CS, +#endif CLUSTER_ATTR_NEW_RSB_COUNT, CLUSTER_ATTR_RECOVER_CALLBACKS, CLUSTER_ATTR_CLUSTER_NAME, @@ -222,7 +226,9 @@ CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); +#ifdef CONFIG_DLM_DEPRECATED_API CLUSTER_ATTR(timewarn_cs, dlm_check_zero); +#endif CLUSTER_ATTR(new_rsb_count, NULL); CLUSTER_ATTR(recover_callbacks, NULL); @@ -237,7 +243,9 @@ static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_LOG_INFO] = &cluster_attr_log_info, [CLUSTER_ATTR_PROTOCOL] = &cluster_attr_protocol, [CLUSTER_ATTR_MARK] = &cluster_attr_mark, +#ifdef CONFIG_DLM_DEPRECATED_API [CLUSTER_ATTR_TIMEWARN_CS] = &cluster_attr_timewarn_cs, +#endif [CLUSTER_ATTR_NEW_RSB_COUNT] = &cluster_attr_new_rsb_count, [CLUSTER_ATTR_RECOVER_CALLBACKS] = &cluster_attr_recover_callbacks, [CLUSTER_ATTR_CLUSTER_NAME] = &cluster_attr_cluster_name, @@ -428,7 +436,9 @@ static struct config_group *make_cluster(struct config_group *g, cl->cl_log_debug = dlm_config.ci_log_debug; cl->cl_log_info = dlm_config.ci_log_info; cl->cl_protocol = dlm_config.ci_protocol; +#ifdef CONFIG_DLM_DEPRECATED_API cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs; +#endif cl->cl_new_rsb_count = dlm_config.ci_new_rsb_count; cl->cl_recover_callbacks = dlm_config.ci_recover_callbacks; memcpy(cl->cl_cluster_name, dlm_config.ci_cluster_name, @@ -949,7 +959,9 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) #define DEFAULT_LOG_INFO 1 #define DEFAULT_PROTOCOL DLM_PROTO_TCP #define DEFAULT_MARK 0 +#ifdef CONFIG_DLM_DEPRECATED_API #define DEFAULT_TIMEWARN_CS 500 /* 5 sec = 500 centiseconds */ +#endif #define DEFAULT_NEW_RSB_COUNT 128 #define DEFAULT_RECOVER_CALLBACKS 0 #define DEFAULT_CLUSTER_NAME "" @@ -965,7 +977,9 @@ struct dlm_config_info dlm_config = { .ci_log_info = DEFAULT_LOG_INFO, .ci_protocol = DEFAULT_PROTOCOL, .ci_mark = DEFAULT_MARK, +#ifdef CONFIG_DLM_DEPRECATED_API .ci_timewarn_cs = DEFAULT_TIMEWARN_CS, +#endif .ci_new_rsb_count = DEFAULT_NEW_RSB_COUNT, .ci_recover_callbacks = DEFAULT_RECOVER_CALLBACKS, .ci_cluster_name = DEFAULT_CLUSTER_NAME diff --git a/fs/dlm/config.h b/fs/dlm/config.h index cb23d018e863cd..55c5f2c13ebd6d 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -37,7 +37,9 @@ struct dlm_config_info { int ci_log_info; int ci_protocol; int ci_mark; +#ifdef CONFIG_DLM_DEPRECATED_API int ci_timewarn_cs; +#endif int ci_new_rsb_count; int ci_recover_callbacks; char ci_cluster_name[DLM_LOCKSPACE_LEN]; diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 84dad619081ee1..8aca8085d24e72 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -145,7 +145,9 @@ struct dlm_args { void (*bastfn) (void *astparam, int mode); int mode; struct dlm_lksb *lksb; +#ifdef CONFIG_DLM_DEPRECATED_API unsigned long timeout; +#endif }; @@ -203,8 +205,10 @@ struct dlm_args { #define DLM_IFL_OVERLAP_UNLOCK 0x00080000 #define DLM_IFL_OVERLAP_CANCEL 0x00100000 #define DLM_IFL_ENDOFLIFE 0x00200000 +#ifdef CONFIG_DLM_DEPRECATED_API #define DLM_IFL_WATCH_TIMEWARN 0x00400000 #define DLM_IFL_TIMEOUT_CANCEL 0x00800000 +#endif #define DLM_IFL_DEADLOCK_CANCEL 0x01000000 #define DLM_IFL_STUB_MS 0x02000000 /* magic number for m_flags */ /* least significant 2 bytes are message changed, they are full transmitted @@ -257,9 +261,12 @@ struct dlm_lkb { struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ struct list_head lkb_wait_reply; /* waiting for remote reply */ struct list_head lkb_ownqueue; /* list of locks for a process */ - struct list_head lkb_time_list; ktime_t lkb_timestamp; + +#ifdef CONFIG_DLM_DEPRECATED_API + struct list_head lkb_time_list; unsigned long lkb_timeout_cs; +#endif struct mutex lkb_cb_mutex; struct work_struct lkb_cb_work; @@ -575,8 +582,10 @@ struct dlm_ls { struct mutex ls_orphans_mutex; struct list_head ls_orphans; +#ifdef CONFIG_DLM_DEPRECATED_API struct mutex ls_timeout_mutex; struct list_head ls_timeout; +#endif spinlock_t ls_new_rsb_spin; int ls_new_rsb_count; @@ -695,7 +704,9 @@ struct dlm_ls { #define LSFL_RCOM_READY 5 #define LSFL_RCOM_WAIT 6 #define LSFL_UEVENT_WAIT 7 +#ifdef CONFIG_DLM_DEPRECATED_API #define LSFL_TIMEWARN 8 +#endif #define LSFL_CB_DELAY 9 #define LSFL_NODIR 10 @@ -748,9 +759,15 @@ static inline int dlm_no_directory(struct dlm_ls *ls) return test_bit(LSFL_NODIR, &ls->ls_flags); } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_netlink_init(void); void dlm_netlink_exit(void); void dlm_timeout_warn(struct dlm_lkb *lkb); +#else +static inline int dlm_netlink_init(void) { return 0; } +static inline void dlm_netlink_exit(void) { }; +static inline void dlm_timeout_warn(struct dlm_lkb *lkb) { }; +#endif int dlm_plock_init(void); void dlm_plock_exit(void); diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 12d4cc74230851..739f09d0951c7b 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -296,12 +296,14 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); +#ifdef CONFIG_DLM_DEPRECATED_API /* if the operation was a cancel, then return -DLM_ECANCEL, if a timeout caused the cancel then return -ETIMEDOUT */ if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL; rv = -ETIMEDOUT; } +#endif if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) { lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL; @@ -1210,7 +1212,9 @@ static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret, kref_init(&lkb->lkb_ref); INIT_LIST_HEAD(&lkb->lkb_ownqueue); INIT_LIST_HEAD(&lkb->lkb_rsb_lookup); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&lkb->lkb_time_list); +#endif INIT_LIST_HEAD(&lkb->lkb_cb_list); mutex_init(&lkb->lkb_cb_mutex); INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work); @@ -1772,6 +1776,7 @@ void dlm_scan_rsbs(struct dlm_ls *ls) } } +#ifdef CONFIG_DLM_DEPRECATED_API static void add_timeout(struct dlm_lkb *lkb) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; @@ -1893,6 +1898,10 @@ void dlm_adjust_timeouts(struct dlm_ls *ls) lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); } +#else +static void add_timeout(struct dlm_lkb *lkb) { } +static void del_timeout(struct dlm_lkb *lkb) { } +#endif /* lkb is master or local copy */ @@ -2757,12 +2766,20 @@ static void confirm_master(struct dlm_rsb *r, int error) } } +#ifdef CONFIG_DLM_DEPRECATED_API static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, int namelen, unsigned long timeout_cs, void (*ast) (void *astparam), void *astparam, void (*bast) (void *astparam, int mode), struct dlm_args *args) +#else +static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, + int namelen, void (*ast)(void *astparam), + void *astparam, + void (*bast)(void *astparam, int mode), + struct dlm_args *args) +#endif { int rv = -EINVAL; @@ -2815,7 +2832,9 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, args->astfn = ast; args->astparam = astparam; args->bastfn = bast; +#ifdef CONFIG_DLM_DEPRECATED_API args->timeout = timeout_cs; +#endif args->mode = mode; args->lksb = lksb; rv = 0; @@ -2871,7 +2890,9 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, lkb->lkb_lksb = args->lksb; lkb->lkb_lvbptr = args->lksb->sb_lvbptr; lkb->lkb_ownpid = (int) current->pid; +#ifdef CONFIG_DLM_DEPRECATED_API lkb->lkb_timeout_cs = args->timeout; +#endif rv = 0; out: if (rv) @@ -3394,8 +3415,13 @@ int dlm_lock(dlm_lockspace_t *lockspace, trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags); +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, lksb, flags, namelen, 0, ast, astarg, bast, &args); +#else + error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast, + &args); +#endif if (error) goto out_put; @@ -5759,9 +5785,14 @@ int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) return 0; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs) +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, + int mode, uint32_t flags, void *name, unsigned int namelen) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5784,8 +5815,13 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, goto out; } } +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) { kfree(ua->lksb.sb_lvbptr); ua->lksb.sb_lvbptr = NULL; @@ -5824,9 +5860,14 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, return error; } +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs) +#else +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in) +#endif { struct dlm_lkb *lkb; struct dlm_args args; @@ -5861,8 +5902,13 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, ua->bastaddr = ua_tmp->bastaddr; ua->user_lksb = ua_tmp->user_lksb; +#ifdef CONFIG_DLM_DEPRECATED_API error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs, fake_astfn, ua, fake_bastfn, &args); +#else + error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua, + fake_bastfn, &args); +#endif if (error) goto out_put; diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h index 8c99e1b6eefa92..a7b6474f009dcd 100644 --- a/fs/dlm/lock.h +++ b/fs/dlm/lock.h @@ -24,8 +24,15 @@ int dlm_put_lkb(struct dlm_lkb *lkb); void dlm_scan_rsbs(struct dlm_ls *ls); int dlm_lock_recovery_try(struct dlm_ls *ls); void dlm_unlock_recovery(struct dlm_ls *ls); + +#ifdef CONFIG_DLM_DEPRECATED_API void dlm_scan_timeout(struct dlm_ls *ls); void dlm_adjust_timeouts(struct dlm_ls *ls); +#else +static inline void dlm_scan_timeout(struct dlm_ls *ls) { } +static inline void dlm_adjust_timeouts(struct dlm_ls *ls) { } +#endif + int dlm_master_lookup(struct dlm_ls *ls, int nodeid, char *name, int len, unsigned int flags, int *r_nodeid, int *result); @@ -40,12 +47,19 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls); int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); +#ifdef CONFIG_DLM_DEPRECATED_API int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, uint32_t flags, void *name, unsigned int namelen, unsigned long timeout_cs); int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, uint32_t lkid, char *lvb_in, unsigned long timeout_cs); +#else +int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua, int mode, + uint32_t flags, void *name, unsigned int namelen); +int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, + int mode, uint32_t flags, uint32_t lkid, char *lvb_in); +#endif int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp, int mode, uint32_t flags, void *name, unsigned int namelen, uint32_t *lkid); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index c9ec107001153a..6e449abdc5f4d5 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -489,22 +489,28 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_ops_arg = ops_arg; } - if (flags & DLM_LSFL_TIMEWARN) { #ifdef CONFIG_DLM_DEPRECATED_API + if (flags & DLM_LSFL_TIMEWARN) { pr_warn_once("===============================================================\n" "WARNING: the dlm DLM_LSFL_TIMEWARN flag is being deprecated and\n" " will be removed in v5.22!\n" " Inclusive DLM_LSFL_TIMEWARN define in UAPI header!\n" "===============================================================\n"); -#endif set_bit(LSFL_TIMEWARN, &ls->ls_flags); } /* ls_exflags are forced to match among nodes, and we don't - need to require all nodes to have some flags set */ + * need to require all nodes to have some flags set + */ ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#else + /* ls_exflags are forced to match among nodes, and we don't + * need to require all nodes to have some flags set + */ + ls->ls_exflags = (flags & ~(DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); +#endif size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; @@ -535,8 +541,10 @@ static int new_lockspace(const char *name, const char *cluster, mutex_init(&ls->ls_waiters_mutex); INIT_LIST_HEAD(&ls->ls_orphans); mutex_init(&ls->ls_orphans_mutex); +#ifdef CONFIG_DLM_DEPRECATED_API INIT_LIST_HEAD(&ls->ls_timeout); mutex_init(&ls->ls_timeout_mutex); +#endif INIT_LIST_HEAD(&ls->ls_new_rsb); spin_lock_init(&ls->ls_new_rsb_spin); diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1fccb08bd8259b..999918348b3171 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -270,10 +270,16 @@ static int device_user_lock(struct dlm_user_proc *proc, ua->xid = params->xid; if (params->flags & DLM_LKF_CONVERT) { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_convert(ls, ua, params->mode, params->flags, params->lkid, params->lvb, (unsigned long) params->timeout); +#else + error = dlm_user_convert(ls, ua, + params->mode, params->flags, + params->lkid, params->lvb); +#endif } else if (params->flags & DLM_LKF_ORPHAN) { error = dlm_user_adopt_orphan(ls, ua, params->mode, params->flags, @@ -282,10 +288,16 @@ static int device_user_lock(struct dlm_user_proc *proc, if (!error) error = lkid; } else { +#ifdef CONFIG_DLM_DEPRECATED_API error = dlm_user_request(ls, ua, params->mode, params->flags, params->name, params->namelen, (unsigned long) params->timeout); +#else + error = dlm_user_request(ls, ua, + params->mode, params->flags, + params->name, params->namelen); +#endif if (!error) error = ua->lksb.sb_lkid; } From 0f24debbb84758c45ad34bd6f7f0665f632bd66f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 23 Jun 2022 21:28:58 -0400 Subject: [PATCH 0150/1250] fs: dlm: move kref_put assert for lkb structs The unhold_lkb() function decrements the lock's kref, and asserts that the ref count was not the final one. Use the kref_put release function (which should not be called) to call the assert, rather than doing the assert based on the kref_put return value. Using kill_lkb() as the release function doesn't make sense if we only want to assert. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lock.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 739f09d0951c7b..dac7eb75dba95c 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1310,6 +1310,13 @@ static inline void hold_lkb(struct dlm_lkb *lkb) kref_get(&lkb->lkb_ref); } +static void unhold_lkb_assert(struct kref *kref) +{ + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); + + DLM_ASSERT(false, dlm_print_lkb(lkb);); +} + /* This is called when we need to remove a reference and are certain it's not the last ref. e.g. del_lkb is always called between a find_lkb/put_lkb and is always the inverse of a previous add_lkb. @@ -1317,9 +1324,7 @@ static inline void hold_lkb(struct dlm_lkb *lkb) static inline void unhold_lkb(struct dlm_lkb *lkb) { - int rv; - rv = kref_put(&lkb->lkb_ref, kill_lkb); - DLM_ASSERT(!rv, dlm_print_lkb(lkb);); + kref_put(&lkb->lkb_ref, unhold_lkb_assert); } static void lkb_add_ordered(struct list_head *new, struct list_head *head, From 148ec6305a54f3922e62b83a166d3344b4cebb70 Mon Sep 17 00:00:00 2001 From: Jilin Yuan Date: Sun, 8 May 2022 11:02:24 +0800 Subject: [PATCH 0151/1250] ARC: Fix comment typo - Remove one of the repeated 'call' in comment line 396. - Delete the redundant word 'to', 'since' Signed-off-by: Jilin Yuan Signed-off-by: Vineet Gupta --- arch/arc/kernel/smp.c | 2 +- arch/arc/mm/cache.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index d947473f1e6da5..6c22a53711e9bc 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -393,7 +393,7 @@ irqreturn_t do_IPI(int irq, void *dev_id) * API called by platform code to hookup arch-common ISR to their IPI IRQ * * Note: If IPI is provided by platform (vs. say ARC MCIP), their intc setup/map - * function needs to call call irq_set_percpu_devid() for IPI IRQ, otherwise + * function needs to call irq_set_percpu_devid() for IPI IRQ, otherwise * request_percpu_irq() below will fail */ static DEFINE_PER_CPU(int, ipi_dev); diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 5446967ea98d3c..55c6de138eae03 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -750,7 +750,7 @@ static inline void arc_slc_enable(void) * -In SMP, if hardware caches are coherent * * There's a corollary case, where kernel READs from a userspace mapped page. - * If the U-mapping is not congruent to to K-mapping, former needs flushing. + * If the U-mapping is not congruent to K-mapping, former needs flushing. */ void flush_dcache_page(struct page *page) { @@ -910,7 +910,7 @@ EXPORT_SYMBOL(flush_icache_range); * @vaddr is typically user vaddr (breakpoint) or kernel vaddr (vmalloc) * However in one instance, when called by kprobe (for a breakpt in * builtin kernel code) @vaddr will be paddr only, meaning CDU operation will - * use a paddr to index the cache (despite VIPT). This is fine since since a + * use a paddr to index the cache (despite VIPT). This is fine since a * builtin kernel page will not have any virtual mappings. * kprobe on loadable module will be kernel vaddr. */ From d6448b8a00546162e93a9ac1dd5070cce854865c Mon Sep 17 00:00:00 2001 From: Zhang Jiaming Date: Thu, 23 Jun 2022 15:46:23 +0800 Subject: [PATCH 0152/1250] ARC: Fix comment typo Change 'seperate' to 'separate'. Signed-off-by: Zhang Jiaming Signed-off-by: Vineet Gupta --- arch/arc/include/asm/entry-compact.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/include/asm/entry-compact.h b/arch/arc/include/asm/entry-compact.h index 5aab4f93ab8aa4..67ff06e15ceaf4 100644 --- a/arch/arc/include/asm/entry-compact.h +++ b/arch/arc/include/asm/entry-compact.h @@ -21,7 +21,7 @@ * r25 contains the kernel current task ptr * - Defined Stack Switching Macro to be reused in all intr/excp hdlrs * - Shaved off 11 instructions from RESTORE_ALL_INT1 by using the - * address Write back load ld.ab instead of seperate ld/add instn + * address Write back load ld.ab instead of separate ld/add instn * * Amit Bhor, Sameer Dhavale: Codito Technologies 2004 */ From 1163fb3b6a308ffeee0b06328b9673ae3ce48aa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Amadeusz=20S=C5=82awi=C5=84ski?= Date: Fri, 27 May 2022 13:53:43 +0200 Subject: [PATCH 0153/1250] ARC: bitops: Change __fls to return unsigned long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As per asm-generic definition and other architectures __fls should return unsigned long. No functional change is expected as return value should fit in unsigned long. Reviewed-by: Cezary Rojewski Signed-off-by: Amadeusz Sławiński Signed-off-by: Vineet Gupta --- arch/arc/include/asm/bitops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index bdb7e190a294e7..f5a936496f0600 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -82,7 +82,7 @@ static inline __attribute__ ((const)) int fls(unsigned int x) /* * __fls: Similar to fls, but zero based (0-31) */ -static inline __attribute__ ((const)) int __fls(unsigned long x) +static inline __attribute__ ((const)) unsigned long __fls(unsigned long x) { if (!x) return 0; @@ -131,7 +131,7 @@ static inline __attribute__ ((const)) int fls(unsigned int x) /* * __fls: Similar to fls, but zero based (0-31). Also 0 if no bit set */ -static inline __attribute__ ((const)) int __fls(unsigned long x) +static inline __attribute__ ((const)) unsigned long __fls(unsigned long x) { /* FLS insn has exactly same semantics as the API */ return __builtin_arc_fls(x); From 952deecb065e23784fd6fcab3020de397d51d296 Mon Sep 17 00:00:00 2001 From: Serge Semin Date: Fri, 24 Jun 2022 17:16:17 +0300 Subject: [PATCH 0154/1250] arc: dts: Harmonize EHCI/OHCI DT nodes name In accordance with the Generic EHCI/OHCI bindings the corresponding node name is suppose to comply with the Generic USB HCD DT schema, which requires the USB nodes to have the name acceptable by the regexp: "^usb(@.*)?" . Make sure the "generic-ehci" and "generic-ohci"-compatible nodes are correctly named. Signed-off-by: Serge Semin Acked-by: Alexey Brodkin Acked-by: Krzysztof Kozlowski Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/axc003.dtsi | 4 ++-- arch/arc/boot/dts/axc003_idu.dtsi | 4 ++-- arch/arc/boot/dts/axs10x_mb.dtsi | 4 ++-- arch/arc/boot/dts/hsdk.dts | 4 ++-- arch/arc/boot/dts/vdk_axs10x_mb.dtsi | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index cd1edcf4f95efe..3434c8131ecd54 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -103,11 +103,11 @@ dma-coherent; }; - ehci@40000 { + usb@40000 { dma-coherent; }; - ohci@60000 { + usb@60000 { dma-coherent; }; diff --git a/arch/arc/boot/dts/axc003_idu.dtsi b/arch/arc/boot/dts/axc003_idu.dtsi index 70779386ca7963..67556f4b70574e 100644 --- a/arch/arc/boot/dts/axc003_idu.dtsi +++ b/arch/arc/boot/dts/axc003_idu.dtsi @@ -110,11 +110,11 @@ dma-coherent; }; - ehci@40000 { + usb@40000 { dma-coherent; }; - ohci@60000 { + usb@60000 { dma-coherent; }; diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi index 99d3e7175bf70a..b6443538530495 100644 --- a/arch/arc/boot/dts/axs10x_mb.dtsi +++ b/arch/arc/boot/dts/axs10x_mb.dtsi @@ -87,13 +87,13 @@ mac-address = [00 00 00 00 00 00]; /* Filled in by U-Boot */ }; - ehci@40000 { + usb@40000 { compatible = "generic-ehci"; reg = < 0x40000 0x100 >; interrupts = < 8 >; }; - ohci@60000 { + usb@60000 { compatible = "generic-ohci"; reg = < 0x60000 0x100 >; interrupts = < 8 >; diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index f48ba03e9b5e7d..6691f425507788 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -234,7 +234,7 @@ }; }; - ohci@60000 { + usb@60000 { compatible = "snps,hsdk-v1.0-ohci", "generic-ohci"; reg = <0x60000 0x100>; interrupts = <15>; @@ -242,7 +242,7 @@ dma-coherent; }; - ehci@40000 { + usb@40000 { compatible = "snps,hsdk-v1.0-ehci", "generic-ehci"; reg = <0x40000 0x100>; interrupts = <15>; diff --git a/arch/arc/boot/dts/vdk_axs10x_mb.dtsi b/arch/arc/boot/dts/vdk_axs10x_mb.dtsi index cbb179770293e7..90a412026e6433 100644 --- a/arch/arc/boot/dts/vdk_axs10x_mb.dtsi +++ b/arch/arc/boot/dts/vdk_axs10x_mb.dtsi @@ -46,7 +46,7 @@ clock-names = "stmmaceth"; }; - ehci@40000 { + usb@40000 { compatible = "generic-ehci"; reg = < 0x40000 0x100 >; interrupts = < 8 >; From 750a914adb7ded3669b47daed6d9d749110b4107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Fri, 17 Jun 2022 21:39:06 +0100 Subject: [PATCH 0155/1250] media: mediatek: vcodec: Drop platform_get_resource(IORESOURCE_IRQ) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a1a2b7125e10 ("of/platform: Drop static setup of IRQ resource from DT core") removed support for calling platform_get_resource(..., IORESOURCE_IRQ, ...) on DT-based drivers, but the probe() function of mtk-vcodec's encoder was still making use of it. This caused the encoder driver to fail probe. Since the platform_get_resource() call was only being used to check for the presence of the interrupt (its returned resource wasn't even used) and platform_get_irq() was already being used to get the IRQ, simply drop the use of platform_get_resource(IORESOURCE_IRQ) and handle the failure of platform_get_irq(), to get the driver probing again. [hverkuil: drop unused struct resource *res] Fixes: a1a2b7125e10 ("of/platform: Drop static setup of IRQ resource from DT core") Signed-off-by: Nícolas F. R. A. Prado Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- .../media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c b/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c index 95e8c29ccc6511..d2f5f30582a9ce 100644 --- a/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c +++ b/drivers/media/platform/mediatek/vcodec/mtk_vcodec_enc_drv.c @@ -228,7 +228,6 @@ static int mtk_vcodec_probe(struct platform_device *pdev) { struct mtk_vcodec_dev *dev; struct video_device *vfd_enc; - struct resource *res; phandle rproc_phandle; enum mtk_vcodec_fw_type fw_type; int ret; @@ -272,14 +271,12 @@ static int mtk_vcodec_probe(struct platform_device *pdev) goto err_res; } - res = platform_get_resource(pdev, IORESOURCE_IRQ, 0); - if (res == NULL) { - dev_err(&pdev->dev, "failed to get irq resource"); - ret = -ENOENT; + dev->enc_irq = platform_get_irq(pdev, 0); + if (dev->enc_irq < 0) { + ret = dev->enc_irq; goto err_res; } - dev->enc_irq = platform_get_irq(pdev, 0); irq_set_status_flags(dev->enc_irq, IRQ_NOAUTOEN); ret = devm_request_irq(&pdev->dev, dev->enc_irq, mtk_vcodec_enc_irq_handler, From b8f6770624f699d3ead20b9589fb849665f26ded Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Thu, 23 Jun 2022 18:07:35 +0200 Subject: [PATCH 0156/1250] vdpa: Add suspend operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This operation is optional: It it's not implemented, backend feature bit will not be exposed. Signed-off-by: Eugenio Pérez Message-Id: <20220623160738.632852-2-eperezma@redhat.com> Signed-off-by: Michael S. Tsirkin --- include/linux/vdpa.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 7b4a13d3bd9190..d282f464d2f1a7 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -218,6 +218,9 @@ struct vdpa_map_file { * @reset: Reset device * @vdev: vdpa device * Returns integer: success (0) or error (< 0) + * @suspend: Suspend or resume the device (optional) + * @vdev: vdpa device + * Returns integer: success (0) or error (< 0) * @get_config_size: Get the size of the configuration space includes * fields that are conditional on feature bits. * @vdev: vdpa device @@ -319,6 +322,7 @@ struct vdpa_config_ops { u8 (*get_status)(struct vdpa_device *vdev); void (*set_status)(struct vdpa_device *vdev, u8 status); int (*reset)(struct vdpa_device *vdev); + int (*suspend)(struct vdpa_device *vdev); size_t (*get_config_size)(struct vdpa_device *vdev); void (*get_config)(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len); From 94ab17d00bbfe73d05477ca75346196e8bd831db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Thu, 23 Jun 2022 18:07:36 +0200 Subject: [PATCH 0157/1250] vhost-vdpa: introduce SUSPEND backend feature bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Userland knows if it can suspend the device or not by checking this feature bit. It's only offered if the vdpa driver backend implements the suspend() operation callback, and to offer it or userland to ack it if the backend does not offer that callback is an error. Signed-off-by: Eugenio Pérez Message-Id: <20220623160738.632852-3-eperezma@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 16 +++++++++++++++- include/uapi/linux/vhost_types.h | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 23dcbfdfa13b19..3d636e1920614f 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -347,6 +347,14 @@ static long vhost_vdpa_set_config(struct vhost_vdpa *v, return 0; } +static bool vhost_vdpa_can_suspend(const struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + return ops->suspend; +} + static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) { struct vdpa_device *vdpa = v->vdpa; @@ -577,7 +585,11 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, if (cmd == VHOST_SET_BACKEND_FEATURES) { if (copy_from_user(&features, featurep, sizeof(features))) return -EFAULT; - if (features & ~VHOST_VDPA_BACKEND_FEATURES) + if (features & ~(VHOST_VDPA_BACKEND_FEATURES | + BIT_ULL(VHOST_BACKEND_F_SUSPEND))) + return -EOPNOTSUPP; + if ((features & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) && + !vhost_vdpa_can_suspend(v)) return -EOPNOTSUPP; vhost_set_backend_features(&v->vdev, features); return 0; @@ -628,6 +640,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, break; case VHOST_GET_BACKEND_FEATURES: features = VHOST_VDPA_BACKEND_FEATURES; + if (vhost_vdpa_can_suspend(v)) + features |= BIT_ULL(VHOST_BACKEND_F_SUSPEND); if (copy_to_user(featurep, &features, sizeof(features))) r = -EFAULT; break; diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index 634cee485abbcf..1bdd6e363f4c9f 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -161,5 +161,7 @@ struct vhost_vdpa_iova_range { * message */ #define VHOST_BACKEND_F_IOTLB_ASID 0x3 +/* Device can be suspended */ +#define VHOST_BACKEND_F_SUSPEND 0x4 #endif From 3b3bcc55308cfd47a5c78b2176b92223046b1904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Thu, 23 Jun 2022 18:07:37 +0200 Subject: [PATCH 0158/1250] vhost-vdpa: uAPI to suspend the device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The ioctl adds support for suspending the device from userspace. This is a must before getting virtqueue indexes (base) for live migration, since the device could modify them after userland gets them. There are individual ways to perform that action for some devices (VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING, ...) but there was no way to perform it for any vhost device (and, in particular, vhost-vdpa). After a successful return of the ioctl call the device must not process more virtqueue descriptors. The device can answer to read or writes of config fields as if it were not suspended. In particular, writing to "queue_enable" with a value of 1 will not make the device start processing buffers of the virtqueue. Signed-off-by: Eugenio Pérez Message-Id: <20220623160738.632852-4-eperezma@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vdpa.c | 19 +++++++++++++++++++ include/uapi/linux/vhost.h | 14 ++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 3d636e1920614f..7fa671ac4bdfc6 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -478,6 +478,22 @@ static long vhost_vdpa_get_vqs_count(struct vhost_vdpa *v, u32 __user *argp) return 0; } +/* After a successful return of ioctl the device must not process more + * virtqueue descriptors. The device can answer to read or writes of config + * fields as if it were not suspended. In particular, writing to "queue_enable" + * with a value of 1 will not make the device start processing buffers. + */ +static long vhost_vdpa_suspend(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (!ops->suspend) + return -EOPNOTSUPP; + + return ops->suspend(vdpa); +} + static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, void __user *argp) { @@ -654,6 +670,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, case VHOST_VDPA_GET_VQS_COUNT: r = vhost_vdpa_get_vqs_count(v, argp); break; + case VHOST_VDPA_SUSPEND: + r = vhost_vdpa_suspend(v); + break; default: r = vhost_dev_ioctl(&v->vdev, cmd, argp); if (r == -ENOIOCTLCMD) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index cab645d4a64555..6d9f4516315571 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -171,4 +171,18 @@ #define VHOST_VDPA_SET_GROUP_ASID _IOW(VHOST_VIRTIO, 0x7C, \ struct vhost_vring_state) +/* Suspend or resume a device so it does not process virtqueue requests anymore + * + * After the return of ioctl with suspend != 0, the device must finish any + * pending operations like in flight requests. It must also preserve all the + * necessary state (the virtqueue vring base plus the possible device specific + * states) that is required for restoring in the future. The device must not + * change its configuration after that point. + * + * After the return of ioctl with suspend == 0, the device can continue + * processing buffers as long as typical conditions are met (vq is enabled, + * DRIVER_OK status bit is enabled, etc). + */ +#define VHOST_VDPA_SUSPEND _IOW(VHOST_VIRTIO, 0x7D, int) + #endif From 6bae751ff31d158cfcd3b583e4caaa3477ac22ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= Date: Thu, 23 Jun 2022 18:07:38 +0200 Subject: [PATCH 0159/1250] vdpa_sim: Implement suspend vdpa op MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement suspend operation for vdpa_sim devices, so vhost-vdpa will offer that backend feature and userspace can effectively suspend the device. This is a must before get virtqueue indexes (base) for live migration, since the device could modify them after userland gets them. There are individual ways to perform that action for some devices (VHOST_NET_SET_BACKEND, VHOST_VSOCK_SET_RUNNING, ...) but there was no way to perform it for any vhost device (and, in particular, vhost-vdpa). Reviewed-by: Stefano Garzarella Signed-off-by: Eugenio Pérez Message-Id: <20220623160738.632852-5-eperezma@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 21 +++++++++++++++++++++ drivers/vdpa/vdpa_sim/vdpa_sim.h | 1 + drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 3 +++ drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 3 +++ 4 files changed, 28 insertions(+) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 0f28658996472b..213883487f9b49 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -107,6 +107,7 @@ static void vdpasim_do_reset(struct vdpasim *vdpasim) for (i = 0; i < vdpasim->dev_attr.nas; i++) vhost_iotlb_reset(&vdpasim->iommu[i]); + vdpasim->running = true; spin_unlock(&vdpasim->iommu_lock); vdpasim->features = 0; @@ -505,6 +506,24 @@ static int vdpasim_reset(struct vdpa_device *vdpa) return 0; } +static int vdpasim_suspend(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + int i; + + spin_lock(&vdpasim->lock); + vdpasim->running = false; + if (vdpasim->running) { + /* Check for missed buffers */ + for (i = 0; i < vdpasim->dev_attr.nvqs; ++i) + vdpasim_kick_vq(vdpa, i); + + } + spin_unlock(&vdpasim->lock); + + return 0; +} + static size_t vdpasim_get_config_size(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); @@ -694,6 +713,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, .reset = vdpasim_reset, + .suspend = vdpasim_suspend, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, @@ -726,6 +746,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, .reset = vdpasim_reset, + .suspend = vdpasim_suspend, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 622782e922391c..061986f30911a7 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -66,6 +66,7 @@ struct vdpasim { u32 generation; u64 features; u32 groups; + bool running; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index 42d401d4391171..bcdb1982c378ea 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -204,6 +204,9 @@ static void vdpasim_blk_work(struct work_struct *work) if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; + if (!vdpasim->running) + goto out; + for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) { struct vdpasim_virtqueue *vq = &vdpasim->vqs[i]; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index 5125976a4df87c..886449e885026a 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -154,6 +154,9 @@ static void vdpasim_net_work(struct work_struct *work) spin_lock(&vdpasim->lock); + if (!vdpasim->running) + goto out; + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) goto out; From 83ada2237e639518274503c2858ce19a3d60e644 Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 24 Jun 2022 10:55:41 +0800 Subject: [PATCH 0160/1250] remoteproc: rename len of rpoc_vring to num Rename the member len in the structure rpoc_vring to num. And remove 'in bytes' from the comment of it. This is misleading. Because this actually refers to the size of the virtio vring to be created. The unit is not bytes. Signed-off-by: Xuan Zhuo Message-Id: <20220624025621.128843-2-xuanzhuo@linux.alibaba.com> Signed-off-by: Michael S. Tsirkin --- drivers/remoteproc/remoteproc_core.c | 4 ++-- drivers/remoteproc/remoteproc_virtio.c | 10 +++++----- include/linux/remoteproc.h | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 02a04ab34a2308..2d2f3bab588852 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -334,7 +334,7 @@ int rproc_alloc_vring(struct rproc_vdev *rvdev, int i) size_t size; /* actual size of vring (in bytes) */ - size = PAGE_ALIGN(vring_size(rvring->len, rvring->align)); + size = PAGE_ALIGN(vring_size(rvring->num, rvring->align)); rsc = (void *)rproc->table_ptr + rvdev->rsc_offset; @@ -401,7 +401,7 @@ rproc_parse_vring(struct rproc_vdev *rvdev, struct fw_rsc_vdev *rsc, int i) return -EINVAL; } - rvring->len = vring->num; + rvring->num = vring->num; rvring->align = vring->align; rvring->rvdev = rvdev; diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c index 70ab496d0431c5..d43d74733f0a6b 100644 --- a/drivers/remoteproc/remoteproc_virtio.c +++ b/drivers/remoteproc/remoteproc_virtio.c @@ -87,7 +87,7 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev, struct fw_rsc_vdev *rsc; struct virtqueue *vq; void *addr; - int len, size; + int num, size; /* we're temporarily limited to two virtqueues per rvdev */ if (id >= ARRAY_SIZE(rvdev->vring)) @@ -104,20 +104,20 @@ static struct virtqueue *rp_find_vq(struct virtio_device *vdev, rvring = &rvdev->vring[id]; addr = mem->va; - len = rvring->len; + num = rvring->num; /* zero vring */ - size = vring_size(len, rvring->align); + size = vring_size(num, rvring->align); memset(addr, 0, size); dev_dbg(dev, "vring%d: va %pK qsz %d notifyid %d\n", - id, addr, len, rvring->notifyid); + id, addr, num, rvring->notifyid); /* * Create the new vq, and tell virtio we're not interested in * the 'weak' smp barriers, since we're talking with a real device. */ - vq = vring_new_virtqueue(id, len, rvring->align, vdev, false, ctx, + vq = vring_new_virtqueue(id, num, rvring->align, vdev, false, ctx, addr, rproc_virtio_notify, callback, name); if (!vq) { dev_err(dev, "vring_new_virtqueue %s failed\n", name); diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index 7c943f0a2fc40a..aea79c77db0ff2 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -597,7 +597,7 @@ struct rproc_subdev { /** * struct rproc_vring - remoteproc vring state * @va: virtual address - * @len: length, in bytes + * @num: vring size * @da: device address * @align: vring alignment * @notifyid: rproc-specific unique vring index @@ -606,7 +606,7 @@ struct rproc_subdev { */ struct rproc_vring { void *va; - int len; + int num; u32 da; u32 align; int notifyid; From 55bd341385a570e446a3c918803bea1269904d1b Mon Sep 17 00:00:00 2001 From: Xuan Zhuo Date: Fri, 24 Jun 2022 10:55:45 +0800 Subject: [PATCH 0161/1250] virtio_ring: remove the arg vq of vring_alloc_desc_extra() The parameter vq of vring_alloc_desc_extra() is useless. This patch removes this parameter. Subsequent patches will call this function to avoid passing useless arguments. Signed-off-by: Xuan Zhuo Acked-by: Jason Wang Message-Id: <20220624025621.128843-6-xuanzhuo@linux.alibaba.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_ring.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 643ca779fcc635..a5ec724c01d8cd 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -1637,8 +1637,7 @@ static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) return NULL; } -static struct vring_desc_extra *vring_alloc_desc_extra(struct vring_virtqueue *vq, - unsigned int num) +static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) { struct vring_desc_extra *desc_extra; unsigned int i; @@ -1759,7 +1758,7 @@ static struct virtqueue *vring_create_virtqueue_packed( /* Put everything in free lists. */ vq->free_head = 0; - vq->packed.desc_extra = vring_alloc_desc_extra(vq, num); + vq->packed.desc_extra = vring_alloc_desc_extra(num); if (!vq->packed.desc_extra) goto err_desc_extra; @@ -2248,7 +2247,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, if (!vq->split.desc_state) goto err_state; - vq->split.desc_extra = vring_alloc_desc_extra(vq, vring.num); + vq->split.desc_extra = vring_alloc_desc_extra(vring.num); if (!vq->split.desc_extra) goto err_extra; From fc356a71565bbc35c6e188e2c15c910fa3a2e261 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Fri, 24 Jun 2022 09:56:56 +0200 Subject: [PATCH 0162/1250] vringh: iterate on iotlb_translate to handle large translations iotlb_translate() can return -ENOBUFS if the bio_vec is not big enough to contain all the ranges for translation. This can happen for example if the VMM maps a large bounce buffer, without using hugepages, that requires more than 16 ranges to translate the addresses. To handle this case, let's extend iotlb_translate() to also return the number of bytes successfully translated. In copy_from_iotlb()/copy_to_iotlb() loops by calling iotlb_translate() several times until we complete the translation. Signed-off-by: Stefano Garzarella Message-Id: <20220624075656.13997-1-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/vringh.c | 78 ++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 22 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index eab55accf381f8..11f59dd06a74e1 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -1095,7 +1095,8 @@ EXPORT_SYMBOL(vringh_need_notify_kern); #if IS_REACHABLE(CONFIG_VHOST_IOTLB) static int iotlb_translate(const struct vringh *vrh, - u64 addr, u64 len, struct bio_vec iov[], + u64 addr, u64 len, u64 *translated, + struct bio_vec iov[], int iov_size, u32 perm) { struct vhost_iotlb_map *map; @@ -1136,43 +1137,76 @@ static int iotlb_translate(const struct vringh *vrh, spin_unlock(vrh->iotlb_lock); + if (translated) + *translated = min(len, s); + return ret; } static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, void *src, size_t len) { - struct iov_iter iter; - struct bio_vec iov[16]; - int ret; + u64 total_translated = 0; - ret = iotlb_translate(vrh, (u64)(uintptr_t)src, - len, iov, 16, VHOST_MAP_RO); - if (ret < 0) - return ret; + while (total_translated < len) { + struct bio_vec iov[16]; + struct iov_iter iter; + u64 translated; + int ret; - iov_iter_bvec(&iter, READ, iov, ret, len); + ret = iotlb_translate(vrh, (u64)(uintptr_t)src, + len - total_translated, &translated, + iov, ARRAY_SIZE(iov), VHOST_MAP_RO); + if (ret == -ENOBUFS) + ret = ARRAY_SIZE(iov); + else if (ret < 0) + return ret; - ret = copy_from_iter(dst, len, &iter); + iov_iter_bvec(&iter, READ, iov, ret, translated); - return ret; + ret = copy_from_iter(dst, translated, &iter); + if (ret < 0) + return ret; + + src += translated; + dst += translated; + total_translated += translated; + } + + return total_translated; } static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, void *src, size_t len) { - struct iov_iter iter; - struct bio_vec iov[16]; - int ret; + u64 total_translated = 0; - ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, - len, iov, 16, VHOST_MAP_WO); - if (ret < 0) - return ret; + while (total_translated < len) { + struct bio_vec iov[16]; + struct iov_iter iter; + u64 translated; + int ret; + + ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, + len - total_translated, &translated, + iov, ARRAY_SIZE(iov), VHOST_MAP_WO); + if (ret == -ENOBUFS) + ret = ARRAY_SIZE(iov); + else if (ret < 0) + return ret; - iov_iter_bvec(&iter, WRITE, iov, ret, len); + iov_iter_bvec(&iter, WRITE, iov, ret, translated); + + ret = copy_to_iter(src, translated, &iter); + if (ret < 0) + return ret; + + src += translated; + dst += translated; + total_translated += translated; + } - return copy_to_iter(src, len, &iter); + return total_translated; } static inline int getu16_iotlb(const struct vringh *vrh, @@ -1183,7 +1217,7 @@ static inline int getu16_iotlb(const struct vringh *vrh, int ret; /* Atomic read is needed for getu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, &iov, 1, VHOST_MAP_RO); if (ret < 0) return ret; @@ -1204,7 +1238,7 @@ static inline int putu16_iotlb(const struct vringh *vrh, int ret; /* Atomic write is needed for putu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, &iov, 1, VHOST_MAP_WO); if (ret < 0) return ret; From 197f80d97e9ccc8a496f4935ba939f3ed7432d53 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 22 Jun 2022 22:49:32 +0200 Subject: [PATCH 0163/1250] drbd: bm_page_async_io: fix spurious bitmap "IO error" on large volumes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We usually do all our bitmap IO in units of PAGE_SIZE. With very small or oddly sized external meta data, or with PAGE_SIZE != 4k, it can happen that our last on-disk bitmap page is not fully PAGE_SIZE aligned, so we may need to adjust the size of the IO. We used to do that with min_t(unsigned int, PAGE_SIZE, last_allowed_sector - current_offset); And for just the right diff, (unsigned int)(diff) will result in 0. A bio of length 0 will correctly be rejected with an IO error (and some scary WARN_ON_ONCE()) by the scsi layer. Do the calculation properly. Signed-off-by: Lars Ellenberg Signed-off-by: Christoph Böhmwalder Link: https://lore.kernel.org/r/20220622204932.196830-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_bitmap.c | 49 +++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 9e060e49b3f8c7..bd2133ef6e0aec 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -974,25 +974,58 @@ static void drbd_bm_endio(struct bio *bio) } } +/* For the layout, see comment above drbd_md_set_sector_offsets(). */ +static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->md.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + bdev->md.al_offset -1; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset + bdev->md.md_size_sect -1; + } +} + static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) { struct drbd_device *device = ctx->device; unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE; - struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, - GFP_NOIO, &drbd_md_io_bio_set); struct drbd_bitmap *b = device->bitmap; + struct bio *bio; struct page *page; + sector_t last_bm_sect; + sector_t first_bm_sect; + sector_t on_disk_sector; unsigned int len; - sector_t on_disk_sector = - device->ldev->md.md_offset + device->ldev->md.bm_offset; - on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); + first_bm_sect = device->ldev->md.md_offset + device->ldev->md.bm_offset; + on_disk_sector = first_bm_sect + (((sector_t)page_nr) << (PAGE_SHIFT-SECTOR_SHIFT)); /* this might happen with very small * flexible external meta data device, * or with PAGE_SIZE > 4k */ - len = min_t(unsigned int, PAGE_SIZE, - (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9); + last_bm_sect = drbd_md_last_bitmap_sector(device->ldev); + if (first_bm_sect <= on_disk_sector && last_bm_sect >= on_disk_sector) { + sector_t len_sect = last_bm_sect - on_disk_sector + 1; + if (len_sect < PAGE_SIZE/SECTOR_SIZE) + len = (unsigned int)len_sect*SECTOR_SIZE; + else + len = PAGE_SIZE; + } else { + if (__ratelimit(&drbd_ratelimit_state)) { + drbd_err(device, "Invalid offset during on-disk bitmap access: " + "page idx %u, sector %llu\n", page_nr, on_disk_sector); + } + ctx->error = -EIO; + bm_set_page_io_err(b->bm_pages[page_nr]); + if (atomic_dec_and_test(&ctx->in_flight)) { + ctx->done = 1; + wake_up(&device->misc_wait); + kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); + } + return; + } /* serialize IO on this page */ bm_page_lock_io(device, page_nr); @@ -1007,6 +1040,8 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bm_store_page_idx(page, page_nr); } else page = b->bm_pages[page_nr]; + bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO, + &drbd_md_io_bio_set); bio->bi_iter.bi_sector = on_disk_sector; /* bio_add_page of a single page to an empty bio will always succeed, * according to api. Do we want to assert that? */ From 1cf8639be186f309a4a28febc3ba6d1e63c1bef3 Mon Sep 17 00:00:00 2001 From: Bitan Biswas Date: Mon, 27 Jun 2022 12:41:19 +0100 Subject: [PATCH 0164/1250] soc/tegra: fuse: Expose Tegra production status For Tegra194 and Tegra234 devices there is a production fuse register that indicates if the device is a production device. Expose the production status of the chip via the sysfs for Tegra194 and Tegra234 devices. Note that '0' implies not production or unknown and '1' indicates the device is a production device. Signed-off-by: Bitan Biswas [ Re-worked for upstream submission by Jon Hunter ] Signed-off-by: Jon Hunter Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/fuse-tegra.c | 15 +++++++++++++++ include/soc/tegra/fuse.h | 1 + 2 files changed, 16 insertions(+) diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index b0a8405dbdb199..0d145ce9029099 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -452,10 +452,25 @@ static ssize_t platform_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR_RO(platform); +static ssize_t production_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + u32 reg = 0; + + if (tegra_is_silicon()) + if (tegra_fuse_readl(TEGRA_FUSE_PRODUCTION_MODE, ®)) + dev_err(dev, "failed to read production fuse!\n"); + + return sprintf(buf, "%d\n", reg); +} + +static DEVICE_ATTR_RO(production); + static struct attribute *tegra194_soc_attr[] = { &dev_attr_major.attr, &dev_attr_minor.attr, &dev_attr_platform.attr, + &dev_attr_production.attr, NULL, }; diff --git a/include/soc/tegra/fuse.h b/include/soc/tegra/fuse.h index 67d2bc856fbc08..d035e04cb86998 100644 --- a/include/soc/tegra/fuse.h +++ b/include/soc/tegra/fuse.h @@ -18,6 +18,7 @@ #define TEGRA194 0x19 #define TEGRA234 0x23 +#define TEGRA_FUSE_PRODUCTION_MODE 0x0 #define TEGRA_FUSE_SKU_CALIB_0 0xf0 #define TEGRA30_FUSE_SATA_CALIB 0x124 #define TEGRA_FUSE_USB_CALIB_EXT_0 0x250 From 03995d64bc7d957eccba11b8b9b891e82525bbe4 Mon Sep 17 00:00:00 2001 From: Liang He Date: Thu, 16 Jun 2022 09:46:36 +0800 Subject: [PATCH 0165/1250] soc/tegra: fuse: Add missing of_node_put() in tegra_init_fuse() In this function, of_find_matching_node() will return a node pointer with refcount incremented. We should use of_node_put() when the "np" pointer is not used anymore. Signed-off-by: Liang He Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/fuse-tegra.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/soc/tegra/fuse/fuse-tegra.c b/drivers/soc/tegra/fuse/fuse-tegra.c index 0d145ce9029099..8ee6092753b187 100644 --- a/drivers/soc/tegra/fuse/fuse-tegra.c +++ b/drivers/soc/tegra/fuse/fuse-tegra.c @@ -583,6 +583,7 @@ static int __init tegra_init_fuse(void) np = of_find_matching_node(NULL, car_match); if (np) { void __iomem *base = of_iomap(np, 0); + of_node_put(np); if (base) { tegra_enable_fuse_clk(base); iounmap(base); From 681ec6abcd7f051f7fc318068a3ac09772ebef7e Mon Sep 17 00:00:00 2001 From: Sean Wang Date: Mon, 27 Jun 2022 07:27:36 +0800 Subject: [PATCH 0166/1250] Bluetooth: btmtksdio: Add in-band wakeup support Commit ce64b3e94919 ("Bluetooth: mt7921s: Support wake on bluetooth") adds the wake on bluethooth via a dedicated GPIO. Extend the wake-on-bluetooth to use the SDIO DAT1 pin (in-band wakeup), when supported by the SDIO host driver. Co-developed-by: Yake Yang Signed-off-by: Yake Yang Signed-off-by: Sean Wang Reviewed-by: AngeloGioacchino Del Regno Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtksdio.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c index d6700efcfe8cd3..f9a3444753c2bb 100644 --- a/drivers/bluetooth/btmtksdio.c +++ b/drivers/bluetooth/btmtksdio.c @@ -1282,6 +1282,13 @@ static void btmtksdio_cmd_timeout(struct hci_dev *hdev) hci_reset_dev(hdev); } +static bool btmtksdio_sdio_inband_wakeup(struct hci_dev *hdev) +{ + struct btmtksdio_dev *bdev = hci_get_drvdata(hdev); + + return device_may_wakeup(bdev->dev); +} + static bool btmtksdio_sdio_wakeup(struct hci_dev *hdev) { struct btmtksdio_dev *bdev = hci_get_drvdata(hdev); @@ -1349,6 +1356,14 @@ static int btmtksdio_probe(struct sdio_func *func, hdev->shutdown = btmtksdio_shutdown; hdev->send = btmtksdio_send_frame; hdev->wakeup = btmtksdio_sdio_wakeup; + /* + * If SDIO controller supports wake on Bluetooth, sending a wakeon + * command is not necessary. + */ + if (device_can_wakeup(func->card->host->parent)) + hdev->wakeup = btmtksdio_sdio_inband_wakeup; + else + hdev->wakeup = btmtksdio_sdio_wakeup; hdev->set_bdaddr = btmtk_set_bdaddr; SET_HCIDEV_DEV(hdev, &func->dev); From 9804ebd35611dbbf584b65d512e39f8f3ecde2d3 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Mon, 20 Jun 2022 09:31:07 +0000 Subject: [PATCH 0167/1250] bus: ti-sysc: Add missing of_node_put() in sysc_add_named_clock_from_child() of_get_next_available_child() returns a node pointer with refcount incremented, we should use of_node_put() on it when not need anymore. Add missing of_node_put() to avoid refcount leak. Signed-off-by: Peng Wu Reported-by: Hulk Robot Message-Id: <20220620093107.73809-1-wupeng58@huawei.com> Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 9a7d12332fadb0..73fd42dd25b31f 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -333,6 +333,7 @@ static int sysc_add_named_clock_from_child(struct sysc *ddata, return -ENODEV; clock = devm_get_clk_from_child(ddata->dev, child, name); + of_node_put(child); if (IS_ERR(clock)) return PTR_ERR(clock); From 609c1fabc7c5aa37a662e5c463d1d51b47a541b9 Mon Sep 17 00:00:00 2001 From: Juerg Haefliger Date: Mon, 23 May 2022 08:28:07 +0200 Subject: [PATCH 0168/1250] ARM: omap1: Kconfig: Fix indentation The convention for indentation seems to be a single tab. Help text is further indented by an additional two whitespaces. Fix the lines that violate these rules. Signed-off-by: Juerg Haefliger Message-Id: <20220523062807.10544-1-juergh@canonical.com> Signed-off-by: Tony Lindgren --- arch/arm/mach-omap1/Kconfig | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/arch/arm/mach-omap1/Kconfig b/arch/arm/mach-omap1/Kconfig index 0ac0567f721d8a..879bba36b715c4 100644 --- a/arch/arm/mach-omap1/Kconfig +++ b/arch/arm/mach-omap1/Kconfig @@ -132,8 +132,8 @@ config MACH_OMAP_INNOVATOR bool "TI Innovator" depends on ARCH_OMAP15XX || ARCH_OMAP16XX help - TI OMAP 1510 or 1610 Innovator board support. Say Y here if you - have such a board. + TI OMAP 1510 or 1610 Innovator board support. Say Y here if you + have such a board. config MACH_OMAP_H2 bool "TI H2 Support" @@ -160,7 +160,7 @@ config MACH_OMAP_OSK depends on ARCH_OMAP16XX help TI OMAP 5912 OSK (OMAP Starter Kit) board support. Say Y here - if you have such a board. + if you have such a board. config OMAP_OSK_MISTRAL bool "Mistral QVGA board Support" @@ -197,10 +197,10 @@ config MACH_OMAP_PALMZ71 bool "Palm Zire71" depends on ARCH_OMAP15XX help - Support for the Palm Zire71 PDA. To boot the kernel, - you'll need a PalmOS compatible bootloader; check out - http://hackndev.com/palm/z71 for more information. - Say Y here if you have such a PDA, say N otherwise. + Support for the Palm Zire71 PDA. To boot the kernel, + you'll need a PalmOS compatible bootloader; check out + http://hackndev.com/palm/z71 for more information. + Say Y here if you have such a PDA, say N otherwise. config MACH_OMAP_PALMTT bool "Palm Tungsten|T" @@ -246,10 +246,10 @@ config MACH_OMAP_GENERIC bool "Generic OMAP board" depends on ARCH_OMAP15XX || ARCH_OMAP16XX help - Support for generic OMAP-1510, 1610 or 1710 board with - no FPGA. Can be used as template for porting Linux to - custom OMAP boards. Say Y here if you have a custom - board. + Support for generic OMAP-1510, 1610 or 1710 board with + no FPGA. Can be used as template for porting Linux to + custom OMAP boards. Say Y here if you have a custom + board. endmenu From e7145cc274e9a4f03dcba4bd8c69e018ba0a3d1b Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 20 Jun 2022 10:38:42 -0700 Subject: [PATCH 0169/1250] f2fs: introduce memory mode Introduce memory mode to supports "normal" and "low" memory modes. "low" mode is to support low memory devices. Because of the nature of low memory devices, in this mode, f2fs will try to save memory sometimes by sacrificing performance. "normal" mode is the default mode and same as before. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 5 +++++ fs/f2fs/f2fs.h | 13 +++++++++++++ fs/f2fs/super.c | 24 ++++++++++++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index ad8dc8c040a276..2965601e21bbe2 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -336,6 +336,11 @@ discard_unit=%s Control discard unit, the argument can be "block", "segment" default, it is helpful for large sized SMR or ZNS devices to reduce memory cost by getting rid of fs metadata supports small discard. +memory=%s Control memory mode. This supports "normal" and "low" modes. + "low" mode is introduced to support low memory devices. + Because of the nature of low memory devices, in this mode, f2fs + will try to save memory sometimes by sacrificing performance. + "normal" mode is the default mode and same as before. ======================== ============================================================ Debugfs Entries diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d9bbecd008d22a..fea97093d92712 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -159,6 +159,7 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ + int memory_mode; /* memory mode */ int discard_unit; /* * discard command's offset/size should * be aligned to this unit: block, @@ -1360,6 +1361,13 @@ enum { DISCARD_UNIT_SECTION, /* basic discard unit is section */ }; +enum { + MEMORY_MODE_NORMAL, /* memory mode for normal devices */ + MEMORY_MODE_LOW, /* memory mode for low memry devices */ +}; + + + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -4398,6 +4406,11 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } +static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW; +} + static inline bool f2fs_may_compress(struct inode *inode) { if (IS_SWAPFILE(inode) || f2fs_is_pinned_file(inode) || diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3112fe92f93429..cf9cf24f9b56a0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -160,6 +160,7 @@ enum { Opt_gc_merge, Opt_nogc_merge, Opt_discard_unit, + Opt_memory_mode, Opt_err, }; @@ -236,6 +237,7 @@ static match_table_t f2fs_tokens = { {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, {Opt_discard_unit, "discard_unit=%s"}, + {Opt_memory_mode, "memory=%s"}, {Opt_err, NULL}, }; @@ -1235,6 +1237,22 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_memory_mode: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "normal")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_NORMAL; + } else if (!strcmp(name, "low")) { + F2FS_OPTION(sbi).memory_mode = + MEMORY_MODE_LOW; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -2006,6 +2024,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) seq_printf(seq, ",discard_unit=%s", "section"); + if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_NORMAL) + seq_printf(seq, ",memory=%s", "normal"); + else if (F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW) + seq_printf(seq, ",memory=%s", "low"); + return 0; } @@ -2027,6 +2050,7 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).compress_ext_cnt = 0; F2FS_OPTION(sbi).compress_mode = COMPR_MODE_FS; F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON; + F2FS_OPTION(sbi).memory_mode = MEMORY_MODE_NORMAL; sbi->sb->s_flags &= ~SB_INLINECRYPT; From 34c8217a23df7d498202409368c33cbeb4452c7d Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 20 Jun 2022 10:38:43 -0700 Subject: [PATCH 0170/1250] f2fs: handle decompress only post processing in softirq Now decompression is being handled in workqueue and it makes read I/O latency non-deterministic, because of the non-deterministic scheduling nature of workqueues. So, I made it handled in softirq context only if possible, not in low memory devices, since this modification will maintain decompresion related memory a little longer. Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 208 ++++++++++++++++++++++++++++----------------- fs/f2fs/data.c | 52 ++++++++---- fs/f2fs/f2fs.h | 17 ++-- 3 files changed, 172 insertions(+), 105 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index fa237e5c7173b3..2e06a301bf1270 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -729,28 +729,18 @@ static int f2fs_compress_pages(struct compress_ctx *cc) return ret; } -void f2fs_decompress_cluster(struct decompress_io_ctx *dic) +static int f2fs_prepare_decomp_mem(struct decompress_io_ctx *dic, bool end_io) { - struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); - struct f2fs_inode_info *fi = F2FS_I(dic->inode); const struct f2fs_compress_ops *cops = - f2fs_cops[fi->i_compress_algorithm]; - int ret; + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; int i; - trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, - dic->cluster_size, fi->i_compress_algorithm); - - if (dic->failed) { - ret = -EIO; - goto out_end_io; - } + if (end_io ^ f2fs_low_mem_mode(F2FS_I_SB(dic->inode))) + return 0; dic->tpages = page_array_alloc(dic->inode, dic->cluster_size); - if (!dic->tpages) { - ret = -ENOMEM; - goto out_end_io; - } + if (!dic->tpages) + return 1; for (i = 0; i < dic->cluster_size; i++) { if (dic->rpages[i]) { @@ -759,28 +749,100 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) } dic->tpages[i] = f2fs_compress_alloc_page(); - if (!dic->tpages[i]) { - ret = -ENOMEM; - goto out_end_io; - } + if (!dic->tpages[i]) + return 1; } + dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); + if (!dic->rbuf) + return 1; + + dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); + if (!dic->cbuf) + return 1; + + cops = f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; if (cops->init_decompress_ctx) { - ret = cops->init_decompress_ctx(dic); + int ret = cops->init_decompress_ctx(dic); + if (ret) - goto out_end_io; + return 1; } - dic->rbuf = f2fs_vmap(dic->tpages, dic->cluster_size); - if (!dic->rbuf) { - ret = -ENOMEM; - goto out_destroy_decompress_ctx; + return 0; +} + +static void f2fs_release_decomp_mem(struct decompress_io_ctx *dic, + bool bypass_destroy_callback, bool end_io) +{ + const struct f2fs_compress_ops *cops = + f2fs_cops[F2FS_I(dic->inode)->i_compress_algorithm]; + + if (end_io ^ f2fs_low_mem_mode(F2FS_I_SB(dic->inode))) + return; + + if (!bypass_destroy_callback && cops->destroy_decompress_ctx) + cops->destroy_decompress_ctx(dic); + + if (dic->cbuf) + vm_unmap_ram(dic->cbuf, dic->nr_cpages); + + if (dic->rbuf) + vm_unmap_ram(dic->rbuf, dic->cluster_size); +} + +static void f2fs_free_dic(struct decompress_io_ctx *dic, + bool bypass_destroy_callback) +{ + int i; + + f2fs_release_decomp_mem(dic, bypass_destroy_callback, false); + + if (dic->tpages) { + for (i = 0; i < dic->cluster_size; i++) { + if (dic->rpages[i]) + continue; + if (!dic->tpages[i]) + continue; + f2fs_compress_free_page(dic->tpages[i]); + } + page_array_free(dic->inode, dic->tpages, dic->cluster_size); } - dic->cbuf = f2fs_vmap(dic->cpages, dic->nr_cpages); - if (!dic->cbuf) { + if (dic->cpages) { + for (i = 0; i < dic->nr_cpages; i++) { + if (!dic->cpages[i]) + continue; + f2fs_compress_free_page(dic->cpages[i]); + } + page_array_free(dic->inode, dic->cpages, dic->nr_cpages); + } + + page_array_free(dic->inode, dic->rpages, dic->nr_rpages); + kmem_cache_free(dic_entry_slab, dic); +} + +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dic->inode); + struct f2fs_inode_info *fi = F2FS_I(dic->inode); + const struct f2fs_compress_ops *cops = + f2fs_cops[fi->i_compress_algorithm]; + bool bypass_callback = false; + int ret; + + trace_f2fs_decompress_pages_start(dic->inode, dic->cluster_idx, + dic->cluster_size, fi->i_compress_algorithm); + + if (dic->failed) { + ret = -EIO; + goto out_end_io; + } + + if (f2fs_prepare_decomp_mem(dic, true)) { + bypass_callback = true; ret = -ENOMEM; - goto out_vunmap_rbuf; + goto out_release; } dic->clen = le32_to_cpu(dic->cbuf->clen); @@ -788,7 +850,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) if (dic->clen > PAGE_SIZE * dic->nr_cpages - COMPRESS_HEADER_SIZE) { ret = -EFSCORRUPTED; - goto out_vunmap_cbuf; + goto out_release; } ret = cops->decompress_pages(dic); @@ -809,17 +871,13 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) } } -out_vunmap_cbuf: - vm_unmap_ram(dic->cbuf, dic->nr_cpages); -out_vunmap_rbuf: - vm_unmap_ram(dic->rbuf, dic->cluster_size); -out_destroy_decompress_ctx: - if (cops->destroy_decompress_ctx) - cops->destroy_decompress_ctx(dic); +out_release: + f2fs_release_decomp_mem(dic, bypass_callback, true); + out_end_io: trace_f2fs_decompress_pages_end(dic->inode, dic->cluster_idx, dic->clen, ret); - f2fs_decompress_end_io(dic, ret); + f2fs_decompress_end_io(dic, ret, in_task); } /* @@ -829,7 +887,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic) * (or in the case of a failure, cleans up without actually decompressing). */ void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr) + block_t blkaddr, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); @@ -839,12 +897,12 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, if (failed) WRITE_ONCE(dic->failed, true); - else if (blkaddr) + else if (blkaddr && in_task) f2fs_cache_compressed_page(sbi, page, dic->inode->i_ino, blkaddr); if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic); + f2fs_decompress_cluster(dic, in_task); } static bool is_page_in_cluster(struct compress_ctx *cc, pgoff_t index) @@ -1552,16 +1610,14 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, return err; } -static void f2fs_free_dic(struct decompress_io_ctx *dic); - struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) { struct decompress_io_ctx *dic; pgoff_t start_idx = start_idx_of_cluster(cc); + struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); int i; - dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, - false, F2FS_I_SB(cc->inode)); + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!dic) return ERR_PTR(-ENOMEM); @@ -1602,52 +1658,43 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) dic->cpages[i] = page; } + if (f2fs_prepare_decomp_mem(dic, false)) + goto out_free; + return dic; out_free: - f2fs_free_dic(dic); + f2fs_free_dic(dic, true); return ERR_PTR(-ENOMEM); } -static void f2fs_free_dic(struct decompress_io_ctx *dic) +static void f2fs_late_free_dic(struct work_struct *work) { - int i; - - if (dic->tpages) { - for (i = 0; i < dic->cluster_size; i++) { - if (dic->rpages[i]) - continue; - if (!dic->tpages[i]) - continue; - f2fs_compress_free_page(dic->tpages[i]); - } - page_array_free(dic->inode, dic->tpages, dic->cluster_size); - } - - if (dic->cpages) { - for (i = 0; i < dic->nr_cpages; i++) { - if (!dic->cpages[i]) - continue; - f2fs_compress_free_page(dic->cpages[i]); - } - page_array_free(dic->inode, dic->cpages, dic->nr_cpages); - } + struct decompress_io_ctx *dic = + container_of(work, struct decompress_io_ctx, free_work); - page_array_free(dic->inode, dic->rpages, dic->nr_rpages); - kmem_cache_free(dic_entry_slab, dic); + f2fs_free_dic(dic, false); } -static void f2fs_put_dic(struct decompress_io_ctx *dic) +static void f2fs_put_dic(struct decompress_io_ctx *dic, bool in_task) { - if (refcount_dec_and_test(&dic->refcnt)) - f2fs_free_dic(dic); + if (refcount_dec_and_test(&dic->refcnt)) { + if (in_task) { + f2fs_free_dic(dic, false); + } else { + INIT_WORK(&dic->free_work, f2fs_late_free_dic); + queue_work(F2FS_I_SB(dic->inode)->post_read_wq, + &dic->free_work); + } + } } /* * Update and unlock the cluster's pagecache pages, and release the reference to * the decompress_io_ctx that was being held for I/O completion. */ -static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) { int i; @@ -1668,7 +1715,7 @@ static void __f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) unlock_page(rpage); } - f2fs_put_dic(dic); + f2fs_put_dic(dic, in_task); } static void f2fs_verify_cluster(struct work_struct *work) @@ -1685,14 +1732,15 @@ static void f2fs_verify_cluster(struct work_struct *work) SetPageError(rpage); } - __f2fs_decompress_end_io(dic, false); + __f2fs_decompress_end_io(dic, false, true); } /* * This is called when a compressed cluster has been decompressed * (or failed to be read and/or decompressed). */ -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task) { if (!failed && dic->need_verity) { /* @@ -1704,7 +1752,7 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) INIT_WORK(&dic->verity_work, f2fs_verify_cluster); fsverity_enqueue_verify_work(&dic->verity_work); } else { - __f2fs_decompress_end_io(dic, failed); + __f2fs_decompress_end_io(dic, failed, in_task); } } @@ -1713,12 +1761,12 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed) * * This is called when the page is no longer needed and can be freed. */ -void f2fs_put_page_dic(struct page *page) +void f2fs_put_page_dic(struct page *page, bool in_task) { struct decompress_io_ctx *dic = (struct decompress_io_ctx *)page_private(page); - f2fs_put_dic(dic); + f2fs_put_dic(dic, in_task); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7fcbcf97973724..c448c3ee7ac345 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -119,7 +119,7 @@ struct bio_post_read_ctx { block_t fs_blkaddr; }; -static void f2fs_finish_read_bio(struct bio *bio) +static void f2fs_finish_read_bio(struct bio *bio, bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; @@ -133,8 +133,9 @@ static void f2fs_finish_read_bio(struct bio *bio) if (f2fs_is_compressed_page(page)) { if (bio->bi_status) - f2fs_end_read_compressed_page(page, true, 0); - f2fs_put_page_dic(page); + f2fs_end_read_compressed_page(page, true, 0, + in_task); + f2fs_put_page_dic(page, in_task); continue; } @@ -191,7 +192,7 @@ static void f2fs_verify_bio(struct work_struct *work) fsverity_verify_bio(bio); } - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, true); } /* @@ -203,7 +204,7 @@ static void f2fs_verify_bio(struct work_struct *work) * can involve reading verity metadata pages from the file, and these verity * metadata pages may be encrypted and/or compressed. */ -static void f2fs_verify_and_finish_bio(struct bio *bio) +static void f2fs_verify_and_finish_bio(struct bio *bio, bool in_task) { struct bio_post_read_ctx *ctx = bio->bi_private; @@ -211,7 +212,7 @@ static void f2fs_verify_and_finish_bio(struct bio *bio) INIT_WORK(&ctx->work, f2fs_verify_bio); fsverity_enqueue_verify_work(&ctx->work); } else { - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, in_task); } } @@ -224,7 +225,8 @@ static void f2fs_verify_and_finish_bio(struct bio *bio) * that the bio includes at least one compressed page. The actual decompression * is done on a per-cluster basis, not a per-bio basis. */ -static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) +static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx, + bool in_task) { struct bio_vec *bv; struct bvec_iter_all iter_all; @@ -237,7 +239,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) /* PG_error was set if decryption failed. */ if (f2fs_is_compressed_page(page)) f2fs_end_read_compressed_page(page, PageError(page), - blkaddr); + blkaddr, in_task); else all_compressed = false; @@ -262,15 +264,16 @@ static void f2fs_post_read_work(struct work_struct *work) fscrypt_decrypt_bio(ctx->bio); if (ctx->enabled_steps & STEP_DECOMPRESS) - f2fs_handle_step_decompress(ctx); + f2fs_handle_step_decompress(ctx, true); - f2fs_verify_and_finish_bio(ctx->bio); + f2fs_verify_and_finish_bio(ctx->bio, true); } static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); struct bio_post_read_ctx *ctx; + bool intask = in_task(); iostat_update_and_unbind_ctx(bio, 0); ctx = bio->bi_private; @@ -281,16 +284,29 @@ static void f2fs_read_end_io(struct bio *bio) } if (bio->bi_status) { - f2fs_finish_read_bio(bio); + f2fs_finish_read_bio(bio, intask); return; } - if (ctx && (ctx->enabled_steps & (STEP_DECRYPT | STEP_DECOMPRESS))) { - INIT_WORK(&ctx->work, f2fs_post_read_work); - queue_work(ctx->sbi->post_read_wq, &ctx->work); - } else { - f2fs_verify_and_finish_bio(bio); + if (ctx) { + unsigned int enabled_steps = ctx->enabled_steps & + (STEP_DECRYPT | STEP_DECOMPRESS); + + /* + * If we have only decompression step between decompression and + * decrypt, we don't need post processing for this. + */ + if (enabled_steps == STEP_DECOMPRESS && + !f2fs_low_mem_mode(sbi)) { + f2fs_handle_step_decompress(ctx, intask); + } else if (enabled_steps) { + INIT_WORK(&ctx->work, f2fs_post_read_work); + queue_work(ctx->sbi->post_read_wq, &ctx->work); + return; + } } + + f2fs_verify_and_finish_bio(bio, intask); } static void f2fs_write_end_io(struct bio *bio) @@ -2222,7 +2238,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_load_compressed_page(sbi, page, blkaddr)) { if (atomic_dec_and_test(&dic->remaining_pages)) - f2fs_decompress_cluster(dic); + f2fs_decompress_cluster(dic, true); continue; } @@ -2240,7 +2256,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, page->index, for_write); if (IS_ERR(bio)) { ret = PTR_ERR(bio); - f2fs_decompress_end_io(dic, ret); + f2fs_decompress_end_io(dic, ret, true); f2fs_put_dnode(&dn); *bio_ret = NULL; return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fea97093d92712..c9a31934b9484e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1588,6 +1588,7 @@ struct decompress_io_ctx { void *private; /* payload buffer for specified decompression algorithm */ void *private2; /* extra payload buffer */ struct work_struct verity_work; /* work to verify the decompressed pages */ + struct work_struct free_work; /* work for late free this structure itself */ }; #define NULL_CLUSTER ((unsigned int)(~0)) @@ -4166,9 +4167,9 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page); bool f2fs_is_compress_backend_ready(struct inode *inode); int f2fs_init_compress_mempool(void); void f2fs_destroy_compress_mempool(void); -void f2fs_decompress_cluster(struct decompress_io_ctx *dic); +void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task); void f2fs_end_read_compressed_page(struct page *page, bool failed, - block_t blkaddr); + block_t blkaddr, bool in_task); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, @@ -4187,8 +4188,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); -void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); -void f2fs_put_page_dic(struct page *page); +void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, + bool in_task); +void f2fs_put_page_dic(struct page *page, bool in_task); unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); @@ -4234,13 +4236,14 @@ static inline struct page *f2fs_compress_control_page(struct page *page) } static inline int f2fs_init_compress_mempool(void) { return 0; } static inline void f2fs_destroy_compress_mempool(void) { } -static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic) { } +static inline void f2fs_decompress_cluster(struct decompress_io_ctx *dic, + bool in_task) { } static inline void f2fs_end_read_compressed_page(struct page *page, - bool failed, block_t blkaddr) + bool failed, block_t blkaddr, bool in_task) { WARN_ON_ONCE(1); } -static inline void f2fs_put_page_dic(struct page *page) +static inline void f2fs_put_page_dic(struct page *page, bool in_task) { WARN_ON_ONCE(1); } From 56f97d2a954012cf4c25154d9d425eac97143758 Mon Sep 17 00:00:00 2001 From: duguowei Date: Mon, 20 Jun 2022 21:39:45 +0800 Subject: [PATCH 0171/1250] f2fs: remove redundant code for gc condition Remove the redundant code and use local variant as the argument directly. Make it more human-readable. Signed-off-by: duguowei [Jaegeuk Kim: make code neat] Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 3fe145e8e594f3..19b956c2d697a4 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -120,15 +120,13 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi) return free_blks - ovp_blks; } -static inline block_t limit_invalid_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_invalid_user_blocks(block_t user_block_count) { - return (long)(sbi->user_block_count * LIMIT_INVALID_BLOCK) / 100; + return (long)(user_block_count * LIMIT_INVALID_BLOCK) / 100; } -static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) +static inline block_t limit_free_user_blocks(block_t reclaimable_user_blocks) { - block_t reclaimable_user_blocks = sbi->user_block_count - - written_block_count(sbi); return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100; } @@ -163,15 +161,16 @@ static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) { - block_t invalid_user_blocks = sbi->user_block_count - - written_block_count(sbi); + block_t user_block_count = sbi->user_block_count; + block_t invalid_user_blocks = user_block_count - + written_block_count(sbi); /* * Background GC is triggered with the following conditions. * 1. There are a number of invalid blocks. * 2. There is not enough free space. */ - if (invalid_user_blocks > limit_invalid_user_blocks(sbi) && - free_user_blocks(sbi) < limit_free_user_blocks(sbi)) - return true; - return false; + return (invalid_user_blocks > + limit_invalid_user_blocks(user_block_count) && + free_user_blocks(sbi) < + limit_free_user_blocks(invalid_user_blocks)); } From 8245b017033284827312b35f86584f59a0f04da5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jun 2022 10:57:24 -0700 Subject: [PATCH 0172/1250] f2fs: enforce single zone capacity In order to simplify the complicated per-zone capacity, let's support only one capacity for entire zoned device. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 19 ++++++------------- fs/f2fs/segment.h | 3 +++ fs/f2fs/super.c | 33 ++++++++++++--------------------- 4 files changed, 22 insertions(+), 35 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c9a31934b9484e..1d97d06e0d8733 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1235,7 +1235,6 @@ struct f2fs_dev_info { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_blkz; /* Total number of zones */ unsigned long *blkz_seq; /* Bitmap indicating sequential zones */ - block_t *zone_capacity_blocks; /* Array of zone capacity in blks */ #endif }; @@ -1673,6 +1672,7 @@ struct f2fs_sb_info { unsigned int meta_ino_num; /* meta inode number*/ unsigned int log_blocks_per_seg; /* log2 blocks per segment */ unsigned int blocks_per_seg; /* blocks per segment */ + unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ unsigned int segs_per_sec; /* segments per section */ unsigned int secs_per_zone; /* sections per zone */ unsigned int total_sections; /* total section count */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 874c1b9c41a2ae..447b0357904921 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -4895,7 +4895,7 @@ static unsigned int get_zone_idx(struct f2fs_sb_info *sbi, unsigned int secno, static inline unsigned int f2fs_usable_zone_segs_in_sec( struct f2fs_sb_info *sbi, unsigned int segno) { - unsigned int dev_idx, zone_idx, unusable_segs_in_sec; + unsigned int dev_idx, zone_idx; dev_idx = f2fs_target_device_index(sbi, START_BLOCK(sbi, segno)); zone_idx = get_zone_idx(sbi, GET_SEC_FROM_SEG(sbi, segno), dev_idx); @@ -4904,18 +4904,12 @@ static inline unsigned int f2fs_usable_zone_segs_in_sec( if (is_conv_zone(sbi, zone_idx, dev_idx)) return sbi->segs_per_sec; - /* - * If the zone_capacity_blocks array is NULL, then zone capacity - * is equal to the zone size for all zones - */ - if (!FDEV(dev_idx).zone_capacity_blocks) + if (!sbi->unusable_blocks_per_sec) return sbi->segs_per_sec; /* Get the segment count beyond zone capacity block */ - unusable_segs_in_sec = (sbi->blocks_per_blkz - - FDEV(dev_idx).zone_capacity_blocks[zone_idx]) >> - sbi->log_blocks_per_seg; - return sbi->segs_per_sec - unusable_segs_in_sec; + return sbi->segs_per_sec - (sbi->unusable_blocks_per_sec >> + sbi->log_blocks_per_seg); } /* @@ -4944,12 +4938,11 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg( if (is_conv_zone(sbi, zone_idx, dev_idx)) return sbi->blocks_per_seg; - if (!FDEV(dev_idx).zone_capacity_blocks) + if (!sbi->unusable_blocks_per_sec) return sbi->blocks_per_seg; sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); - sec_cap_blkaddr = sec_start_blkaddr + - FDEV(dev_idx).zone_capacity_blocks[zone_idx]; + sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); /* * If segment starts before zone capacity and spans beyond diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 3f277dfcb13116..813a892cd979d9 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -101,6 +101,9 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, GET_SEGNO_FROM_SEG0(sbi, blk_addr))) #define BLKS_PER_SEC(sbi) \ ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define CAP_BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg - \ + (sbi)->unusable_blocks_per_sec) #define GET_SEC_FROM_SEG(sbi, segno) \ (((segno) == -1) ? -1: (segno) / (sbi)->segs_per_sec) #define GET_SEG_FROM_SEC(sbi, secno) \ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cf9cf24f9b56a0..faf9a767d05ae4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1522,7 +1522,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) blkdev_put(FDEV(i).bdev, FMODE_EXCL); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); - kfree(FDEV(i).zone_capacity_blocks); #endif } kvfree(sbi->devs); @@ -3673,24 +3672,29 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) #ifdef CONFIG_BLK_DEV_ZONED struct f2fs_report_zones_args { + struct f2fs_sb_info *sbi; struct f2fs_dev_info *dev; - bool zone_cap_mismatch; }; static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct f2fs_report_zones_args *rz_args = data; + block_t unusable_blocks = (zone->len - zone->capacity) >> + F2FS_LOG_SECTORS_PER_BLOCK; if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return 0; set_bit(idx, rz_args->dev->blkz_seq); - rz_args->dev->zone_capacity_blocks[idx] = zone->capacity >> - F2FS_LOG_SECTORS_PER_BLOCK; - if (zone->len != zone->capacity && !rz_args->zone_cap_mismatch) - rz_args->zone_cap_mismatch = true; - + if (!rz_args->sbi->unusable_blocks_per_sec) { + rz_args->sbi->unusable_blocks_per_sec = unusable_blocks; + return 0; + } + if (rz_args->sbi->unusable_blocks_per_sec != unusable_blocks) { + f2fs_err(rz_args->sbi, "F2FS supports single zone capacity\n"); + return -EINVAL; + } return 0; } @@ -3731,26 +3735,13 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) if (!FDEV(devi).blkz_seq) return -ENOMEM; - /* Get block zones type and zone-capacity */ - FDEV(devi).zone_capacity_blocks = f2fs_kzalloc(sbi, - FDEV(devi).nr_blkz * sizeof(block_t), - GFP_KERNEL); - if (!FDEV(devi).zone_capacity_blocks) - return -ENOMEM; - + rep_zone_arg.sbi = sbi; rep_zone_arg.dev = &FDEV(devi); - rep_zone_arg.zone_cap_mismatch = false; ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, f2fs_report_zone_cb, &rep_zone_arg); if (ret < 0) return ret; - - if (!rep_zone_arg.zone_cap_mismatch) { - kfree(FDEV(devi).zone_capacity_blocks); - FDEV(devi).zone_capacity_blocks = NULL; - } - return 0; } #endif From 7a2413f4eefd43838c7914815d386dc114d33ce4 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jun 2022 11:03:57 -0700 Subject: [PATCH 0173/1250] f2fs: adjust zone capacity when considering valid block count This patch fixes counting unusable blocks set by zone capacity when checking the valid block count in a section. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- fs/f2fs/file.c | 6 +++--- fs/f2fs/gc.c | 4 ++-- fs/f2fs/segment.c | 7 +++---- fs/f2fs/segment.h | 8 ++++---- 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index c92625ef16d0bf..c01471573977ac 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -39,7 +39,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = BLKS_PER_SEC(sbi); + blks_per_sec = CAP_BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, true); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2d1114b0ceefea..0f29af7876a642 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1681,7 +1681,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, return 0; if (f2fs_is_pinned_file(inode)) { - block_t sec_blks = BLKS_PER_SEC(sbi); + block_t sec_blks = CAP_BLKS_PER_SEC(sbi); block_t sec_len = roundup(map.m_len, sec_blks); map.m_len = sec_blks; @@ -2432,7 +2432,7 @@ static int __f2fs_ioc_gc_range(struct file *filp, struct f2fs_gc_range *range) ret = -EAGAIN; goto out; } - range->start += BLKS_PER_SEC(sbi); + range->start += CAP_BLKS_PER_SEC(sbi); if (range->start <= end) goto do_more; out: @@ -2557,7 +2557,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, goto out; } - sec_num = DIV_ROUND_UP(total, BLKS_PER_SEC(sbi)); + sec_num = DIV_ROUND_UP(total, CAP_BLKS_PER_SEC(sbi)); /* * make sure there are enough free section for LFS allocation, this can diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d5fb426e074743..c38bdaf831af1f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -487,7 +487,7 @@ static void atgc_lookup_victim(struct f2fs_sb_info *sbi, unsigned long long age, u, accu; unsigned long long max_mtime = sit_i->dirty_max_mtime; unsigned long long min_mtime = sit_i->dirty_min_mtime; - unsigned int sec_blocks = BLKS_PER_SEC(sbi); + unsigned int sec_blocks = CAP_BLKS_PER_SEC(sbi); unsigned int vblocks; unsigned int dirty_threshold = max(am->max_candidate_count, am->candidate_ratio * @@ -1487,7 +1487,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || (!force_migrate && get_valid_blocks(sbi, segno, true) == - BLKS_PER_SEC(sbi))) + CAP_BLKS_PER_SEC(sbi))) return submitted; if (check_valid_map(sbi, segno, off) == 0) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 447b0357904921..ce571c0d712696 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -728,7 +728,7 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, get_valid_blocks(sbi, segno, true); f2fs_bug_on(sbi, unlikely(!valid_blocks || - valid_blocks == BLKS_PER_SEC(sbi))); + valid_blocks == CAP_BLKS_PER_SEC(sbi))); if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); @@ -764,7 +764,7 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || - valid_blocks == BLKS_PER_SEC(sbi)) { + valid_blocks == CAP_BLKS_PER_SEC(sbi)) { clear_bit(secno, dirty_i->dirty_secmap); return; } @@ -4483,7 +4483,6 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; block_t valid_blocks, usable_blks_in_seg; - block_t blks_per_sec = BLKS_PER_SEC(sbi); while (1) { /* find dirty segment based on free segmap */ @@ -4512,7 +4511,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) valid_blocks = get_valid_blocks(sbi, segno, true); secno = GET_SEC_FROM_SEG(sbi, segno); - if (!valid_blocks || valid_blocks == blks_per_sec) + if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) continue; if (IS_CURSEC(sbi, secno)) continue; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 813a892cd979d9..d1d63766f2c7e5 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -612,10 +612,10 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, get_pages(sbi, F2FS_DIRTY_DENTS) + get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS); - unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi); - unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi); - unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi); - unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi); + unsigned int node_secs = total_node_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int dent_secs = total_dent_blocks / CAP_BLKS_PER_SEC(sbi); + unsigned int node_blocks = total_node_blocks % CAP_BLKS_PER_SEC(sbi); + unsigned int dent_blocks = total_dent_blocks % CAP_BLKS_PER_SEC(sbi); unsigned int free, need_lower, need_upper; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) From 785c1904b28d928a0b7c84e05e624434746ecd0c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Jun 2022 15:49:47 -0700 Subject: [PATCH 0174/1250] f2fs: add a sysfs entry to show zone capacity This patch adds a sysfs entry showing the unusable space in a section made by zone capacity. Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/sysfs.c | 10 ++++++++++ 2 files changed, 16 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 9b583dd0298b79..22c1efd49773bc 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -580,3 +580,9 @@ Date: January 2022 Contact: "Jaegeuk Kim" Description: Controls max # of node block writes to be used for roll forward recovery. This can limit the roll forward recovery time. + +What: /sys/fs/f2fs//unusable_blocks_per_sec +Date: June 2022 +Contact: "Jaegeuk Kim" +Description: Shows the number of unusable blocks in a section which was defined by + the zone capacity reported by underlying zoned device. diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 4c50aedd5144e6..6eeefe60a7afec 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -713,6 +713,11 @@ static struct f2fs_attr f2fs_attr_##_name = { \ .offset = _offset \ } +#define F2FS_RO_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0444, \ + f2fs_sbi_show, NULL, \ + offsetof(struct struct_name, elname)) + #define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ F2FS_ATTR_OFFSET(struct_type, name, 0644, \ f2fs_sbi_show, f2fs_sbi_store, \ @@ -811,6 +816,8 @@ F2FS_FEATURE_RO_ATTR(encrypted_casefold); #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED F2FS_FEATURE_RO_ATTR(block_zoned); +F2FS_RO_ATTR(F2FS_SBI, f2fs_sb_info, unusable_blocks_per_sec, + unusable_blocks_per_sec); #endif F2FS_FEATURE_RO_ATTR(atomic_write); F2FS_FEATURE_RO_ATTR(extra_attr); @@ -919,6 +926,9 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(moved_blocks_background), ATTR_LIST(avg_vblocks), #endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(unusable_blocks_per_sec), +#endif #ifdef CONFIG_F2FS_FS_COMPRESSION ATTR_LIST(compr_written_block), ATTR_LIST(compr_saved_block), From 522ae491e18a22c0440afd85eea9a1cc85b4fcdd Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 28 Jun 2022 16:34:29 +0800 Subject: [PATCH 0175/1250] virtio_pmem: initialize provider_data through nd_region_desc We used to initialize the provider_data manually after nvdimm_pemm_region_create(). This seems to be racy if the flush is issued before the initialization of provider_data[1]. Fixing this by initializing the provider_data through nd_region_desc to make sure the provider_data is ready after the pmem is created. [1]: [ 80.152281] nd_pmem namespace0.0: unable to guarantee persistence of writes [ 92.393956] BUG: kernel NULL pointer dereference, address: 0000000000000318 [ 92.394551] #PF: supervisor read access in kernel mode [ 92.394955] #PF: error_code(0x0000) - not-present page [ 92.395365] PGD 0 P4D 0 [ 92.395566] Oops: 0000 [#1] PREEMPT SMP PTI [ 92.395867] CPU: 2 PID: 506 Comm: mkfs.ext4 Not tainted 5.19.0-rc1+ #453 [ 92.396365] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 [ 92.397178] RIP: 0010:virtio_pmem_flush+0x2f/0x1f0 [ 92.397521] Code: 55 41 54 55 53 48 81 ec a0 00 00 00 65 48 8b 04 25 28 00 00 00 48 89 84 24 98 00 00 00 31 c0 48 8b 87 78 03 00 00 48 89 04 24 <48> 8b 98 18 03 00 00 e8 85 bf 6b 00 ba 58 00 00 00 be c0 0c 00 00 [ 92.398982] RSP: 0018:ffff9a7380aefc88 EFLAGS: 00010246 [ 92.399349] RAX: 0000000000000000 RBX: ffff8e77c3f86f00 RCX: 0000000000000000 [ 92.399833] RDX: ffffffffad4ea720 RSI: ffff8e77c41e39c0 RDI: ffff8e77c41c5c00 [ 92.400388] RBP: ffff8e77c41e39c0 R08: ffff8e77c19f0600 R09: 0000000000000000 [ 92.400874] R10: 0000000000000000 R11: 0000000000000000 R12: ffff8e77c0814e28 [ 92.401364] R13: 0000000000000000 R14: 0000000000000000 R15: ffff8e77c41e39c0 [ 92.401849] FS: 00007f3cd75b2780(0000) GS:ffff8e7937d00000(0000) knlGS:0000000000000000 [ 92.402423] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 92.402821] CR2: 0000000000000318 CR3: 0000000103c80002 CR4: 0000000000370ee0 [ 92.403307] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 92.403793] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 92.404278] Call Trace: [ 92.404481] [ 92.404654] ? mempool_alloc+0x5d/0x160 [ 92.404939] ? terminate_walk+0x5f/0xf0 [ 92.405226] ? bio_alloc_bioset+0xbb/0x3f0 [ 92.405525] async_pmem_flush+0x17/0x80 [ 92.405806] nvdimm_flush+0x11/0x30 [ 92.406067] pmem_submit_bio+0x1e9/0x200 [ 92.406354] __submit_bio+0x80/0x120 [ 92.406621] submit_bio_noacct_nocheck+0xdc/0x2a0 [ 92.406958] submit_bio_wait+0x4e/0x80 [ 92.407234] blkdev_issue_flush+0x31/0x50 [ 92.407526] ? punt_bios_to_rescuer+0x230/0x230 [ 92.407852] blkdev_fsync+0x1e/0x30 [ 92.408112] do_fsync+0x33/0x70 [ 92.408354] __x64_sys_fsync+0xb/0x10 [ 92.408625] do_syscall_64+0x43/0x90 [ 92.408895] entry_SYSCALL_64_after_hwframe+0x46/0xb0 [ 92.409257] RIP: 0033:0x7f3cd76c6c44 Fixes 6e84200c0a29 ("virtio-pmem: Add virtio pmem driver") Acked-by: Pankaj Gupta Reviewed-by: Dan Williams Signed-off-by: Jason Wang Message-Id: <20220628083430.61856-1-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/nvdimm/virtio_pmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c index 995b6cdc67ede8..48f8327d0431fa 100644 --- a/drivers/nvdimm/virtio_pmem.c +++ b/drivers/nvdimm/virtio_pmem.c @@ -81,6 +81,7 @@ static int virtio_pmem_probe(struct virtio_device *vdev) ndr_desc.res = &res; ndr_desc.numa_node = nid; ndr_desc.flush = async_pmem_flush; + ndr_desc.provider_data = vdev; set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); set_bit(ND_REGION_ASYNC, &ndr_desc.flags); nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); @@ -89,7 +90,6 @@ static int virtio_pmem_probe(struct virtio_device *vdev) err = -ENXIO; goto out_nd; } - nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent); return 0; out_nd: nvdimm_bus_unregister(vpmem->nvdimm_bus); From e66eb603b9c664690f29d4978b74a4664226951d Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 28 Jun 2022 16:34:30 +0800 Subject: [PATCH 0176/1250] virtio_pmem: set device ready in probe() The NVDIMM region could be available before the virtio_device_ready() that is called by virtio_dev_probe(). This means the driver tries to use device before DRIVER_OK which violates the spec, fixing this by set device ready before the nvdimm_pmem_region_create(). Note that this means the virtio_pmem_host_ack() could be triggered before the creation of the nd region, this is safe since the pmem_lock has been initialized and whether or not any available buffer is added before is validated by virtio_pmem_host_ack(). Fixes 6e84200c0a29 ("virtio-pmem: Add virtio pmem driver") Acked-by: Pankaj Gupta Signed-off-by: Jason Wang Message-Id: <20220628083430.61856-2-jasowang@redhat.com> Signed-off-by: Michael S. Tsirkin --- drivers/nvdimm/virtio_pmem.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c index 48f8327d0431fa..20da455d2ef637 100644 --- a/drivers/nvdimm/virtio_pmem.c +++ b/drivers/nvdimm/virtio_pmem.c @@ -84,6 +84,12 @@ static int virtio_pmem_probe(struct virtio_device *vdev) ndr_desc.provider_data = vdev; set_bit(ND_REGION_PAGEMAP, &ndr_desc.flags); set_bit(ND_REGION_ASYNC, &ndr_desc.flags); + /* + * The NVDIMM region could be available before the + * virtio_device_ready() that is called by + * virtio_dev_probe(), so we set device ready here. + */ + virtio_device_ready(vdev); nd_region = nvdimm_pmem_region_create(vpmem->nvdimm_bus, &ndr_desc); if (!nd_region) { dev_err(&vdev->dev, "failed to create nvdimm region\n"); @@ -92,6 +98,7 @@ static int virtio_pmem_probe(struct virtio_device *vdev) } return 0; out_nd: + virtio_reset_device(vdev); nvdimm_bus_unregister(vpmem->nvdimm_bus); out_vq: vdev->config->del_vqs(vdev); From 964688b32d9ada55a7fce2e650d85ef24188f73f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 17 May 2022 18:03:27 -0400 Subject: [PATCH 0177/1250] btrfs: Use a folio in wait_dev_supers() Remove a use of PageError and optimise putting the page reference twice. Signed-off-by: Matthew Wilcox (Oracle) --- fs/btrfs/disk-io.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4ba005c4198368..dbac856565718b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4178,7 +4178,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) max_mirrors = BTRFS_SUPER_MIRROR_MAX; for (i = 0; i < max_mirrors; i++) { - struct page *page; + struct folio *folio; ret = btrfs_sb_log_location(device, i, READ, &bytenr); if (ret == -ENOENT) { @@ -4193,27 +4193,24 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) device->commit_total_bytes) break; - page = find_get_page(device->bdev->bd_inode->i_mapping, + folio = filemap_get_folio(device->bdev->bd_inode->i_mapping, bytenr >> PAGE_SHIFT); - if (!page) { + if (!folio) { errors++; if (i == 0) primary_failed = true; continue; } - /* Page is submitted locked and unlocked once the IO completes */ - wait_on_page_locked(page); - if (PageError(page)) { + /* Folio is unlocked once the IO completes */ + folio_wait_locked(folio); + if (!folio_test_uptodate(folio)) { errors++; if (i == 0) primary_failed = true; } - /* Drop our reference */ - put_page(page); - - /* Drop the reference from the writing run */ - put_page(page); + /* Drop our reference and the one from the writing run */ + folio_put_refs(folio, 2); } /* log error, force error return */ From ba457436991abc8e3cc830cfc91e9f54b53a07ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 26 May 2022 23:19:49 -0400 Subject: [PATCH 0178/1250] buffer: Don't test folio error in block_read_full_folio() We can cache this information in a local variable instead of communicating from one part of the function to another via folio flags. Signed-off-by: Matthew Wilcox (Oracle) --- fs/buffer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index a0214e3f90d367..ce9844d7c10fac 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2259,6 +2259,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) unsigned int blocksize, bbits; int nr, i; int fully_mapped = 1; + bool page_error = false; VM_BUG_ON_FOLIO(folio_test_large(folio), folio); @@ -2283,8 +2284,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (iblock < lblock) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, iblock, bh, 0); - if (err) + if (err) { folio_set_error(folio); + page_error = true; + } } if (!buffer_mapped(bh)) { folio_zero_range(folio, i * blocksize, @@ -2311,7 +2314,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) * All buffers are uptodate - we can set the folio uptodate * as well. But not if get_block() returned an error. */ - if (!folio_test_error(folio)) + if (!page_error) folio_mark_uptodate(folio); folio_unlock(folio); return 0; From 2c62b172802a648cb9bad9b1ad2415a8ecbb7f41 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 26 May 2022 23:33:43 -0400 Subject: [PATCH 0179/1250] squashfs: Return the actual error from squashfs_read_folio() Since we actually know what error happened, we can report it instead of having the generic code return -EIO for pages that were unlocked without being marked uptodate. Also remove a test of PageError since we have the return value at this point. Signed-off-by: Matthew Wilcox (Oracle) --- fs/squashfs/file.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a8e495d8eb8600..7f0904b203294c 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -454,7 +454,7 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) int expected = index == file_end ? (i_size_read(inode) & (msblk->block_size - 1)) : msblk->block_size; - int res; + int res = 0; void *pageaddr; TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", @@ -467,14 +467,15 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) if (index < file_end || squashfs_i(inode)->fragment_block == SQUASHFS_INVALID_BLK) { u64 block = 0; - int bsize = read_blocklist(inode, index, &block); - if (bsize < 0) + + res = read_blocklist(inode, index, &block); + if (res < 0) goto error_out; - if (bsize == 0) + if (res == 0) res = squashfs_readpage_sparse(page, expected); else - res = squashfs_readpage_block(page, block, bsize, expected); + res = squashfs_readpage_block(page, block, res, expected); } else res = squashfs_readpage_fragment(page, expected); @@ -488,11 +489,11 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) memset(pageaddr, 0, PAGE_SIZE); kunmap_atomic(pageaddr); flush_dcache_page(page); - if (!PageError(page)) + if (res == 0) SetPageUptodate(page); unlock_page(page); - return 0; + return res; } From 4188e3e2860d65a43397d4658f74136bf3297808 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 27 May 2022 11:20:56 -0400 Subject: [PATCH 0180/1250] hostfs: Handle page write errors correctly If a page can't be written back, we need to call mapping_set_error(), not clear the page's Uptodate flag. Also remove the clearing of PageError on success; that flag is used for read errors, not write errors. Signed-off-by: Matthew Wilcox (Oracle) --- fs/hostfs/hostfs_kern.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cc1bc6f93a0101..07881b76d42f99 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -416,15 +416,15 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); if (err != count) { - ClearPageUptodate(page); + if (err >= 0) + err = -EIO; + mapping_set_error(mapping, err); goto out; } if (base > inode->i_size) inode->i_size = base; - if (PageError(page)) - ClearPageError(page); err = 0; out: From 437084d7c5b825e714d0b50ba6624e6ba840fa32 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 27 May 2022 11:29:24 -0400 Subject: [PATCH 0181/1250] ocfs2: Use filemap_write_and_wait_range() in ocfs2_cow_sync_writeback() Remove the open-coding of filemap_fdatawait_range(). Signed-off-by: Matthew Wilcox (Oracle) --- fs/ocfs2/refcounttree.c | 42 ++++++----------------------------------- 1 file changed, 6 insertions(+), 36 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e04358a46b6805..1358981e80a365 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3146,48 +3146,18 @@ int ocfs2_cow_sync_writeback(struct super_block *sb, struct inode *inode, u32 cpos, u32 num_clusters) { - int ret = 0; - loff_t offset, end, map_end; - pgoff_t page_index; - struct page *page; + int ret; + loff_t start, end; if (ocfs2_should_order_data(inode)) return 0; - offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; - end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); + start = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; + end = start + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits) - 1; - ret = filemap_fdatawrite_range(inode->i_mapping, - offset, end - 1); - if (ret < 0) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (ret < 0) mlog_errno(ret); - return ret; - } - - while (offset < end) { - page_index = offset >> PAGE_SHIFT; - map_end = ((loff_t)page_index + 1) << PAGE_SHIFT; - if (map_end > end) - map_end = end; - - page = find_or_create_page(inode->i_mapping, - page_index, GFP_NOFS); - BUG_ON(!page); - - wait_on_page_writeback(page); - if (PageError(page)) { - ret = -EIO; - mlog_errno(ret); - } else - mark_page_accessed(page); - - unlock_page(page); - put_page(page); - page = NULL; - offset = map_end; - if (ret) - break; - } return ret; } From 0ba02b002594cc1809da3a05a44bb5b9654448f6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 18 May 2022 08:54:42 -0400 Subject: [PATCH 0182/1250] cramfs: read_mapping_page() is synchronous Since commit 67f9fd91f93c, the code to wait for the read to complete has been dead. That commit wrongly stated that the read was synchronous already; this seems to have been a confusion about which ->readpage operation was being called. Instead of reintroducing an asynchronous version of read_mapping_page(), call the readahead code directly to submit all reads first before waiting for them in read_mapping_page(). Signed-off-by: Matthew Wilcox (Oracle) --- fs/cramfs/inode.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 7ae59a6afc5c19..61ccf7722fc3c4 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -183,6 +183,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, unsigned int len) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; + struct file_ra_state ra; struct page *pages[BLKS_PER_BUF]; unsigned i, blocknr, buffer; unsigned long devsize; @@ -212,6 +213,9 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT; /* Ok, read in BLKS_PER_BUF pages completely first. */ + file_ra_state_init(&ra, mapping); + page_cache_sync_readahead(mapping, &ra, NULL, blocknr, BLKS_PER_BUF); + for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = NULL; @@ -224,19 +228,6 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, pages[i] = page; } - for (i = 0; i < BLKS_PER_BUF; i++) { - struct page *page = pages[i]; - - if (page) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - /* asynchronous error */ - put_page(page); - pages[i] = NULL; - } - } - } - buffer = next_buffer; next_buffer = NEXT_BUFFER(buffer); buffer_blocknr[buffer] = blocknr; From 1cf29f882fa8e28e18cccc9b0a7e94e391b3291f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 17 May 2022 23:36:55 -0400 Subject: [PATCH 0183/1250] block: Simplify read_part_sector() That rather complicated expression is just trying to find the offset of this sector within a page, and there are easier ways to express that. Signed-off-by: Matthew Wilcox (Oracle) --- block/partitions/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index a9a51bac42df8b..52871fa224eeb2 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -718,8 +718,7 @@ void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) goto out; p->v = page; - return (unsigned char *)page_address(page) + - ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT); + return page_address(page) + offset_in_page(n * SECTOR_SIZE); out: p->v = NULL; return NULL; From a340b79b299109ded6928157dc24cf0fb6a90823 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 17 May 2022 23:38:37 -0400 Subject: [PATCH 0184/1250] block: Handle partition read errors more consistently Set p->v to NULL if we try to read beyond the end of the disk, just like we do if we get an error returned from trying to read the disk. Signed-off-by: Matthew Wilcox (Oracle) --- block/partitions/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 52871fa224eeb2..58034dd2d2155f 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -709,7 +709,7 @@ void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; - return NULL; + goto out; } page = read_mapping_page(mapping, From 4639d0da923efd6704974893ba19eb1aaf396538 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 17 May 2022 23:40:45 -0400 Subject: [PATCH 0185/1250] block: Use PAGE_SECTORS_SHIFT The bare use of '9' confuses some people. We also don't need this cast, since the compiler does exactly that cast for us. Signed-off-by: Matthew Wilcox (Oracle) --- block/partitions/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 58034dd2d2155f..269c86523e67de 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -712,8 +712,7 @@ void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) goto out; } - page = read_mapping_page(mapping, - (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL); + page = read_mapping_page(mapping, n >> PAGE_SECTORS_SHIFT, NULL); if (IS_ERR(page)) goto out; From 5b15f72a828b44a06d50ab739e89ac492298bec4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 17 May 2022 23:43:35 -0400 Subject: [PATCH 0186/1250] block: Convert read_part_sector() to use a folio This relatively straightforward converion saves a call to compound_head() hidden inside put_page(). Signed-off-by: Matthew Wilcox (Oracle) --- block/partitions/check.h | 4 ++-- block/partitions/core.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/block/partitions/check.h b/block/partitions/check.h index 4ffa2359b1a37e..8d70a880c3720a 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -24,13 +24,13 @@ struct parsed_partitions { }; typedef struct { - struct page *v; + struct folio *v; } Sector; void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); static inline void put_dev_sector(Sector p) { - put_page(p.v); + folio_put(p.v); } static inline void diff --git a/block/partitions/core.c b/block/partitions/core.c index 269c86523e67de..e103ad08a948d6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -705,19 +705,19 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { struct address_space *mapping = state->disk->part0->bd_inode->i_mapping; - struct page *page; + struct folio *folio; if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; goto out; } - page = read_mapping_page(mapping, n >> PAGE_SECTORS_SHIFT, NULL); - if (IS_ERR(page)) + folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL); + if (IS_ERR(folio)) goto out; - p->v = page; - return page_address(page) + offset_in_page(n * SECTOR_SIZE); + p->v = folio; + return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE); out: p->v = NULL; return NULL; From fa19fbd23186e43714bf7694ec83b7a519e618c0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 11:12:16 -0400 Subject: [PATCH 0187/1250] befs: Convert befs_symlink_read_folio() to use a folio This is a straightforward conversion from the page APIs to the folio APIs. Symlinks are not allowed to be larger than PAGE_SIZE, so there is little work to do here. Signed-off-by: Matthew Wilcox (Oracle) --- fs/befs/linuxvfs.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index be383fa46b12a8..32749fcee090ac 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -108,8 +108,7 @@ static const struct export_operations befs_export_operations = { * passes it the address of befs_get_block, for mapping file * positions to disk blocks. */ -static int -befs_read_folio(struct file *file, struct folio *folio) +static int befs_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, befs_get_block); } @@ -470,13 +469,12 @@ befs_destroy_inodecache(void) */ static int befs_symlink_read_folio(struct file *unused, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct befs_inode_info *befs_ino = BEFS_I(inode); befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; - char *link = page_address(page); + char *link = folio_address(folio); if (len == 0 || len > PAGE_SIZE) { befs_error(sb, "Long symlink with illegal length"); @@ -489,12 +487,12 @@ static int befs_symlink_read_folio(struct file *unused, struct folio *folio) goto fail; } link[len - 1] = '\0'; - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_set_error(folio); + folio_unlock(folio); return -EIO; } From 1a6b7e5cb5504bf4f0f4e63b8bedaff8aad5798b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 11:12:16 -0400 Subject: [PATCH 0188/1250] coda: Convert coda_symlink_filler() to use a folio This is a straightforward conversion from the page APIs to the folio APIs. Symlinks are not allowed to be larger than PAGE_SIZE, so there is little work to do here. Signed-off-by: Matthew Wilcox (Oracle) --- fs/coda/symlink.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index 8adf810424986b..ccdbec388091ab 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -22,25 +22,24 @@ static int coda_symlink_filler(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct inode *inode = folio->mapping->host; int error; struct coda_inode_info *cii; unsigned int len = PAGE_SIZE; - char *p = page_address(page); + char *p = folio_address(folio); cii = ITOC(inode); error = venus_readlink(inode->i_sb, &cii->c_fid, p, &len); if (error) goto fail; - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_set_error(folio); + folio_unlock(folio); return error; } From 6e2a48222bc97d4028b1ff8b8bfdfadec7a72923 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 11:12:16 -0400 Subject: [PATCH 0189/1250] freevxfs: Convert vxfs_immed_read_folio() to use a folio Reorganise the file to remove the forward declaration. Use folios throughout vxfs_immed_read_folio(). Use memcpy_to_page() instead of an open-coded kmap()/kunmap(). Remove flush_dcache_page() as this is embedded in memcpy_to_page(). Use folio_pos() instead of opencoding it. Handle multi-page folios. Signed-off-by: Matthew Wilcox (Oracle) --- fs/freevxfs/vxfs_immed.c | 43 ++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index c2ef9f0debbde7..9b49ec36e66708 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -13,16 +13,6 @@ #include "vxfs_extern.h" #include "vxfs_inode.h" - -static int vxfs_immed_read_folio(struct file *, struct folio *); - -/* - * Address space operations for immed files and directories. - */ -const struct address_space_operations vxfs_immed_aops = { - .read_folio = vxfs_immed_read_folio, -}; - /** * vxfs_immed_read_folio - read part of an immed inode into pagecache * @file: file context (unused) @@ -30,7 +20,7 @@ const struct address_space_operations vxfs_immed_aops = { * * Description: * vxfs_immed_read_folio reads a part of the immed area of the - * file that hosts @pp into the pagecache. + * file that hosts @folio into the pagecache. * * Returns: * Zero on success, else a negative error code. @@ -38,21 +28,26 @@ const struct address_space_operations vxfs_immed_aops = { * Locking status: * @folio is locked and will be unlocked. */ -static int -vxfs_immed_read_folio(struct file *fp, struct folio *folio) +static int vxfs_immed_read_folio(struct file *fp, struct folio *folio) { - struct page *pp = &folio->page; - struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); - u_int64_t offset = (u_int64_t)pp->index << PAGE_SHIFT; - caddr_t kaddr; + struct vxfs_inode_info *vip = VXFS_INO(folio->mapping->host); + void *src = vip->vii_immed.vi_immed + folio_pos(folio); + unsigned long i; - kaddr = kmap(pp); - memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE); - kunmap(pp); - - flush_dcache_page(pp); - SetPageUptodate(pp); - unlock_page(pp); + for (i = 0; i < folio_nr_pages(folio); i++) { + memcpy_to_page(folio_page(folio, i), 0, src, PAGE_SIZE); + src += PAGE_SIZE; + } + + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; } + +/* + * Address space operations for immed files and directories. + */ +const struct address_space_operations vxfs_immed_aops = { + .read_folio = vxfs_immed_read_folio, +}; From 71864cbf4617f06ff2434235049e9c3daa2806a6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 29 Apr 2022 11:12:16 -0400 Subject: [PATCH 0190/1250] ocfs2: Convert ocfs2_read_folio() to use a folio Use the folio API throughout. There are a few places where we convert back to a page to call into the rest of the filesystem, so folio usage needs to be pushed down to those functions later. Signed-off-by: Matthew Wilcox (Oracle) --- fs/ocfs2/aops.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 35d40a67204c48..767df51f8657a6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -277,16 +277,14 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) static int ocfs2_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ocfs2_inode_info *oi = OCFS2_I(inode); - loff_t start = (loff_t)page->index << PAGE_SHIFT; + loff_t start = folio_pos(folio); int ret, unlock = 1; - trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, - (page ? page->index : 0)); + trace_ocfs2_readpage((unsigned long long)oi->ip_blkno, folio->index); - ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page); + ret = ocfs2_inode_lock_with_page(inode, NULL, 0, &folio->page); if (ret != 0) { if (ret == AOP_TRUNCATED_PAGE) unlock = 0; @@ -296,11 +294,11 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) if (down_read_trylock(&oi->ip_alloc_sem) == 0) { /* - * Unlock the page and cycle ip_alloc_sem so that we don't + * Unlock the folio and cycle ip_alloc_sem so that we don't * busyloop waiting for ip_alloc_sem to unlock */ ret = AOP_TRUNCATED_PAGE; - unlock_page(page); + folio_unlock(folio); unlock = 0; down_read(&oi->ip_alloc_sem); up_read(&oi->ip_alloc_sem); @@ -313,21 +311,21 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) * block_read_full_folio->get_block freaks out if it is asked to read * beyond the end of a file, so we check here. Callers * (generic_file_read, vm_ops->fault) are clever enough to check i_size - * and notice that the page they just read isn't needed. + * and notice that the folio they just read isn't needed. * * XXX sys_readahead() seems to get that wrong? */ if (start >= i_size_read(inode)) { - zero_user(page, 0, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); ret = 0; goto out_alloc; } if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - ret = ocfs2_readpage_inline(inode, page); + ret = ocfs2_readpage_inline(inode, &folio->page); else - ret = block_read_full_folio(page_folio(page), ocfs2_get_block); + ret = block_read_full_folio(folio, ocfs2_get_block); unlock = 0; out_alloc: @@ -336,7 +334,7 @@ static int ocfs2_read_folio(struct file *file, struct folio *folio) ocfs2_inode_unlock(inode, 0); out: if (unlock) - unlock_page(page); + folio_unlock(folio); return ret; } From d862e2d593685f8a895202493f1a059932cdb0e0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 13 May 2022 21:21:11 -0400 Subject: [PATCH 0191/1250] gfs2: Convert gfs2_jhead_process_page() to use a folio Use folio_put_refs() to perform only one atomic operation instead of two. The other changes are straightforward conversions from page APIs to their folio equivalents. Signed-off-by: Matthew Wilcox (Oracle) --- fs/gfs2/lops.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 6ba51cbb94cf20..1f67d37cd225c8 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -452,36 +452,36 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, * @head: The journal head to start from * @done: If set, perform only cleanup, else search and set if found. * - * Find the page with 'index' in the journal's mapping. Search the page for + * Find the folio with 'index' in the journal's mapping. Search the folio for * the journal head if requested (cleanup == false). Release refs on the - * page so the page cache can reclaim it (put_page() twice). We grabbed a - * reference on this page two times, first when we did a find_or_create_page() - * to obtain the page to add it to the bio and second when we do a - * find_get_page() here to get the page to wait on while I/O on it is being + * folio so the page cache can reclaim it. We grabbed a + * reference on this folio twice, first when we did a find_or_create_page() + * to obtain the folio to add it to the bio and second when we do a + * filemap_get_folio() here to get the folio to wait on while I/O on it is being * completed. - * This function is also used to free up a page we might've grabbed but not + * This function is also used to free up a folio we might've grabbed but not * used. Maybe we added it to a bio, but not submitted it for I/O. Or we * submitted the I/O, but we already found the jhead so we only need to drop - * our references to the page. + * our references to the folio. */ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, struct gfs2_log_header_host *head, bool *done) { - struct page *page; + struct folio *folio; - page = find_get_page(jd->jd_inode->i_mapping, index); - wait_on_page_locked(page); + folio = filemap_get_folio(jd->jd_inode->i_mapping, index); - if (PageError(page)) + folio_wait_locked(folio); + if (folio_test_error(folio)) *done = true; if (!*done) - *done = gfs2_jhead_pg_srch(jd, head, page); + *done = gfs2_jhead_pg_srch(jd, head, &folio->page); - put_page(page); /* Once for find_get_page */ - put_page(page); /* Once more for find_or_create_page */ + /* filemap_get_folio() and the earlier find_or_create_page() */ + folio_put_refs(folio, 2); } static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) From 1662afcecae535147b1cb143fde8d99a6a98d8fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 17 May 2022 18:06:23 -0400 Subject: [PATCH 0192/1250] ext2: Use a folio in ext2_get_page() Remove a call to read_mapping_page(). Signed-off-by: Matthew Wilcox (Oracle) --- fs/ext2/dir.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 8326b63f0b7071..8f597753ac129d 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -200,18 +200,19 @@ static struct page * ext2_get_page(struct inode *dir, unsigned long n, int quiet, void **page_addr) { struct address_space *mapping = dir->i_mapping; - struct page *page = read_mapping_page(mapping, n, NULL); - if (!IS_ERR(page)) { - *page_addr = kmap_local_page(page); - if (unlikely(!PageChecked(page))) { - if (!ext2_check_page(page, quiet, *page_addr)) - goto fail; - } + struct folio *folio = read_mapping_folio(mapping, n, NULL); + + if (IS_ERR(folio)) + return &folio->page; + *page_addr = kmap_local_folio(folio, n & (folio_nr_pages(folio) - 1)); + if (unlikely(!folio_test_checked(folio))) { + if (!ext2_check_page(&folio->page, quiet, *page_addr)) + goto fail; } - return page; + return &folio->page; fail: - ext2_put_page(page, *page_addr); + ext2_put_page(&folio->page, *page_addr); return ERR_PTR(-EIO); } From 1a22e12f94c9f647cdd8e4e56bc313043e9eb5c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Jun 2022 15:37:50 -0400 Subject: [PATCH 0193/1250] secretmem: Remove isolate_page The isolate_page operation is never called for filesystems, only for device drivers which call SetPageMovable. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand --- mm/secretmem.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 206ed6b40c1d0f..1c7f1775b56e7f 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -133,11 +133,6 @@ static const struct file_operations secretmem_fops = { .mmap = secretmem_mmap, }; -static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode) -{ - return false; -} - static int secretmem_migratepage(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) @@ -155,7 +150,6 @@ const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, .migratepage = secretmem_migratepage, - .isolate_page = secretmem_isolate_page, }; static int secretmem_setattr(struct user_namespace *mnt_userns, From b361f39863ed2d6ca644a2d53638c3fd87d32d14 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 7 Jun 2022 15:38:48 -0400 Subject: [PATCH 0194/1250] mm: Convert all PageMovable users to movable_operations These drivers are rather uncomfortably hammered into the address_space_operations hole. They aren't filesystems and don't behave like filesystems. They just need their own movable_operations structure, which we can point to directly from page->mapping. Signed-off-by: Matthew Wilcox (Oracle) --- Documentation/filesystems/locking.rst | 4 - Documentation/filesystems/vfs.rst | 12 --- Documentation/vm/page_migration.rst | 113 +++----------------------- arch/powerpc/platforms/pseries/cmm.c | 60 +------------- drivers/misc/vmw_balloon.c | 61 +------------- drivers/virtio/virtio_balloon.c | 47 +---------- include/linux/balloon_compaction.h | 6 +- include/linux/fs.h | 2 - include/linux/migrate.h | 56 +++++++++++-- include/linux/page-flags.h | 2 +- include/uapi/linux/magic.h | 4 - mm/balloon_compaction.c | 10 +-- mm/compaction.c | 29 +++---- mm/migrate.c | 24 +++--- mm/util.c | 4 +- mm/z3fold.c | 84 ++----------------- mm/zsmalloc.c | 102 +++++------------------ 17 files changed, 134 insertions(+), 486 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index c0fe711f14d3f1..9963d9600b7172 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -252,9 +252,7 @@ prototypes:: bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); - bool (*isolate_page) (struct page *, isolate_mode_t); int (*migratepage)(struct address_space *, struct page *, struct page *); - void (*putback_page) (struct page *); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_page)(struct address_space *, struct page *); @@ -280,9 +278,7 @@ invalidate_folio: yes exclusive release_folio: yes free_folio: yes direct_IO: -isolate_page: yes migratepage: yes (both) -putback_page: yes launder_folio: yes is_partially_uptodate: yes error_remove_page: yes diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index a08c652467d7c0..b51665cdabc46a 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -737,12 +737,8 @@ cache in your filesystem. The following members are defined: bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); - /* isolate a page for migration */ - bool (*isolate_page) (struct page *, isolate_mode_t); /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); - /* put migration-failed page back to right list */ - void (*putback_page) (struct page *); int (*launder_folio) (struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, @@ -930,11 +926,6 @@ cache in your filesystem. The following members are defined: data directly between the storage and the application's address space. -``isolate_page`` - Called by the VM when isolating a movable non-lru page. If page - is successfully isolated, VM marks the page as PG_isolated via - __SetPageIsolated. - ``migrate_page`` This is used to compact the physical memory usage. If the VM wants to relocate a page (maybe off a memory card that is @@ -942,9 +933,6 @@ cache in your filesystem. The following members are defined: page to this function. migrate_page should transfer any private data across and update any references that it has to the page. -``putback_page`` - Called by the VM when isolated page's migration fails. - ``launder_folio`` Called before freeing a folio - it writes back the dirty folio. To prevent redirtying the folio, it is kept locked during the diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index 8c5cb8147e55e4..11493bad711252 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -152,110 +152,15 @@ Steps: Non-LRU page migration ====================== -Although migration originally aimed for reducing the latency of memory accesses -for NUMA, compaction also uses migration to create high-order pages. +Although migration originally aimed for reducing the latency of memory +accesses for NUMA, compaction also uses migration to create high-order +pages. For compaction purposes, it is also useful to be able to move +non-LRU pages, such as zsmalloc and virtio-balloon pages. -Current problem of the implementation is that it is designed to migrate only -*LRU* pages. However, there are potential non-LRU pages which can be migrated -in drivers, for example, zsmalloc, virtio-balloon pages. - -For virtio-balloon pages, some parts of migration code path have been hooked -up and added virtio-balloon specific functions to intercept migration logics. -It's too specific to a driver so other drivers who want to make their pages -movable would have to add their own specific hooks in the migration path. - -To overcome the problem, VM supports non-LRU page migration which provides -generic functions for non-LRU movable pages without driver specific hooks -in the migration path. - -If a driver wants to make its pages movable, it should define three functions -which are function pointers of struct address_space_operations. - -1. ``bool (*isolate_page) (struct page *page, isolate_mode_t mode);`` - - What VM expects from isolate_page() function of driver is to return *true* - if driver isolates the page successfully. On returning true, VM marks the page - as PG_isolated so concurrent isolation in several CPUs skip the page - for isolation. If a driver cannot isolate the page, it should return *false*. - - Once page is successfully isolated, VM uses page.lru fields so driver - shouldn't expect to preserve values in those fields. - -2. ``int (*migratepage) (struct address_space *mapping,`` -| ``struct page *newpage, struct page *oldpage, enum migrate_mode);`` - - After isolation, VM calls migratepage() of driver with the isolated page. - The function of migratepage() is to move the contents of the old page to the - new page - and set up fields of struct page newpage. Keep in mind that you should - indicate to the VM the oldpage is no longer movable via __ClearPageMovable() - under page_lock if you migrated the oldpage successfully and returned - MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver - can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time - because VM interprets -EAGAIN as "temporary migration failure". On returning - any error except -EAGAIN, VM will give up the page migration without - retrying. - - Driver shouldn't touch the page.lru field while in the migratepage() function. - -3. ``void (*putback_page)(struct page *);`` - - If migration fails on the isolated page, VM should return the isolated page - to the driver so VM calls the driver's putback_page() with the isolated page. - In this function, the driver should put the isolated page back into its own data - structure. - -Non-LRU movable page flags - - There are two page flags for supporting non-LRU movable page. - - * PG_movable - - Driver should use the function below to make page movable under page_lock:: - - void __SetPageMovable(struct page *page, struct address_space *mapping) - - It needs argument of address_space for registering migration - family functions which will be called by VM. Exactly speaking, - PG_movable is not a real flag of struct page. Rather, VM - reuses the page->mapping's lower bits to represent it:: - - #define PAGE_MAPPING_MOVABLE 0x2 - page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; - - so driver shouldn't access page->mapping directly. Instead, driver should - use page_mapping() which masks off the low two bits of page->mapping under - page lock so it can get the right struct address_space. - - For testing of non-LRU movable pages, VM supports __PageMovable() function. - However, it doesn't guarantee to identify non-LRU movable pages because - the page->mapping field is unified with other variables in struct page. - If the driver releases the page after isolation by VM, page->mapping - doesn't have a stable value although it has PAGE_MAPPING_MOVABLE set - (look at __ClearPageMovable). But __PageMovable() is cheap to call whether - page is LRU or non-LRU movable once the page has been isolated because LRU - pages can never have PAGE_MAPPING_MOVABLE set in page->mapping. It is also - good for just peeking to test non-LRU movable pages before more expensive - checking with lock_page() in pfn scanning to select a victim. - - For guaranteeing non-LRU movable page, VM provides PageMovable() function. - Unlike __PageMovable(), PageMovable() validates page->mapping and - mapping->a_ops->isolate_page under lock_page(). The lock_page() prevents - sudden destroying of page->mapping. - - Drivers using __SetPageMovable() should clear the flag via - __ClearMovablePage() under page_lock() before the releasing the page. - - * PG_isolated - - To prevent concurrent isolation among several CPUs, VM marks isolated page - as PG_isolated under lock_page(). So if a CPU encounters PG_isolated - non-LRU movable page, it can skip it. Driver doesn't need to manipulate the - flag because VM will set/clear it automatically. Keep in mind that if the - driver sees a PG_isolated page, it means the page has been isolated by the - VM so it shouldn't touch the page.lru field. - The PG_isolated flag is aliased with the PG_reclaim flag so drivers - shouldn't use PG_isolated for its own purposes. +If a driver wants to make its pages movable, it should define a struct +movable_operations. It then needs to call __SetPageMovable() on each +page that it may be able to move. This uses the ``page->mapping`` field, +so this field is not available for the driver to use for other purposes. Monitoring Migration ===================== @@ -286,3 +191,5 @@ THP_MIGRATION_FAIL and PGMIGRATE_FAIL to increase. Christoph Lameter, May 8, 2006. Minchan Kim, Mar 28, 2016. + +.. kernel-doc:: include/linux/migrate.h diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 15ed8206c4630d..5f4037c1d7fe80 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -19,9 +19,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -500,19 +497,6 @@ static struct notifier_block cmm_mem_nb = { }; #ifdef CONFIG_BALLOON_COMPACTION -static struct vfsmount *balloon_mnt; - -static int cmm_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, PPC_CMM_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type balloon_fs = { - .name = "ppc-cmm", - .init_fs_context = cmm_init_fs_context, - .kill_sb = kill_anon_super, -}; - static int cmm_migratepage(struct balloon_dev_info *b_dev_info, struct page *newpage, struct page *page, enum migrate_mode mode) @@ -564,47 +548,13 @@ static int cmm_migratepage(struct balloon_dev_info *b_dev_info, return MIGRATEPAGE_SUCCESS; } -static int cmm_balloon_compaction_init(void) +static void cmm_balloon_compaction_init(void) { - int rc; - balloon_devinfo_init(&b_dev_info); b_dev_info.migratepage = cmm_migratepage; - - balloon_mnt = kern_mount(&balloon_fs); - if (IS_ERR(balloon_mnt)) { - rc = PTR_ERR(balloon_mnt); - balloon_mnt = NULL; - return rc; - } - - b_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); - if (IS_ERR(b_dev_info.inode)) { - rc = PTR_ERR(b_dev_info.inode); - b_dev_info.inode = NULL; - kern_unmount(balloon_mnt); - balloon_mnt = NULL; - return rc; - } - - b_dev_info.inode->i_mapping->a_ops = &balloon_aops; - return 0; -} -static void cmm_balloon_compaction_deinit(void) -{ - if (b_dev_info.inode) - iput(b_dev_info.inode); - b_dev_info.inode = NULL; - kern_unmount(balloon_mnt); - balloon_mnt = NULL; } #else /* CONFIG_BALLOON_COMPACTION */ -static int cmm_balloon_compaction_init(void) -{ - return 0; -} - -static void cmm_balloon_compaction_deinit(void) +static void cmm_balloon_compaction_init(void) { } #endif /* CONFIG_BALLOON_COMPACTION */ @@ -622,9 +572,7 @@ static int cmm_init(void) if (!firmware_has_feature(FW_FEATURE_CMO) && !simulate) return -EOPNOTSUPP; - rc = cmm_balloon_compaction_init(); - if (rc) - return rc; + cmm_balloon_compaction_init(); rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) @@ -658,7 +606,6 @@ static int cmm_init(void) out_oom_notifier: unregister_oom_notifier(&cmm_oom_nb); out_balloon_compaction: - cmm_balloon_compaction_deinit(); return rc; } @@ -677,7 +624,6 @@ static void cmm_exit(void) unregister_memory_notifier(&cmm_mem_nb); cmm_free_pages(atomic_long_read(&loaned_pages)); cmm_unregister_sysfs(&cmm_dev); - cmm_balloon_compaction_deinit(); } /** diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 086ce77d9074e5..85dd6aa33df69f 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -29,8 +29,6 @@ #include #include #include -#include -#include #include #include #include @@ -1730,20 +1728,6 @@ static inline void vmballoon_debugfs_exit(struct vmballoon *b) #ifdef CONFIG_BALLOON_COMPACTION - -static int vmballoon_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, BALLOON_VMW_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type vmballoon_fs = { - .name = "balloon-vmware", - .init_fs_context = vmballoon_init_fs_context, - .kill_sb = kill_anon_super, -}; - -static struct vfsmount *vmballoon_mnt; - /** * vmballoon_migratepage() - migrates a balloon page. * @b_dev_info: balloon device information descriptor. @@ -1862,21 +1846,6 @@ static int vmballoon_migratepage(struct balloon_dev_info *b_dev_info, return ret; } -/** - * vmballoon_compaction_deinit() - removes compaction related data. - * - * @b: pointer to the balloon. - */ -static void vmballoon_compaction_deinit(struct vmballoon *b) -{ - if (!IS_ERR(b->b_dev_info.inode)) - iput(b->b_dev_info.inode); - - b->b_dev_info.inode = NULL; - kern_unmount(vmballoon_mnt); - vmballoon_mnt = NULL; -} - /** * vmballoon_compaction_init() - initialized compaction for the balloon. * @@ -1888,33 +1857,15 @@ static void vmballoon_compaction_deinit(struct vmballoon *b) * * Return: zero on success or error code on failure. */ -static __init int vmballoon_compaction_init(struct vmballoon *b) +static __init void vmballoon_compaction_init(struct vmballoon *b) { - vmballoon_mnt = kern_mount(&vmballoon_fs); - if (IS_ERR(vmballoon_mnt)) - return PTR_ERR(vmballoon_mnt); - b->b_dev_info.migratepage = vmballoon_migratepage; - b->b_dev_info.inode = alloc_anon_inode(vmballoon_mnt->mnt_sb); - - if (IS_ERR(b->b_dev_info.inode)) - return PTR_ERR(b->b_dev_info.inode); - - b->b_dev_info.inode->i_mapping->a_ops = &balloon_aops; - return 0; } #else /* CONFIG_BALLOON_COMPACTION */ - -static void vmballoon_compaction_deinit(struct vmballoon *b) -{ -} - -static int vmballoon_compaction_init(struct vmballoon *b) +static inline void vmballoon_compaction_init(struct vmballoon *b) { - return 0; } - #endif /* CONFIG_BALLOON_COMPACTION */ static int __init vmballoon_init(void) @@ -1939,9 +1890,7 @@ static int __init vmballoon_init(void) * balloon_devinfo_init() . */ balloon_devinfo_init(&balloon.b_dev_info); - error = vmballoon_compaction_init(&balloon); - if (error) - goto fail; + vmballoon_compaction_init(&balloon); INIT_LIST_HEAD(&balloon.huge_pages); spin_lock_init(&balloon.comm_lock); @@ -1958,7 +1907,6 @@ static int __init vmballoon_init(void) return 0; fail: vmballoon_unregister_shrinker(&balloon); - vmballoon_compaction_deinit(&balloon); return error; } @@ -1985,8 +1933,5 @@ static void __exit vmballoon_exit(void) */ vmballoon_send_start(&balloon, 0); vmballoon_pop(&balloon); - - /* Only once we popped the balloon, compaction can be deinit */ - vmballoon_compaction_deinit(&balloon); } module_exit(vmballoon_exit); diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index b9737da6c4ddbc..bd360b91e9d3ed 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -17,9 +17,6 @@ #include #include #include -#include -#include -#include #include /* @@ -42,10 +39,6 @@ (1 << (VIRTIO_BALLOON_HINT_BLOCK_ORDER + PAGE_SHIFT)) #define VIRTIO_BALLOON_HINT_BLOCK_PAGES (1 << VIRTIO_BALLOON_HINT_BLOCK_ORDER) -#ifdef CONFIG_BALLOON_COMPACTION -static struct vfsmount *balloon_mnt; -#endif - enum virtio_balloon_vq { VIRTIO_BALLOON_VQ_INFLATE, VIRTIO_BALLOON_VQ_DEFLATE, @@ -805,18 +798,6 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, return MIGRATEPAGE_SUCCESS; } - -static int balloon_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, BALLOON_KVM_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type balloon_fs = { - .name = "balloon-kvm", - .init_fs_context = balloon_init_fs_context, - .kill_sb = kill_anon_super, -}; - #endif /* CONFIG_BALLOON_COMPACTION */ static unsigned long shrink_free_pages(struct virtio_balloon *vb, @@ -909,19 +890,7 @@ static int virtballoon_probe(struct virtio_device *vdev) goto out_free_vb; #ifdef CONFIG_BALLOON_COMPACTION - balloon_mnt = kern_mount(&balloon_fs); - if (IS_ERR(balloon_mnt)) { - err = PTR_ERR(balloon_mnt); - goto out_del_vqs; - } - vb->vb_dev_info.migratepage = virtballoon_migratepage; - vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); - if (IS_ERR(vb->vb_dev_info.inode)) { - err = PTR_ERR(vb->vb_dev_info.inode); - goto out_kern_unmount; - } - vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops; #endif if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) { /* @@ -930,13 +899,13 @@ static int virtballoon_probe(struct virtio_device *vdev) */ if (virtqueue_get_vring_size(vb->free_page_vq) < 2) { err = -ENOSPC; - goto out_iput; + goto out_del_vqs; } vb->balloon_wq = alloc_workqueue("balloon-wq", WQ_FREEZABLE | WQ_CPU_INTENSIVE, 0); if (!vb->balloon_wq) { err = -ENOMEM; - goto out_iput; + goto out_del_vqs; } INIT_WORK(&vb->report_free_page_work, report_free_page_func); vb->cmd_id_received_cache = VIRTIO_BALLOON_CMD_ID_STOP; @@ -1030,13 +999,7 @@ static int virtballoon_probe(struct virtio_device *vdev) out_del_balloon_wq: if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_FREE_PAGE_HINT)) destroy_workqueue(vb->balloon_wq); -out_iput: -#ifdef CONFIG_BALLOON_COMPACTION - iput(vb->vb_dev_info.inode); -out_kern_unmount: - kern_unmount(balloon_mnt); out_del_vqs: -#endif vdev->config->del_vqs(vdev); out_free_vb: kfree(vb); @@ -1083,12 +1046,6 @@ static void virtballoon_remove(struct virtio_device *vdev) } remove_common(vb); -#ifdef CONFIG_BALLOON_COMPACTION - if (vb->vb_dev_info.inode) - iput(vb->vb_dev_info.inode); - - kern_unmount(balloon_mnt); -#endif kfree(vb); } diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index edb7f6d41faa04..5ca2d56996201d 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -57,7 +57,6 @@ struct balloon_dev_info { struct list_head pages; /* Pages enqueued & handled to Host */ int (*migratepage)(struct balloon_dev_info *, struct page *newpage, struct page *page, enum migrate_mode mode); - struct inode *inode; }; extern struct page *balloon_page_alloc(void); @@ -75,11 +74,10 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) spin_lock_init(&balloon->pages_lock); INIT_LIST_HEAD(&balloon->pages); balloon->migratepage = NULL; - balloon->inode = NULL; } #ifdef CONFIG_BALLOON_COMPACTION -extern const struct address_space_operations balloon_aops; +extern const struct movable_operations balloon_mops; /* * balloon_page_insert - insert a page into the balloon's page list and make @@ -94,7 +92,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { __SetPageOffline(page); - __SetPageMovable(page, balloon->inode->i_mapping); + __SetPageMovable(page, &balloon_mops); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae57..5d8ee3155ca2e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -367,8 +367,6 @@ struct address_space_operations { */ int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); - bool (*isolate_page)(struct page *, isolate_mode_t); - void (*putback_page)(struct page *); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 069a89e847f345..82c735ba6109f5 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -19,6 +19,43 @@ struct migration_target_control; */ #define MIGRATEPAGE_SUCCESS 0 +/** + * struct movable_operations - Driver page migration + * @isolate_page: + * The VM calls this function to prepare the page to be moved. The page + * is locked and the driver should not unlock it. The driver should + * return ``true`` if the page is movable and ``false`` if it is not + * currently movable. After this function returns, the VM uses the + * page->lru field, so the driver must preserve any information which + * is usually stored here. + * + * @migrate_page: + * After isolation, the VM calls this function with the isolated + * @src page. The driver should copy the contents of the + * @src page to the @dst page and set up the fields of @dst page. + * Both pages are locked. + * If page migration is successful, the driver should call + * __ClearPageMovable(@src) and return MIGRATEPAGE_SUCCESS. + * If the driver cannot migrate the page at the moment, it can return + * -EAGAIN. The VM interprets this as a temporary migration failure and + * will retry it later. Any other error value is a permanent migration + * failure and migration will not be retried. + * The driver shouldn't touch the @src->lru field while in the + * migrate_page() function. It may write to @dst->lru. + * + * @putback_page: + * If migration fails on the isolated page, the VM informs the driver + * that the page is no longer a candidate for migration by calling + * this function. The driver should put the isolated page back into + * its own data structure. + */ +struct movable_operations { + bool (*isolate_page)(struct page *, isolate_mode_t); + int (*migrate_page)(struct page *dst, struct page *src, + enum migrate_mode); + void (*putback_page)(struct page *); +}; + /* Defined in mm/debug.c: */ extern const char *migrate_reason_names[MR_TYPES]; @@ -91,13 +128,13 @@ static inline int next_demotion_node(int node) #endif #ifdef CONFIG_COMPACTION -extern int PageMovable(struct page *page); -extern void __SetPageMovable(struct page *page, struct address_space *mapping); -extern void __ClearPageMovable(struct page *page); +bool PageMovable(struct page *page); +void __SetPageMovable(struct page *page, const struct movable_operations *ops); +void __ClearPageMovable(struct page *page); #else -static inline int PageMovable(struct page *page) { return 0; } +static inline bool PageMovable(struct page *page) { return false; } static inline void __SetPageMovable(struct page *page, - struct address_space *mapping) + const struct movable_operations *ops) { } static inline void __ClearPageMovable(struct page *page) @@ -110,6 +147,15 @@ static inline bool folio_test_movable(struct folio *folio) return PageMovable(&folio->page); } +static inline +const struct movable_operations *page_movable_ops(struct page *page) +{ + VM_BUG_ON(!__PageMovable(page)); + + return (const struct movable_operations *) + ((unsigned long)page->mapping - PAGE_MAPPING_MOVABLE); +} + #ifdef CONFIG_NUMA_BALANCING extern int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, int node); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e66f7aa3191df4..3f5490f6f03857 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -639,7 +639,7 @@ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) * structure which KSM associates with that merged page. See ksm.h. * * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable - * page and then page->mapping points a struct address_space. + * page and then page->mapping points to a struct movable_operations. * * Please note that, confusingly, "page_mapping" refers to the inode * address_space which maps the page from disk; whereas "page_mapped" diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index f724129c042556..6325d1d0e90f5d 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -98,12 +98,8 @@ /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC 0x15013346 -#define BALLOON_KVM_MAGIC 0x13661366 -#define ZSMALLOC_MAGIC 0x58295829 #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ -#define Z3FOLD_MAGIC 0x33 -#define PPC_CMM_MAGIC 0xc7571590 #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 4b8eab4b3f456c..22c96fed70b592 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -228,10 +228,8 @@ static void balloon_page_putback(struct page *page) spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); } - /* move_to_new_page() counterpart for a ballooned page */ -static int balloon_page_migrate(struct address_space *mapping, - struct page *newpage, struct page *page, +static int balloon_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { struct balloon_dev_info *balloon = balloon_page_device(page); @@ -250,11 +248,11 @@ static int balloon_page_migrate(struct address_space *mapping, return balloon->migratepage(balloon, newpage, page, mode); } -const struct address_space_operations balloon_aops = { - .migratepage = balloon_page_migrate, +const struct movable_operations balloon_mops = { + .migrate_page = balloon_page_migrate, .isolate_page = balloon_page_isolate, .putback_page = balloon_page_putback, }; -EXPORT_SYMBOL_GPL(balloon_aops); +EXPORT_SYMBOL_GPL(balloon_mops); #endif /* CONFIG_BALLOON_COMPACTION */ diff --git a/mm/compaction.c b/mm/compaction.c index 1f89b969c12bf5..f23efba1d118cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -110,28 +110,27 @@ static void split_map_pages(struct list_head *list) } #ifdef CONFIG_COMPACTION - -int PageMovable(struct page *page) +bool PageMovable(struct page *page) { - struct address_space *mapping; + const struct movable_operations *mops; VM_BUG_ON_PAGE(!PageLocked(page), page); if (!__PageMovable(page)) - return 0; + return false; - mapping = page_mapping(page); - if (mapping && mapping->a_ops && mapping->a_ops->isolate_page) - return 1; + mops = page_movable_ops(page); + if (mops) + return true; - return 0; + return false; } EXPORT_SYMBOL(PageMovable); -void __SetPageMovable(struct page *page, struct address_space *mapping) +void __SetPageMovable(struct page *page, const struct movable_operations *mops) { VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page); - page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE); + VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page); + page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE); } EXPORT_SYMBOL(__SetPageMovable); @@ -139,12 +138,10 @@ void __ClearPageMovable(struct page *page) { VM_BUG_ON_PAGE(!PageMovable(page), page); /* - * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE - * flag so that VM can catch up released page by driver after isolation. - * With it, VM migration doesn't try to put it back. + * This page still has the type of a movable page, but it's + * actually not movable any more. */ - page->mapping = (void *)((unsigned long)page->mapping & - PAGE_MAPPING_MOVABLE); + page->mapping = (void *)PAGE_MAPPING_MOVABLE; } EXPORT_SYMBOL(__ClearPageMovable); diff --git a/mm/migrate.c b/mm/migrate.c index 6c1ea61f39d804..491f0374783247 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -59,7 +59,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) { - struct address_space *mapping; + const struct movable_operations *mops; /* * Avoid burning cycles with pages that are yet under __free_pages(), @@ -97,10 +97,10 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) if (!PageMovable(page) || PageIsolated(page)) goto out_no_isolated; - mapping = page_mapping(page); - VM_BUG_ON_PAGE(!mapping, page); + mops = page_movable_ops(page); + VM_BUG_ON_PAGE(!mops, page); - if (!mapping->a_ops->isolate_page(page, mode)) + if (!mops->isolate_page(page, mode)) goto out_no_isolated; /* Driver shouldn't use PG_isolated bit of page->flags */ @@ -120,10 +120,9 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) static void putback_movable_page(struct page *page) { - struct address_space *mapping; + const struct movable_operations *mops = page_movable_ops(page); - mapping = page_mapping(page); - mapping->a_ops->putback_page(page); + mops->putback_page(page); ClearPageIsolated(page); } @@ -846,16 +845,15 @@ static int fallback_migrate_page(struct address_space *mapping, static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { - struct address_space *mapping; int rc = -EAGAIN; bool is_lru = !__PageMovable(&src->page); VM_BUG_ON_FOLIO(!folio_test_locked(src), src); VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); - mapping = folio_mapping(src); - if (likely(is_lru)) { + struct address_space *mapping = folio_mapping(src); + if (!mapping) rc = migrate_page(mapping, &dst->page, &src->page, mode); else if (mapping->a_ops->migratepage) @@ -872,6 +870,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, rc = fallback_migrate_page(mapping, &dst->page, &src->page, mode); } else { + const struct movable_operations *mops; + /* * In case of non-lru page, it could be released after * isolation step. In that case, we shouldn't try migration. @@ -883,8 +883,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, goto out; } - rc = mapping->a_ops->migratepage(mapping, &dst->page, - &src->page, mode); + mops = page_movable_ops(&src->page); + rc = mops->migrate_page(&dst->page, &src->page, mode); WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && !folio_test_isolated(src)); } diff --git a/mm/util.c b/mm/util.c index 0837570c922513..53af0e79d3e47a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -804,10 +804,10 @@ struct address_space *folio_mapping(struct folio *folio) return swap_address_space(folio_swap_entry(folio)); mapping = folio->mapping; - if ((unsigned long)mapping & PAGE_MAPPING_ANON) + if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) return NULL; - return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); + return mapping; } EXPORT_SYMBOL(folio_mapping); diff --git a/mm/z3fold.c b/mm/z3fold.c index f41f8b0d9e9a06..cf71da10d04e73 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -34,15 +34,11 @@ #include #include #include -#include -#include -#include #include #include #include #include #include -#include #include /* @@ -149,7 +145,6 @@ struct z3fold_header { * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release - * @inode: inode for z3fold pseudo filesystem * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -169,7 +164,6 @@ struct z3fold_pool { struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; struct work_struct work; - struct inode *inode; }; /* @@ -334,54 +328,6 @@ static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) } } -static int z3fold_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type z3fold_fs = { - .name = "z3fold", - .init_fs_context = z3fold_init_fs_context, - .kill_sb = kill_anon_super, -}; - -static struct vfsmount *z3fold_mnt; -static int __init z3fold_mount(void) -{ - int ret = 0; - - z3fold_mnt = kern_mount(&z3fold_fs); - if (IS_ERR(z3fold_mnt)) - ret = PTR_ERR(z3fold_mnt); - - return ret; -} - -static void z3fold_unmount(void) -{ - kern_unmount(z3fold_mnt); -} - -static const struct address_space_operations z3fold_aops; -static int z3fold_register_migration(struct z3fold_pool *pool) -{ - pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb); - if (IS_ERR(pool->inode)) { - pool->inode = NULL; - return 1; - } - - pool->inode->i_mapping->private_data = pool; - pool->inode->i_mapping->a_ops = &z3fold_aops; - return 0; -} - -static void z3fold_unregister_migration(struct z3fold_pool *pool) -{ - if (pool->inode) - iput(pool->inode); -} - /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, struct z3fold_pool *pool, gfp_t gfp) @@ -1002,14 +948,10 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; - if (z3fold_register_migration(pool)) - goto out_rwq; INIT_WORK(&pool->work, free_pages_work); pool->ops = ops; return pool; -out_rwq: - destroy_workqueue(pool->release_wq); out_wq: destroy_workqueue(pool->compact_wq); out_unbuddied: @@ -1043,11 +985,12 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) destroy_workqueue(pool->compact_wq); destroy_workqueue(pool->release_wq); - z3fold_unregister_migration(pool); free_percpu(pool->unbuddied); kfree(pool); } +static const struct movable_operations z3fold_mops; + /** * z3fold_alloc() - allocates a region of a given size * @pool: z3fold pool from which to allocate @@ -1117,11 +1060,11 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, } if (can_sleep) { lock_page(page); - __SetPageMovable(page, pool->inode->i_mapping); + __SetPageMovable(page, &z3fold_mops); unlock_page(page); } else { WARN_ON(!trylock_page(page)); - __SetPageMovable(page, pool->inode->i_mapping); + __SetPageMovable(page, &z3fold_mops); unlock_page(page); } z3fold_page_lock(zhdr); @@ -1554,12 +1497,11 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) return false; } -static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +static int z3fold_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) { struct z3fold_header *zhdr, *new_zhdr; struct z3fold_pool *pool; - struct address_space *new_mapping; VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); @@ -1592,7 +1534,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa * so we only have to reinitialize it. */ INIT_LIST_HEAD(&new_zhdr->buddy); - new_mapping = page_mapping(page); __ClearPageMovable(page); get_page(newpage); @@ -1608,7 +1549,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa spin_lock(&pool->lock); list_add(&newpage->lru, &pool->lru); spin_unlock(&pool->lock); - __SetPageMovable(newpage, new_mapping); + __SetPageMovable(newpage, &z3fold_mops); z3fold_page_unlock(new_zhdr); queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); @@ -1642,9 +1583,9 @@ static void z3fold_page_putback(struct page *page) z3fold_page_unlock(zhdr); } -static const struct address_space_operations z3fold_aops = { +static const struct movable_operations z3fold_mops = { .isolate_page = z3fold_page_isolate, - .migratepage = z3fold_page_migrate, + .migrate_page = z3fold_page_migrate, .putback_page = z3fold_page_putback, }; @@ -1746,17 +1687,11 @@ MODULE_ALIAS("zpool-z3fold"); static int __init init_z3fold(void) { - int ret; - /* * Make sure the z3fold header is not larger than the page size and * there has remaining spaces for its buddy. */ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE); - ret = z3fold_mount(); - if (ret) - return ret; - zpool_register_driver(&z3fold_zpool_driver); return 0; @@ -1764,7 +1699,6 @@ static int __init init_z3fold(void) static void __exit exit_z3fold(void) { - z3fold_unmount(); zpool_unregister_driver(&z3fold_zpool_driver); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d5fc04385b8d7..71d6edcbea4887 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -59,8 +58,6 @@ #include #include #include -#include -#include #include #include #include @@ -177,10 +174,6 @@ struct zs_size_stat { static struct dentry *zs_stat_root; #endif -#ifdef CONFIG_COMPACTION -static struct vfsmount *zsmalloc_mnt; -#endif - /* * We assign a page to ZS_ALMOST_EMPTY fullness group when: * n <= N / f, where @@ -252,7 +245,6 @@ struct zs_pool { struct dentry *stat_dentry; #endif #ifdef CONFIG_COMPACTION - struct inode *inode; struct work_struct free_work; #endif /* protect page/zspage migration */ @@ -271,6 +263,7 @@ struct zspage { unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ + struct zs_pool *pool; #ifdef CONFIG_COMPACTION rwlock_t lock; #endif @@ -295,8 +288,6 @@ static bool ZsHugePage(struct zspage *zspage) } #ifdef CONFIG_COMPACTION -static int zs_register_migration(struct zs_pool *pool); -static void zs_unregister_migration(struct zs_pool *pool); static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); @@ -307,10 +298,6 @@ static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else -static int zsmalloc_mount(void) { return 0; } -static void zsmalloc_unmount(void) {} -static int zs_register_migration(struct zs_pool *pool) { return 0; } -static void zs_unregister_migration(struct zs_pool *pool) {} static void migrate_lock_init(struct zspage *zspage) {} static void migrate_read_lock(struct zspage *zspage) {} static void migrate_read_unlock(struct zspage *zspage) {} @@ -1083,6 +1070,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, create_page_chain(class, zspage, pages); init_zspage(class, zspage); + zspage->pool = pool; return zspage; } @@ -1754,33 +1742,6 @@ static void lock_zspage(struct zspage *zspage) migrate_read_unlock(zspage); } -static int zs_init_fs_context(struct fs_context *fc) -{ - return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM; -} - -static struct file_system_type zsmalloc_fs = { - .name = "zsmalloc", - .init_fs_context = zs_init_fs_context, - .kill_sb = kill_anon_super, -}; - -static int zsmalloc_mount(void) -{ - int ret = 0; - - zsmalloc_mnt = kern_mount(&zsmalloc_fs); - if (IS_ERR(zsmalloc_mnt)) - ret = PTR_ERR(zsmalloc_mnt); - - return ret; -} - -static void zsmalloc_unmount(void) -{ - kern_unmount(zsmalloc_mnt); -} - static void migrate_lock_init(struct zspage *zspage) { rwlock_init(&zspage->lock); @@ -1823,6 +1784,8 @@ static void dec_zspage_isolation(struct zspage *zspage) zspage->isolated--; } +static const struct movable_operations zsmalloc_mops; + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1843,7 +1806,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; - __SetPageMovable(newpage, page_mapping(oldpage)); + __SetPageMovable(newpage, &zsmalloc_mops); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) @@ -1865,8 +1828,8 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) return true; } -static int zs_page_migrate(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +static int zs_page_migrate(struct page *newpage, struct page *page, + enum migrate_mode mode) { struct zs_pool *pool; struct size_class *class; @@ -1889,14 +1852,15 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); - pool = mapping->private_data; + /* The page is locked, so this pointer must remain valid */ + zspage = get_zspage(page); + pool = zspage->pool; /* * The pool migrate_lock protects the race between zpage migration * and zs_free. */ write_lock(&pool->migrate_lock); - zspage = get_zspage(page); class = zspage_class(pool, zspage); /* @@ -1964,31 +1928,12 @@ static void zs_page_putback(struct page *page) migrate_write_unlock(zspage); } -static const struct address_space_operations zsmalloc_aops = { +static const struct movable_operations zsmalloc_mops = { .isolate_page = zs_page_isolate, - .migratepage = zs_page_migrate, + .migrate_page = zs_page_migrate, .putback_page = zs_page_putback, }; -static int zs_register_migration(struct zs_pool *pool) -{ - pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb); - if (IS_ERR(pool->inode)) { - pool->inode = NULL; - return 1; - } - - pool->inode->i_mapping->private_data = pool; - pool->inode->i_mapping->a_ops = &zsmalloc_aops; - return 0; -} - -static void zs_unregister_migration(struct zs_pool *pool) -{ - flush_work(&pool->free_work); - iput(pool->inode); -} - /* * Caller should hold page_lock of all pages in the zspage * In here, we cannot use zspage meta data. @@ -2032,6 +1977,11 @@ static void kick_deferred_free(struct zs_pool *pool) schedule_work(&pool->free_work); } +static void zs_flush_migration(struct zs_pool *pool) +{ + flush_work(&pool->free_work); +} + static void init_deferred_free(struct zs_pool *pool) { INIT_WORK(&pool->free_work, async_free_zspage); @@ -2043,10 +1993,12 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) do { WARN_ON(!trylock_page(page)); - __SetPageMovable(page, pool->inode->i_mapping); + __SetPageMovable(page, &zsmalloc_mops); unlock_page(page); } while ((page = get_next_page(page)) != NULL); } +#else +static inline void zs_flush_migration(struct zs_pool *pool) { } #endif /* @@ -2324,9 +2276,6 @@ struct zs_pool *zs_create_pool(const char *name) /* debug only, don't abort if it fails */ zs_pool_stat_create(pool, name); - if (zs_register_migration(pool)) - goto err; - /* * Not critical since shrinker is only used to trigger internal * defragmentation of the pool which is pretty optional thing. If @@ -2348,7 +2297,7 @@ void zs_destroy_pool(struct zs_pool *pool) int i; zs_unregister_shrinker(pool); - zs_unregister_migration(pool); + zs_flush_migration(pool); zs_pool_stat_destroy(pool); for (i = 0; i < ZS_SIZE_CLASSES; i++) { @@ -2380,14 +2329,10 @@ static int __init zs_init(void) { int ret; - ret = zsmalloc_mount(); - if (ret) - goto out; - ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", zs_cpu_prepare, zs_cpu_dead); if (ret) - goto hp_setup_fail; + goto out; #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); @@ -2397,8 +2342,6 @@ static int __init zs_init(void) return 0; -hp_setup_fail: - zsmalloc_unmount(); out: return ret; } @@ -2408,7 +2351,6 @@ static void __exit zs_exit(void) #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif - zsmalloc_unmount(); cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); zs_stat_exit(); From 662389777689c17a77849af822bac1677be56e37 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 09:00:16 -0400 Subject: [PATCH 0195/1250] fs: Add aops->migrate_folio Provide a folio-based replacement for aops->migratepage. Update the documentation to document migrate_folio instead of migratepage. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- Documentation/filesystems/locking.rst | 5 +++-- Documentation/filesystems/vfs.rst | 14 +++++++------- include/linux/fs.h | 4 +++- mm/compaction.c | 4 +++- mm/migrate.c | 11 +++++++---- 5 files changed, 23 insertions(+), 15 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 9963d9600b7172..4bb2627026ec83 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -252,7 +252,8 @@ prototypes:: bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *); int (*direct_IO)(struct kiocb *, struct iov_iter *iter); - int (*migratepage)(struct address_space *, struct page *, struct page *); + int (*migrate_folio)(struct address_space *, struct folio *dst, + struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count); int (*error_remove_page)(struct address_space *, struct page *); @@ -278,7 +279,7 @@ invalidate_folio: yes exclusive release_folio: yes free_folio: yes direct_IO: -migratepage: yes (both) +migrate_folio: yes (both) launder_folio: yes is_partially_uptodate: yes error_remove_page: yes diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index b51665cdabc46a..6cd6953e175b36 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -737,8 +737,8 @@ cache in your filesystem. The following members are defined: bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); - /* migrate the contents of a page to the specified target */ - int (*migratepage) (struct page *, struct page *); + int (*migrate_folio)(struct mapping *, struct folio *dst, + struct folio *src, enum migrate_mode); int (*launder_folio) (struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, @@ -926,12 +926,12 @@ cache in your filesystem. The following members are defined: data directly between the storage and the application's address space. -``migrate_page`` +``migrate_folio`` This is used to compact the physical memory usage. If the VM - wants to relocate a page (maybe off a memory card that is - signalling imminent failure) it will pass a new page and an old - page to this function. migrate_page should transfer any private - data across and update any references that it has to the page. + wants to relocate a folio (maybe from a memory device that is + signalling imminent failure) it will pass a new folio and an old + folio to this function. migrate_folio should transfer any private + data across and update any references that it has to the folio. ``launder_folio`` Called before freeing a folio - it writes back the dirty folio. diff --git a/include/linux/fs.h b/include/linux/fs.h index 5d8ee3155ca2e5..47431cf8fbb3d4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -362,9 +362,11 @@ struct address_space_operations { void (*free_folio)(struct folio *folio); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* - * migrate the contents of a page to the specified target. If + * migrate the contents of a folio to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ + int (*migrate_folio)(struct address_space *, struct folio *dst, + struct folio *src, enum migrate_mode); int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); int (*launder_folio)(struct folio *); diff --git a/mm/compaction.c b/mm/compaction.c index f23efba1d118cb..458f49f9ab09b5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1042,7 +1042,9 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail_put; mapping = page_mapping(page); - migrate_dirty = !mapping || mapping->a_ops->migratepage; + migrate_dirty = !mapping || + mapping->a_ops->migrate_folio || + mapping->a_ops->migratepage; unlock_page(page); if (!migrate_dirty) goto isolate_fail_put; diff --git a/mm/migrate.c b/mm/migrate.c index 491f0374783247..3c3c168097dd50 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -856,14 +856,17 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, if (!mapping) rc = migrate_page(mapping, &dst->page, &src->page, mode); - else if (mapping->a_ops->migratepage) + else if (mapping->a_ops->migrate_folio) /* - * Most pages have a mapping and most filesystems - * provide a migratepage callback. Anonymous pages + * Most folios have a mapping and most filesystems + * provide a migrate_folio callback. Anonymous folios * are part of swap space which also has its own - * migratepage callback. This is the most common path + * migrate_folio callback. This is the most common path * for page migration. */ + rc = mapping->a_ops->migrate_folio(mapping, dst, src, + mode); + else if (mapping->a_ops->migratepage) rc = mapping->a_ops->migratepage(mapping, &dst->page, &src->page, mode); else From 765acf9085a1188244ec6294e3039637685c209e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 09:34:36 -0400 Subject: [PATCH 0196/1250] mm/migrate: Convert fallback_migrate_page() to fallback_migrate_folio() Use a folio throughout. migrate_page() will be converted to migrate_folio() later. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- mm/migrate.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 3c3c168097dd50..c5278440f74dc6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -805,11 +805,11 @@ static int writeout(struct address_space *mapping, struct page *page) /* * Default handling if a filesystem does not provide a migration function. */ -static int fallback_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +static int fallback_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - if (PageDirty(page)) { - /* Only writeback pages in full synchronous migration */ + if (folio_test_dirty(src)) { + /* Only writeback folios in full synchronous migration */ switch (mode) { case MIGRATE_SYNC: case MIGRATE_SYNC_NO_COPY: @@ -817,18 +817,18 @@ static int fallback_migrate_page(struct address_space *mapping, default: return -EBUSY; } - return writeout(mapping, page); + return writeout(mapping, &src->page); } /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) + if (folio_test_private(src) && + !filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; - return migrate_page(mapping, newpage, page, mode); + return migrate_page(mapping, &dst->page, &src->page, mode); } /* @@ -870,8 +870,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, rc = mapping->a_ops->migratepage(mapping, &dst->page, &src->page, mode); else - rc = fallback_migrate_page(mapping, &dst->page, - &src->page, mode); + rc = fallback_migrate_folio(mapping, dst, src, mode); } else { const struct movable_operations *mops; From 7d474706ff4fb035f896710fa1274e3050afb461 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 09:41:03 -0400 Subject: [PATCH 0197/1250] mm/migrate: Convert writeout() to take a folio Use a folio throughout this function. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- mm/migrate.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index c5278440f74dc6..75b171425c4585 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -761,11 +761,10 @@ int buffer_migrate_page_norefs(struct address_space *mapping, #endif /* - * Writeback a page to clean the dirty state + * Writeback a folio to clean the dirty state */ -static int writeout(struct address_space *mapping, struct page *page) +static int writeout(struct address_space *mapping, struct folio *folio) { - struct folio *folio = page_folio(page); struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = 1, @@ -779,25 +778,25 @@ static int writeout(struct address_space *mapping, struct page *page) /* No write method for the address space */ return -EINVAL; - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) /* Someone else already triggered a write */ return -EAGAIN; /* - * A dirty page may imply that the underlying filesystem has - * the page on some queue. So the page must be clean for - * migration. Writeout may mean we loose the lock and the - * page state is no longer what we checked for earlier. + * A dirty folio may imply that the underlying filesystem has + * the folio on some queue. So the folio must be clean for + * migration. Writeout may mean we lose the lock and the + * folio state is no longer what we checked for earlier. * At this point we know that the migration attempt cannot * be successful. */ remove_migration_ptes(folio, folio, false); - rc = mapping->a_ops->writepage(page, &wbc); + rc = mapping->a_ops->writepage(&folio->page, &wbc); if (rc != AOP_WRITEPAGE_ACTIVATE) /* unlocked. Relock */ - lock_page(page); + folio_lock(folio); return (rc < 0) ? -EIO : -EAGAIN; } @@ -817,7 +816,7 @@ static int fallback_migrate_folio(struct address_space *mapping, default: return -EBUSY; } - return writeout(mapping, &src->page); + return writeout(mapping, src); } /* From e267a3198014ec93389b1930b776047082bb271f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 10:20:31 -0400 Subject: [PATCH 0198/1250] mm/migrate: Convert buffer_migrate_page() to buffer_migrate_folio() Use a folio throughout __buffer_migrate_folio(), add kernel-doc for buffer_migrate_folio() and buffer_migrate_folio_norefs(), move their declarations to buffer.h and switch all filesystems that have wired them up. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- block/fops.c | 2 +- fs/ext2/inode.c | 4 +- fs/ext4/inode.c | 4 +- fs/ntfs/aops.c | 6 +-- fs/ocfs2/aops.c | 2 +- include/linux/buffer_head.h | 10 +++++ include/linux/fs.h | 12 ------ mm/migrate.c | 76 ++++++++++++++++++++++--------------- 8 files changed, 65 insertions(+), 51 deletions(-) diff --git a/block/fops.c b/block/fops.c index d6b3276a6c6808..743fc46d0aad7c 100644 --- a/block/fops.c +++ b/block/fops.c @@ -417,7 +417,7 @@ const struct address_space_operations def_blk_aops = { .write_end = blkdev_write_end, .writepages = blkdev_writepages, .direct_IO = blkdev_direct_IO, - .migratepage = buffer_migrate_page_norefs, + .migrate_folio = buffer_migrate_folio_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e6b932219803ed..58a9d061f17d14 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -973,7 +973,7 @@ const struct address_space_operations ext2_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -989,7 +989,7 @@ const struct address_space_operations ext2_nobh_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 06cc688781766b..87a8b4382bce08 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3633,7 +3633,7 @@ static const struct address_space_operations ext4_aops = { .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, @@ -3668,7 +3668,7 @@ static const struct address_space_operations ext4_da_aops = { .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 9e3964ea2ea030..5f4fb6ca6f2e93 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1659,7 +1659,7 @@ const struct address_space_operations ntfs_normal_aops = { .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ .bmap = ntfs_bmap, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_compressed_aops = { .writepage = ntfs_writepage, .dirty_folio = block_dirty_folio, #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -1688,7 +1688,7 @@ const struct address_space_operations ntfs_mst_aops = { .writepage = ntfs_writepage, /* Write dirty page to disk. */ .dirty_folio = filemap_dirty_folio, #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 767df51f8657a6..1d489003f99dcc 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2462,7 +2462,7 @@ const struct address_space_operations ocfs2_aops = { .direct_IO = ocfs2_direct_IO, .invalidate_folio = block_invalidate_folio, .release_folio = ocfs2_release_folio, - .migratepage = buffer_migrate_page, + .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c9d1463bb20f31..b0366c89d6a4d1 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -267,6 +267,16 @@ int nobh_truncate_page(struct address_space *, loff_t, get_block_t *); int nobh_writepage(struct page *page, get_block_t *get_block, struct writeback_control *wbc); +#ifdef CONFIG_MIGRATION +extern int buffer_migrate_folio(struct address_space *, + struct folio *dst, struct folio *src, enum migrate_mode); +extern int buffer_migrate_folio_norefs(struct address_space *, + struct folio *dst, struct folio *src, enum migrate_mode); +#else +#define buffer_migrate_folio NULL +#define buffer_migrate_folio_norefs NULL +#endif + void buffer_init(void); /* diff --git a/include/linux/fs.h b/include/linux/fs.h index 47431cf8fbb3d4..9e6b17da4e11b1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3215,18 +3215,6 @@ extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); -#ifdef CONFIG_MIGRATION -extern int buffer_migrate_page(struct address_space *, - struct page *, struct page *, - enum migrate_mode); -extern int buffer_migrate_page_norefs(struct address_space *, - struct page *, struct page *, - enum migrate_mode); -#else -#define buffer_migrate_page NULL -#define buffer_migrate_page_norefs NULL -#endif - int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct user_namespace *, struct dentry *, struct iattr *); diff --git a/mm/migrate.c b/mm/migrate.c index 75b171425c4585..ea5398d0f7f1f0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -656,23 +656,23 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head, return true; } -static int __buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode, +static int __buffer_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode, bool check_refs) { struct buffer_head *bh, *head; int rc; int expected_count; - if (!page_has_buffers(page)) - return migrate_page(mapping, newpage, page, mode); + head = folio_buffers(src); + if (!head) + return migrate_page(mapping, &dst->page, &src->page, mode); /* Check whether page does not have extra refs before we do more work */ - expected_count = expected_page_refs(mapping, page); - if (page_count(page) != expected_count) + expected_count = expected_page_refs(mapping, &src->page); + if (folio_ref_count(src) != expected_count) return -EAGAIN; - head = page_buffers(page); if (!buffer_migrate_lock_buffers(head, mode)) return -EAGAIN; @@ -703,23 +703,22 @@ static int __buffer_migrate_page(struct address_space *mapping, } } - rc = migrate_page_move_mapping(mapping, newpage, page, 0); + rc = folio_migrate_mapping(mapping, dst, src, 0); if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; - attach_page_private(newpage, detach_page_private(page)); + folio_attach_private(dst, folio_detach_private(src)); bh = head; do { - set_bh_page(bh, newpage, bh_offset(bh)); + set_bh_page(bh, &dst->page, bh_offset(bh)); bh = bh->b_this_page; - } while (bh != head); if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(dst, src); else - migrate_page_states(newpage, page); + folio_migrate_flags(dst, src); rc = MIGRATEPAGE_SUCCESS; unlock_buffers: @@ -729,34 +728,51 @@ static int __buffer_migrate_page(struct address_space *mapping, do { unlock_buffer(bh); bh = bh->b_this_page; - } while (bh != head); return rc; } -/* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. For example attached buffer heads are accessed only under page lock. +/** + * buffer_migrate_folio() - Migration function for folios with buffers. + * @mapping: The address space containing @src. + * @dst: The folio to migrate to. + * @src: The folio to migrate from. + * @mode: How to migrate the folio. + * + * This function can only be used if the underlying filesystem guarantees + * that no other references to @src exist. For example attached buffer + * heads are accessed only under the folio lock. If your filesystem cannot + * provide this guarantee, buffer_migrate_folio_norefs() may be more + * appropriate. + * + * Return: 0 on success or a negative errno on failure. */ -int buffer_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +int buffer_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - return __buffer_migrate_page(mapping, newpage, page, mode, false); + return __buffer_migrate_folio(mapping, dst, src, mode, false); } -EXPORT_SYMBOL(buffer_migrate_page); +EXPORT_SYMBOL(buffer_migrate_folio); -/* - * Same as above except that this variant is more careful and checks that there - * are also no buffer head references. This function is the right one for - * mappings where buffer heads are directly looked up and referenced (such as - * block device mappings). +/** + * buffer_migrate_folio_norefs() - Migration function for folios with buffers. + * @mapping: The address space containing @src. + * @dst: The folio to migrate to. + * @src: The folio to migrate from. + * @mode: How to migrate the folio. + * + * Like buffer_migrate_folio() except that this variant is more careful + * and checks that there are also no buffer head references. This function + * is the right one for mappings where buffer heads are directly looked + * up and referenced (such as block device mappings). + * + * Return: 0 on success or a negative errno on failure. */ -int buffer_migrate_page_norefs(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) +int buffer_migrate_folio_norefs(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - return __buffer_migrate_page(mapping, newpage, page, mode, true); + return __buffer_migrate_folio(mapping, dst, src, mode, true); } #endif From 1c3d54037780df2128024b64b7543136a90472b3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Wed, 1 Jun 2022 16:12:37 -0700 Subject: [PATCH 0199/1250] MAINTAINERS: add patchwork link to linux-raid project Add link to patchwork: https://patchwork.kernel.org/project/linux-raid/list/ Signed-off-by: Song Liu --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fe5daf14150136..68147e52544001 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18540,6 +18540,7 @@ SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu L: linux-raid@vger.kernel.org S: Supported +Q: https://patchwork.kernel.org/project/linux-raid/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/song/md.git F: drivers/md/Kconfig F: drivers/md/Makefile From 59c0b25533f0734962462b559ddc49378d2ffb65 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:47 -0600 Subject: [PATCH 0200/1250] md/raid5-log: Drop extern decorators for function prototypes extern is not necessary and recommended against when defining prototype functions in headers. checkpatch.pl complains about these. So remove them. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-log.h | 75 ++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 43c714a8798c5f..270ced4f770f29 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -2,49 +2,46 @@ #ifndef _RAID5_LOG_H #define _RAID5_LOG_H -extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); -extern void r5l_exit_log(struct r5conf *conf); -extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); -extern void r5l_write_stripe_run(struct r5l_log *log); -extern void r5l_flush_stripe_to_raid(struct r5l_log *log); -extern void r5l_stripe_write_finished(struct stripe_head *sh); -extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int quiesce); -extern bool r5l_log_disk_error(struct r5conf *conf); -extern bool r5c_is_writeback(struct r5l_log *log); -extern int -r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s, int disks); -extern void -r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, - struct stripe_head_state *s); -extern void r5c_release_extra_page(struct stripe_head *sh); -extern void r5c_use_extra_page(struct stripe_head *sh); -extern void r5l_wake_reclaim(struct r5l_log *log, sector_t space); -extern void r5c_handle_cached_data_endio(struct r5conf *conf, - struct stripe_head *sh, int disks); -extern int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); -extern void r5c_make_stripe_write_out(struct stripe_head *sh); -extern void r5c_flush_cache(struct r5conf *conf, int num); -extern void r5c_check_stripe_cache_usage(struct r5conf *conf); -extern void r5c_check_cached_full_stripe(struct r5conf *conf); +int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev); +void r5l_exit_log(struct r5conf *conf); +int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh); +void r5l_write_stripe_run(struct r5l_log *log); +void r5l_flush_stripe_to_raid(struct r5l_log *log); +void r5l_stripe_write_finished(struct stripe_head *sh); +int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); +void r5l_quiesce(struct r5l_log *log, int quiesce); +bool r5l_log_disk_error(struct r5conf *conf); +bool r5c_is_writeback(struct r5l_log *log); +int r5c_try_caching_write(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s, int disks); +void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, + struct stripe_head_state *s); +void r5c_release_extra_page(struct stripe_head *sh); +void r5c_use_extra_page(struct stripe_head *sh); +void r5l_wake_reclaim(struct r5l_log *log, sector_t space); +void r5c_handle_cached_data_endio(struct r5conf *conf, + struct stripe_head *sh, int disks); +int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh); +void r5c_make_stripe_write_out(struct stripe_head *sh); +void r5c_flush_cache(struct r5conf *conf, int num); +void r5c_check_stripe_cache_usage(struct r5conf *conf); +void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; -extern void r5c_update_on_rdev_error(struct mddev *mddev, - struct md_rdev *rdev); -extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); -extern int r5l_start(struct r5l_log *log); +void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev); +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); +int r5l_start(struct r5l_log *log); -extern struct dma_async_tx_descriptor * +struct dma_async_tx_descriptor * ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx); -extern int ppl_init_log(struct r5conf *conf); -extern void ppl_exit_log(struct r5conf *conf); -extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); -extern void ppl_write_stripe_run(struct r5conf *conf); -extern void ppl_stripe_write_finished(struct stripe_head *sh); -extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); -extern void ppl_quiesce(struct r5conf *conf, int quiesce); -extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); +int ppl_init_log(struct r5conf *conf); +void ppl_exit_log(struct r5conf *conf); +int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); +void ppl_write_stripe_run(struct r5conf *conf); +void ppl_stripe_write_finished(struct stripe_head *sh); +int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); +void ppl_quiesce(struct r5conf *conf, int quiesce); +int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); extern struct md_sysfs_entry ppl_write_hint; static inline bool raid5_has_log(struct r5conf *conf) From 26becbe46497e4161dd486bec66a8e444056e0d8 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:48 -0600 Subject: [PATCH 0201/1250] md/raid5-ppl: Drop unused argument from ppl_handle_flush_request() ppl_handle_flush_request() takes an struct r5log argument but doesn't use it. It has no buisiness taking this argument as it is only used by raid5-cache and has no way to derference it anyway. Remove the argument. No functional changes intended. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-log.h | 4 ++-- drivers/md/raid5-ppl.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 270ced4f770f29..c8332502669e56 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -41,7 +41,7 @@ void ppl_write_stripe_run(struct r5conf *conf); void ppl_stripe_write_finished(struct stripe_head *sh); int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); void ppl_quiesce(struct r5conf *conf, int quiesce); -int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio); +int ppl_handle_flush_request(struct bio *bio); extern struct md_sysfs_entry ppl_write_hint; static inline bool raid5_has_log(struct r5conf *conf) @@ -108,7 +108,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio) if (conf->log) ret = r5l_handle_flush_request(conf->log, bio); else if (raid5_has_ppl(conf)) - ret = ppl_handle_flush_request(conf->log, bio); + ret = ppl_handle_flush_request(bio); return ret; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 0a2e4806b1ece1..db49edec362aff 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -679,7 +679,7 @@ void ppl_quiesce(struct r5conf *conf, int quiesce) } } -int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio) +int ppl_handle_flush_request(struct bio *bio) { if (bio->bi_iter.bi_size == 0) { bio_endio(bio); From f4eb3eb3a9d522ec7073cdfa3dc87facca585b08 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:49 -0600 Subject: [PATCH 0202/1250] md/raid5: suspend the array for calls to log_exit() The raid5-cache code relies on there being no IO in flight when log_exit() is called. There are two places where this is not guaranteed so add mddev_suspend() and mddev_resume() calls to these sites. The site in raid5_change_consistency_policy() is in the error path, and another similar call site already has suspend/resume calls just below it; so it should be equally safe to make that change here. There is one remaining site in raid5_remove_disk() that we call log_exit() without suspending the array. Unfortunately, as the comment stated, we cannot call mddev_suspend from raid5d. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5d09256d7f8185..5d84bad8b854d7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8697,8 +8697,11 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) err = log_init(conf, NULL, true); if (!err) { err = resize_stripes(conf, conf->pool_size); - if (err) + if (err) { + mddev_suspend(mddev); log_exit(conf); + mddev_resume(mddev); + } } } else err = -EINVAL; From c2270aede3a1059c43b5af0561374bde82b7d313 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:50 -0600 Subject: [PATCH 0203/1250] md/raid5-cache: Take mddev_lock in r5c_journal_mode_show() The mddev->lock spinlock doesn't protect against the removal of conf->log in r5l_exit_log() so conf->log may be freed before it is used. To fix this, take the mddev_lock() insteaad of the mddev->lock spinlock. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 83c184eddbda23..ca8fc317da950b 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2534,12 +2534,13 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) struct r5conf *conf; int ret; - spin_lock(&mddev->lock); + ret = mddev_lock(mddev); + if (ret) + return ret; + conf = mddev->private; - if (!conf || !conf->log) { - spin_unlock(&mddev->lock); - return 0; - } + if (!conf || !conf->log) + goto out_unlock; switch (conf->log->r5c_journal_mode) { case R5C_JOURNAL_MODE_WRITE_THROUGH: @@ -2557,7 +2558,9 @@ static ssize_t r5c_journal_mode_show(struct mddev *mddev, char *page) default: ret = 0; } - spin_unlock(&mddev->lock); + +out_unlock: + mddev_unlock(mddev); return ret; } From 33fa9f7c08f7e2258f117a21a4c0bfc5bc146e20 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:51 -0600 Subject: [PATCH 0204/1250] md/raid5-cache: Drop RCU usage of conf->log The only place that uses RCU to access conf->log is in r5l_log_disk_error(). This function is mostly used in the IO path and once with mddev_lock() held in raid5_change_consistency_policy(). It is known that the IO will be suspended before the log is freed and r5l_log_exit() is called with the mddev_lock() held. This should mean that conf->log can not be freed while the function is being called, so the RCU protection is not necessary. Drop the rcu_read_lock() as well as the synchronize_rcu() and rcu_assign_pointer() usage. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index ca8fc317da950b..b1fc65e113f899 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1590,18 +1590,13 @@ void r5l_quiesce(struct r5l_log *log, int quiesce) bool r5l_log_disk_error(struct r5conf *conf) { - struct r5l_log *log; - bool ret; - /* don't allow write if journal disk is missing */ - rcu_read_lock(); - log = rcu_dereference(conf->log); + struct r5l_log *log = conf->log; + /* don't allow write if journal disk is missing */ if (!log) - ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); + return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags); else - ret = test_bit(Faulty, &log->rdev->flags); - rcu_read_unlock(); - return ret; + return test_bit(Faulty, &log->rdev->flags); } #define R5L_RECOVERY_PAGE_POOL_SIZE 256 @@ -3148,7 +3143,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) spin_lock_init(&log->stripe_in_journal_lock); atomic_set(&log->stripe_in_journal_count, 0); - rcu_assign_pointer(conf->log, log); + conf->log = log; set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); return 0; @@ -3171,7 +3166,6 @@ void r5l_exit_log(struct r5conf *conf) struct r5l_log *log = conf->log; conf->log = NULL; - synchronize_rcu(); /* Ensure disable_writeback_work wakes up and exits */ wake_up(&conf->mddev->sb_wait); From 0d79ac61090f92f8e94d459715b2fa2cd5c14199 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:52 -0600 Subject: [PATCH 0205/1250] md/raid5-cache: Clear conf->log after finishing work A NULL pointer dereferlence on conf->log is seen randomly with the mdadm test 21raid5cache. Kasan reporst: BUG: KASAN: null-ptr-deref in r5l_reclaimable_space+0xf5/0x140 Read of size 8 at addr 0000000000000860 by task md0_reclaim/3086 Call Trace: dump_stack_lvl+0x5a/0x74 kasan_report.cold+0x5f/0x1a9 __asan_load8+0x69/0x90 r5l_reclaimable_space+0xf5/0x140 r5l_do_reclaim+0xf4/0x5e0 r5l_reclaim_thread+0x69/0x3b0 md_thread+0x1a2/0x2c0 kthread+0x177/0x1b0 ret_from_fork+0x22/0x30 This is caused by conf->log being cleared in r5l_exit_log() before stopping the reclaim thread. To fix this, clear conf->log after the reclaim_thread is unregistered and after flushing disable_writeback_work. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index b1fc65e113f899..ca57b5fac59958 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -3165,12 +3165,13 @@ void r5l_exit_log(struct r5conf *conf) { struct r5l_log *log = conf->log; - conf->log = NULL; - /* Ensure disable_writeback_work wakes up and exits */ wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); + + conf->log = NULL; + mempool_exit(&log->meta_pool); bioset_exit(&log->bs); mempool_exit(&log->io_pool); From 13a5bd4c6b10491376d5ed95e88892c9a129dac3 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:53 -0600 Subject: [PATCH 0206/1250] md/raid5-cache: Annotate pslot with __rcu notation radix_tree_lookup_slot() and radix_tree_replace_slot() API expect the slot returned and looked up to be marked with __rcu. Otherwise sparse warnings are generated: drivers/md/raid5-cache.c:2939:23: warning: incorrect type in assignment (different address spaces) drivers/md/raid5-cache.c:2939:23: expected void **pslot drivers/md/raid5-cache.c:2939:23: got void [noderef] __rcu ** Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index ca57b5fac59958..b2e6016934eea6 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2637,7 +2637,7 @@ int r5c_try_caching_write(struct r5conf *conf, int i; struct r5dev *dev; int to_cache = 0; - void **pslot; + void __rcu **pslot; sector_t tree_index; int ret; uintptr_t refcount; @@ -2804,7 +2804,7 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, int i; int do_wakeup = 0; sector_t tree_index; - void **pslot; + void __rcu **pslot; uintptr_t refcount; if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) From ff3a0996abb9e45d31bb58022fd64f28eebefb63 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:54 -0600 Subject: [PATCH 0207/1250] md: Use enum for overloaded magic numbers used by mddev->curr_resync Comments in the code document special values used for mddev->curr_resync. Make this clearer by using an enum to label these values. The only functional change is a couple places use the wrong comparison operator that implied 3 is another special value. They are all fixed to imply that 3 or greater is an active resync. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 40 ++++++++++++++++++---------------------- drivers/md/md.h | 15 +++++++++++++++ 2 files changed, 33 insertions(+), 22 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index c7ecb0bffda0d5..03c9ba219f744d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5001,7 +5001,7 @@ static ssize_t sync_speed_show(struct mddev *mddev, char *page) { unsigned long resync, dt, db; - if (mddev->curr_resync == 0) + if (mddev->curr_resync == MD_RESYNC_NONE) return sprintf(page, "none\n"); resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); dt = (jiffies - mddev->resync_mark) / HZ; @@ -5020,8 +5020,8 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); - if (mddev->curr_resync == 1 || - mddev->curr_resync == 2) + if (mddev->curr_resync == MD_RESYNC_YIELDED || + mddev->curr_resync == MD_RESYNC_DELAYED) return sprintf(page, "delayed\n"); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || @@ -8018,7 +8018,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) max_sectors = mddev->dev_sectors; resync = mddev->curr_resync; - if (resync <= 3) { + if (resync < MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) /* Still cleaning up */ resync = max_sectors; @@ -8027,7 +8027,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) else resync -= atomic_read(&mddev->recovery_active); - if (resync == 0) { + if (resync == MD_RESYNC_NONE) { if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { struct md_rdev *rdev; @@ -8051,7 +8051,7 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) } return 0; } - if (resync < 3) { + if (resync < MD_RESYNC_ACTIVE) { seq_printf(seq, "\tresync=DELAYED"); return 1; } @@ -8729,13 +8729,7 @@ void md_do_sync(struct md_thread *thread) mddev->last_sync_action = action ?: desc; - /* we overload curr_resync somewhat here. - * 0 == not engaged in resync at all - * 2 == checking that there is no conflict with another sync - * 1 == like 2, but have yielded to allow conflicting resync to - * commence - * other == active in resync - this many blocks - * + /* * Before starting a resync we must have set curr_resync to * 2, and then checked that every "conflicting" array has curr_resync * less than ours. When we find one that is the same or higher @@ -8747,7 +8741,7 @@ void md_do_sync(struct md_thread *thread) do { int mddev2_minor = -1; - mddev->curr_resync = 2; + mddev->curr_resync = MD_RESYNC_DELAYED; try_again: if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) @@ -8759,12 +8753,14 @@ void md_do_sync(struct md_thread *thread) && mddev2->curr_resync && match_mddev_units(mddev, mddev2)) { DEFINE_WAIT(wq); - if (mddev < mddev2 && mddev->curr_resync == 2) { + if (mddev < mddev2 && + mddev->curr_resync == MD_RESYNC_DELAYED) { /* arbitrarily yield */ - mddev->curr_resync = 1; + mddev->curr_resync = MD_RESYNC_YIELDED; wake_up(&resync_wait); } - if (mddev > mddev2 && mddev->curr_resync == 1) + if (mddev > mddev2 && + mddev->curr_resync == MD_RESYNC_YIELDED) /* no need to wait here, we can wait the next * time 'round when curr_resync == 2 */ @@ -8792,7 +8788,7 @@ void md_do_sync(struct md_thread *thread) finish_wait(&resync_wait, &wq); } } - } while (mddev->curr_resync < 2); + } while (mddev->curr_resync < MD_RESYNC_DELAYED); j = 0; if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { @@ -8876,7 +8872,7 @@ void md_do_sync(struct md_thread *thread) desc, mdname(mddev)); mddev->curr_resync = j; } else - mddev->curr_resync = 3; /* no longer delayed */ + mddev->curr_resync = MD_RESYNC_ACTIVE; /* no longer delayed */ mddev->curr_resync_completed = j; sysfs_notify_dirent_safe(mddev->sysfs_completed); md_new_event(); @@ -9011,14 +9007,14 @@ void md_do_sync(struct md_thread *thread) if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(MD_RECOVERY_INTR, &mddev->recovery) && - mddev->curr_resync > 3) { + mddev->curr_resync >= MD_RESYNC_ACTIVE) { mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify_dirent_safe(mddev->sysfs_completed); } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && - mddev->curr_resync > 3) { + mddev->curr_resync >= MD_RESYNC_ACTIVE) { if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (mddev->curr_resync >= mddev->recovery_cp) { @@ -9082,7 +9078,7 @@ void md_do_sync(struct md_thread *thread) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; set_bit(MD_RECOVERY_DONE, &mddev->recovery); - mddev->curr_resync = 0; + mddev->curr_resync = MD_RESYNC_NONE; spin_unlock(&mddev->lock); wake_up(&resync_wait); diff --git a/drivers/md/md.h b/drivers/md/md.h index cf2cbb17acbd42..e3361f6a734178 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -288,6 +288,21 @@ struct serial_info { sector_t _subtree_last; /* highest sector in subtree of rb node */ }; +/* + * mddev->curr_resync stores the current sector of the resync but + * also has some overloaded values. + */ +enum { + /* No resync in progress */ + MD_RESYNC_NONE = 0, + /* Yielded to allow another conflicting resync to commence */ + MD_RESYNC_YIELDED = 1, + /* Delayed to check that there is no conflict with another sync */ + MD_RESYNC_DELAYED = 2, + /* Any value greater than or equal to this is in an active resync */ + MD_RESYNC_ACTIVE = 3, +}; + struct mddev { void *private; struct md_personality *pers; From c74008cf2d48c8614acd13932841556e99da67b6 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:55 -0600 Subject: [PATCH 0208/1250] md: Ensure resync is reported after it starts The 07layouts test in mdadm fails on some systems. The failure presents itself as the backup file not being removed before the next layout is grown into: mdadm: /dev/md0: cannot create backup file /tmp/md-test-backup: File exists This is because the background mdadm process, which is responsible for cleaning up this backup file gets into an infinite loop waiting for the reshape to start. mdadm checks the mdstat file if a reshape is going and, if it is not, it waits for an event on the file or times out in 5 seconds. On faster machines, the reshape may complete before the 5 seconds times out, and thus the background mdadm process loops waiting for a reshape to start that has already occurred. mdadm reads the mdstat file to start, but mdstat does not report that the reshape has begun, even though it has indeed begun. So the mdstat_wait() call (in mdadm) which polls on the mdstat file won't ever return until timing out. The reason mdstat reports the reshape has started is due to an issue in status_resync(). recovery_active is subtracted from curr_resync which will result in a value of zero for the first chunk of reshaped data, and the resulting read will report no reshape in progress. To fix this, if "resync - recovery_active" is an overloaded value, force the value to be MD_RESYNC_ACTIVE so the code reports a resync in progress. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 03c9ba219f744d..6a76cb0d48766a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8022,10 +8022,20 @@ static int status_resync(struct seq_file *seq, struct mddev *mddev) if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) /* Still cleaning up */ resync = max_sectors; - } else if (resync > max_sectors) + } else if (resync > max_sectors) { resync = max_sectors; - else + } else { resync -= atomic_read(&mddev->recovery_active); + if (resync < MD_RESYNC_ACTIVE) { + /* + * Resync has started, but the subtraction has + * yielded one of the special values. Force it + * to active to ensure the status reports an + * active resync. + */ + resync = MD_RESYNC_ACTIVE; + } + } if (resync == MD_RESYNC_NONE) { if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) { From 92a2748dc3c52a5484cede626fe152a48d81782d Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Wed, 8 Jun 2022 10:27:56 -0600 Subject: [PATCH 0209/1250] md: Notify sysfs sync_completed in md_reap_sync_thread() The mdadm test 07layouts randomly produces a kernel hung task deadlock. The deadlock is caused by the suspend_lo/suspend_hi files being set by the mdadm background process during reshape and not being cleared because the process hangs. (Leaving aside the issue of the fragility of freezing kernel tasks by buggy userspace processes...) When the background mdadm process hangs it, is waiting (without a timeout) on a change to the sync_completed file signalling that the reshape has completed. The process is woken up a couple times when the reshape finishes but it is woken up before MD_RECOVERY_RUNNING is cleared so sync_completed_show() reports 0 instead of "none". To fix this, notify the sysfs file in md_reap_sync_thread() after MD_RECOVERY_RUNNING has been cleared. This wakes up mdadm and causes it to continue and write to suspend_lo/suspend_hi to allow IO to continue. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 6a76cb0d48766a..cbb53e53c5425c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9472,6 +9472,7 @@ void md_reap_sync_thread(struct mddev *mddev) wake_up(&resync_wait); /* flag recovery needed just to double check */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + sysfs_notify_dirent_safe(mddev->sysfs_completed); sysfs_notify_dirent_safe(mddev->sysfs_action); md_new_event(); if (mddev->event_work.func) From 467f738d0e821f36bf6316aaab4ef5bdef9b003e Mon Sep 17 00:00:00 2001 From: Chris Webb Date: Wed, 1 Jun 2022 12:03:07 +0100 Subject: [PATCH 0210/1250] md: Explicitly create command-line configured devices Boot-time assembly of arrays with md= command-line arguments breaks when CONFIG_BLOCK_LEGACY_AUTOLOAD is unset. md_setup_drive() in md-autodetect.c calls blkdev_get_by_dev(), assuming this implicitly creates the block device. Fix this by attempting to md_alloc() the array first. As in the probe path, ignore any error as failure is caught by blkdev_get_by_dev() anyway. Signed-off-by: Chris Webb Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md-autodetect.c | 1 + drivers/md/md.c | 2 +- drivers/md/md.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c index 2cf973722f5962..344910ba435c53 100644 --- a/drivers/md/md-autodetect.c +++ b/drivers/md/md-autodetect.c @@ -169,6 +169,7 @@ static void __init md_setup_drive(struct md_setup_args *args) pr_info("md: Loading %s: %s\n", name, args->device_names); + md_alloc(mdev, name); bdev = blkdev_get_by_dev(mdev, FMODE_READ, NULL); if (IS_ERR(bdev)) { pr_err("md: open failed - cannot start array %s\n", name); diff --git a/drivers/md/md.c b/drivers/md/md.c index cbb53e53c5425c..ce9d2845d3ace1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5623,7 +5623,7 @@ int mddev_init_writes_pending(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_init_writes_pending); -static int md_alloc(dev_t dev, char *name) +int md_alloc(dev_t dev, char *name) { /* * If dev is zero, name is the name of a device to allocate with diff --git a/drivers/md/md.h b/drivers/md/md.h index e3361f6a734178..1a85dbe78a71c1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -766,6 +766,7 @@ extern int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); extern void mddev_init(struct mddev *mddev); +int md_alloc(dev_t dev, char *name); extern int md_run(struct mddev *mddev); extern int md_start(struct mddev *mddev); extern void md_stop(struct mddev *mddev); From 0c48c3be92add65a95c6dd037292fb1647ba0044 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 21 Jun 2022 11:11:29 +0800 Subject: [PATCH 0211/1250] md: unlock mddev before reap sync_thread in action_store Since the bug which commit 8b48ec23cc51a ("md: don't unregister sync_thread with reconfig_mutex held") fixed is related with action_store path, other callers which reap sync_thread didn't need to be changed. Let's pull md_unregister_thread from md_reap_sync_thread, then fix previous bug with belows. 1. unlock mddev before md_reap_sync_thread in action_store. 2. save reshape_position before unlock, then restore it to ensure position not changed accidentally by others. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/dm-raid.c | 1 + drivers/md/md.c | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9526ccbedafbac..d43b8075c0552e 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3725,6 +3725,7 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) diff --git a/drivers/md/md.c b/drivers/md/md.c index ce9d2845d3ace1..b64de313838f29 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4830,6 +4830,19 @@ action_store(struct mddev *mddev, const char *page, size_t len) if (work_pending(&mddev->del_work)) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { + sector_t save_rp = mddev->reshape_position; + + mddev_unlock(mddev); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); + mddev_lock_nointr(mddev); + /* + * set RECOVERY_INTR again and restore reshape + * position in case others changed them after + * got lock, eg, reshape_position_store and + * md_check_recovery. + */ + mddev->reshape_position = save_rp; set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); } @@ -6197,6 +6210,7 @@ static void __md_stop_writes(struct mddev *mddev) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); } @@ -9309,6 +9323,7 @@ void md_check_recovery(struct mddev *mddev) * ->spare_active and clear saved_raid_disk */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -9344,6 +9359,7 @@ void md_check_recovery(struct mddev *mddev) goto unlock; } if (mddev->sync_thread) { + md_unregister_thread(&mddev->sync_thread); md_reap_sync_thread(mddev); goto unlock; } @@ -9423,8 +9439,7 @@ void md_reap_sync_thread(struct mddev *mddev) sector_t old_dev_sectors = mddev->dev_sectors; bool is_reshaped = false; - /* resync has finished, collect result */ - md_unregister_thread(&mddev->sync_thread); + /* sync_thread should be unregistered, collect result */ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && mddev->degraded != mddev->raid_disks) { From cc69f703f7af795981c09e1b08bba01f257405fa Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:31 -0600 Subject: [PATCH 0212/1250] md/raid5: Make logic blocking check consistent with logic that blocks The check in raid5_make_request differs very slightly from the logic that causes it to block lower down. This likely does not cause a bug as the check is fuzzy anyway (as reshape may move on between the first check and the subsequent check). However, make it consistent so it can be cleaned up in a subsequent patch. The condition which causes the schedule is: !(mddev->reshape_backwards ? logical_sector < conf->reshape_progress : logical_sector >= conf->reshape_progress) && (mddev->reshape_backwards ? logical_sector < conf->reshape_safe : logical_sector >= conf->reshape_safe) The condition that causes the early bailout is made to match this. Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5d84bad8b854d7..b3d1f894f15474 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5841,7 +5841,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && (mddev->reshape_backwards - ? (logical_sector > conf->reshape_progress && logical_sector <= conf->reshape_safe) + ? (logical_sector >= conf->reshape_progress && logical_sector < conf->reshape_safe) : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) { bio_wouldblock_error(bi); if (rw == WRITE) From 7b19410260fb76f3febac943132710a4ea74c96d Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:32 -0600 Subject: [PATCH 0213/1250] md/raid5: Factor out ahead_of_reshape() function There are a few uses of an ugly ternary operator in raid5_make_request() to check if a sector is a head of a reshape sector. Factor this out into a simple helper called ahead_of_reshape(). No functional changes intended. Suggested-by: Christoph Hellwig Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b3d1f894f15474..6e53b8490fff93 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5784,6 +5784,13 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) bio_endio(bi); } +static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, + sector_t reshape_sector) +{ + return mddev->reshape_backwards ? sector < reshape_sector : + sector >= reshape_sector; +} + static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; @@ -5840,9 +5847,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && - (mddev->reshape_backwards - ? (logical_sector >= conf->reshape_progress && logical_sector < conf->reshape_safe) - : (logical_sector >= conf->reshape_safe && logical_sector < conf->reshape_progress))) { + !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && + ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); @@ -5871,14 +5877,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * to check again. */ spin_lock_irq(&conf->device_lock); - if (mddev->reshape_backwards - ? logical_sector < conf->reshape_progress - : logical_sector >= conf->reshape_progress) { + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) { previous = 1; } else { - if (mddev->reshape_backwards - ? logical_sector < conf->reshape_safe - : logical_sector >= conf->reshape_safe) { + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); schedule(); do_prepare = true; @@ -5909,9 +5913,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) */ int must_retry = 0; spin_lock_irq(&conf->device_lock); - if (mddev->reshape_backwards - ? logical_sector >= conf->reshape_progress - : logical_sector < conf->reshape_progress) + if (!ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) /* mismatch, need to try again */ must_retry = 1; spin_unlock_irq(&conf->device_lock); From e07b250b9da96faf1c4a4b06b786c16243d2436e Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:33 -0600 Subject: [PATCH 0214/1250] md/raid5: Refactor raid5_make_request loop Break immediately if raid5_get_active_stripe() returns NULL and deindent the rest of the loop. Annotate this check with an unlikely(). This makes the code easier to read and reduces the indentation level. No functional changes intended. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 109 +++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6e53b8490fff93..25db747c5856f8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5901,68 +5901,69 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) sh = raid5_get_active_stripe(conf, new_sector, previous, (bi->bi_opf & REQ_RAHEAD), 0); - if (sh) { - if (unlikely(previous)) { - /* expansion might have moved on while waiting for a - * stripe, so we must do the range check again. - * Expansion could still move past after this - * test, but as we are holding a reference to - * 'sh', we know that if that happens, - * STRIPE_EXPANDING will get set and the expansion - * won't proceed until we finish with the stripe. - */ - int must_retry = 0; - spin_lock_irq(&conf->device_lock); - if (!ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) - /* mismatch, need to try again */ - must_retry = 1; - spin_unlock_irq(&conf->device_lock); - if (must_retry) { - raid5_release_stripe(sh); - schedule(); - do_prepare = true; - goto retry; - } - } - if (read_seqcount_retry(&conf->gen_lock, seq)) { - /* Might have got the wrong stripe_head - * by accident - */ - raid5_release_stripe(sh); - goto retry; - } + if (unlikely(!sh)) { + /* cannot get stripe, just give-up */ + bi->bi_status = BLK_STS_IOERR; + break; + } - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { - /* Stripe is busy expanding or - * add failed due to overlap. Flush everything - * and wait a while - */ - md_wakeup_thread(mddev->thread); + if (unlikely(previous)) { + /* expansion might have moved on while waiting for a + * stripe, so we must do the range check again. + * Expansion could still move past after this + * test, but as we are holding a reference to + * 'sh', we know that if that happens, + * STRIPE_EXPANDING will get set and the expansion + * won't proceed until we finish with the stripe. + */ + int must_retry = 0; + spin_lock_irq(&conf->device_lock); + if (!ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) + /* mismatch, need to try again */ + must_retry = 1; + spin_unlock_irq(&conf->device_lock); + if (must_retry) { raid5_release_stripe(sh); schedule(); do_prepare = true; goto retry; } - if (do_flush) { - set_bit(STRIPE_R5C_PREFLUSH, &sh->state); - /* we only need flush for one stripe */ - do_flush = false; - } + } - set_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - if ((!sh->batch_head || sh == sh->batch_head) && - (bi->bi_opf & REQ_SYNC) && - !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - release_stripe_plug(mddev, sh); - } else { - /* cannot get stripe for read-ahead, just give-up */ - bi->bi_status = BLK_STS_IOERR; - break; + if (read_seqcount_retry(&conf->gen_lock, seq)) { + /* Might have got the wrong stripe_head by accident */ + raid5_release_stripe(sh); + goto retry; + } + + if (test_bit(STRIPE_EXPANDING, &sh->state) || + !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { + /* + * Stripe is busy expanding or add failed due to + * overlap. Flush everything and wait a while. + */ + md_wakeup_thread(mddev->thread); + raid5_release_stripe(sh); + schedule(); + do_prepare = true; + goto retry; } + + if (do_flush) { + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); + /* we only need flush for one stripe */ + do_flush = false; + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if ((!sh->batch_head || sh == sh->batch_head) && + (bi->bi_opf & REQ_SYNC) && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + + release_stripe_plug(mddev, sh); } finish_wait(&conf->wait_for_overlap, &w); From ec877594ad5775c0fed459e15f43907e37fcf1e5 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:34 -0600 Subject: [PATCH 0215/1250] md/raid5: Move stripe_add_to_batch_list() call out of add_stripe_bio() stripe_add_to_batch_list() is better done in the loop in make_request instead of inside add_stripe_bio(). This is clearer and allows for storing the batch_head state outside the loop in a subsequent patch. The call to add_stripe_bio() in retry_aligned_read() is for read and batching only applies to write. So it's impossible for batching to happen at that call site. No functional changes intended. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 25db747c5856f8..969609b7114b29 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3531,8 +3531,6 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, } spin_unlock_irq(&sh->stripe_lock); - if (stripe_can_batch(sh)) - stripe_add_to_batch_list(conf, sh); return 1; overlap: @@ -5950,6 +5948,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) goto retry; } + if (stripe_can_batch(sh)) + stripe_add_to_batch_list(conf, sh); + if (do_flush) { set_bit(STRIPE_R5C_PREFLUSH, &sh->state); /* we only need flush for one stripe */ From 6ce9728905c2760c5de37b75338ce23ad4b3bd13 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:35 -0600 Subject: [PATCH 0216/1250] md/raid5: Move common stripe get code into new find_get_stripe() helper Both uses of find_stripe() require a fairly complicated dance to increment the reference count. Move this into a common find_get_stripe() helper. No functional changes intended. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 131 ++++++++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 67 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 969609b7114b29..1bbf87d15bc8ca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -624,6 +624,49 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, return NULL; } +static struct stripe_head *find_get_stripe(struct r5conf *conf, + sector_t sector, short generation, int hash) +{ + int inc_empty_inactive_list_flag; + struct stripe_head *sh; + + sh = __find_stripe(conf, sector, generation); + if (!sh) + return NULL; + + if (atomic_inc_not_zero(&sh->count)) + return sh; + + /* + * Slow path. The reference count is zero which means the stripe must + * be on a list (sh->lru). Must remove the stripe from the list that + * references it with the device_lock held. + */ + + spin_lock(&conf->device_lock); + if (!atomic_read(&sh->count)) { + if (!test_bit(STRIPE_HANDLE, &sh->state)) + atomic_inc(&conf->active_stripes); + BUG_ON(list_empty(&sh->lru) && + !test_bit(STRIPE_EXPANDING, &sh->state)); + inc_empty_inactive_list_flag = 0; + if (!list_empty(conf->inactive_list + hash)) + inc_empty_inactive_list_flag = 1; + list_del_init(&sh->lru); + if (list_empty(conf->inactive_list + hash) && + inc_empty_inactive_list_flag) + atomic_inc(&conf->empty_inactive_list_nr); + if (sh->group) { + sh->group->stripes_cnt--; + sh->group = NULL; + } + } + atomic_inc(&sh->count); + spin_unlock(&conf->device_lock); + + return sh; +} + /* * Need to check if array has failed when deciding whether to: * - start an array @@ -716,7 +759,6 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, { struct stripe_head *sh; int hash = stripe_hash_locks_hash(conf, sector); - int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -726,57 +768,34 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0 || noquiesce, *(conf->hash_locks + hash)); - sh = __find_stripe(conf, sector, conf->generation - previous); - if (!sh) { - if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { - sh = get_free_stripe(conf, hash); - if (!sh && !test_bit(R5_DID_ALLOC, - &conf->cache_state)) - set_bit(R5_ALLOC_MORE, - &conf->cache_state); - } - if (noblock && sh == NULL) - break; + sh = find_get_stripe(conf, sector, conf->generation - previous, + hash); + if (sh) + break; - r5c_check_stripe_cache_usage(conf); - if (!sh) { - set_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state); - r5l_wake_reclaim(conf->log, 0); - wait_event_lock_irq( - conf->wait_for_stripe, + if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) { + sh = get_free_stripe(conf, hash); + if (!sh && !test_bit(R5_DID_ALLOC, &conf->cache_state)) + set_bit(R5_ALLOC_MORE, &conf->cache_state); + } + if (noblock && !sh) + break; + + r5c_check_stripe_cache_usage(conf); + if (!sh) { + set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + r5l_wake_reclaim(conf->log, 0); + wait_event_lock_irq(conf->wait_for_stripe, !list_empty(conf->inactive_list + hash) && (atomic_read(&conf->active_stripes) < (conf->max_nr_stripes * 3 / 4) || !test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)), *(conf->hash_locks + hash)); - clear_bit(R5_INACTIVE_BLOCKED, - &conf->cache_state); - } else { - init_stripe(sh, sector, previous); - atomic_inc(&sh->count); - } - } else if (!atomic_inc_not_zero(&sh->count)) { - spin_lock(&conf->device_lock); - if (!atomic_read(&sh->count)) { - if (!test_bit(STRIPE_HANDLE, &sh->state)) - atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&sh->lru) && - !test_bit(STRIPE_EXPANDING, &sh->state)); - inc_empty_inactive_list_flag = 0; - if (!list_empty(conf->inactive_list + hash)) - inc_empty_inactive_list_flag = 1; - list_del_init(&sh->lru); - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) - atomic_inc(&conf->empty_inactive_list_nr); - if (sh->group) { - sh->group->stripes_cnt--; - sh->group = NULL; - } - } + clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state); + } else { + init_stripe(sh, sector, previous); atomic_inc(&sh->count); - spin_unlock(&conf->device_lock); } } while (sh == NULL); @@ -830,7 +849,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh sector_t head_sector, tmp_sec; int hash; int dd_idx; - int inc_empty_inactive_list_flag; /* Don't cross chunks, so stripe pd_idx/qd_idx is the same */ tmp_sec = sh->sector; @@ -840,28 +858,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh hash = stripe_hash_locks_hash(conf, head_sector); spin_lock_irq(conf->hash_locks + hash); - head = __find_stripe(conf, head_sector, conf->generation); - if (head && !atomic_inc_not_zero(&head->count)) { - spin_lock(&conf->device_lock); - if (!atomic_read(&head->count)) { - if (!test_bit(STRIPE_HANDLE, &head->state)) - atomic_inc(&conf->active_stripes); - BUG_ON(list_empty(&head->lru) && - !test_bit(STRIPE_EXPANDING, &head->state)); - inc_empty_inactive_list_flag = 0; - if (!list_empty(conf->inactive_list + hash)) - inc_empty_inactive_list_flag = 1; - list_del_init(&head->lru); - if (list_empty(conf->inactive_list + hash) && inc_empty_inactive_list_flag) - atomic_inc(&conf->empty_inactive_list_nr); - if (head->group) { - head->group->stripes_cnt--; - head->group = NULL; - } - } - atomic_inc(&head->count); - spin_unlock(&conf->device_lock); - } + head = find_get_stripe(conf, head_sector, conf->generation, hash); spin_unlock_irq(conf->hash_locks + hash); if (!head) From 09cf533457791bf2ee15e7e370f44032ca64101a Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:36 -0600 Subject: [PATCH 0217/1250] md/raid5: Factor out helper from raid5_make_request() loop Factor out the inner loop of raid5_make_request() into it's own helper called make_stripe_request(). The helper returns a number of statuses: SUCCESS, RETRY, SCHEDULE_AND_RETRY and FAIL. This makes the code a bit easier to understand and allows the SCHEDULE_AND_RETRY path to be made common. A context structure is added to contain do_flush. It will be used more in subsequent patches for state that needs to be kept outside the loop. No functional changes intended. This will be cleaned up further in subsequent patches to untangle the gen_lock and do_prepare logic further. Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 231 ++++++++++++++++++++++++++------------------- 1 file changed, 133 insertions(+), 98 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1bbf87d15bc8ca..26ef292842de0f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5786,17 +5786,139 @@ static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, sector >= reshape_sector; } +enum stripe_result { + STRIPE_SUCCESS = 0, + STRIPE_RETRY, + STRIPE_SCHEDULE_AND_RETRY, + STRIPE_FAIL, +}; + +struct stripe_request_ctx { + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ + bool do_flush; +}; + +static enum stripe_result make_stripe_request(struct mddev *mddev, + struct r5conf *conf, struct stripe_request_ctx *ctx, + sector_t logical_sector, struct bio *bi, int seq) +{ + const int rw = bio_data_dir(bi); + enum stripe_result ret; + struct stripe_head *sh; + sector_t new_sector; + int previous = 0; + int dd_idx; + + if (unlikely(conf->reshape_progress != MaxSector)) { + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) { + previous = 1; + } else { + if (ahead_of_reshape(mddev, logical_sector, + conf->reshape_safe)) { + spin_unlock_irq(&conf->device_lock); + return STRIPE_SCHEDULE_AND_RETRY; + } + } + spin_unlock_irq(&conf->device_lock); + } + + new_sector = raid5_compute_sector(conf, logical_sector, previous, + &dd_idx, NULL); + pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, + new_sector, logical_sector); + + sh = raid5_get_active_stripe(conf, new_sector, previous, + (bi->bi_opf & REQ_RAHEAD), 0); + if (unlikely(!sh)) { + /* cannot get stripe, just give-up */ + bi->bi_status = BLK_STS_IOERR; + return STRIPE_FAIL; + } + + if (unlikely(previous)) { + /* + * Expansion might have moved on while waiting for a + * stripe, so we must do the range check again. + * Expansion could still move past after this + * test, but as we are holding a reference to + * 'sh', we know that if that happens, + * STRIPE_EXPANDING will get set and the expansion + * won't proceed until we finish with the stripe. + */ + int must_retry = 0; + spin_lock_irq(&conf->device_lock); + if (!ahead_of_reshape(mddev, logical_sector, + conf->reshape_progress)) + /* mismatch, need to try again */ + must_retry = 1; + spin_unlock_irq(&conf->device_lock); + if (must_retry) { + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + } + + if (read_seqcount_retry(&conf->gen_lock, seq)) { + /* Might have got the wrong stripe_head by accident */ + ret = STRIPE_RETRY; + goto out_release; + } + + if (test_bit(STRIPE_EXPANDING, &sh->state) || + !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { + /* + * Stripe is busy expanding or add failed due to + * overlap. Flush everything and wait a while. + */ + md_wakeup_thread(mddev->thread); + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; + } + + if (stripe_can_batch(sh)) + stripe_add_to_batch_list(conf, sh); + + if (ctx->do_flush) { + set_bit(STRIPE_R5C_PREFLUSH, &sh->state); + /* we only need flush for one stripe */ + ctx->do_flush = false; + } + + set_bit(STRIPE_HANDLE, &sh->state); + clear_bit(STRIPE_DELAYED, &sh->state); + if ((!sh->batch_head || sh == sh->batch_head) && + (bi->bi_opf & REQ_SYNC) && + !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + atomic_inc(&conf->preread_active_stripes); + + release_stripe_plug(mddev, sh); + return STRIPE_SUCCESS; + +out_release: + raid5_release_stripe(sh); + return ret; +} + static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; - int dd_idx; - sector_t new_sector; sector_t logical_sector, last_sector; - struct stripe_head *sh; + struct stripe_request_ctx ctx = {}; const int rw = bio_data_dir(bi); + enum stripe_result res; DEFINE_WAIT(w); bool do_prepare; - bool do_flush = false; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5812,7 +5934,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, * we need to flush journal device */ - do_flush = bi->bi_opf & REQ_PREFLUSH; + ctx.do_flush = bi->bi_opf & REQ_PREFLUSH; } if (!md_write_start(mddev, bi)) @@ -5852,117 +5974,30 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { - int previous; int seq; do_prepare = false; retry: seq = read_seqcount_begin(&conf->gen_lock); - previous = 0; if (do_prepare) prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - if (unlikely(conf->reshape_progress != MaxSector)) { - /* spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) { - previous = 1; - } else { - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_safe)) { - spin_unlock_irq(&conf->device_lock); - schedule(); - do_prepare = true; - goto retry; - } - } - spin_unlock_irq(&conf->device_lock); - } - - new_sector = raid5_compute_sector(conf, logical_sector, - previous, - &dd_idx, NULL); - pr_debug("raid456: raid5_make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, - (unsigned long long)logical_sector); - sh = raid5_get_active_stripe(conf, new_sector, previous, - (bi->bi_opf & REQ_RAHEAD), 0); - if (unlikely(!sh)) { - /* cannot get stripe, just give-up */ - bi->bi_status = BLK_STS_IOERR; + res = make_stripe_request(mddev, conf, &ctx, logical_sector, + bi, seq); + if (res == STRIPE_FAIL) break; - } - - if (unlikely(previous)) { - /* expansion might have moved on while waiting for a - * stripe, so we must do the range check again. - * Expansion could still move past after this - * test, but as we are holding a reference to - * 'sh', we know that if that happens, - * STRIPE_EXPANDING will get set and the expansion - * won't proceed until we finish with the stripe. - */ - int must_retry = 0; - spin_lock_irq(&conf->device_lock); - if (!ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) - /* mismatch, need to try again */ - must_retry = 1; - spin_unlock_irq(&conf->device_lock); - if (must_retry) { - raid5_release_stripe(sh); - schedule(); - do_prepare = true; - goto retry; - } - } - if (read_seqcount_retry(&conf->gen_lock, seq)) { - /* Might have got the wrong stripe_head by accident */ - raid5_release_stripe(sh); + if (res == STRIPE_RETRY) goto retry; - } - if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { - /* - * Stripe is busy expanding or add failed due to - * overlap. Flush everything and wait a while. - */ - md_wakeup_thread(mddev->thread); - raid5_release_stripe(sh); + if (res == STRIPE_SCHEDULE_AND_RETRY) { schedule(); do_prepare = true; goto retry; } - - if (stripe_can_batch(sh)) - stripe_add_to_batch_list(conf, sh); - - if (do_flush) { - set_bit(STRIPE_R5C_PREFLUSH, &sh->state); - /* we only need flush for one stripe */ - do_flush = false; - } - - set_bit(STRIPE_HANDLE, &sh->state); - clear_bit(STRIPE_DELAYED, &sh->state); - if ((!sh->batch_head || sh == sh->batch_head) && - (bi->bi_opf & REQ_SYNC) && - !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) - atomic_inc(&conf->preread_active_stripes); - - release_stripe_plug(mddev, sh); } + finish_wait(&conf->wait_for_overlap, &w); if (rw == WRITE) From 7f181f7bfeb86dccc62ef9747a859fd3144087fd Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:37 -0600 Subject: [PATCH 0218/1250] md/raid5: Drop the do_prepare flag in raid5_make_request() prepare_to_wait() can be reasonably called after schedule instead of setting a flag and preparing in the next loop iteration. This means that prepare_to_wait() will be called before read_seqcount_begin(), but there shouldn't be any reason that the order matters here. On the first iteration of the loop prepare_to_wait() is already called first. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 26ef292842de0f..c58e70db204ada 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5918,7 +5918,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) const int rw = bio_data_dir(bi); enum stripe_result res; DEFINE_WAIT(w); - bool do_prepare; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5976,12 +5975,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int seq; - do_prepare = false; retry: seq = read_seqcount_begin(&conf->gen_lock); - if (do_prepare) - prepare_to_wait(&conf->wait_for_overlap, &w, - TASK_UNINTERRUPTIBLE); res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi, seq); @@ -5993,7 +5988,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (res == STRIPE_SCHEDULE_AND_RETRY) { schedule(); - do_prepare = true; + prepare_to_wait(&conf->wait_for_overlap, &w, + TASK_UNINTERRUPTIBLE); goto retry; } } From 0647deafbf6362b2cbf5fa7fb0cbc2fdcb85a915 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:38 -0600 Subject: [PATCH 0219/1250] md/raid5: Move read_seqcount_begin() into make_stripe_request() Now that prepare_to_wait() isn't in the way, move read_sequcount_begin() into make_stripe_request(). No functional changes intended. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Reviewed-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c58e70db204ada..345350d34623fc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5800,14 +5800,16 @@ struct stripe_request_ctx { static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, - sector_t logical_sector, struct bio *bi, int seq) + sector_t logical_sector, struct bio *bi) { const int rw = bio_data_dir(bi); enum stripe_result ret; struct stripe_head *sh; sector_t new_sector; int previous = 0; - int dd_idx; + int seq, dd_idx; + + seq = read_seqcount_begin(&conf->gen_lock); if (unlikely(conf->reshape_progress != MaxSector)) { /* @@ -5973,13 +5975,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { - int seq; - retry: - seq = read_seqcount_begin(&conf->gen_lock); - res = make_stripe_request(mddev, conf, &ctx, logical_sector, - bi, seq); + bi); if (res == STRIPE_FAIL) break; From cb775de29f28b36219bde44b6fdef39642e4633d Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:39 -0600 Subject: [PATCH 0220/1250] md/raid5: Refactor for loop in raid5_make_request() into while loop The for loop with retry label can be more cleanly expressed as a while loop by moving the logical_sector increment into the success path. No functional changes intended. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 345350d34623fc..17ddaa41147c1f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5974,22 +5974,23 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { - retry: + while (logical_sector < last_sector) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi); if (res == STRIPE_FAIL) break; if (res == STRIPE_RETRY) - goto retry; + continue; if (res == STRIPE_SCHEDULE_AND_RETRY) { schedule(); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - goto retry; + continue; } + + logical_sector += RAID5_STRIPE_SECTORS(conf); } finish_wait(&conf->wait_for_overlap, &w); From 4fcbd9abb6f2a13017b5125539fb79b6b63fb570 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:40 -0600 Subject: [PATCH 0221/1250] md/raid5: Keep a reference to last stripe_head for batch When batching, every stripe head has to find the previous stripe head to add to the batch list. This involves taking the hash lock which is highly contended during IO. Instead of finding the previous stripe_head each time, store a reference to the previous stripe_head in a pointer so that it doesn't require taking the contended lock another time. The reference to the previous stripe must be released before scheduling and waiting for work to get done. Otherwise, it can hold up raid5_activate_delayed() and deadlock. Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Acked-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 52 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 17ddaa41147c1f..34f8d6c18bd35c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -843,7 +843,8 @@ static bool stripe_can_batch(struct stripe_head *sh) } /* we only do back search */ -static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh) +static void stripe_add_to_batch_list(struct r5conf *conf, + struct stripe_head *sh, struct stripe_head *last_sh) { struct stripe_head *head; sector_t head_sector, tmp_sec; @@ -856,15 +857,20 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh return; head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); - hash = stripe_hash_locks_hash(conf, head_sector); - spin_lock_irq(conf->hash_locks + hash); - head = find_get_stripe(conf, head_sector, conf->generation, hash); - spin_unlock_irq(conf->hash_locks + hash); - - if (!head) - return; - if (!stripe_can_batch(head)) - goto out; + if (last_sh && head_sector == last_sh->sector) { + head = last_sh; + atomic_inc(&head->count); + } else { + hash = stripe_hash_locks_hash(conf, head_sector); + spin_lock_irq(conf->hash_locks + hash); + head = find_get_stripe(conf, head_sector, conf->generation, + hash); + spin_unlock_irq(conf->hash_locks + hash); + if (!head) + return; + if (!stripe_can_batch(head)) + goto out; + } lock_two_stripes(head, sh); /* clear_batch_ready clear the flag */ @@ -5794,6 +5800,8 @@ enum stripe_result { }; struct stripe_request_ctx { + /* a reference to the last stripe_head for batching */ + struct stripe_head *batch_last; /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ bool do_flush; }; @@ -5888,8 +5896,13 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, goto out_release; } - if (stripe_can_batch(sh)) - stripe_add_to_batch_list(conf, sh); + if (stripe_can_batch(sh)) { + stripe_add_to_batch_list(conf, sh, ctx->batch_last); + if (ctx->batch_last) + raid5_release_stripe(ctx->batch_last); + atomic_inc(&sh->count); + ctx->batch_last = sh; + } if (ctx->do_flush) { set_bit(STRIPE_R5C_PREFLUSH, &sh->state); @@ -5984,6 +5997,18 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; if (res == STRIPE_SCHEDULE_AND_RETRY) { + /* + * Must release the reference to batch_last before + * scheduling and waiting for work to be done, + * otherwise the batch_last stripe head could prevent + * raid5_activate_delayed() from making progress + * and thus deadlocking. + */ + if (ctx.batch_last) { + raid5_release_stripe(ctx.batch_last); + ctx.batch_last = NULL; + } + schedule(); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); @@ -5995,6 +6020,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) finish_wait(&conf->wait_for_overlap, &w); + if (ctx.batch_last) + raid5_release_stripe(ctx.batch_last); + if (rw == WRITE) md_write_end(mddev); bio_endio(bi); From b16d91f9b7ea286cff87173a3c4284a61f12f096 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:41 -0600 Subject: [PATCH 0222/1250] md/raid5: Refactor add_stripe_bio() Factor out two helper functions from add_stripe_bio(): one to check for overlap (stripe_bio_overlaps()), and one to actually add the bio to the stripe (__add_stripe_bio()). The latter function will always succeed. This will be useful in the next patch so that overlap can be checked for multiple disks before adding any Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 86 ++++++++++++++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 30 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 34f8d6c18bd35c..f12773c2387a72 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3415,39 +3415,32 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, s->locked, s->ops_request); } -/* - * Each stripe/dev can have one or more bion attached. - * toread/towrite point to the first in a chain. - * The bi_next chain must be in order. - */ -static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, - int forwrite, int previous) +static bool stripe_bio_overlaps(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite) { - struct bio **bip; struct r5conf *conf = sh->raid_conf; - int firstwrite=0; + struct bio **bip; - pr_debug("adding bi b#%llu to stripe s#%llu\n", - (unsigned long long)bi->bi_iter.bi_sector, - (unsigned long long)sh->sector); + pr_debug("checking bi b#%llu to stripe s#%llu\n", + bi->bi_iter.bi_sector, sh->sector); - spin_lock_irq(&sh->stripe_lock); /* Don't allow new IO added to stripes in batch list */ if (sh->batch_head) - goto overlap; - if (forwrite) { + return true; + + if (forwrite) bip = &sh->dev[dd_idx].towrite; - if (*bip == NULL) - firstwrite = 1; - } else + else bip = &sh->dev[dd_idx].toread; + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) { if (bio_end_sector(*bip) > bi->bi_iter.bi_sector) - goto overlap; - bip = & (*bip)->bi_next; + return true; + bip = &(*bip)->bi_next; } + if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi)) - goto overlap; + return true; if (forwrite && raid5_has_ppl(conf)) { /* @@ -3476,9 +3469,30 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, } if (first + conf->chunk_sectors * (count - 1) != last) - goto overlap; + return true; } + return false; +} + +static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + struct r5conf *conf = sh->raid_conf; + struct bio **bip; + int firstwrite = 0; + + if (forwrite) { + bip = &sh->dev[dd_idx].towrite; + if (!*bip) + firstwrite = 1; + } else { + bip = &sh->dev[dd_idx].toread; + } + + while (*bip && (*bip)->bi_iter.bi_sector < bi->bi_iter.bi_sector) + bip = &(*bip)->bi_next; + if (!forwrite || previous) clear_bit(STRIPE_BATCH_READY, &sh->state); @@ -3505,8 +3519,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, } pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (unsigned long long)(*bip)->bi_iter.bi_sector, - (unsigned long long)sh->sector, dd_idx); + (*bip)->bi_iter.bi_sector, sh->sector, dd_idx); if (conf->mddev->bitmap && firstwrite) { /* Cannot hold spinlock over bitmap_startwrite, @@ -3514,7 +3527,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, * we have added to the bitmap and set bm_seq. * So set STRIPE_BITMAP_PENDING to prevent * batching. - * If multiple add_stripe_bio() calls race here they + * If multiple __add_stripe_bio() calls race here they * much all set STRIPE_BITMAP_PENDING. So only the first one * to complete "bitmap_startwrite" gets to set * STRIPE_BIT_DELAY. This is important as once a stripe @@ -3532,14 +3545,27 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, set_bit(STRIPE_BIT_DELAY, &sh->state); } } - spin_unlock_irq(&sh->stripe_lock); +} - return 1; +/* + * Each stripe/dev can have one or more bios attached. + * toread/towrite point to the first in a chain. + * The bi_next chain must be in order. + */ +static bool add_stripe_bio(struct stripe_head *sh, struct bio *bi, + int dd_idx, int forwrite, int previous) +{ + spin_lock_irq(&sh->stripe_lock); - overlap: - set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &sh->dev[dd_idx].flags); + spin_unlock_irq(&sh->stripe_lock); + return false; + } + + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); spin_unlock_irq(&sh->stripe_lock); - return 0; + return true; } static void end_reshape(struct r5conf *conf); From 76816a91b3d24670e5f0e8b5ab49aa6f9c497147 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:42 -0600 Subject: [PATCH 0223/1250] md/raid5: Check all disks in a stripe_head for reshape progress When testing if a previous stripe has had reshape expand past it, use the earliest or latest logical sector in all the disks for that stripe head. This will allow adding multiple disks at a time in a subesquent patch. To do this cleaner, refactor the check into a helper function called stripe_ahead_of_reshape(). Signed-off-by: Logan Gunthorpe Reviewed-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/raid5.c | 53 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f12773c2387a72..b27b754ee18cda 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5818,6 +5818,40 @@ static bool ahead_of_reshape(struct mddev *mddev, sector_t sector, sector >= reshape_sector; } +static bool range_ahead_of_reshape(struct mddev *mddev, sector_t min, + sector_t max, sector_t reshape_sector) +{ + return mddev->reshape_backwards ? max < reshape_sector : + min >= reshape_sector; +} + +static bool stripe_ahead_of_reshape(struct mddev *mddev, struct r5conf *conf, + struct stripe_head *sh) +{ + sector_t max_sector = 0, min_sector = MaxSector; + bool ret = false; + int dd_idx; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + if (dd_idx == sh->pd_idx) + continue; + + min_sector = min(min_sector, sh->dev[dd_idx].sector); + max_sector = min(max_sector, sh->dev[dd_idx].sector); + } + + spin_lock_irq(&conf->device_lock); + + if (!range_ahead_of_reshape(mddev, min_sector, max_sector, + conf->reshape_progress)) + /* mismatch, need to try again */ + ret = true; + + spin_unlock_irq(&conf->device_lock); + + return ret; +} + enum stripe_result { STRIPE_SUCCESS = 0, STRIPE_RETRY, @@ -5882,27 +5916,18 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, return STRIPE_FAIL; } - if (unlikely(previous)) { + if (unlikely(previous) && + stripe_ahead_of_reshape(mddev, conf, sh)) { /* - * Expansion might have moved on while waiting for a - * stripe, so we must do the range check again. + * Expansion moved on while waiting for a stripe. * Expansion could still move past after this * test, but as we are holding a reference to * 'sh', we know that if that happens, * STRIPE_EXPANDING will get set and the expansion * won't proceed until we finish with the stripe. */ - int must_retry = 0; - spin_lock_irq(&conf->device_lock); - if (!ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) - /* mismatch, need to try again */ - must_retry = 1; - spin_unlock_irq(&conf->device_lock); - if (must_retry) { - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out_release; - } + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out_release; } if (read_seqcount_retry(&conf->gen_lock, seq)) { From a5b9c6a653fb608220f68069635a265132172400 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:43 -0600 Subject: [PATCH 0224/1250] md/raid5: Pivot raid5_make_request() raid5_make_request() loops through every page in the request, finds the appropriate stripe and adds the bio for that page in the disk. This causes a great deal of contention on the hash_lock and extra work seeing each stripe must be found once for every data disk. The number of times a stripe must be found can be reduced by pivoting raid5_make_request() so that it loops through every stripe and then loops through every disk in that stripe to see if the bio must be added. This reduces the number of times the hash lock must be taken by a factor equal to the number of data disks. To accomplish this, the logical sectors that have already been added must be tracked. Tracking them is done with a bitmap: the bits for all pages are set at the start of the request and each bit is cleared once the bio is added to a stripe. Finding the next sector to be done is then just a call to find_first_bit() so that sectors that have been done can simply be skipped. One minor downside is that the maximum sectors for a request must be limited so that the bitmap can be appropriately sized on the stack. This limit is arbitrarily chosen to be 256 stripe pages which works out to 1MB if PAGE_SIZE == DEFAULT_STRIPE_SIZE. This doesn't actually restrict the maximum request further seeing the default block queue settings are used which restricts the number of segments to 128 (which results in request sizes that are approximately 512KB). Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 89 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b27b754ee18cda..ca7dbd281ad256 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -61,6 +61,8 @@ #define cpu_to_group(cpu) cpu_to_node(cpu) #define ANY_GROUP NUMA_NO_NODE +#define RAID5_MAX_REQ_STRIPES 256 + static bool devices_handle_discard_safely = false; module_param(devices_handle_discard_safely, bool, 0644); MODULE_PARM_DESC(devices_handle_discard_safely, @@ -5862,10 +5864,69 @@ enum stripe_result { struct stripe_request_ctx { /* a reference to the last stripe_head for batching */ struct stripe_head *batch_last; + + /* first sector in the request */ + sector_t first_sector; + + /* last sector in the request */ + sector_t last_sector; + + /* bitmap to track stripe sectors that have been added to stripes */ + DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES); + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ bool do_flush; }; +static int add_all_stripe_bios(struct r5conf *conf, + struct stripe_request_ctx *ctx, struct stripe_head *sh, + struct bio *bi, int forwrite, int previous) +{ + int dd_idx; + int ret = 1; + + spin_lock_irq(&sh->stripe_lock); + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + if (stripe_bio_overlaps(sh, bi, dd_idx, forwrite)) { + set_bit(R5_Overlap, &dev->flags); + ret = 0; + continue; + } + } + + if (!ret) + goto out; + + for (dd_idx = 0; dd_idx < sh->disks; dd_idx++) { + struct r5dev *dev = &sh->dev[dd_idx]; + + if (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx) + continue; + + if (dev->sector < ctx->first_sector || + dev->sector >= ctx->last_sector) + continue; + + __add_stripe_bio(sh, bi, dd_idx, forwrite, previous); + clear_bit((dev->sector - ctx->first_sector) >> + RAID5_STRIPE_SHIFT(conf), ctx->sectors_to_do); + } + +out: + spin_unlock_irq(&sh->stripe_lock); + return ret; +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5937,7 +5998,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, } if (test_bit(STRIPE_EXPANDING, &sh->state) || - !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { + !add_all_stripe_bios(conf, ctx, sh, bi, rw, previous)) { /* * Stripe is busy expanding or add failed due to * overlap. Flush everything and wait a while. @@ -5979,11 +6040,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { struct r5conf *conf = mddev->private; - sector_t logical_sector, last_sector; + sector_t logical_sector; struct stripe_request_ctx ctx = {}; const int rw = bio_data_dir(bi); enum stripe_result res; DEFINE_WAIT(w); + int s; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -6023,9 +6085,14 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); - last_sector = bio_end_sector(bi); + ctx.first_sector = logical_sector; + ctx.last_sector = bio_end_sector(bi); bi->bi_next = NULL; + bitmap_set(ctx.sectors_to_do, 0, + DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + RAID5_STRIPE_SECTORS(conf))); + /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && @@ -6038,7 +6105,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - while (logical_sector < last_sector) { + while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi); if (res == STRIPE_FAIL) @@ -6066,7 +6133,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; } - logical_sector += RAID5_STRIPE_SECTORS(conf); + s = find_first_bit(ctx.sectors_to_do, RAID5_MAX_REQ_STRIPES); + if (s == RAID5_MAX_REQ_STRIPES) + break; + + logical_sector = ctx.first_sector + + (s << RAID5_STRIPE_SHIFT(conf)); } finish_wait(&conf->wait_for_overlap, &w); @@ -7923,7 +7995,12 @@ static int raid5_run(struct mddev *mddev) mddev->queue->limits.discard_granularity < stripe) blk_queue_max_discard_sectors(mddev->queue, 0); - blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); + /* + * Requests require having a bitmap for each stripe. + * Limit the max sectors based on this. + */ + blk_queue_max_hw_sectors(mddev->queue, + RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); } if (log_init(conf, journal_dev, raid5_has_ppl(conf))) From a3906874f1f5fbfbc02a18bc653070d5056b9b44 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:44 -0600 Subject: [PATCH 0225/1250] md/raid5: Improve debug prints Add a debug print for raid5_make_request() so that each request is printed and add the logical sector number to the debug print in __add_stripe_bio(). Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ca7dbd281ad256..e01f6211c947f8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3520,8 +3520,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, sh->overwrite_disks++; } - pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", - (*bip)->bi_iter.bi_sector, sh->sector, dd_idx); + pr_debug("added bi b#%llu to stripe s#%llu, disk %d, logical %llu\n", + (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, + sh->dev[dd_idx].sector); if (conf->mddev->bitmap && firstwrite) { /* Cannot hold spinlock over bitmap_startwrite, @@ -6093,6 +6094,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, RAID5_STRIPE_SECTORS(conf))); + pr_debug("raid456: %s, logical %llu to %llu\n", __func__, + bi->bi_iter.bi_sector, ctx.last_sector); + /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && From e969c8949e9b4b5a45345fc73e998480edcc42b7 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 16 Jun 2022 13:19:45 -0600 Subject: [PATCH 0226/1250] md/raid5: Increase restriction on max segments per request The block layer defaults the maximum segments to 128, which means requests tend to get split around the 512KB depending on how many pages can be merged. There's no such restriction in the raid5 code so increase the limit to USHRT_MAX so that larger requests can be sent as one. Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e01f6211c947f8..184145b49b7c04 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8005,6 +8005,9 @@ static int raid5_run(struct mddev *mddev) */ blk_queue_max_hw_sectors(mddev->queue, RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); + + /* No restrictions on the number of segments in the request */ + blk_queue_max_segments(mddev->queue, USHRT_MAX); } if (log_init(conf, journal_dev, raid5_has_ppl(conf))) From ff4ec5f79108cf82fe7168547c76fe754c4ade0a Mon Sep 17 00:00:00 2001 From: Zhang Jiaming Date: Sat, 2 Jul 2022 09:54:11 +0800 Subject: [PATCH 0227/1250] md: Fix spelling mistake in comments There are 2 spelling mistakes in comments. Fix it. Signed-off-by: Zhang Jiaming Signed-off-by: Song Liu --- drivers/md/md-cluster.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 37cbcce3cc66bc..742b2349fea34a 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -40,7 +40,7 @@ struct resync_info { /* Lock the send communication. This is done through * bit manipulation as opposed to a mutex in order to - * accomodate lock and hold. See next comment. + * accommodate lock and hold. See next comment. */ #define MD_CLUSTER_SEND_LOCK 4 /* If cluster operations (such as adding a disk) must lock the @@ -689,7 +689,7 @@ static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked) /* * If resync thread run after raid1d thread, then process_metadata_update * could not continue if raid1d held reconfig_mutex (and raid1d is blocked - * since another node already got EX on Token and waitting the EX of Ack), + * since another node already got EX on Token and waiting the EX of Ack), * so let resync wake up thread in case flag is set. */ if (mddev_locked && !test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, From aa516a92584eabad397f0a47597b20754521c876 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 3 Jul 2022 18:05:43 +0200 Subject: [PATCH 0228/1250] block: null_blk: Use the bitmap API to allocate bitmaps Use bitmap_zalloc()/bitmap_free() instead of hand-writing them. It is less verbose and it improves the semantic. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/7c4d3116ba843fc4a8ae557dd6176352a6cd0985.1656864320.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 6b67088f4ea71c..3778df206b0136 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1656,7 +1656,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, static void cleanup_queue(struct nullb_queue *nq) { - kfree(nq->tag_map); + bitmap_free(nq->tag_map); kfree(nq->cmds); } @@ -1783,14 +1783,13 @@ static const struct block_device_operations null_rq_ops = { static int setup_commands(struct nullb_queue *nq) { struct nullb_cmd *cmd; - int i, tag_size; + int i; nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); if (!nq->cmds) return -ENOMEM; - tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG; - nq->tag_map = kcalloc(tag_size, sizeof(unsigned long), GFP_KERNEL); + nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL); if (!nq->tag_map) { kfree(nq->cmds); return -ENOMEM; From 84537fa46387bc8276f0db57015ca564d970a359 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 1 Jul 2022 17:46:02 +0800 Subject: [PATCH 0229/1250] ARM: omap1: call platform_device_put() in error case in omap1_dm_timer_init() If platform_device_add() is not called or failed, it should call platform_device_put() in error case. Fixes: 97933d6ced60 ("ARM: OMAP1: dmtimer: conversion to platform devices") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Message-Id: <20220701094602.2365099-1-yangyingliang@huawei.com> Signed-off-by: Tony Lindgren --- arch/arm/mach-omap1/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/mach-omap1/timer.c b/arch/arm/mach-omap1/timer.c index 9ed64345f06e99..2b282e7b9418a3 100644 --- a/arch/arm/mach-omap1/timer.c +++ b/arch/arm/mach-omap1/timer.c @@ -166,7 +166,7 @@ static int __init omap1_dm_timer_init(void) kfree(pdata); err_free_pdev: - platform_device_unregister(pdev); + platform_device_put(pdev); return ret; } From 61480dc07401e1e6587f7f4a888d6a87c0abaaab Mon Sep 17 00:00:00 2001 From: Liang He Date: Mon, 20 Jun 2022 22:56:19 +0800 Subject: [PATCH 0230/1250] bus: ti-sysc: Fix refcount leak bugs In sysc_init_stdout_path(), there is only one of_node_put() for the second of_find_node_by_path(). However, we need to add one of_node_put() for the first of_find_node_by_path(). In sysc_init_static_data(), we need one of_node_put() for the of_find_node_by_path() to keep refcount balance. Signed-off-by: Liang He Message-Id: <20220620145619.4074665-1-windhl@126.com> Signed-off-by: Tony Lindgren --- drivers/bus/ti-sysc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bus/ti-sysc.c b/drivers/bus/ti-sysc.c index 73fd42dd25b31f..51d772bb80461b 100644 --- a/drivers/bus/ti-sysc.c +++ b/drivers/bus/ti-sysc.c @@ -752,6 +752,7 @@ static void sysc_init_stdout_path(struct sysc *ddata) goto err; uart = of_get_property(np, "stdout-path", NULL); + of_node_put(np); if (!uart) goto err; @@ -3139,6 +3140,7 @@ static int sysc_init_static_data(struct sysc *ddata) np = of_find_node_by_path("/ocp"); WARN_ONCE(np && of_device_is_compatible(np, "simple-bus"), "ti-sysc: Incomplete old dtb, please update\n"); + of_node_put(np); break; default: break; From 7fdc6a2af1ae2e7ef8740aa395fb6a9162e970e8 Mon Sep 17 00:00:00 2001 From: Liang He Date: Tue, 28 Jun 2022 19:29:39 +0800 Subject: [PATCH 0231/1250] ARM: OMAP2+: omap4-common: Fix refcount leak bug In omap4_sram_init(), of_find_compatible_node() will return a node pointer with refcount incremented. We should use of_node_put() when it is not used anymore. Signed-off-by: Liang He Message-Id: <20220628112939.160737-1-windhl@126.com> Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/omap4-common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-omap2/omap4-common.c b/arch/arm/mach-omap2/omap4-common.c index 6d1eb4eefefe58..d9ed2a5dcd5efb 100644 --- a/arch/arm/mach-omap2/omap4-common.c +++ b/arch/arm/mach-omap2/omap4-common.c @@ -140,6 +140,7 @@ static int __init omap4_sram_init(void) __func__); else sram_sync = (void __iomem *)gen_pool_alloc(sram_pool, PAGE_SIZE); + of_node_put(np); return 0; } From 6a9720576cd00d30722c5f755bd17d4cfa9df636 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 30 Jun 2022 15:10:57 -0400 Subject: [PATCH 0232/1250] virtio: VIRTIO_HARDEN_NOTIFICATION is broken This option doesn't really work and breaks too many drivers. Not yet sure what's the right thing to do, for now let's make sure randconfig isn't broken by this. Fixes: c346dae4f3fb ("virtio: disable notification hardening by default") Cc: "Jason Wang" Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/virtio/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index e1556d2a355ae0..afb9051e01253f 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -31,11 +31,12 @@ if VIRTIO_MENU config VIRTIO_HARDEN_NOTIFICATION bool "Harden virtio notification" + depends on BROKEN help Enable this to harden the device notifications and suppress those that happen at a time where notifications are illegal. - Experimental: Note that several drivers still have bugs that + Experimental: Note that several drivers still have issues that may cause crashes or hangs when correct handling of notifications is enforced; depending on the subset of drivers and devices you use, this may or may not work. From 69b03d7ec72cba04550c0634bb64dc63968363fa Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 4 Jul 2022 21:06:46 +0200 Subject: [PATCH 0233/1250] csky: Use the bitmap API to allocate bitmaps Use bitmap_zalloc()/bitmap_free() instead of hand-writing them. It is less verbose and it improves the semantic. While at it, turn a bitmap_clear() into an equivalent bitmap_zero(). It is also less verbose. Signed-off-by: Christophe JAILLET Signed-off-by: Guo Ren --- arch/csky/mm/asid.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/csky/mm/asid.c b/arch/csky/mm/asid.c index b2e914745c1d0e..7fb6c417bbacdc 100644 --- a/arch/csky/mm/asid.c +++ b/arch/csky/mm/asid.c @@ -27,7 +27,7 @@ static void flush_context(struct asid_info *info) u64 asid; /* Update the list of reserved ASIDs and the ASID bitmap. */ - bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info)); + bitmap_zero(info->map, NUM_CTXT_ASIDS(info)); for_each_possible_cpu(i) { asid = atomic64_xchg_relaxed(&active_asid(info, i), 0); @@ -178,8 +178,7 @@ int asid_allocator_init(struct asid_info *info, */ WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus()); atomic64_set(&info->generation, ASID_FIRST_VERSION(info)); - info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)), - sizeof(*info->map), GFP_KERNEL); + info->map = bitmap_zalloc(NUM_CTXT_ASIDS(info), GFP_KERNEL); if (!info->map) return -ENOMEM; From a2ea44f19d28aff9a65ef208156642c8ddf9da9c Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Jul 2022 14:00:15 +0800 Subject: [PATCH 0234/1250] csky: Correct position of _stext Correct position of _stext to prevent check_kernel_text_object warning [1]. [1] https://lore.kernel.org/linux-csky/YfLpNkmlvoR8iPcq@ls3530/ Signed-off-by: Guo Ren Signed-off-by: Guo Ren Cc: Helge Deller --- arch/csky/include/asm/sections.h | 10 ++++++++++ arch/csky/kernel/setup.c | 4 ++-- arch/csky/kernel/vmlinux.lds.S | 3 ++- 3 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 arch/csky/include/asm/sections.h diff --git a/arch/csky/include/asm/sections.h b/arch/csky/include/asm/sections.h new file mode 100644 index 00000000000000..4192cba8445dc4 --- /dev/null +++ b/arch/csky/include/asm/sections.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __ASM_SECTIONS_H +#define __ASM_SECTIONS_H + +#include + +extern char _start[]; + +#endif /* __ASM_SECTIONS_H */ diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index c64e7be2045b5d..106fbf0b6f3b40 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -31,7 +31,7 @@ static void __init csky_memblock_init(void) unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; signed long size; - memblock_reserve(__pa(_stext), _end - _stext); + memblock_reserve(__pa(_start), _end - _start); early_init_fdt_reserve_self(); early_init_fdt_scan_reserved_mem(); @@ -78,7 +78,7 @@ void __init setup_arch(char **cmdline_p) pr_info("Phys. mem: %ldMB\n", (unsigned long) memblock_phys_mem_size()/1024/1024); - setup_initial_init_mm(_stext, _etext, _edata, _end); + setup_initial_init_mm(_start, _etext, _edata, _end); parse_early_param(); diff --git a/arch/csky/kernel/vmlinux.lds.S b/arch/csky/kernel/vmlinux.lds.S index e8b1a4a497980a..163a8cd8b9a6ff 100644 --- a/arch/csky/kernel/vmlinux.lds.S +++ b/arch/csky/kernel/vmlinux.lds.S @@ -22,7 +22,7 @@ SECTIONS { . = PAGE_OFFSET + PHYS_OFFSET_OFFSET; - _stext = .; + _start = .; __init_begin = .; HEAD_TEXT_SECTION INIT_TEXT_SECTION(PAGE_SIZE) @@ -33,6 +33,7 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + _stext = .; VBR_BASE IRQENTRY_TEXT SOFTIRQENTRY_TEXT From f5ac0fb05b2334040591ecbbe85b6fa73d75736e Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 5 Jul 2022 14:16:21 +0800 Subject: [PATCH 0235/1250] csky: Move HEAD_TEXT_SECTION out of __init_begin-end Prevent HEAD_TEXT_SECTION back into the buddy system. Signed-off-by: Guo Ren Signed-off-by: Guo Ren --- arch/csky/kernel/vmlinux.lds.S | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/csky/kernel/vmlinux.lds.S b/arch/csky/kernel/vmlinux.lds.S index 163a8cd8b9a6ff..68c980d084829a 100644 --- a/arch/csky/kernel/vmlinux.lds.S +++ b/arch/csky/kernel/vmlinux.lds.S @@ -23,13 +23,8 @@ SECTIONS . = PAGE_OFFSET + PHYS_OFFSET_OFFSET; _start = .; - __init_begin = .; HEAD_TEXT_SECTION - INIT_TEXT_SECTION(PAGE_SIZE) - INIT_DATA_SECTION(PAGE_SIZE) - PERCPU_SECTION(L1_CACHE_BYTES) . = ALIGN(PAGE_SIZE); - __init_end = .; .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; @@ -49,7 +44,12 @@ SECTIONS /* __init_begin __init_end must be page aligned for free_initmem */ . = ALIGN(PAGE_SIZE); - + __init_begin = .; + INIT_TEXT_SECTION(PAGE_SIZE) + INIT_DATA_SECTION(PAGE_SIZE) + PERCPU_SECTION(L1_CACHE_BYTES) + . = ALIGN(PAGE_SIZE); + __init_end = .; _sdata = .; RO_DATA(PAGE_SIZE) From e655852c1275350c11ffa0a2fd1aac310ba0e252 Mon Sep 17 00:00:00 2001 From: Alexandre Torgue Date: Mon, 20 Jun 2022 14:30:18 +0200 Subject: [PATCH 0236/1250] ARM: dts: stm32: add fake interrupt propoerty for ASync notif - TEMP/TO REMOVE Current internal optee version enables ASYNc notif and in suche case interrupt is mandatory in optee node. Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32mp131.dtsi | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/boot/dts/stm32mp131.dtsi b/arch/arm/boot/dts/stm32mp131.dtsi index 3a921db23e9f62..1dd647b9a79148 100644 --- a/arch/arm/boot/dts/stm32mp131.dtsi +++ b/arch/arm/boot/dts/stm32mp131.dtsi @@ -33,6 +33,8 @@ optee { method = "smc"; compatible = "linaro,optee-tz"; + interrupt-parent = <&intc>; + interrupts = ; }; scmi: scmi { From 12459acb849c46aeb3ae5cd82ac44808ff04f7d5 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 26 Jun 2022 02:15:59 +0200 Subject: [PATCH 0237/1250] ARM: dts: stm32: Fix SPI2 pinmux pin comments on stm32mp15 Those pin comments refer to SPI2 pins, not SPI1 pins, update the comments. No functional change. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32mp15-pinctrl.dtsi | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi index 6052243ad81c53..658162e2f42099 100644 --- a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi +++ b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi @@ -1794,15 +1794,15 @@ spi2_pins_a: spi2-0 { pins1 { - pinmux = , /* SPI1_SCK */ - ; /* SPI1_MOSI */ + pinmux = , /* SPI2_SCK */ + ; /* SPI2_MOSI */ bias-disable; drive-push-pull; slew-rate = <1>; }; pins2 { - pinmux = ; /* SPI1_MISO */ + pinmux = ; /* SPI2_MISO */ bias-disable; }; }; From bd085611fa5c64f7dc64a5bfc6405b9c4fafae93 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 26 Jun 2022 02:20:59 +0200 Subject: [PATCH 0238/1250] dt-bindings: arm: stm32: Add compatible string for DH electronics DHCOR DRC Compact Add DT compatible string for DH electronics STM32MP15xx DHCOR on DRC Compact carrier board into YAML DT binding document. This system is a general purpose DIN Rail Controller design. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: Maxime Coquelin Cc: Patrice Chotard Cc: Patrick Delaunay Cc: Rob Herring Cc: devicetree@vger.kernel.org Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Reviewed-by: Rob Herring Signed-off-by: Alexandre Torgue --- Documentation/devicetree/bindings/arm/stm32/stm32.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/arm/stm32/stm32.yaml b/Documentation/devicetree/bindings/arm/stm32/stm32.yaml index 8b31565fee591f..4c605bccc474ed 100644 --- a/Documentation/devicetree/bindings/arm/stm32/stm32.yaml +++ b/Documentation/devicetree/bindings/arm/stm32/stm32.yaml @@ -59,12 +59,18 @@ properties: - prt,prtt1s # Protonic PRTT1S - const: st,stm32mp151 - - description: DH STM32MP153 SoM based Boards + - description: DH STM32MP153 DHCOM SoM based Boards items: - const: dh,stm32mp153c-dhcom-drc02 - const: dh,stm32mp153c-dhcom-som - const: st,stm32mp153 + - description: DH STM32MP153 DHCOR SoM based Boards + items: + - const: dh,stm32mp153c-dhcor-drc-compact + - const: dh,stm32mp153c-dhcor-som + - const: st,stm32mp153 + - items: - enum: - shiratech,stm32mp157a-iot-box # IoT Box From 10506cfb0811bea5690d8863d8a529df4de54ad6 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 26 Jun 2022 02:21:00 +0200 Subject: [PATCH 0239/1250] ARM: dts: stm32: Add alternate pinmux for CAN1 pins Add another mux option for CAN1 pins, this is used on DRC Compact board. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32mp15-pinctrl.dtsi | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi index 658162e2f42099..667d8b74a8a406 100644 --- a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi +++ b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi @@ -978,6 +978,26 @@ }; }; + m_can1_pins_c: m-can1-2 { + pins1 { + pinmux = ; /* CAN1_TX */ + slew-rate = <1>; + drive-push-pull; + bias-disable; + }; + pins2 { + pinmux = ; /* CAN1_RX */ + bias-disable; + }; + }; + + m_can1_sleep_pins_c: m_can1-sleep-2 { + pins { + pinmux = , /* CAN1_TX */ + ; /* CAN1_RX */ + }; + }; + m_can2_pins_a: m-can2-0 { pins1 { pinmux = ; /* CAN2_TX */ From a5eeda7b716078f3ce42e17682424881ddb1efaa Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 26 Jun 2022 02:21:01 +0200 Subject: [PATCH 0240/1250] ARM: dts: stm32: Add alternate pinmux for SPI2 pins Add another mux option for SPI2 pins, this is used on DRC Compact board. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32mp15-pinctrl.dtsi | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi index 667d8b74a8a406..01242cb005b60d 100644 --- a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi +++ b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi @@ -1827,6 +1827,21 @@ }; }; + spi2_pins_b: spi2-1 { + pins1 { + pinmux = , /* SPI2_SCK */ + ; /* SPI2_MOSI */ + bias-disable; + drive-push-pull; + slew-rate = <1>; + }; + + pins2 { + pinmux = ; /* SPI2_MISO */ + bias-disable; + }; + }; + spi4_pins_a: spi4-0 { pins { pinmux = , /* SPI4_SCK */ From 1254697266ea310c886d3049f1d53c78596adc51 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 26 Jun 2022 02:21:02 +0200 Subject: [PATCH 0241/1250] ARM: dts: stm32: Add alternate pinmux for UART3 pins Add another mux option for UART3 pins, this is used on DRC Compact board. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32mp15-pinctrl.dtsi | 41 ++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi index 01242cb005b60d..95781bd8c3099d 100644 --- a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi +++ b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi @@ -2218,6 +2218,47 @@ }; }; + usart3_pins_e: usart3-4 { + pins1 { + pinmux = , /* USART3_TX */ + ; /* USART3_RTS */ + bias-disable; + drive-push-pull; + slew-rate = <0>; + }; + pins2 { + pinmux = , /* USART3_RX */ + ; /* USART3_CTS_NSS */ + bias-pull-up; + }; + }; + + usart3_idle_pins_e: usart3-idle-4 { + pins1 { + pinmux = , /* USART3_TX */ + ; /* USART3_CTS_NSS */ + }; + pins2 { + pinmux = ; /* USART3_RTS */ + bias-disable; + drive-push-pull; + slew-rate = <0>; + }; + pins3 { + pinmux = ; /* USART3_RX */ + bias-pull-up; + }; + }; + + usart3_sleep_pins_e: usart3-sleep-4 { + pins { + pinmux = , /* USART3_TX */ + , /* USART3_RTS */ + , /* USART3_CTS_NSS */ + ; /* USART3_RX */ + }; + }; + usbotg_hs_pins_a: usbotg-hs-0 { pins { pinmux = ; /* OTG_ID */ From ed44f72e55e6bc9d1f4284e47554b703286d05ac Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 26 Jun 2022 02:21:03 +0200 Subject: [PATCH 0242/1250] ARM: dts: stm32: Add alternate pinmux for UART4 pins Add another mux option for UART4 pins, this is used on DRC Compact board. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32mp15-pinctrl.dtsi | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi index 95781bd8c3099d..adcb8683184959 100644 --- a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi +++ b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi @@ -1919,6 +1919,36 @@ }; }; + uart4_pins_d: uart4-3 { + pins1 { + pinmux = ; /* UART4_TX */ + bias-disable; + drive-push-pull; + slew-rate = <0>; + }; + pins2 { + pinmux = ; /* UART4_RX */ + bias-disable; + }; + }; + + uart4_idle_pins_d: uart4-idle-3 { + pins1 { + pinmux = ; /* UART4_TX */ + }; + pins2 { + pinmux = ; /* UART4_RX */ + bias-disable; + }; + }; + + uart4_sleep_pins_d: uart4-sleep-3 { + pins { + pinmux = , /* UART4_TX */ + ; /* UART4_RX */ + }; + }; + uart7_pins_a: uart7-0 { pins1 { pinmux = ; /* UART7_TX */ From c0ea0221644d82de7c2de380a253c9cdc3889f89 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 26 Jun 2022 02:21:04 +0200 Subject: [PATCH 0243/1250] ARM: dts: stm32: Add alternate pinmux for UART5 pins Add another mux option for UART5 pins, this is used on DRC Compact board. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32mp15-pinctrl.dtsi | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi index adcb8683184959..214563a4e56b86 100644 --- a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi +++ b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi @@ -1949,6 +1949,19 @@ }; }; + uart5_pins_a: uart5-0 { + pins1 { + pinmux = ; /* UART5_TX */ + bias-disable; + drive-push-pull; + slew-rate = <0>; + }; + pins2 { + pinmux = ; /* UART5_RX */ + bias-disable; + }; + }; + uart7_pins_a: uart7-0 { pins1 { pinmux = ; /* UART7_TX */ From 775c229299590b761eed7901b2e667ebf536ef0e Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 26 Jun 2022 02:21:05 +0200 Subject: [PATCH 0244/1250] ARM: dts: stm32: Add DHCOR based DRC Compact board Add DT for DH DRC Compact unit, which is a universal controller device. The system has two ethernet ports, one CAN, RS485 and RS232, USB, uSD card slot, eMMC and SDIO Wi-Fi. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/Makefile | 1 + .../dts/stm32mp153c-dhcor-drc-compact.dts | 30 ++ .../dts/stm32mp15xx-dhcor-drc-compact.dtsi | 322 ++++++++++++++++++ 3 files changed, 353 insertions(+) create mode 100644 arch/arm/boot/dts/stm32mp153c-dhcor-drc-compact.dts create mode 100644 arch/arm/boot/dts/stm32mp15xx-dhcor-drc-compact.dtsi diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile index 5112f493f49462..e3e9af031bf589 100644 --- a/arch/arm/boot/dts/Makefile +++ b/arch/arm/boot/dts/Makefile @@ -1192,6 +1192,7 @@ dtb-$(CONFIG_ARCH_STM32) += \ stm32mp151a-prtt1c.dtb \ stm32mp151a-prtt1s.dtb \ stm32mp153c-dhcom-drc02.dtb \ + stm32mp153c-dhcor-drc-compact.dtb \ stm32mp157a-avenger96.dtb \ stm32mp157a-dhcor-avenger96.dtb \ stm32mp157a-dk1.dtb \ diff --git a/arch/arm/boot/dts/stm32mp153c-dhcor-drc-compact.dts b/arch/arm/boot/dts/stm32mp153c-dhcor-drc-compact.dts new file mode 100644 index 00000000000000..c8b9818499ea0e --- /dev/null +++ b/arch/arm/boot/dts/stm32mp153c-dhcor-drc-compact.dts @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright (C) 2022 Marek Vasut + * + * DHCOR STM32MP1 variant: + * DHCR-STM32MP153C-C065-R051-V33-SPI-I-01LG + * DHCOR PCB number: 586-100 or newer + * DRC Compact PCB number: 627-100 or newer + */ + +/dts-v1/; + +#include "stm32mp153.dtsi" +#include "stm32mp15xc.dtsi" +#include "stm32mp15xx-dhcor-som.dtsi" +#include "stm32mp15xx-dhcor-drc-compact.dtsi" + +/ { + model = "DH electronics STM32MP153C DHCOR DRC Compact"; + compatible = "dh,stm32mp153c-dhcor-drc-compact", + "dh,stm32mp153c-dhcor-som", + "st,stm32mp153"; +}; + +&m_can1 { + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&m_can1_pins_c>; + pinctrl-1 = <&m_can1_sleep_pins_c>; + status = "okay"; +}; diff --git a/arch/arm/boot/dts/stm32mp15xx-dhcor-drc-compact.dtsi b/arch/arm/boot/dts/stm32mp15xx-dhcor-drc-compact.dtsi new file mode 100644 index 00000000000000..27477bb219dedf --- /dev/null +++ b/arch/arm/boot/dts/stm32mp15xx-dhcor-drc-compact.dtsi @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright (C) 2022 Marek Vasut + */ + +/ { + aliases { + ethernet0 = ðernet0; + ethernet1 = &ksz8851; + mmc0 = &sdmmc1; + rtc0 = &hwrtc; + rtc1 = &rtc; + serial0 = &uart4; + serial1 = &uart8; + serial2 = &usart3; + serial3 = &uart5; + spi0 = &qspi; + }; + + chosen { + stdout-path = "serial0:115200n8"; + }; + + led { + compatible = "gpio-leds"; + led1 { + label = "yellow:user0"; + gpios = <&gpioz 6 GPIO_ACTIVE_LOW>; + default-state = "off"; + }; + + led2 { + label = "red:user1"; + gpios = <&gpioz 3 GPIO_ACTIVE_LOW>; + default-state = "off"; + }; + }; + + ethernet_vio: vioregulator { + compatible = "regulator-fixed"; + regulator-name = "vio"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + gpio = <&gpioh 2 GPIO_ACTIVE_LOW>; + regulator-always-on; + regulator-boot-on; + vin-supply = <&vdd>; + }; +}; + +&adc { /* X11 ADC inputs */ + pinctrl-names = "default"; + pinctrl-0 = <&adc12_ain_pins_b>; + vdd-supply = <&vdd>; + vdda-supply = <&vdda>; + vref-supply = <&vdda>; + status = "okay"; + + adc1: adc@0 { + st,adc-channels = <0 1 6>; + st,min-sample-time-nsecs = <5000>; + status = "okay"; + }; + + adc2: adc@100 { + st,adc-channels = <0 1 2>; + st,min-sample-time-nsecs = <5000>; + status = "okay"; + }; +}; + +ðernet0 { + status = "okay"; + pinctrl-0 = <ðernet0_rgmii_pins_c>; + pinctrl-1 = <ðernet0_rgmii_sleep_pins_c>; + pinctrl-names = "default", "sleep"; + phy-mode = "rgmii"; + max-speed = <1000>; + phy-handle = <&phy0>; + + mdio0 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "snps,dwmac-mdio"; + reset-gpios = <&gpioz 2 GPIO_ACTIVE_LOW>; + reset-delay-us = <1000>; + reset-post-delay-us = <1000>; + + phy0: ethernet-phy@7 { + reg = <7>; + + rxc-skew-ps = <1500>; + rxdv-skew-ps = <540>; + rxd0-skew-ps = <420>; + rxd1-skew-ps = <420>; + rxd2-skew-ps = <420>; + rxd3-skew-ps = <420>; + + txc-skew-ps = <1440>; + txen-skew-ps = <540>; + txd0-skew-ps = <420>; + txd1-skew-ps = <420>; + txd2-skew-ps = <420>; + txd3-skew-ps = <420>; + }; + }; +}; + +&fmc { + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&fmc_pins_b>; + pinctrl-1 = <&fmc_sleep_pins_b>; + status = "okay"; + + ksz8851: ethernet@1,0 { + compatible = "micrel,ks8851-mll"; + reg = <1 0x0 0x2>, <1 0x2 0x20000>; + interrupt-parent = <&gpioc>; + interrupts = <3 IRQ_TYPE_LEVEL_LOW>; + bank-width = <2>; + + /* Timing values are in nS */ + st,fmc2-ebi-cs-mux-enable; + st,fmc2-ebi-cs-transaction-type = <4>; + st,fmc2-ebi-cs-buswidth = <16>; + st,fmc2-ebi-cs-address-setup-ns = <5>; + st,fmc2-ebi-cs-address-hold-ns = <5>; + st,fmc2-ebi-cs-bus-turnaround-ns = <5>; + st,fmc2-ebi-cs-data-setup-ns = <45>; + st,fmc2-ebi-cs-data-hold-ns = <1>; + st,fmc2-ebi-cs-write-address-setup-ns = <5>; + st,fmc2-ebi-cs-write-address-hold-ns = <5>; + st,fmc2-ebi-cs-write-bus-turnaround-ns = <5>; + st,fmc2-ebi-cs-write-data-setup-ns = <45>; + st,fmc2-ebi-cs-write-data-hold-ns = <1>; + }; +}; + +&gpioa { + gpio-line-names = "", "", "", "", + "DRCC-VAR2", "", "", "", + "", "", "", "", + "", "", "", ""; +}; + +&gpioe { + gpio-line-names = "", "", "", "", + "", "DRCC-GPIO0", "", "", + "", "", "", "", + "", "", "", ""; +}; + +&gpiog { + gpio-line-names = "", "", "", "", + "", "", "", "", + "", "", "", "", + "DRCC-GPIO5", "", "", ""; +}; + +&gpioh { + gpio-line-names = "", "", "", "DRCC-HW2", + "DRCC-GPIO4", "", "", "", + "DRCC-HW1", "DRCC-HW0", "", "DRCC-VAR1", + "DRCC-VAR0", "", "", "DRCC-GPIO6"; +}; + +&gpioi { + gpio-line-names = "", "", "", "", + "", "", "", "DRCC-GPIO2", + "", "DRCC-GPIO1", "", "", + "", "", "", ""; +}; + +&i2c1 { /* X11 I2C1 */ + pinctrl-names = "default"; + pinctrl-0 = <&i2c1_pins_b>; + i2c-scl-rising-time-ns = <185>; + i2c-scl-falling-time-ns = <20>; + status = "okay"; + /delete-property/dmas; + /delete-property/dma-names; +}; + +&i2c4 { + hwrtc: rtc@32 { + compatible = "microcrystal,rv8803"; + reg = <0x32>; + }; + + eeprom@50 { + compatible = "atmel,24c04"; + reg = <0x50>; + pagesize = <16>; + }; +}; + +&sdmmc1 { /* MicroSD */ + pinctrl-names = "default", "opendrain", "sleep"; + pinctrl-0 = <&sdmmc1_b4_pins_a>; + pinctrl-1 = <&sdmmc1_b4_od_pins_a>; + pinctrl-2 = <&sdmmc1_b4_sleep_pins_a>; + cd-gpios = <&gpioi 8 (GPIO_ACTIVE_LOW | GPIO_PULL_UP)>; + disable-wp; + st,neg-edge; + bus-width = <4>; + vmmc-supply = <&vdd>; + vqmmc-supply = <&vdd>; + status = "okay"; +}; + +&sdmmc2 { /* eMMC */ + pinctrl-names = "default", "opendrain", "sleep"; + pinctrl-0 = <&sdmmc2_b4_pins_a &sdmmc2_d47_pins_c>; + pinctrl-1 = <&sdmmc2_b4_od_pins_a &sdmmc2_d47_pins_c>; + pinctrl-2 = <&sdmmc2_b4_sleep_pins_a &sdmmc2_d47_sleep_pins_c>; + bus-width = <8>; + no-sd; + no-sdio; + non-removable; + st,neg-edge; + vmmc-supply = <&v3v3>; + vqmmc-supply = <&vdd>; + status = "okay"; +}; + +&sdmmc3 { /* SDIO Wi-Fi */ + pinctrl-names = "default", "opendrain", "sleep"; + pinctrl-0 = <&sdmmc3_b4_pins_a>; + pinctrl-1 = <&sdmmc3_b4_od_pins_a>; + pinctrl-2 = <&sdmmc3_b4_sleep_pins_a>; + broken-cd; + bus-width = <4>; + mmc-ddr-3_3v; + st,neg-edge; + vmmc-supply = <&v3v3>; + vqmmc-supply = <&v3v3>; + status = "okay"; +}; + +&spi2 { /* X11 SPI */ + pinctrl-names = "default"; + pinctrl-0 = <&spi2_pins_b>; + cs-gpios = <&gpioi 0 0>; + status = "disabled"; + /delete-property/dmas; + /delete-property/dma-names; +}; + +&uart4 { + label = "UART0"; + pinctrl-names = "default"; + pinctrl-0 = <&uart4_pins_d>; + /delete-property/dmas; + /delete-property/dma-names; + status = "okay"; +}; + +&uart5 { /* X11 UART */ + label = "X11-UART5"; + pinctrl-names = "default"; + pinctrl-0 = <&uart5_pins_a>; + /delete-property/dmas; + /delete-property/dma-names; + status = "okay"; +}; + +&uart8 { + label = "RS485-1"; + pinctrl-names = "default"; + pinctrl-0 = <&uart8_pins_a &uart8_rtscts_pins_a>; + uart-has-rtscts; + /delete-property/dmas; + /delete-property/dma-names; + status = "okay"; +}; + +&usart3 { /* RS485 or RS232 */ + label = "RS485-2"; + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&usart3_pins_e>; + pinctrl-1 = <&usart3_sleep_pins_e>; + uart-has-rtscts; + /delete-property/dmas; + /delete-property/dma-names; + status = "okay"; +}; + +&usbh_ehci { + phys = <&usbphyc_port0>; + status = "okay"; +}; + +&usbh_ohci { + phys = <&usbphyc_port0>; + status = "okay"; +}; + +&usbotg_hs { + dr_mode = "otg"; + pinctrl-0 = <&usbotg_hs_pins_a>; + pinctrl-names = "default"; + phy-names = "usb2-phy"; + phys = <&usbphyc_port1 0>; + vbus-supply = <&vbus_otg>; + status = "okay"; +}; + +&usbphyc { + status = "okay"; +}; + +&usbphyc_port0 { + phy-supply = <&vdd_usb>; + connector { + compatible = "usb-a-connector"; + vbus-supply = <&vbus_sw>; + }; +}; + +&usbphyc_port1 { + phy-supply = <&vdd_usb>; +}; From 59cd3db262102232a19105bbe1e18bbfab152bda Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 22 May 2022 22:24:02 +0200 Subject: [PATCH 0245/1250] ARM: dts: stm32: Add alternate pinmux for DCMI pins Add another mux option for DCMI pins, this is used on AV96 board. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: Manivannan Sadhasivam Cc: Maxime Coquelin Cc: Patrice Chotard Cc: Patrick Delaunay Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32mp15-pinctrl.dtsi | 37 ++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi index 214563a4e56b86..94db065eada584 100644 --- a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi +++ b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi @@ -151,6 +151,43 @@ }; }; + dcmi_pins_c: dcmi-2 { + pins { + pinmux = ,/* DCMI_HSYNC */ + ,/* DCMI_VSYNC */ + ,/* DCMI_PIXCLK */ + ,/* DCMI_D0 */ + ,/* DCMI_D1 */ + ,/* DCMI_D2 */ + ,/* DCMI_D3 */ + ,/* DCMI_D4 */ + ,/* DCMI_D5 */ + ,/* DCMI_D6 */ + ,/* DCMI_D7 */ + ,/* DCMI_D8 */ + ;/* DCMI_D9 */ + bias-pull-up; + }; + }; + + dcmi_sleep_pins_c: dcmi-sleep-2 { + pins { + pinmux = ,/* DCMI_HSYNC */ + ,/* DCMI_VSYNC */ + ,/* DCMI_PIXCLK */ + ,/* DCMI_D0 */ + ,/* DCMI_D1 */ + ,/* DCMI_D2 */ + ,/* DCMI_D3 */ + ,/* DCMI_D4 */ + ,/* DCMI_D5 */ + ,/* DCMI_D6 */ + ,/* DCMI_D7 */ + ,/* DCMI_D8 */ + ;/* DCMI_D9 */ + }; + }; + ethernet0_rgmii_pins_a: rgmii-0 { pins1 { pinmux = , /* ETH_RGMII_CLK125 */ From 98aa0ceb72d43f4b5838fb6ea1ef4a972afe3eda Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 22 May 2022 22:24:03 +0200 Subject: [PATCH 0246/1250] ARM: dts: stm32: Add alternate pinmux for RCC pin Add another mux option for RCC pin, this is used on AV96 board for e.g. sensor clock supply. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: Manivannan Sadhasivam Cc: Maxime Coquelin Cc: Patrice Chotard Cc: Patrick Delaunay Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- arch/arm/boot/dts/stm32mp15-pinctrl.dtsi | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi index 94db065eada584..2cc9341d43d295 100644 --- a/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi +++ b/arch/arm/boot/dts/stm32mp15-pinctrl.dtsi @@ -960,6 +960,21 @@ }; }; + mco1_pins_a: mco1-0 { + pins { + pinmux = ; /* MCO1 */ + bias-disable; + drive-push-pull; + slew-rate = <1>; + }; + }; + + mco1_sleep_pins_a: mco1-sleep-0 { + pins { + pinmux = ; /* MCO1 */ + }; + }; + mco2_pins_a: mco2-0 { pins { pinmux = ; /* MCO2 */ From 4b43ff02d2a4b809a2e8cfa628a54f9e39a87381 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sun, 22 May 2022 22:24:04 +0200 Subject: [PATCH 0247/1250] ARM: dts: stm32: Add ST MIPID02 bindings to AV96 Add DT bindings for ST MIPID02 and DCMI to Avenger96 base DT. Both the ST MIPID02 and DCMI are disabled by default, as the AV96 camera module is optional. Signed-off-by: Marek Vasut Cc: Alexandre Torgue Cc: Manivannan Sadhasivam Cc: linux-stm32@st-md-mailman.stormreply.com To: linux-arm-kernel@lists.infradead.org Signed-off-by: Alexandre Torgue --- .../boot/dts/stm32mp15xx-dhcor-avenger96.dtsi | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/arch/arm/boot/dts/stm32mp15xx-dhcor-avenger96.dtsi b/arch/arm/boot/dts/stm32mp15xx-dhcor-avenger96.dtsi index 76c54b006d8712..90933077d66dec 100644 --- a/arch/arm/boot/dts/stm32mp15xx-dhcor-avenger96.dtsi +++ b/arch/arm/boot/dts/stm32mp15xx-dhcor-avenger96.dtsi @@ -126,6 +126,22 @@ }; }; +&dcmi { + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&dcmi_pins_c>; + pinctrl-1 = <&dcmi_sleep_pins_c>; + status = "disabled"; + + port { + dcmi_0: endpoint { + remote-endpoint = <&stmipi_2>; + bus-type = <5>; + bus-width = <8>; + pclk-sample = <0>; + }; + }; +}; + ðernet0 { status = "okay"; pinctrl-0 = <ðernet0_rgmii_pins_c>; @@ -219,6 +235,45 @@ }; &i2c4 { + stmipi: stmipi@14 { + compatible = "st,st-mipid02"; + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&mco1_pins_a>; + pinctrl-1 = <&mco1_sleep_pins_a>; + reg = <0x14>; + clocks = <&rcc CK_MCO1>; + clock-names = "xclk"; + assigned-clocks = <&rcc CK_MCO1>; + assigned-clock-parents = <&rcc CK_HSE>; + assigned-clock-rates = <24000000>; + VDDE-supply = <&v1v8>; + VDDIN-supply = <&v1v8>; + reset-gpios = <&gpioz 0 GPIO_ACTIVE_LOW>; + status = "disabled"; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + stmipi_0: endpoint { + }; + }; + + port@2 { + reg = <2>; + stmipi_2: endpoint { + bus-width = <8>; + hsync-active = <0>; + vsync-active = <0>; + pclk-sample = <0>; + remote-endpoint = <&dcmi_0>; + }; + }; + }; + }; + hdmi-transmitter@3d { compatible = "adi,adv7513"; reg = <0x3d>, <0x4d>, <0x2d>, <0x5d>; From faa4daef55dd9196854c6fe5d1d2dd3324806e01 Mon Sep 17 00:00:00 2001 From: Allen-KH Cheng Date: Fri, 1 Jul 2022 18:34:28 +0800 Subject: [PATCH 0248/1250] dt-bindings: nvmem: mediatek: efuse: add support for mt8186 Add compatible for mt8186 SoC. Signed-off-by: Allen-KH Cheng Acked-by: Rob Herring Signed-off-by: Srinivas Kandagatla --- Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml b/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml index 7c7233e29ecf16..b5a1109f2ee122 100644 --- a/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml +++ b/Documentation/devicetree/bindings/nvmem/mediatek,efuse.yaml @@ -29,6 +29,7 @@ properties: - mediatek,mt7623-efuse - mediatek,mt8173-efuse - mediatek,mt8183-efuse + - mediatek,mt8186-efuse - mediatek,mt8192-efuse - mediatek,mt8195-efuse - mediatek,mt8516-efuse From 432ee5a3cfcf8676448b0e11f1e800fc49c008d0 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Thu, 23 Jun 2022 14:15:58 +0200 Subject: [PATCH 0249/1250] nvmem: mtk-efuse: Simplify with devm_platform_get_and_ioremap_resource() Convert platform_get_resource(), devm_ioremap_resource() to a single call to devm_platform_get_and_ioremap_resource(), as this is exactly what this function does. No functional changes. Signed-off-by: AngeloGioacchino Del Regno Signed-off-by: Srinivas Kandagatla --- drivers/nvmem/mtk-efuse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/nvmem/mtk-efuse.c b/drivers/nvmem/mtk-efuse.c index e9a375dd84af81..a08e0aedd21c83 100644 --- a/drivers/nvmem/mtk-efuse.c +++ b/drivers/nvmem/mtk-efuse.c @@ -41,8 +41,7 @@ static int mtk_efuse_probe(struct platform_device *pdev) if (!priv) return -ENOMEM; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - priv->base = devm_ioremap_resource(dev, res); + priv->base = devm_platform_get_and_ioremap_resource(pdev, 0, &res); if (IS_ERR(priv->base)) return PTR_ERR(priv->base); From 3bcc2c1eade4e95ee494c30684d0157fba27c824 Mon Sep 17 00:00:00 2001 From: Nicolas Dufresne Date: Fri, 10 Jun 2022 13:52:11 +0100 Subject: [PATCH 0250/1250] media: rkvdec: Disable H.264 error detection Quite often, the HW get stuck in error condition if a stream error was detected. As documented, the HW should stop immediately and self reset. There is likely a problem or a miss-understanding of the self reset mechanism, as unless we make a long pause, the next command will then report an error even if there is no error in it. Disabling error detection fixes the issue, and let the decoder continue after an error. This patch is safe for backport into older kernels. Fixes: cd33c830448b ("media: rkvdec: Add the rkvdec driver") Signed-off-by: Nicolas Dufresne Reviewed-by: Brian Norris Tested-by: Brian Norris Reviewed-by: Ezequiel Garcia Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- drivers/staging/media/rkvdec/rkvdec-h264.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/media/rkvdec/rkvdec-h264.c b/drivers/staging/media/rkvdec/rkvdec-h264.c index 2992fb87cf723e..55596ce6bb6e42 100644 --- a/drivers/staging/media/rkvdec/rkvdec-h264.c +++ b/drivers/staging/media/rkvdec/rkvdec-h264.c @@ -1175,8 +1175,8 @@ static int rkvdec_h264_run(struct rkvdec_ctx *ctx) schedule_delayed_work(&rkvdec->watchdog_work, msecs_to_jiffies(2000)); - writel(0xffffffff, rkvdec->regs + RKVDEC_REG_STRMD_ERR_EN); - writel(0xffffffff, rkvdec->regs + RKVDEC_REG_H264_ERR_E); + writel(0, rkvdec->regs + RKVDEC_REG_STRMD_ERR_EN); + writel(0, rkvdec->regs + RKVDEC_REG_H264_ERR_E); writel(1, rkvdec->regs + RKVDEC_REG_PREF_LUMA_CACHE_COMMAND); writel(1, rkvdec->regs + RKVDEC_REG_PREF_CHR_CACHE_COMMAND); From bf909caec3902d174aea2ef3f4326e4715a56499 Mon Sep 17 00:00:00 2001 From: Vasyl Vavrychuk Date: Tue, 5 Jul 2022 15:59:31 +0300 Subject: [PATCH 0251/1250] Bluetooth: core: Fix deadlock on hci_power_on_sync. `cancel_work_sync(&hdev->power_on)` was moved to hci_dev_close_sync in commit [1] to ensure that power_on work is canceled after HCI interface down. But, in certain cases power_on work function may call hci_dev_close_sync itself: hci_power_on -> hci_dev_do_close -> hci_dev_close_sync -> cancel_work_sync(&hdev->power_on), causing deadlock. In particular, this happens when device is rfkilled on boot. To avoid deadlock, move power_on work canceling out of hci_dev_do_close/hci_dev_close_sync. Deadlock introduced by commit [1] was reported in [2,3] as broken suspend. Suspend did not work because `hdev->req_lock` held as result of `power_on` work deadlock. In fact, other BT features were not working. It was not observed when testing [1] since it was verified without rfkill in place. NOTE: It is not needed to cancel power_on work from other places where hci_dev_do_close/hci_dev_close_sync is called in case: * Requests were serialized due to `hdev->req_workqueue`. The power_on work is first in that workqueue. * hci_rfkill_set_block which won't close device anyway until HCI_SETUP is on. * hci_sock_release which runs after hci_sock_bind which ensures HCI_SETUP was cleared. As result, behaviour is the same as in pre-dd06ed7 commit, except power_on work cancel added to hci_dev_close. [1]: commit ff7f2926114d ("Bluetooth: core: Fix missing power_on work cancel on HCI close") [2]: https://lore.kernel.org/lkml/20220614181706.26513-1-max.oss.09@gmail.com/ [2]: https://lore.kernel.org/lkml/1236061d-95dd-c3ad-a38f-2dae7aae51ef@o2.pl/ Fixes: ff7f2926114d ("Bluetooth: core: Fix missing power_on work cancel on HCI close") Signed-off-by: Vasyl Vavrychuk Reported-by: Max Krummenacher Reported-by: Mateusz Jonczyk Tested-by: Max Krummenacher Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 3 +++ net/bluetooth/hci_sync.c | 1 - 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 05c13f639b9475..27e90eb4bf4c30 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -571,6 +571,7 @@ int hci_dev_close(__u16 dev) goto done; } + cancel_work_sync(&hdev->power_on); if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); @@ -2677,6 +2678,8 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); + cancel_work_sync(&hdev->power_on); + hci_cmd_sync_clear(hdev); hci_unregister_suspend_notifier(hdev); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e5602e209b637c..7cb31005187992 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4112,7 +4112,6 @@ int hci_dev_close_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); - cancel_work_sync(&hdev->power_on); cancel_delayed_work(&hdev->power_off); cancel_delayed_work(&hdev->ncmd_timer); From 1c183b4f5038617c0d20d9e6cf24a1ba9d14b0d8 Mon Sep 17 00:00:00 2001 From: Will McVicker Date: Mon, 13 Jun 2022 17:26:11 -0500 Subject: [PATCH 0252/1250] PCI: dwc: Fix MSI msi_msg DMA mapping As of 07940c369a6b ("PCI: dwc: Fix MSI page leakage in suspend/resume"), the PCIe designware host driver has been using the driver data allocation for the msi_msg DMA mapping which can result in a DMA_MAPPING_ERROR due to the DMA overflow check in dma_direct_map_page() when the address is greater than 32 bits (reported in [1]). The commit was trying to address a memory leak on suspend/resume by moving the MSI mapping to dw_pcie_host_init(), but subsequently dropped the page allocation thinking it wasn't needed. To fix the DMA mapping issue as well as make msi_msg DMA'able, switch back to allocating a 32-bit page for the msi_msg. To avoid the suspend/resume leak, allocate the page in dw_pcie_host_init() since that shouldn't be called during suspend/resume. [1] https://lore.kernel.org/all/Yo0soniFborDl7+C@google.com/ Signed-off-by: Will McVicker Signed-off-by: Bjorn Helgaas Reviewed-by: Rob Herring --- drivers/pci/controller/dwc/pcie-designware-host.c | 14 ++++++++------ drivers/pci/controller/dwc/pcie-designware.h | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-host.c b/drivers/pci/controller/dwc/pcie-designware-host.c index 1314bb12812815..79ade8b79b6d1e 100644 --- a/drivers/pci/controller/dwc/pcie-designware-host.c +++ b/drivers/pci/controller/dwc/pcie-designware-host.c @@ -267,8 +267,9 @@ static void dw_pcie_free_msi(struct dw_pcie_rp *pp) struct dw_pcie *pci = to_dw_pcie_from_pp(pp); struct device *dev = pci->dev; - dma_unmap_single_attrs(dev, pp->msi_data, sizeof(pp->msi_msg), - DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); + dma_unmap_page(dev, pp->msi_data, PAGE_SIZE, DMA_FROM_DEVICE); + if (pp->msi_page) + __free_page(pp->msi_page); } } @@ -392,13 +393,14 @@ int dw_pcie_host_init(struct dw_pcie_rp *pp) if (ret) dev_warn(dev, "Failed to set DMA mask to 32-bit. Devices with only 32-bit MSI support may not work properly\n"); - pp->msi_data = dma_map_single_attrs(dev, &pp->msi_msg, - sizeof(pp->msi_msg), - DMA_FROM_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); + pp->msi_page = alloc_page(GFP_DMA32); + pp->msi_data = dma_map_page(dev, pp->msi_page, 0, + PAGE_SIZE, DMA_FROM_DEVICE); ret = dma_mapping_error(dev, pp->msi_data); if (ret) { dev_err(pci->dev, "Failed to map MSI data\n"); + __free_page(pp->msi_page); + pp->msi_page = NULL; pp->msi_data = 0; goto err_free_msi; } diff --git a/drivers/pci/controller/dwc/pcie-designware.h b/drivers/pci/controller/dwc/pcie-designware.h index 32df3ebccf19f4..258244da5ff174 100644 --- a/drivers/pci/controller/dwc/pcie-designware.h +++ b/drivers/pci/controller/dwc/pcie-designware.h @@ -191,8 +191,8 @@ struct dw_pcie_rp { int msi_irq; struct irq_domain *irq_domain; struct irq_domain *msi_domain; - u16 msi_msg; dma_addr_t msi_data; + struct page *msi_page; struct irq_chip *msi_irq_chip; u32 num_vectors; u32 irq_mask[MAX_MSI_CTRLS]; From 0c9fd52f0f95ba4abd0240b39d0fa54c206820d3 Mon Sep 17 00:00:00 2001 From: Luo Xueqin Date: Tue, 5 Jul 2022 23:27:57 +0800 Subject: [PATCH 0253/1250] fsi: Fix typo in comment Spelling mistake in comment. Reported-by: k2ci Signed-off-by: Luo Xueqin Link: https://lore.kernel.org/r/20220705152757.27843-1-luoxueqin66@gmail.com Signed-off-by: Joel Stanley --- drivers/fsi/fsi-master.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/fsi/fsi-master.h b/drivers/fsi/fsi-master.h index cd6bee5e12a712..4762315a46ba36 100644 --- a/drivers/fsi/fsi-master.h +++ b/drivers/fsi/fsi-master.h @@ -51,7 +51,7 @@ #define FSI_MMODE_CRS1SHFT 8 /* Clk rate selection 1 shift */ #define FSI_MMODE_CRS1MASK 0x3ff /* Clk rate selection 1 mask */ -/* MRESB: Reset brindge */ +/* MRESB: Reset bridge */ #define FSI_MRESB_RST_GEN 0x80000000 /* General reset */ #define FSI_MRESB_RST_ERR 0x40000000 /* Error Reset */ From d72bea37e1531afff147a82fe262c2452ea1dd65 Mon Sep 17 00:00:00 2001 From: Eddie James Date: Tue, 26 Apr 2022 10:49:55 -0500 Subject: [PATCH 0254/1250] fsi: occ: Fix checksum failure mode Change the checksum errno to something different than the errno used for a bad SBE message. In addition, don't set the user's response length to the data length in this case, since it's not SBE FFDC. Signed-off-by: Eddie James Link: https://lore.kernel.org/r/20220426154956.27205-2-eajames@linux.ibm.com Signed-off-by: Joel Stanley --- drivers/fsi/fsi-occ.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c index c9cc75fbdfb9d4..3d04e8baecbbd6 100644 --- a/drivers/fsi/fsi-occ.c +++ b/drivers/fsi/fsi-occ.c @@ -246,7 +246,7 @@ static int occ_verify_checksum(struct occ *occ, struct occ_response *resp, if (checksum != checksum_resp) { dev_err(occ->dev, "Bad checksum: %04x!=%04x\n", checksum, checksum_resp); - return -EBADMSG; + return -EBADE; } return 0; @@ -575,8 +575,11 @@ int fsi_occ_submit(struct device *dev, const void *request, size_t req_len, dev_dbg(dev, "resp_status=%02x resp_data_len=%d\n", resp->return_status, resp_data_length); - occ->client_response_size = resp_data_length + 7; rc = occ_verify_checksum(occ, resp, resp_data_length); + if (rc) + goto done; + + occ->client_response_size = resp_data_length + 7; done: *resp_len = occ->client_response_size; From c27b98ca0edb0656134133bf84918e7d520c6b4d Mon Sep 17 00:00:00 2001 From: Eddie James Date: Tue, 26 Apr 2022 10:49:56 -0500 Subject: [PATCH 0255/1250] hwmon (occ): Retry for checksum failure Due to the OCC communication design with a shared SRAM area, checkum errors are expected due to corrupted buffer from OCC communications with other system components. Therefore, retry the command twice in the event of a checksum failure. Signed-off-by: Eddie James Acked-by: Guenter Roeck Link: https://lore.kernel.org/r/20220426154956.27205-3-eajames@linux.ibm.com Signed-off-by: Joel Stanley --- drivers/hwmon/occ/p9_sbe.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/hwmon/occ/p9_sbe.c b/drivers/hwmon/occ/p9_sbe.c index a91937e28e12bf..7d4702c845034e 100644 --- a/drivers/hwmon/occ/p9_sbe.c +++ b/drivers/hwmon/occ/p9_sbe.c @@ -14,6 +14,8 @@ #include "common.h" +#define OCC_CHECKSUM_RETRIES 3 + struct p9_sbe_occ { struct occ occ; bool sbe_error; @@ -82,17 +84,20 @@ static int p9_sbe_occ_send_cmd(struct occ *occ, u8 *cmd, size_t len, void *resp, size_t resp_len) { struct p9_sbe_occ *ctx = to_p9_sbe_occ(occ); - int rc; + int rc, i; - rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len); - if (rc < 0) { + for (i = 0; i < OCC_CHECKSUM_RETRIES; ++i) { + rc = fsi_occ_submit(ctx->sbe, cmd, len, resp, &resp_len); + if (rc >= 0) + break; if (resp_len) { if (p9_sbe_occ_save_ffdc(ctx, resp, resp_len)) sysfs_notify(&occ->bus_dev->kobj, NULL, bin_attr_ffdc.attr.name); + return rc; } - - return rc; + if (rc != -EBADE) + return rc; } switch (((struct occ_response *)resp)->return_status) { From e7a66dc2125e54dbc9d9f084b45829368b0f0335 Mon Sep 17 00:00:00 2001 From: Eddie James Date: Fri, 13 May 2022 14:44:24 -0500 Subject: [PATCH 0256/1250] fsi: occ: Prevent use after free Use get_device and put_device in the open and close functions to make sure the device doesn't get freed while a file descriptor is open. Also, lock around the freeing of the device buffer and check the buffer before using it in the submit function. Signed-off-by: Eddie James Reviewed-by: Guenter Roeck Link: https://lore.kernel.org/r/20220513194424.53468-1-eajames@linux.ibm.com Signed-off-by: Joel Stanley --- drivers/fsi/fsi-occ.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/fsi/fsi-occ.c b/drivers/fsi/fsi-occ.c index 3d04e8baecbbd6..8f7f602b909da1 100644 --- a/drivers/fsi/fsi-occ.c +++ b/drivers/fsi/fsi-occ.c @@ -94,6 +94,7 @@ static int occ_open(struct inode *inode, struct file *file) client->occ = occ; mutex_init(&client->lock); file->private_data = client; + get_device(occ->dev); /* We allocate a 1-page buffer, make sure it all fits */ BUILD_BUG_ON((OCC_CMD_DATA_BYTES + 3) > PAGE_SIZE); @@ -197,6 +198,7 @@ static int occ_release(struct inode *inode, struct file *file) { struct occ_client *client = file->private_data; + put_device(client->occ->dev); free_page((unsigned long)client->buffer); kfree(client); @@ -493,12 +495,19 @@ int fsi_occ_submit(struct device *dev, const void *request, size_t req_len, for (i = 1; i < req_len - 2; ++i) checksum += byte_request[i]; - mutex_lock(&occ->occ_lock); + rc = mutex_lock_interruptible(&occ->occ_lock); + if (rc) + return rc; occ->client_buffer = response; occ->client_buffer_size = user_resp_len; occ->client_response_size = 0; + if (!occ->buffer) { + rc = -ENOENT; + goto done; + } + /* * Get a sequence number and update the counter. Avoid a sequence * number of 0 which would pass the response check below even if the @@ -674,10 +683,13 @@ static int occ_remove(struct platform_device *pdev) { struct occ *occ = platform_get_drvdata(pdev); - kvfree(occ->buffer); - misc_deregister(&occ->mdev); + mutex_lock(&occ->occ_lock); + kvfree(occ->buffer); + occ->buffer = NULL; + mutex_unlock(&occ->occ_lock); + device_for_each_child(&pdev->dev, NULL, occ_unregister_child); ida_simple_remove(&occ_ida, occ->idx); From e1b9895b42b095bf174cc5bb5f8077c9b4582cb3 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Sun, 3 Apr 2022 10:09:37 -0400 Subject: [PATCH 0257/1250] fsi: cleanup extern usage in function definition Smatch reports these issues fsi-core.c:395:12: warning: function 'fsi_slave_claim_range' with external linkage has definition fsi-core.c:409:13: warning: function 'fsi_slave_release_range' with external linkage has definition The storage-class-specifier extern is not needed in a definition, so remove it. Signed-off-by: Tom Rix Link: https://lore.kernel.org/r/20220403140937.3833578-1-trix@redhat.com Signed-off-by: Joel Stanley --- drivers/fsi/fsi-core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/fsi/fsi-core.c b/drivers/fsi/fsi-core.c index 3a7b78e3670118..0d11a17c53a0d2 100644 --- a/drivers/fsi/fsi-core.c +++ b/drivers/fsi/fsi-core.c @@ -392,8 +392,8 @@ int fsi_slave_write(struct fsi_slave *slave, uint32_t addr, } EXPORT_SYMBOL_GPL(fsi_slave_write); -extern int fsi_slave_claim_range(struct fsi_slave *slave, - uint32_t addr, uint32_t size) +int fsi_slave_claim_range(struct fsi_slave *slave, + uint32_t addr, uint32_t size) { if (addr + size < addr) return -EINVAL; @@ -406,8 +406,8 @@ extern int fsi_slave_claim_range(struct fsi_slave *slave, } EXPORT_SYMBOL_GPL(fsi_slave_claim_range); -extern void fsi_slave_release_range(struct fsi_slave *slave, - uint32_t addr, uint32_t size) +void fsi_slave_release_range(struct fsi_slave *slave, + uint32_t addr, uint32_t size) { } EXPORT_SYMBOL_GPL(fsi_slave_release_range); From becdb3b8e4f925eee333c4ba6f495c3fca463f77 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Fri, 15 Apr 2022 14:37:57 +0930 Subject: [PATCH 0258/1250] fsi: sbefifo: Add detailed debugging information Provide more output on the timeout status, and make some vdbg calls into dbg calls so they can be enabled at runtime. Signed-off-by: Joel Stanley Link: https://lore.kernel.org/r/20220415050757.281158-1-joel@jms.id.au Signed-off-by: Joel Stanley --- drivers/fsi/fsi-sbefifo.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/fsi/fsi-sbefifo.c b/drivers/fsi/fsi-sbefifo.c index f52a912cdf16bc..5f93a53846aad7 100644 --- a/drivers/fsi/fsi-sbefifo.c +++ b/drivers/fsi/fsi-sbefifo.c @@ -477,7 +477,8 @@ static int sbefifo_wait(struct sbefifo *sbefifo, bool up, if (!ready) { sysfs_notify(&sbefifo->dev.kobj, NULL, dev_attr_timeout.attr.name); sbefifo->timed_out = true; - dev_err(dev, "%s FIFO Timeout ! status=%08x\n", up ? "UP" : "DOWN", sts); + dev_err(dev, "%s FIFO Timeout (%u ms)! status=%08x\n", + up ? "UP" : "DOWN", jiffies_to_msecs(timeout), sts); return -ETIMEDOUT; } dev_vdbg(dev, "End of wait status: %08x\n", sts); @@ -497,8 +498,8 @@ static int sbefifo_send_command(struct sbefifo *sbefifo, u32 status; int rc; - dev_vdbg(dev, "sending command (%zd words, cmd=%04x)\n", - cmd_len, be32_to_cpu(command[1])); + dev_dbg(dev, "sending command (%zd words, cmd=%04x)\n", + cmd_len, be32_to_cpu(command[1])); /* As long as there's something to send */ timeout = msecs_to_jiffies(SBEFIFO_TIMEOUT_START_CMD); @@ -551,21 +552,23 @@ static int sbefifo_read_response(struct sbefifo *sbefifo, struct iov_iter *respo size_t len; int rc; - dev_vdbg(dev, "reading response, buflen = %zd\n", iov_iter_count(response)); + dev_dbg(dev, "reading response, buflen = %zd\n", iov_iter_count(response)); timeout = msecs_to_jiffies(sbefifo->timeout_start_rsp_ms); for (;;) { /* Grab FIFO status (this will handle parity errors) */ rc = sbefifo_wait(sbefifo, false, &status, timeout); - if (rc < 0) + if (rc < 0) { + dev_dbg(dev, "timeout waiting (%u ms)\n", jiffies_to_msecs(timeout)); return rc; + } timeout = msecs_to_jiffies(SBEFIFO_TIMEOUT_IN_RSP); /* Decode status */ len = sbefifo_populated(status); eot_set = sbefifo_eot_set(status); - dev_vdbg(dev, " chunk size %zd eot_set=0x%x\n", len, eot_set); + dev_dbg(dev, " chunk size %zd eot_set=0x%x\n", len, eot_set); /* Go through the chunk */ while(len--) { From 04823dd2af87341ddf58e9903874e8db6e212670 Mon Sep 17 00:00:00 2001 From: Lv Ruyi Date: Thu, 7 Apr 2022 08:59:11 +0000 Subject: [PATCH 0259/1250] fsi: master-ast-cf: Fix missing of_node_put in fsi_master_acf_probe of_parse_phandle returns node pointer with refcount incremented, use of_node_put() on it when done. Reported-by: Zeal Robot Signed-off-by: Lv Ruyi Link: https://lore.kernel.org/r/20220407085911.2491719-1-lv.ruyi@zte.com.cn Signed-off-by: Joel Stanley --- drivers/fsi/fsi-master-ast-cf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/fsi/fsi-master-ast-cf.c b/drivers/fsi/fsi-master-ast-cf.c index 24292acdbaf84d..5f608ef8b53cad 100644 --- a/drivers/fsi/fsi-master-ast-cf.c +++ b/drivers/fsi/fsi-master-ast-cf.c @@ -1324,12 +1324,14 @@ static int fsi_master_acf_probe(struct platform_device *pdev) } master->cvic = devm_of_iomap(&pdev->dev, np, 0, NULL); if (IS_ERR(master->cvic)) { + of_node_put(np); rc = PTR_ERR(master->cvic); dev_err(&pdev->dev, "Error %d mapping CVIC\n", rc); goto err_free; } rc = of_property_read_u32(np, "copro-sw-interrupts", &master->cvic_sw_irq); + of_node_put(np); if (rc) { dev_err(&pdev->dev, "Can't find coprocessor SW interrupt\n"); goto err_free; From 0026f88ae840b8542af22c6951bd7dfb0c526a38 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 16 Jun 2022 22:25:35 +0200 Subject: [PATCH 0260/1250] thermal/drivers/qcom: Remove get_trend function There is a get_trend function which is a wrapper to call a private get_trend function. However, this private get_trend function is not assigned anywhere. Remove this dead code. Signed-off-by: Daniel Lezcano Acked-by: Amit Kucheria Link: https://lore.kernel.org/r/20220616202537.303655-1-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano --- drivers/thermal/qcom/tsens.c | 12 ------------ drivers/thermal/qcom/tsens.h | 2 -- 2 files changed, 14 deletions(-) diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c index 7963ee33bf75b7..e49f58e8351375 100644 --- a/drivers/thermal/qcom/tsens.c +++ b/drivers/thermal/qcom/tsens.c @@ -933,17 +933,6 @@ static int tsens_get_temp(void *data, int *temp) return priv->ops->get_temp(s, temp); } -static int tsens_get_trend(void *data, int trip, enum thermal_trend *trend) -{ - struct tsens_sensor *s = data; - struct tsens_priv *priv = s->priv; - - if (priv->ops->get_trend) - return priv->ops->get_trend(s, trend); - - return -ENOTSUPP; -} - static int __maybe_unused tsens_suspend(struct device *dev) { struct tsens_priv *priv = dev_get_drvdata(dev); @@ -1004,7 +993,6 @@ MODULE_DEVICE_TABLE(of, tsens_table); static const struct thermal_zone_of_device_ops tsens_of_ops = { .get_temp = tsens_get_temp, - .get_trend = tsens_get_trend, .set_trips = tsens_set_trips, }; diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h index 1471a2c00f1584..ba05c82333565b 100644 --- a/drivers/thermal/qcom/tsens.h +++ b/drivers/thermal/qcom/tsens.h @@ -65,7 +65,6 @@ struct tsens_sensor { * @disable: Function to disable the tsens device * @suspend: Function to suspend the tsens device * @resume: Function to resume the tsens device - * @get_trend: Function to get the thermal/temp trend */ struct tsens_ops { /* mandatory callbacks */ @@ -77,7 +76,6 @@ struct tsens_ops { void (*disable)(struct tsens_priv *priv); int (*suspend)(struct tsens_priv *priv); int (*resume)(struct tsens_priv *priv); - int (*get_trend)(struct tsens_sensor *s, enum thermal_trend *trend); }; #define REG_FIELD_FOR_EACH_SENSOR11(_name, _offset, _startbit, _stopbit) \ From 9307ea4ae7c50d4b1a1e768ffb1a5bf3dcea9347 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 16 Jun 2022 22:25:36 +0200 Subject: [PATCH 0261/1250] thermal/drivers/tegra: Remove get_trend function The get_trend function does already what the generic framework does. Remove it. Signed-off-by: Daniel Lezcano Tested-by: Dmitry Osipenko Link: https://lore.kernel.org/r/20220616202537.303655-2-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano --- drivers/thermal/tegra/soctherm.c | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c index 210325f92559e3..825eab52661967 100644 --- a/drivers/thermal/tegra/soctherm.c +++ b/drivers/thermal/tegra/soctherm.c @@ -633,37 +633,6 @@ static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp) return 0; } -static int tegra_thermctl_get_trend(void *data, int trip, - enum thermal_trend *trend) -{ - struct tegra_thermctl_zone *zone = data; - struct thermal_zone_device *tz = zone->tz; - int trip_temp, temp, last_temp, ret; - - if (!tz) - return -EINVAL; - - ret = tz->ops->get_trip_temp(zone->tz, trip, &trip_temp); - if (ret) - return ret; - - temp = READ_ONCE(tz->temperature); - last_temp = READ_ONCE(tz->last_temperature); - - if (temp > trip_temp) { - if (temp >= last_temp) - *trend = THERMAL_TREND_RAISING; - else - *trend = THERMAL_TREND_STABLE; - } else if (temp < trip_temp) { - *trend = THERMAL_TREND_DROPPING; - } else { - *trend = THERMAL_TREND_STABLE; - } - - return 0; -} - static void thermal_irq_enable(struct tegra_thermctl_zone *zn) { u32 r; @@ -716,7 +685,6 @@ static int tegra_thermctl_set_trips(void *data, int lo, int hi) static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = { .get_temp = tegra_thermctl_get_temp, .set_trip_temp = tegra_thermctl_set_trip_temp, - .get_trend = tegra_thermctl_get_trend, .set_trips = tegra_thermctl_set_trips, }; From 0e1f2573e9f8b6f7cd270a5cb50f11ab02feef06 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Thu, 16 Jun 2022 22:25:37 +0200 Subject: [PATCH 0262/1250] thermal/drivers/u8500: Remove the get_trend function The get_trend function relies on the interrupt to set the raising or dropping trend. However the interpolated temperature is already giving the temperature information to the thermal framework which is able to deduce the trend. Remove the trend code. Signed-off-by: Daniel Lezcano Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20220616202537.303655-3-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano --- drivers/thermal/db8500_thermal.c | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c index 21d4d6e6409a97..ed40cfd9ab7deb 100644 --- a/drivers/thermal/db8500_thermal.c +++ b/drivers/thermal/db8500_thermal.c @@ -53,7 +53,6 @@ static const unsigned long db8500_thermal_points[] = { struct db8500_thermal_zone { struct thermal_zone_device *tz; - enum thermal_trend trend; unsigned long interpolated_temp; unsigned int cur_index; }; @@ -73,24 +72,12 @@ static int db8500_thermal_get_temp(void *data, int *temp) return 0; } -/* Callback to get temperature changing trend */ -static int db8500_thermal_get_trend(void *data, int trip, enum thermal_trend *trend) -{ - struct db8500_thermal_zone *th = data; - - *trend = th->trend; - - return 0; -} - static struct thermal_zone_of_device_ops thdev_ops = { .get_temp = db8500_thermal_get_temp, - .get_trend = db8500_thermal_get_trend, }; static void db8500_thermal_update_config(struct db8500_thermal_zone *th, unsigned int idx, - enum thermal_trend trend, unsigned long next_low, unsigned long next_high) { @@ -98,7 +85,6 @@ static void db8500_thermal_update_config(struct db8500_thermal_zone *th, th->cur_index = idx; th->interpolated_temp = (next_low + next_high)/2; - th->trend = trend; /* * The PRCMU accept absolute temperatures in celsius so divide @@ -127,8 +113,7 @@ static irqreturn_t prcmu_low_irq_handler(int irq, void *irq_data) } idx -= 1; - db8500_thermal_update_config(th, idx, THERMAL_TREND_DROPPING, - next_low, next_high); + db8500_thermal_update_config(th, idx, next_low, next_high); dev_dbg(&th->tz->device, "PRCMU set max %ld, min %ld\n", next_high, next_low); @@ -149,8 +134,7 @@ static irqreturn_t prcmu_high_irq_handler(int irq, void *irq_data) next_low = db8500_thermal_points[idx]; idx += 1; - db8500_thermal_update_config(th, idx, THERMAL_TREND_RAISING, - next_low, next_high); + db8500_thermal_update_config(th, idx, next_low, next_high); dev_dbg(&th->tz->device, "PRCMU set max %ld, min %ld\n", next_high, next_low); @@ -210,8 +194,7 @@ static int db8500_thermal_probe(struct platform_device *pdev) dev_info(dev, "thermal zone sensor registered\n"); /* Start measuring at the lowest point */ - db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE, - PRCMU_DEFAULT_LOW_TEMP, + db8500_thermal_update_config(th, 0, PRCMU_DEFAULT_LOW_TEMP, db8500_thermal_points[0]); platform_set_drvdata(pdev, th); @@ -232,8 +215,7 @@ static int db8500_thermal_resume(struct platform_device *pdev) struct db8500_thermal_zone *th = platform_get_drvdata(pdev); /* Resume and start measuring at the lowest point */ - db8500_thermal_update_config(th, 0, THERMAL_TREND_STABLE, - PRCMU_DEFAULT_LOW_TEMP, + db8500_thermal_update_config(th, 0, PRCMU_DEFAULT_LOW_TEMP, db8500_thermal_points[0]); return 0; From 3cdacdb345d8f32decfc4a76340c8c13ca682b6d Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 29 Jun 2022 17:10:11 +0200 Subject: [PATCH 0263/1250] thermal/core: Use clamp() helper in the stepwise governor The code is actually clampling the next cooling device state using the lowest and highest states of the thermal instance. That code can be replaced by the clamp() macro which does exactly the same. It results in a simpler routine to read. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220629151012.3115773-1-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano --- drivers/thermal/gov_step_wise.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c index 12acb12aac50d7..6efbfaf014dafc 100644 --- a/drivers/thermal/gov_step_wise.c +++ b/drivers/thermal/gov_step_wise.c @@ -11,6 +11,7 @@ */ #include +#include #include #include "thermal_core.h" @@ -52,10 +53,7 @@ static unsigned long get_target_state(struct thermal_instance *instance, if (!instance->initialized) { if (throttle) { - next_target = (cur_state + 1) >= instance->upper ? - instance->upper : - ((cur_state + 1) < instance->lower ? - instance->lower : (cur_state + 1)); + next_target = clamp((cur_state + 1), instance->lower, instance->upper); } else { next_target = THERMAL_NO_TARGET; } @@ -66,10 +64,7 @@ static unsigned long get_target_state(struct thermal_instance *instance, switch (trend) { case THERMAL_TREND_RAISING: if (throttle) { - next_target = cur_state < instance->upper ? - (cur_state + 1) : instance->upper; - if (next_target < instance->lower) - next_target = instance->lower; + next_target = clamp((cur_state + 1), instance->lower, instance->upper); } break; case THERMAL_TREND_RAISE_FULL: @@ -82,9 +77,7 @@ static unsigned long get_target_state(struct thermal_instance *instance, next_target = THERMAL_NO_TARGET; } else { if (!throttle) { - next_target = cur_state - 1; - if (next_target > instance->upper) - next_target = instance->upper; + next_target = clamp((cur_state - 1), instance->lower, instance->upper); } } break; From 5665ce4c60d0eecc37b953637659a5384aec77bd Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 29 Jun 2022 17:10:12 +0200 Subject: [PATCH 0264/1250] thermal/core: Remove DROP_FULL and RAISE_FULL The trends DROP_FULL and RAISE_FULL are not used and were never used in the past AFAICT. Remove these conditions as they seems to not be handled anywhere. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220629151012.3115773-2-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano --- drivers/thermal/gov_step_wise.c | 11 ----------- include/linux/thermal.h | 2 -- 2 files changed, 13 deletions(-) diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c index 6efbfaf014dafc..9729b46d0258aa 100644 --- a/drivers/thermal/gov_step_wise.c +++ b/drivers/thermal/gov_step_wise.c @@ -67,10 +67,6 @@ static unsigned long get_target_state(struct thermal_instance *instance, next_target = clamp((cur_state + 1), instance->lower, instance->upper); } break; - case THERMAL_TREND_RAISE_FULL: - if (throttle) - next_target = instance->upper; - break; case THERMAL_TREND_DROPPING: if (cur_state <= instance->lower) { if (!throttle) @@ -81,13 +77,6 @@ static unsigned long get_target_state(struct thermal_instance *instance, } } break; - case THERMAL_TREND_DROP_FULL: - if (cur_state == instance->lower) { - if (!throttle) - next_target = THERMAL_NO_TARGET; - } else - next_target = instance->lower; - break; default: break; } diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 365733b428d8f2..231bac2768fb79 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -40,8 +40,6 @@ enum thermal_trend { THERMAL_TREND_STABLE, /* temperature is stable */ THERMAL_TREND_RAISING, /* temperature is raising */ THERMAL_TREND_DROPPING, /* temperature is dropping */ - THERMAL_TREND_RAISE_FULL, /* apply highest cooling action */ - THERMAL_TREND_DROP_FULL, /* apply lowest cooling action */ }; /* Thermal notification reason */ From 43a20e93310ec218550dcdda0ae79dcd91dbd880 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 6 Jul 2022 21:31:45 +0800 Subject: [PATCH 0265/1250] rnbd-clt: open code send_msg_open in rnbd_clt_map_device Let's open code it in rnbd_clt_map_device, then we can use information from rsp to setup gendisk and request_queue in next commits. After that, we can remove some members (wc, fua and max_hw_sectors etc) from struct rnbd_clt_dev. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220706133152.12058-2-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 43 +++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 409c76b81aed46..9e9aeba86d337e 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1562,7 +1562,14 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, { struct rnbd_clt_session *sess; struct rnbd_clt_dev *dev; - int ret; + int ret, errno; + struct rnbd_msg_open_rsp *rsp; + struct rnbd_msg_open msg; + struct rnbd_iu *iu; + struct kvec vec = { + .iov_base = &msg, + .iov_len = sizeof(msg) + }; if (exists_devpath(pathname, sessname)) return ERR_PTR(-EEXIST); @@ -1582,7 +1589,39 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, ret = -EEXIST; goto put_dev; } - ret = send_msg_open(dev, RTRS_PERMIT_WAIT); + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) { + ret = -ENOMEM; + goto del_dev; + } + + iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT); + if (!iu) { + ret = -ENOMEM; + kfree(rsp); + goto del_dev; + } + iu->buf = rsp; + iu->dev = dev; + sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp)); + + msg.hdr.type = cpu_to_le16(RNBD_MSG_OPEN); + msg.access_mode = dev->access_mode; + strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name)); + + WARN_ON(!rnbd_clt_get_dev(dev)); + ret = send_usr_msg(sess->rtrs, READ, iu, + &vec, sizeof(*rsp), iu->sgt.sgl, 1, + msg_open_conf, &errno, RTRS_PERMIT_WAIT); + if (ret) { + rnbd_clt_put_dev(dev); + rnbd_put_iu(sess, iu); + kfree(rsp); + } else { + ret = errno; + } + rnbd_put_iu(sess, iu); if (ret) { rnbd_clt_err(dev, "map_device: failed, can't open remote device, err: %d\n", From 953d0c1b1d29c75221ae2d14a0f937f2e0f90592 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 6 Jul 2022 21:31:46 +0800 Subject: [PATCH 0266/1250] rnbd-clt: don't free rsp in msg_open_conf for map scenario For map scenario, rsp is freed in two places: 1. msg_open_conf frees rsp if rtrs_clt_request returns 0. 2. Otherwise, rsp is freed by the call sites of rtrs_clt_request. Now, We'd like to control full lifecycle of rsp in rnbd_clt_map_device, with that, it is feasible to pass rsp to rnbd_client_setup_device in next commit. For 1, it is possible to free rsp from the caller of send_usr_msg because of the synchronization of iu->comp.wait. And we put iu later in rnbd_clt_map_device to ensure order of release rsp and iu. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220706133152.12058-3-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 9e9aeba86d337e..ef3e561faf61b5 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -507,6 +507,11 @@ static void msg_open_conf(struct work_struct *work) struct rnbd_msg_open_rsp *rsp = iu->buf; struct rnbd_clt_dev *dev = iu->dev; int errno = iu->errno; + bool from_map = false; + + /* INIT state is only triggered from rnbd_clt_map_device */ + if (dev->dev_state == DEV_STATE_INIT) + from_map = true; if (errno) { rnbd_clt_err(dev, @@ -523,7 +528,9 @@ static void msg_open_conf(struct work_struct *work) send_msg_close(dev, device_id, RTRS_PERMIT_NOWAIT); } } - kfree(rsp); + /* We free rsp in rnbd_clt_map_device for map scenario */ + if (!from_map) + kfree(rsp); wake_up_iu_comp(iu, errno); rnbd_put_iu(dev->sess, iu); rnbd_clt_put_dev(dev); @@ -1617,16 +1624,14 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, if (ret) { rnbd_clt_put_dev(dev); rnbd_put_iu(sess, iu); - kfree(rsp); } else { ret = errno; } - rnbd_put_iu(sess, iu); if (ret) { rnbd_clt_err(dev, "map_device: failed, can't open remote device, err: %d\n", ret); - goto del_dev; + goto put_iu; } mutex_lock(&dev->lock); pr_debug("Opened remote device: session=%s, path='%s'\n", @@ -1650,12 +1655,17 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, dev->max_hw_sectors, dev->wc, dev->fua); mutex_unlock(&dev->lock); + kfree(rsp); + rnbd_put_iu(sess, iu); rnbd_clt_put_sess(sess); return dev; send_close: send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT); +put_iu: + kfree(rsp); + rnbd_put_iu(sess, iu); del_dev: delete_dev(dev); put_dev: From e8d5be284d3089aa5c3d957e53f71d2eca72b574 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 6 Jul 2022 21:31:47 +0800 Subject: [PATCH 0267/1250] rnbd-clt: kill read_only from struct rnbd_clt_dev The member is not needed since we can call get_disk_ro to achieve the same goal. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220706133152.12058-4-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 8 ++------ drivers/block/rnbd/rnbd-clt.h | 1 - 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index ef3e561faf61b5..0e93e529dd824d 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -949,7 +949,7 @@ static int rnbd_client_open(struct block_device *block_device, fmode_t mode) { struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; - if (dev->read_only && (mode & FMODE_WRITE)) + if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE)) return -EPERM; if (dev->dev_state == DEV_STATE_UNMAPPED || @@ -1402,12 +1402,8 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) set_capacity(dev->gd, dev->nsectors); - if (dev->access_mode == RNBD_ACCESS_RO) { - dev->read_only = true; + if (dev->access_mode == RNBD_ACCESS_RO) set_disk_ro(dev->gd, true); - } else { - dev->read_only = false; - } /* * Network device does not need rotational diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index 2e2e8c4a85c170..26fb91d800e31e 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -117,7 +117,6 @@ struct rnbd_clt_dev { char *pathname; enum rnbd_access_mode access_mode; u32 nr_poll_queues; - bool read_only; bool wc; bool fua; u32 max_hw_sectors; From 7e6c34c6ca2282ea1a9c3a2d06db3e3578f272dd Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 6 Jul 2022 21:31:48 +0800 Subject: [PATCH 0268/1250] rnbd-clt: reduce the size of struct rnbd_clt_dev Previously, both map and remap trigger rnbd_clt_set_dev_attr to set some members in rnbd_clt_dev such as wc, fua and logical_block_size etc, but those members are only useful for map scenario given the setup_request_queue is only called from the path: rnbd_clt_map_device -> rnbd_client_setup_device Since rnbd_clt_map_device frees rsp after rnbd_client_setup_device, we can pass rsp to rnbd_client_setup_device and it's callees, which means queue's attributes can be set directly from relevant members of rsp instead from rnbd_clt_dev. After that, we can kill 11 members from rnbd_clt_dev, and we don't need rnbd_clt_set_dev_attr either. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220706133152.12058-5-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 118 ++++++++++++++++------------------ drivers/block/rnbd/rnbd-clt.h | 11 ---- 2 files changed, 55 insertions(+), 74 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 0e93e529dd824d..da2ba9477b1e57 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -68,38 +68,12 @@ static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) return refcount_inc_not_zero(&dev->refcount); } -static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev, - const struct rnbd_msg_open_rsp *rsp) -{ - struct rnbd_clt_session *sess = dev->sess; - - if (!rsp->logical_block_size) - return -EINVAL; - - dev->device_id = le32_to_cpu(rsp->device_id); - dev->nsectors = le64_to_cpu(rsp->nsectors); - dev->logical_block_size = le16_to_cpu(rsp->logical_block_size); - dev->physical_block_size = le16_to_cpu(rsp->physical_block_size); - dev->max_discard_sectors = le32_to_cpu(rsp->max_discard_sectors); - dev->discard_granularity = le32_to_cpu(rsp->discard_granularity); - dev->discard_alignment = le32_to_cpu(rsp->discard_alignment); - dev->secure_discard = le16_to_cpu(rsp->secure_discard); - dev->wc = !!(rsp->cache_policy & RNBD_WRITEBACK); - dev->fua = !!(rsp->cache_policy & RNBD_FUA); - - dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE; - dev->max_segments = sess->max_segments; - - return 0; -} - static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, size_t new_nsectors) { - rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n", - dev->nsectors, new_nsectors); - dev->nsectors = new_nsectors; - set_capacity_and_notify(dev->gd, dev->nsectors); + rnbd_clt_info(dev, "Device size changed from %llu to %zu sectors\n", + get_capacity(dev->gd), new_nsectors); + set_capacity_and_notify(dev->gd, new_nsectors); return 0; } @@ -123,15 +97,17 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev, * If the device was remapped and the size changed in the * meantime we need to revalidate it */ - if (dev->nsectors != nsectors) + if (get_capacity(dev->gd) != nsectors) rnbd_clt_change_capacity(dev, nsectors); gd_kobj = &disk_to_dev(dev->gd)->kobj; kobject_uevent(gd_kobj, KOBJ_ONLINE); rnbd_clt_info(dev, "Device online, device remapped successfully\n"); } - err = rnbd_clt_set_dev_attr(dev, rsp); - if (err) + if (!rsp->logical_block_size) { + err = -EINVAL; goto out; + } + dev->device_id = le32_to_cpu(rsp->device_id); dev->dev_state = DEV_STATE_MAPPED; out: @@ -970,10 +946,10 @@ static int rnbd_client_getgeo(struct block_device *block_device, struct hd_geometry *geo) { u64 size; - struct rnbd_clt_dev *dev; + struct rnbd_clt_dev *dev = block_device->bd_disk->private_data; + struct queue_limits *limit = &dev->queue->limits; - dev = block_device->bd_disk->private_data; - size = dev->size * (dev->logical_block_size / SECTOR_SIZE); + size = dev->size * (limit->logical_block_size / SECTOR_SIZE); geo->cylinders = size >> 6; /* size/64 */ geo->heads = 4; geo->sectors = 16; @@ -1357,11 +1333,15 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev) } } -static void setup_request_queue(struct rnbd_clt_dev *dev) +static void setup_request_queue(struct rnbd_clt_dev *dev, + struct rnbd_msg_open_rsp *rsp) { - blk_queue_logical_block_size(dev->queue, dev->logical_block_size); - blk_queue_physical_block_size(dev->queue, dev->physical_block_size); - blk_queue_max_hw_sectors(dev->queue, dev->max_hw_sectors); + blk_queue_logical_block_size(dev->queue, + le16_to_cpu(rsp->logical_block_size)); + blk_queue_physical_block_size(dev->queue, + le16_to_cpu(rsp->physical_block_size)); + blk_queue_max_hw_sectors(dev->queue, + dev->sess->max_io_size / SECTOR_SIZE); /* * we don't support discards to "discontiguous" segments @@ -1369,21 +1349,27 @@ static void setup_request_queue(struct rnbd_clt_dev *dev) */ blk_queue_max_discard_segments(dev->queue, 1); - blk_queue_max_discard_sectors(dev->queue, dev->max_discard_sectors); - dev->queue->limits.discard_granularity = dev->discard_granularity; - dev->queue->limits.discard_alignment = dev->discard_alignment; - if (dev->secure_discard) + blk_queue_max_discard_sectors(dev->queue, + le32_to_cpu(rsp->max_discard_sectors)); + dev->queue->limits.discard_granularity = + le32_to_cpu(rsp->discard_granularity); + dev->queue->limits.discard_alignment = + le32_to_cpu(rsp->discard_alignment); + if (le16_to_cpu(rsp->secure_discard)) blk_queue_max_secure_erase_sectors(dev->queue, - dev->max_discard_sectors); + le32_to_cpu(rsp->max_discard_sectors)); blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); - blk_queue_max_segments(dev->queue, dev->max_segments); + blk_queue_max_segments(dev->queue, dev->sess->max_segments); blk_queue_io_opt(dev->queue, dev->sess->max_io_size); blk_queue_virt_boundary(dev->queue, SZ_4K - 1); - blk_queue_write_cache(dev->queue, dev->wc, dev->fua); + blk_queue_write_cache(dev->queue, + !!(rsp->cache_policy & RNBD_WRITEBACK), + !!(rsp->cache_policy & RNBD_FUA)); } -static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) +static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, + struct rnbd_msg_open_rsp *rsp, int idx) { int err; @@ -1395,12 +1381,12 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) dev->gd->private_data = dev; snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d", idx); - pr_debug("disk_name=%s, capacity=%zu\n", + pr_debug("disk_name=%s, capacity=%llu\n", dev->gd->disk_name, - dev->nsectors * (dev->logical_block_size / SECTOR_SIZE) - ); + le64_to_cpu(rsp->nsectors) * + (le16_to_cpu(rsp->logical_block_size) / SECTOR_SIZE)); - set_capacity(dev->gd, dev->nsectors); + set_capacity(dev->gd, le64_to_cpu(rsp->nsectors)); if (dev->access_mode == RNBD_ACCESS_RO) set_disk_ro(dev->gd, true); @@ -1416,11 +1402,13 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) return err; } -static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) +static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, + struct rnbd_msg_open_rsp *rsp) { int idx = dev->clt_device_id; - dev->size = dev->nsectors * dev->logical_block_size; + dev->size = le64_to_cpu(rsp->nsectors) * + le16_to_cpu(rsp->logical_block_size); dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); if (IS_ERR(dev->gd)) @@ -1428,8 +1416,8 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev) dev->queue = dev->gd->queue; rnbd_init_mq_hw_queues(dev); - setup_request_queue(dev); - return rnbd_clt_setup_gen_disk(dev, idx); + setup_request_queue(dev, rsp); + return rnbd_clt_setup_gen_disk(dev, rsp, idx); } static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess, @@ -1632,7 +1620,7 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, mutex_lock(&dev->lock); pr_debug("Opened remote device: session=%s, path='%s'\n", sess->sessname, pathname); - ret = rnbd_client_setup_device(dev); + ret = rnbd_client_setup_device(dev, rsp); if (ret) { rnbd_clt_err(dev, "map_device: Failed to configure device, err: %d\n", @@ -1642,13 +1630,17 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname, } rnbd_clt_info(dev, - "map_device: Device mapped as %s (nsectors: %zu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n", - dev->gd->disk_name, dev->nsectors, - dev->logical_block_size, dev->physical_block_size, - dev->max_discard_sectors, - dev->discard_granularity, dev->discard_alignment, - dev->secure_discard, dev->max_segments, - dev->max_hw_sectors, dev->wc, dev->fua); + "map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n", + dev->gd->disk_name, le64_to_cpu(rsp->nsectors), + le16_to_cpu(rsp->logical_block_size), + le16_to_cpu(rsp->physical_block_size), + le32_to_cpu(rsp->max_discard_sectors), + le32_to_cpu(rsp->discard_granularity), + le32_to_cpu(rsp->discard_alignment), + le16_to_cpu(rsp->secure_discard), + sess->max_segments, sess->max_io_size / SECTOR_SIZE, + !!(rsp->cache_policy & RNBD_WRITEBACK), + !!(rsp->cache_policy & RNBD_FUA)); mutex_unlock(&dev->lock); kfree(rsp); diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index 26fb91d800e31e..7520272541b1c1 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -117,17 +117,6 @@ struct rnbd_clt_dev { char *pathname; enum rnbd_access_mode access_mode; u32 nr_poll_queues; - bool wc; - bool fua; - u32 max_hw_sectors; - u32 max_discard_sectors; - u32 discard_granularity; - u32 discard_alignment; - u16 secure_discard; - u16 physical_block_size; - u16 logical_block_size; - u16 max_segments; - size_t nsectors; u64 size; /* device size in bytes */ struct list_head list; struct gendisk *gd; From 50aff97483b6afe4a9796154e9f6a5ca0a4f55c2 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 6 Jul 2022 21:31:49 +0800 Subject: [PATCH 0269/1250] rnbd-clt: adjust the layout of struct rnbd_clt_dev While at it, let re-arrange the struct to remove holes. Before, pahole reports /* size: 232, cachelines: 4, members: 17 */ /* sum members: 224, holes: 2, sum holes: 8 */ /* last cacheline: 40 bytes */ After the change, the report changes to /* size: 224, cachelines: 4, members: 17 */ /* last cacheline: 32 bytes */ Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220706133152.12058-6-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index 7520272541b1c1..df237d2ea0d980 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -106,6 +106,7 @@ struct rnbd_queue { }; struct rnbd_clt_dev { + struct kobject kobj; struct rnbd_clt_session *sess; struct request_queue *queue; struct rnbd_queue *hw_queues; @@ -114,15 +115,14 @@ struct rnbd_clt_dev { u32 clt_device_id; struct mutex lock; enum rnbd_clt_dev_state dev_state; + refcount_t refcount; char *pathname; enum rnbd_access_mode access_mode; u32 nr_poll_queues; u64 size; /* device size in bytes */ struct list_head list; struct gendisk *gd; - struct kobject kobj; char *blk_symlink_name; - refcount_t refcount; struct work_struct unmap_on_rmmod_work; }; From ffa41a71702493b9145a5ae4f4b1b8a4bab1b8f7 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 6 Jul 2022 21:31:50 +0800 Subject: [PATCH 0270/1250] rnbd-clt: check capacity inside rnbd_clt_change_capacity Currently, process_msg_open_rsp checks if capacity changed or not before call rnbd_clt_change_capacity while the checking also make sense for rnbd_clt_resize_dev_store, let's move the checking into the function. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220706133152.12058-7-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index da2ba9477b1e57..a9bfab53bbf7c6 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -71,6 +71,12 @@ static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, size_t new_nsectors) { + if (get_capacity(dev->gd) == new_nsectors) + return 0; + + /* + * If the size changed, we need to revalidate it + */ rnbd_clt_info(dev, "Device size changed from %llu to %zu sectors\n", get_capacity(dev->gd), new_nsectors); set_capacity_and_notify(dev->gd, new_nsectors); @@ -93,12 +99,7 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev, if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) { u64 nsectors = le64_to_cpu(rsp->nsectors); - /* - * If the device was remapped and the size changed in the - * meantime we need to revalidate it - */ - if (get_capacity(dev->gd) != nsectors) - rnbd_clt_change_capacity(dev, nsectors); + rnbd_clt_change_capacity(dev, nsectors); gd_kobj = &disk_to_dev(dev->gd)->kobj; kobject_uevent(gd_kobj, KOBJ_ONLINE); rnbd_clt_info(dev, "Device online, device remapped successfully\n"); From 59498516e707ed6b6a5c01ae28fc816382d9698f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 6 Jul 2022 21:31:51 +0800 Subject: [PATCH 0271/1250] rnbd-clt: pass sector_t type for resize capacity Let's change the parameter type to 'sector_t' then we don't need to cast it from rnbd_clt_resize_dev_store, and update rnbd_clt_resize_disk too. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220706133152.12058-8-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt-sysfs.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 6 +++--- drivers/block/rnbd/rnbd-clt.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c index 2be5d87a3ca605..e7c7d9a68168de 100644 --- a/drivers/block/rnbd/rnbd-clt-sysfs.c +++ b/drivers/block/rnbd/rnbd-clt-sysfs.c @@ -376,7 +376,7 @@ static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj, if (ret) return ret; - ret = rnbd_clt_resize_disk(dev, (size_t)sectors); + ret = rnbd_clt_resize_disk(dev, sectors); if (ret) return ret; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index a9bfab53bbf7c6..c77da3d0317e81 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -69,7 +69,7 @@ static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) } static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, - size_t new_nsectors) + sector_t new_nsectors) { if (get_capacity(dev->gd) == new_nsectors) return 0; @@ -77,7 +77,7 @@ static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, /* * If the size changed, we need to revalidate it */ - rnbd_clt_info(dev, "Device size changed from %llu to %zu sectors\n", + rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n", get_capacity(dev->gd), new_nsectors); set_capacity_and_notify(dev->gd, new_nsectors); return 0; @@ -117,7 +117,7 @@ static int process_msg_open_rsp(struct rnbd_clt_dev *dev, return err; } -int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize) +int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize) { int ret = 0; diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h index df237d2ea0d980..a48e040abe639d 100644 --- a/drivers/block/rnbd/rnbd-clt.h +++ b/drivers/block/rnbd/rnbd-clt.h @@ -138,7 +138,7 @@ int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force, const struct attribute *sysfs_self); int rnbd_clt_remap_device(struct rnbd_clt_dev *dev); -int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize); +int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize); /* rnbd-clt-sysfs.c */ From 3b56590b1715b998cb5c73a5bd2e9d340ccb42dc Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 6 Jul 2022 21:31:52 +0800 Subject: [PATCH 0272/1250] rnbd-clt: make rnbd_clt_change_capacity return void No need to checking the return value, make it return void. Acked-by: Jack Wang Signed-off-by: Guoqing Jiang Link: https://lore.kernel.org/r/20220706133152.12058-9-guoqing.jiang@linux.dev Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-clt.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index c77da3d0317e81..7a418c4d47e4b2 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -68,11 +68,11 @@ static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev) return refcount_inc_not_zero(&dev->refcount); } -static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, +static void rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, sector_t new_nsectors) { if (get_capacity(dev->gd) == new_nsectors) - return 0; + return; /* * If the size changed, we need to revalidate it @@ -80,7 +80,6 @@ static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev, rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n", get_capacity(dev->gd), new_nsectors); set_capacity_and_notify(dev->gd, new_nsectors); - return 0; } static int process_msg_open_rsp(struct rnbd_clt_dev *dev, @@ -127,7 +126,7 @@ int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize) ret = -ENOENT; goto out; } - ret = rnbd_clt_change_capacity(dev, newsize); + rnbd_clt_change_capacity(dev, newsize); out: mutex_unlock(&dev->lock); From 12c6870bf7efbbc275972edaab86071e21cfc2f1 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Sat, 4 Jun 2022 22:32:54 +0800 Subject: [PATCH 0273/1250] nvme: remove a double word in a comment Delete the redundant word 'be'. Signed-off-by: Xiang wangx Signed-off-by: Christoph Hellwig --- drivers/nvme/host/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 0da94b233feda8..75d0a730e0bb2d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -140,7 +140,7 @@ enum nvme_quirks { NVME_QUIRK_DMA_ADDRESS_BITS_48 = (1 << 16), /* - * The controller requires the command_id value be be limited, so skip + * The controller requires the command_id value be limited, so skip * encoding the generation sequence number. */ NVME_QUIRK_SKIP_CID_GEN = (1 << 17), From 48e1bc03b7983b3ad2f920ca70805bbc6b55609d Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 8 Jun 2022 11:52:21 -0700 Subject: [PATCH 0274/1250] nvme: handle the persistent internal error AER In the NVM Express Revision 1.4 spec, Figure 145 describes possible values for an AER with event type "Error" (value 000b). For a Persistent Internal Error (value 03h), the host should perform a controller reset. Add support for this error using code that already exists for doing a controller reset. As part of this support, introduce two utility functions for parsing the AER type and subtype. This new support was tested in a lab environment where we can generate the persistent internal error on demand, and observe both the Linux side and NVMe controller side to see that the controller reset has been done. Signed-off-by: Michael Kelley Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 31 +++++++++++++++++++++++++++++-- include/linux/nvme.h | 4 ++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b3d9c29aba1e6a..6eb42f696fc9f5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4526,9 +4526,19 @@ static void nvme_fw_act_work(struct work_struct *work) nvme_get_fw_slot_info(ctrl); } +static u32 nvme_aer_type(u32 result) +{ + return result & 0x7; +} + +static u32 nvme_aer_subtype(u32 result) +{ + return (result & 0xff00) >> 8; +} + static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) { - u32 aer_notice_type = (result & 0xff00) >> 8; + u32 aer_notice_type = nvme_aer_subtype(result); trace_nvme_async_event(ctrl, aer_notice_type); @@ -4561,11 +4571,19 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) } } +static void nvme_handle_aer_persistent_error(struct nvme_ctrl *ctrl) +{ + trace_nvme_async_event(ctrl, NVME_AER_ERROR); + dev_warn(ctrl->device, "resetting controller due to AER\n"); + nvme_reset_ctrl(ctrl); +} + void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, volatile union nvme_result *res) { u32 result = le32_to_cpu(res->u32); - u32 aer_type = result & 0x07; + u32 aer_type = nvme_aer_type(result); + u32 aer_subtype = nvme_aer_subtype(result); if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) return; @@ -4575,6 +4593,15 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, nvme_handle_aen_notice(ctrl, result); break; case NVME_AER_ERROR: + /* + * For a persistent internal error, don't run async_event_work + * to submit a new AER. The controller reset will do it. + */ + if (aer_subtype == NVME_AER_ERROR_PERSIST_INT_ERR) { + nvme_handle_aer_persistent_error(ctrl); + return; + } + fallthrough; case NVME_AER_SMART: case NVME_AER_CSS: case NVME_AER_VS: diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e3934003f2397e..e167cdad78442b 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -711,6 +711,10 @@ enum { NVME_AER_VS = 7, }; +enum { + NVME_AER_ERROR_PERSIST_INT_ERR = 0x03, +}; + enum { NVME_AER_NOTICE_NS_CHANGED = 0x00, NVME_AER_NOTICE_FW_ACT_STARTING = 0x01, From 38c8467732d2ab4ddef33bc2a9f9b90137a4258c Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 6 Jun 2022 18:16:42 -0700 Subject: [PATCH 0275/1250] nvme: remove unused timeout parameter The function __nvme_submit_sync_cmd() has following list of callers that sets the timeout value to 0 :- Callers | Timeout value ------------------------------------------------ nvme_submit_sync_cmd() | 0 nvme_features() | 0 nvme_sec_submit() | 0 nvmf_reg_read32() | 0 nvmf_reg_read64() | 0 nvmf_reg_write32() | 0 nvmf_connect_admin_queue() | 0 nvmf_connect_io_queue() | 0 Remove the timeout function parameter from __nvme_submit_sync_cmd() and adjust the rest of code accordingly. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 12 ++++-------- drivers/nvme/host/fabrics.c | 10 +++++----- drivers/nvme/host/nvme.h | 2 +- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6eb42f696fc9f5..10c3fa5b0d6988 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -990,8 +990,7 @@ static int nvme_execute_rq(struct request *rq, bool at_head) */ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, - blk_mq_req_flags_t flags) + int qid, int at_head, blk_mq_req_flags_t flags) { struct request *req; int ret; @@ -1006,9 +1005,6 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, return PTR_ERR(req); nvme_init_request(req, cmd); - if (timeout) - req->timeout = timeout; - if (buffer && bufflen) { ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); if (ret) @@ -1028,7 +1024,7 @@ EXPORT_SYMBOL_GPL(__nvme_submit_sync_cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buffer, unsigned bufflen) { - return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, 0, + return __nvme_submit_sync_cmd(q, cmd, NULL, buffer, bufflen, NVME_QID_ANY, 0, 0); } EXPORT_SYMBOL_GPL(nvme_submit_sync_cmd); @@ -1466,7 +1462,7 @@ static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, c.features.dword11 = cpu_to_le32(dword11); ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res, - buffer, buflen, 0, NVME_QID_ANY, 0, 0); + buffer, buflen, NVME_QID_ANY, 0, 0); if (ret >= 0 && result) *result = le32_to_cpu(res.u32); return ret; @@ -2103,7 +2099,7 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8); cmd.common.cdw11 = cpu_to_le32(len); - return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, 0, + return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len, NVME_QID_ANY, 1, 0); } EXPORT_SYMBOL_GPL(nvme_sec_submit); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index ee79a6d639b4b9..0a0512300f1bba 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -152,7 +152,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) cmd.prop_get.fctype = nvme_fabrics_type_property_get; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, NVME_QID_ANY, 0, 0); if (ret >= 0) @@ -198,7 +198,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) cmd.prop_get.attrib = 1; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, NVME_QID_ANY, 0, 0); if (ret >= 0) @@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) cmd.prop_set.offset = cpu_to_le32(off); cmd.prop_set.value = cpu_to_le64(val); - ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, NVME_QID_ANY, 0, 0); if (unlikely(ret)) dev_err(ctrl->device, @@ -389,7 +389,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, - data, sizeof(*data), 0, NVME_QID_ANY, 1, + data, sizeof(*data), NVME_QID_ANY, 1, BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); if (ret) { nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), @@ -450,7 +450,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); ret = __nvme_submit_sync_cmd(ctrl->connect_q, &cmd, &res, - data, sizeof(*data), 0, qid, 1, + data, sizeof(*data), qid, 1, BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT); if (ret) { nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 75d0a730e0bb2d..e4612dd0b4206e 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -780,7 +780,7 @@ int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, + int qid, int at_head, blk_mq_req_flags_t flags); int nvme_set_features(struct nvme_ctrl *dev, unsigned int fid, unsigned int dword11, void *buffer, size_t buflen, From 76a11e3e1d2cf9df939a92d1a6efa9ef08ae2bff Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 6 Jun 2022 18:16:43 -0700 Subject: [PATCH 0276/1250] nvme: fix qid param blk_mq_alloc_request_hctx Only caller of the __nvme_submit_sync_cmd() with qid value not equal to NVME_QID_ANY is nvmf_connect_io_queues(), where qid value is alway set to > 0. [1] __nvme_submit_sync_cmd() callers with qid parameter from :- Caller | qid parameter ------------------------------------------------------ * nvme_fc_connect_io_queues() | nvmf_connect_io_queue() | qid > 0 * nvme_rdma_start_io_queues() | nvme_rdma_start_queue() | nvmf_connect_io_queues() | qid > 0 * nvme_tcp_start_io_queues() | nvme_tcp_start_queue() | nvmf_connect_io_queues() | qid > 0 * nvme_loop_connect_io_queues() | nvmf_connect_io_queues() | qid > 0 When qid value of the function parameter __nvme_submit_sync_cmd() is > 0 from above callers, we use blk_mq_alloc_request_hctx(), where we pass last parameter as 0 if qid functional parameter value is set to 0 with conditional operators, see 1002 :- 991 int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, 992 union nvme_result *result, void *buffer, unsigned bufflen, 993 int qid, int at_head, blk_mq_req_flags_t flags) 994 { 995 struct request *req; 996 int ret; 997 998 if (qid == NVME_QID_ANY) 999 req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); 1000 else 1001 req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, 1002 qid ? qid - 1 : 0); 1003 But qid function parameter value of the __nvme_submit_sync_cmd() will never be 0 from above caller list see [1], and all the other callers of __nvme_submit_sync_cmd() use NVME_QID_ANY as qid value :- 1. nvme_submit_sync_cmd() 2. nvme_features() 3. nvme_sec_submit() 4. nvmf_reg_read32() 5. nvmf_reg_read64() 6. nvmf_ref_write32() 7. nvmf_connect_admin_queue() Remove the conditional operator to pass the qid as 0 in the call to blk_mq_alloc_requst_hctx(). Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 10c3fa5b0d6988..f56e58a975dd49 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -999,7 +999,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, req = blk_mq_alloc_request(q, nvme_req_op(cmd), flags); else req = blk_mq_alloc_request_hctx(q, nvme_req_op(cmd), flags, - qid ? qid - 1 : 0); + qid - 1); if (IS_ERR(req)) return PTR_ERR(req); From d109f3ccfc8f3fcdbd6f4c557a56d6237ca209d4 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 26 Jun 2022 17:06:00 +0300 Subject: [PATCH 0277/1250] nvme-loop: use nvme core helpers to cancel all requests in a tagset A helper now exist, no need to open-code the same functionality. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 59024af2da2e3d..9c9c428ae53800 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -424,9 +424,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) { if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_cancel_request, &ctrl->ctrl); - blk_mq_tagset_wait_completed_request(&ctrl->tag_set); + nvme_cancel_tagset(&ctrl->ctrl); nvme_loop_destroy_io_queues(ctrl); } @@ -434,9 +432,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_cancel_request, &ctrl->ctrl); - blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); + nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_loop_destroy_admin_queue(ctrl); } From 1bc60dc50469db7c02968196c0b8c4f93d4e06ee Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:51:57 +0200 Subject: [PATCH 0278/1250] crypto: add crypto_has_shash() Add helper function to determine if a given synchronous hash is supported. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Acked-by: Herbert Xu Signed-off-by: Christoph Hellwig --- crypto/shash.c | 6 ++++++ include/crypto/hash.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/crypto/shash.c b/crypto/shash.c index 0a0a50cb694f0e..4c88e63b3350fc 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -521,6 +521,12 @@ struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, } EXPORT_SYMBOL_GPL(crypto_alloc_shash); +int crypto_has_shash(const char *alg_name, u32 type, u32 mask) +{ + return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask); +} +EXPORT_SYMBOL_GPL(crypto_has_shash); + static int shash_prepare_alg(struct shash_alg *alg) { struct crypto_alg *base = &alg->base; diff --git a/include/crypto/hash.h b/include/crypto/hash.h index f140e4643949b9..f5841992dc9b16 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -718,6 +718,8 @@ static inline void ahash_request_set_crypt(struct ahash_request *req, struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask); +int crypto_has_shash(const char *alg_name, u32 type, u32 mask); + static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm) { return &tfm->base; From de83cc791efb12edc33e47399d968ba8e4d214f3 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:51:58 +0200 Subject: [PATCH 0279/1250] crypto: add crypto_has_kpp() Add helper function to determine if a given key-agreement protocol primitive is supported. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Acked-by: Herbert Xu Signed-off-by: Christoph Hellwig --- crypto/kpp.c | 6 ++++++ include/crypto/kpp.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/crypto/kpp.c b/crypto/kpp.c index 7aa6ba4b60a4db..678e871ce418c0 100644 --- a/crypto/kpp.c +++ b/crypto/kpp.c @@ -104,6 +104,12 @@ int crypto_grab_kpp(struct crypto_kpp_spawn *spawn, } EXPORT_SYMBOL_GPL(crypto_grab_kpp); +int crypto_has_kpp(const char *alg_name, u32 type, u32 mask) +{ + return crypto_type_has_alg(alg_name, &crypto_kpp_type, type, mask); +} +EXPORT_SYMBOL_GPL(crypto_has_kpp); + static void kpp_prepare_alg(struct kpp_alg *alg) { struct crypto_alg *base = &alg->base; diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h index cccceadc164b9e..24d01e9877c12b 100644 --- a/include/crypto/kpp.h +++ b/include/crypto/kpp.h @@ -104,6 +104,8 @@ struct kpp_alg { */ struct crypto_kpp *crypto_alloc_kpp(const char *alg_name, u32 type, u32 mask); +int crypto_has_kpp(const char *alg_name, u32 type, u32 mask); + static inline struct crypto_tfm *crypto_kpp_tfm(struct crypto_kpp *tfm) { return &tfm->base; From c603e3e98393e73a55a14379b382cb99208f372a Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:51:59 +0200 Subject: [PATCH 0280/1250] lib/base64: RFC4648-compliant base64 encoding Add RFC4648-compliant base64 encoding and decoding routines, based on the base64url encoding in fs/crypto/fname.c. Signed-off-by: Hannes Reinecke Reviewed-by: Himanshu Madhani Reviewed-by: Sagi Grimberg Cc: Eric Biggers Signed-off-by: Christoph Hellwig --- include/linux/base64.h | 16 +++++++ lib/Makefile | 2 +- lib/base64.c | 103 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 include/linux/base64.h create mode 100644 lib/base64.c diff --git a/include/linux/base64.h b/include/linux/base64.h new file mode 100644 index 00000000000000..660d4cb1ef31f8 --- /dev/null +++ b/include/linux/base64.h @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * base64 encoding, lifted from fs/crypto/fname.c. + */ + +#ifndef _LINUX_BASE64_H +#define _LINUX_BASE64_H + +#include + +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + +int base64_encode(const u8 *src, int len, char *dst); +int base64_decode(const char *src, int len, u8 *dst); + +#endif /* _LINUX_BASE64_H */ diff --git a/lib/Makefile b/lib/Makefile index f99bf61f8bbc67..20d2caf6577d6d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -46,7 +46,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ list_sort.o uuid.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ - percpu-refcount.o rhashtable.o \ + percpu-refcount.o rhashtable.o base64.o \ once.o refcount.o usercopy.o errseq.o bucket_locks.o \ generic-radix-tree.o obj-$(CONFIG_STRING_SELFTEST) += test_string.o diff --git a/lib/base64.c b/lib/base64.c new file mode 100644 index 00000000000000..b736a7a431c50e --- /dev/null +++ b/lib/base64.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * base64.c - RFC4648-compliant base64 encoding + * + * Copyright (c) 2020 Hannes Reinecke, SUSE + * + * Based on the base64url routines from fs/crypto/fname.c + * (which are using the URL-safe base64 encoding), + * modified to use the standard coding table from RFC4648 section 4. + */ + +#include +#include +#include +#include +#include + +static const char base64_table[65] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +/** + * base64_encode() - base64-encode some binary data + * @src: the binary data to encode + * @srclen: the length of @src in bytes + * @dst: (output) the base64-encoded string. Not NUL-terminated. + * + * Encodes data using base64 encoding, i.e. the "Base 64 Encoding" specified + * by RFC 4648, including the '='-padding. + * + * Return: the length of the resulting base64-encoded string in bytes. + */ +int base64_encode(const u8 *src, int srclen, char *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + char *cp = dst; + + for (i = 0; i < srclen; i++) { + ac = (ac << 8) | src[i]; + bits += 8; + do { + bits -= 6; + *cp++ = base64_table[(ac >> bits) & 0x3f]; + } while (bits >= 6); + } + if (bits) { + *cp++ = base64_table[(ac << (6 - bits)) & 0x3f]; + bits -= 6; + } + while (bits < 0) { + *cp++ = '='; + bits += 2; + } + return cp - dst; +} +EXPORT_SYMBOL_GPL(base64_encode); + +/** + * base64_decode() - base64-decode a string + * @src: the string to decode. Doesn't need to be NUL-terminated. + * @srclen: the length of @src in bytes + * @dst: (output) the decoded binary data + * + * Decodes a string using base64 encoding, i.e. the "Base 64 Encoding" + * specified by RFC 4648, including the '='-padding. + * + * This implementation hasn't been optimized for performance. + * + * Return: the length of the resulting decoded binary data in bytes, + * or -1 if the string isn't a valid base64 string. + */ +int base64_decode(const char *src, int srclen, u8 *dst) +{ + u32 ac = 0; + int bits = 0; + int i; + u8 *bp = dst; + + for (i = 0; i < srclen; i++) { + const char *p = strchr(base64_table, src[i]); + + if (src[i] == '=') { + ac = (ac << 6); + bits += 6; + if (bits >= 8) + bits -= 8; + continue; + } + if (p == NULL || src[i] == 0) + return -1; + ac = (ac << 6) | (p - base64_table); + bits += 6; + if (bits >= 8) { + bits -= 8; + *bp++ = (u8)(ac >> bits); + } + } + if (ac & ((1 << bits) - 1)) + return -1; + return bp - dst; +} +EXPORT_SYMBOL_GPL(base64_decode); From a0516233e6b3f04622cb1014f25fa9b08763d2a9 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:52:00 +0200 Subject: [PATCH 0281/1250] nvme: add definitions for NVMe In-Band authentication Add new definitions for NVMe In-band authentication as defined in the NVMe Base Specification v2.0. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Himanshu Madhani Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- include/linux/nvme.h | 209 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 208 insertions(+), 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index e167cdad78442b..10eab53f5a8a54 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -19,6 +19,7 @@ #define NVMF_TRSVCID_SIZE 32 #define NVMF_TRADDR_SIZE 256 #define NVMF_TSAS_SIZE 256 +#define NVMF_AUTH_HASH_LEN 64 #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" @@ -1371,6 +1372,8 @@ enum nvmf_capsule_command { nvme_fabrics_type_property_set = 0x00, nvme_fabrics_type_connect = 0x01, nvme_fabrics_type_property_get = 0x04, + nvme_fabrics_type_auth_send = 0x05, + nvme_fabrics_type_auth_receive = 0x06, }; #define nvme_fabrics_type_name(type) { type, #type } @@ -1378,7 +1381,9 @@ enum nvmf_capsule_command { __print_symbolic(type, \ nvme_fabrics_type_name(nvme_fabrics_type_property_set), \ nvme_fabrics_type_name(nvme_fabrics_type_connect), \ - nvme_fabrics_type_name(nvme_fabrics_type_property_get)) + nvme_fabrics_type_name(nvme_fabrics_type_property_get), \ + nvme_fabrics_type_name(nvme_fabrics_type_auth_send), \ + nvme_fabrics_type_name(nvme_fabrics_type_auth_receive)) /* * If not fabrics command, fctype will be ignored. @@ -1474,6 +1479,11 @@ struct nvmf_connect_command { __u8 resv4[12]; }; +enum { + NVME_CONNECT_AUTHREQ_ASCR = (1 << 2), + NVME_CONNECT_AUTHREQ_ATR = (1 << 1), +}; + struct nvmf_connect_data { uuid_t hostid; __le16 cntlid; @@ -1508,6 +1518,200 @@ struct nvmf_property_get_command { __u8 resv4[16]; }; +struct nvmf_auth_common_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __u8 resv3; + __u8 spsp0; + __u8 spsp1; + __u8 secp; + __le32 al_tl; + __u8 resv4[16]; +}; + +struct nvmf_auth_send_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __u8 resv3; + __u8 spsp0; + __u8 spsp1; + __u8 secp; + __le32 tl; + __u8 resv4[16]; +}; + +struct nvmf_auth_receive_command { + __u8 opcode; + __u8 resv1; + __u16 command_id; + __u8 fctype; + __u8 resv2[19]; + union nvme_data_ptr dptr; + __u8 resv3; + __u8 spsp0; + __u8 spsp1; + __u8 secp; + __le32 al; + __u8 resv4[16]; +}; + +/* Value for secp */ +enum { + NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER = 0xe9, +}; + +/* Defined value for auth_type */ +enum { + NVME_AUTH_COMMON_MESSAGES = 0x00, + NVME_AUTH_DHCHAP_MESSAGES = 0x01, +}; + +/* Defined messages for auth_id */ +enum { + NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE = 0x00, + NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE = 0x01, + NVME_AUTH_DHCHAP_MESSAGE_REPLY = 0x02, + NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1 = 0x03, + NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 = 0x04, + NVME_AUTH_DHCHAP_MESSAGE_FAILURE2 = 0xf0, + NVME_AUTH_DHCHAP_MESSAGE_FAILURE1 = 0xf1, +}; + +struct nvmf_auth_dhchap_protocol_descriptor { + __u8 authid; + __u8 rsvd; + __u8 halen; + __u8 dhlen; + __u8 idlist[60]; +}; + +enum { + NVME_AUTH_DHCHAP_AUTH_ID = 0x01, +}; + +/* Defined hash functions for DH-HMAC-CHAP authentication */ +enum { + NVME_AUTH_HASH_SHA256 = 0x01, + NVME_AUTH_HASH_SHA384 = 0x02, + NVME_AUTH_HASH_SHA512 = 0x03, + NVME_AUTH_HASH_INVALID = 0xff, +}; + +/* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */ +enum { + NVME_AUTH_DHGROUP_NULL = 0x00, + NVME_AUTH_DHGROUP_2048 = 0x01, + NVME_AUTH_DHGROUP_3072 = 0x02, + NVME_AUTH_DHGROUP_4096 = 0x03, + NVME_AUTH_DHGROUP_6144 = 0x04, + NVME_AUTH_DHGROUP_8192 = 0x05, + NVME_AUTH_DHGROUP_INVALID = 0xff, +}; + +union nvmf_auth_protocol { + struct nvmf_auth_dhchap_protocol_descriptor dhchap; +}; + +struct nvmf_auth_dhchap_negotiate_data { + __u8 auth_type; + __u8 auth_id; + __le16 rsvd; + __le16 t_id; + __u8 sc_c; + __u8 napd; + union nvmf_auth_protocol auth_protocol[]; +}; + +struct nvmf_auth_dhchap_challenge_data { + __u8 auth_type; + __u8 auth_id; + __u16 rsvd1; + __le16 t_id; + __u8 hl; + __u8 rsvd2; + __u8 hashid; + __u8 dhgid; + __le16 dhvlen; + __le32 seqnum; + /* 'hl' bytes of challenge value */ + __u8 cval[]; + /* followed by 'dhvlen' bytes of DH value */ +}; + +struct nvmf_auth_dhchap_reply_data { + __u8 auth_type; + __u8 auth_id; + __le16 rsvd1; + __le16 t_id; + __u8 hl; + __u8 rsvd2; + __u8 cvalid; + __u8 rsvd3; + __le16 dhvlen; + __le32 seqnum; + /* 'hl' bytes of response data */ + __u8 rval[]; + /* followed by 'hl' bytes of Challenge value */ + /* followed by 'dhvlen' bytes of DH value */ +}; + +enum { + NVME_AUTH_DHCHAP_RESPONSE_VALID = (1 << 0), +}; + +struct nvmf_auth_dhchap_success1_data { + __u8 auth_type; + __u8 auth_id; + __le16 rsvd1; + __le16 t_id; + __u8 hl; + __u8 rsvd2; + __u8 rvalid; + __u8 rsvd3[7]; + /* 'hl' bytes of response value if 'rvalid' is set */ + __u8 rval[]; +}; + +struct nvmf_auth_dhchap_success2_data { + __u8 auth_type; + __u8 auth_id; + __le16 rsvd1; + __le16 t_id; + __u8 rsvd2[10]; +}; + +struct nvmf_auth_dhchap_failure_data { + __u8 auth_type; + __u8 auth_id; + __le16 rsvd1; + __le16 t_id; + __u8 rescode; + __u8 rescode_exp; +}; + +enum { + NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED = 0x01, +}; + +enum { + NVME_AUTH_DHCHAP_FAILURE_FAILED = 0x01, + NVME_AUTH_DHCHAP_FAILURE_NOT_USABLE = 0x02, + NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH = 0x03, + NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE = 0x04, + NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE = 0x05, + NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD = 0x06, + NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE = 0x07, +}; + + struct nvme_dbbuf { __u8 opcode; __u8 flags; @@ -1551,6 +1755,9 @@ struct nvme_command { struct nvmf_connect_command connect; struct nvmf_property_set_command prop_set; struct nvmf_property_get_command prop_get; + struct nvmf_auth_common_command auth_common; + struct nvmf_auth_send_command auth_send; + struct nvmf_auth_receive_command auth_receive; struct nvme_dbbuf dbbuf; struct nvme_directive_cmd directive; }; From b3107f1658bcb7759802d406dc9c07fe8d1b6349 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:52:01 +0200 Subject: [PATCH 0282/1250] nvme-fabrics: decode 'authentication required' connect error The 'connect' command might fail with NVME_SC_AUTH_REQUIRED, so we should be decoding this error, too. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Himanshu Madhani Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 0a0512300f1bba..e4b1520862d874 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -331,6 +331,10 @@ static void nvmf_log_connect_error(struct nvme_ctrl *ctrl, dev_err(ctrl->device, "Connect command failed: host path error\n"); break; + case NVME_SC_AUTH_REQUIRED: + dev_err(ctrl->device, + "Connect command failed: authentication required\n"); + break; default: dev_err(ctrl->device, "Connect command failed, error wo/DNR bit: %d\n", From a476416bb57b183aa5851e5e1516260c864dc47a Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:52:02 +0200 Subject: [PATCH 0283/1250] nvme: implement In-Band authentication Implement NVMe-oF In-Band authentication according to NVMe TPAR 8006. This patch adds two new fabric options 'dhchap_secret' to specify the pre-shared key (in ASCII respresentation according to NVMe 2.0 section 8.13.5.8 'Secret representation') and 'dhchap_ctrl_secret' to specify the pre-shared controller key for bi-directional authentication of both the host and the controller. Re-authentication can be triggered by writing the PSK into the new controller sysfs attribute 'dhchap_secret' or 'dhchap_ctrl_secret'. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/Kconfig | 1 + drivers/nvme/Makefile | 1 + drivers/nvme/common/Kconfig | 4 + drivers/nvme/common/Makefile | 7 + drivers/nvme/common/auth.c | 329 ++++++++++++++ drivers/nvme/host/Kconfig | 13 + drivers/nvme/host/Makefile | 1 + drivers/nvme/host/auth.c | 828 +++++++++++++++++++++++++++++++++++ drivers/nvme/host/core.c | 143 +++++- drivers/nvme/host/fabrics.c | 80 +++- drivers/nvme/host/fabrics.h | 7 + drivers/nvme/host/nvme.h | 30 ++ drivers/nvme/host/rdma.c | 1 + drivers/nvme/host/tcp.c | 1 + drivers/nvme/host/trace.c | 32 ++ include/linux/nvme-auth.h | 33 ++ 16 files changed, 1504 insertions(+), 7 deletions(-) create mode 100644 drivers/nvme/common/Kconfig create mode 100644 drivers/nvme/common/Makefile create mode 100644 drivers/nvme/common/auth.c create mode 100644 drivers/nvme/host/auth.c create mode 100644 include/linux/nvme-auth.h diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig index 87ae409a32b981..656e46d938dabc 100644 --- a/drivers/nvme/Kconfig +++ b/drivers/nvme/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only menu "NVME Support" +source "drivers/nvme/common/Kconfig" source "drivers/nvme/host/Kconfig" source "drivers/nvme/target/Kconfig" diff --git a/drivers/nvme/Makefile b/drivers/nvme/Makefile index fb42c44609a8cd..eedca8c720983c 100644 --- a/drivers/nvme/Makefile +++ b/drivers/nvme/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only +obj-$(CONFIG_NVME_COMMON) += common/ obj-y += host/ obj-y += target/ diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig new file mode 100644 index 00000000000000..4514f44362dd21 --- /dev/null +++ b/drivers/nvme/common/Kconfig @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only + +config NVME_COMMON + tristate diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile new file mode 100644 index 00000000000000..720c625b8a522c --- /dev/null +++ b/drivers/nvme/common/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y += -I$(src) + +obj-$(CONFIG_NVME_COMMON) += nvme-common.o + +nvme-common-y += auth.o diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c new file mode 100644 index 00000000000000..01adb29947d49c --- /dev/null +++ b/drivers/nvme/common/auth.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Hannes Reinecke, SUSE Linux + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u32 nvme_dhchap_seqnum; +static DEFINE_MUTEX(nvme_dhchap_mutex); + +u32 nvme_auth_get_seqnum(void) +{ + u32 seqnum; + + mutex_lock(&nvme_dhchap_mutex); + if (!nvme_dhchap_seqnum) + nvme_dhchap_seqnum = prandom_u32(); + else { + nvme_dhchap_seqnum++; + if (!nvme_dhchap_seqnum) + nvme_dhchap_seqnum++; + } + seqnum = nvme_dhchap_seqnum; + mutex_unlock(&nvme_dhchap_mutex); + return seqnum; +} +EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum); + +static struct nvme_auth_dhgroup_map { + const char name[16]; + const char kpp[16]; +} dhgroup_map[] = { + [NVME_AUTH_DHGROUP_NULL] = { + .name = "null", .kpp = "null" }, + [NVME_AUTH_DHGROUP_2048] = { + .name = "ffdhe2048", .kpp = "ffdhe2048(dh)" }, + [NVME_AUTH_DHGROUP_3072] = { + .name = "ffdhe3072", .kpp = "ffdhe3072(dh)" }, + [NVME_AUTH_DHGROUP_4096] = { + .name = "ffdhe4096", .kpp = "ffdhe4096(dh)" }, + [NVME_AUTH_DHGROUP_6144] = { + .name = "ffdhe6144", .kpp = "ffdhe6144(dh)" }, + [NVME_AUTH_DHGROUP_8192] = { + .name = "ffdhe8192", .kpp = "ffdhe8192(dh)" }, +}; + +const char *nvme_auth_dhgroup_name(u8 dhgroup_id) +{ + if ((dhgroup_id > ARRAY_SIZE(dhgroup_map)) || + !dhgroup_map[dhgroup_id].name || + !strlen(dhgroup_map[dhgroup_id].name)) + return NULL; + return dhgroup_map[dhgroup_id].name; +} +EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_name); + +const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id) +{ + if ((dhgroup_id > ARRAY_SIZE(dhgroup_map)) || + !dhgroup_map[dhgroup_id].kpp || + !strlen(dhgroup_map[dhgroup_id].kpp)) + return NULL; + return dhgroup_map[dhgroup_id].kpp; +} +EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_kpp); + +u8 nvme_auth_dhgroup_id(const char *dhgroup_name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(dhgroup_map); i++) { + if (!dhgroup_map[i].name || + !strlen(dhgroup_map[i].name)) + continue; + if (!strncmp(dhgroup_map[i].name, dhgroup_name, + strlen(dhgroup_map[i].name))) + return i; + } + return NVME_AUTH_DHGROUP_INVALID; +} +EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id); + +static struct nvme_dhchap_hash_map { + int len; + const char hmac[15]; + const char digest[8]; +} hash_map[] = { + [NVME_AUTH_HASH_SHA256] = { + .len = 32, + .hmac = "hmac(sha256)", + .digest = "sha256", + }, + [NVME_AUTH_HASH_SHA384] = { + .len = 48, + .hmac = "hmac(sha384)", + .digest = "sha384", + }, + [NVME_AUTH_HASH_SHA512] = { + .len = 64, + .hmac = "hmac(sha512)", + .digest = "sha512", + }, +}; + +const char *nvme_auth_hmac_name(u8 hmac_id) +{ + if ((hmac_id > ARRAY_SIZE(hash_map)) || + !hash_map[hmac_id].hmac || + !strlen(hash_map[hmac_id].hmac)) + return NULL; + return hash_map[hmac_id].hmac; +} +EXPORT_SYMBOL_GPL(nvme_auth_hmac_name); + +const char *nvme_auth_digest_name(u8 hmac_id) +{ + if ((hmac_id > ARRAY_SIZE(hash_map)) || + !hash_map[hmac_id].digest || + !strlen(hash_map[hmac_id].digest)) + return NULL; + return hash_map[hmac_id].digest; +} +EXPORT_SYMBOL_GPL(nvme_auth_digest_name); + +u8 nvme_auth_hmac_id(const char *hmac_name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(hash_map); i++) { + if (!hash_map[i].hmac || !strlen(hash_map[i].hmac)) + continue; + if (!strncmp(hash_map[i].hmac, hmac_name, + strlen(hash_map[i].hmac))) + return i; + } + return NVME_AUTH_HASH_INVALID; +} +EXPORT_SYMBOL_GPL(nvme_auth_hmac_id); + +size_t nvme_auth_hmac_hash_len(u8 hmac_id) +{ + if ((hmac_id > ARRAY_SIZE(hash_map)) || + !hash_map[hmac_id].hmac || + !strlen(hash_map[hmac_id].hmac)) + return 0; + return hash_map[hmac_id].len; +} +EXPORT_SYMBOL_GPL(nvme_auth_hmac_hash_len); + +struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, + u8 key_hash) +{ + struct nvme_dhchap_key *key; + unsigned char *p; + u32 crc; + int ret, key_len; + size_t allocated_len = strlen(secret); + + /* Secret might be affixed with a ':' */ + p = strrchr(secret, ':'); + if (p) + allocated_len = p - secret; + key = kzalloc(sizeof(*key), GFP_KERNEL); + if (!key) + return ERR_PTR(-ENOMEM); + key->key = kzalloc(allocated_len, GFP_KERNEL); + if (!key->key) { + ret = -ENOMEM; + goto out_free_key; + } + + key_len = base64_decode(secret, allocated_len, key->key); + if (key_len < 0) { + pr_debug("base64 key decoding error %d\n", + key_len); + ret = key_len; + goto out_free_secret; + } + + if (key_len != 36 && key_len != 52 && + key_len != 68) { + pr_err("Invalid key len %d\n", key_len); + ret = -EINVAL; + goto out_free_secret; + } + + if (key_hash > 0 && + (key_len - 4) != nvme_auth_hmac_hash_len(key_hash)) { + pr_err("Mismatched key len %d for %s\n", key_len, + nvme_auth_hmac_name(key_hash)); + ret = -EINVAL; + goto out_free_secret; + } + + /* The last four bytes is the CRC in little-endian format */ + key_len -= 4; + /* + * The linux implementation doesn't do pre- and post-increments, + * so we have to do it manually. + */ + crc = ~crc32(~0, key->key, key_len); + + if (get_unaligned_le32(key->key + key_len) != crc) { + pr_err("key crc mismatch (key %08x, crc %08x)\n", + get_unaligned_le32(key->key + key_len), crc); + ret = -EKEYREJECTED; + goto out_free_secret; + } + key->len = key_len; + key->hash = key_hash; + return key; +out_free_secret: + kfree_sensitive(key->key); +out_free_key: + kfree(key); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(nvme_auth_extract_key); + +void nvme_auth_free_key(struct nvme_dhchap_key *key) +{ + if (!key) + return; + kfree_sensitive(key->key); + kfree(key); +} +EXPORT_SYMBOL_GPL(nvme_auth_free_key); + +u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn) +{ + const char *hmac_name; + struct crypto_shash *key_tfm; + struct shash_desc *shash; + u8 *transformed_key; + int ret; + + if (!key || !key->key) { + pr_warn("No key specified\n"); + return ERR_PTR(-ENOKEY); + } + if (key->hash == 0) { + transformed_key = kmemdup(key->key, key->len, GFP_KERNEL); + return transformed_key ? transformed_key : ERR_PTR(-ENOMEM); + } + hmac_name = nvme_auth_hmac_name(key->hash); + if (!hmac_name) { + pr_warn("Invalid key hash id %d\n", key->hash); + return ERR_PTR(-EINVAL); + } + + key_tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(key_tfm)) + return (u8 *)key_tfm; + + shash = kmalloc(sizeof(struct shash_desc) + + crypto_shash_descsize(key_tfm), + GFP_KERNEL); + if (!shash) { + ret = -ENOMEM; + goto out_free_key; + } + + transformed_key = kzalloc(crypto_shash_digestsize(key_tfm), GFP_KERNEL); + if (!transformed_key) { + ret = -ENOMEM; + goto out_free_shash; + } + + shash->tfm = key_tfm; + ret = crypto_shash_setkey(key_tfm, key->key, key->len); + if (ret < 0) + goto out_free_shash; + ret = crypto_shash_init(shash); + if (ret < 0) + goto out_free_shash; + ret = crypto_shash_update(shash, nqn, strlen(nqn)); + if (ret < 0) + goto out_free_shash; + ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17); + if (ret < 0) + goto out_free_shash; + ret = crypto_shash_final(shash, transformed_key); +out_free_shash: + kfree(shash); +out_free_key: + crypto_free_shash(key_tfm); + if (ret < 0) { + kfree_sensitive(transformed_key); + return ERR_PTR(ret); + } + return transformed_key; +} +EXPORT_SYMBOL_GPL(nvme_auth_transform_key); + +int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) +{ + struct nvme_dhchap_key *key; + u8 key_hash; + + if (!secret) { + *ret_key = NULL; + return 0; + } + + if (sscanf(secret, "DHHC-1:%hhd:%*s:", &key_hash) != 1) + return -EINVAL; + + /* Pass in the secret without the 'DHHC-1:XX:' prefix */ + key = nvme_auth_extract_key(secret + 10, key_hash); + if (IS_ERR(key)) { + *ret_key = NULL; + return PTR_ERR(key); + } + + *ret_key = key; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_auth_generate_key); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 877d2ec4ea9fc4..6c503f42f3c680 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -92,6 +92,19 @@ config NVME_TCP If unsure, say N. +config NVME_AUTH + bool "NVM Express over Fabrics In-Band Authentication" + depends on NVME_CORE + select NVME_COMMON + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_SHA256 + select CRYPTO_SHA512 + help + This provides support for NVMe over Fabrics In-Band Authentication. + + If unsure, say N. + config NVME_APPLE tristate "Apple ANS2 NVM Express host driver" depends on OF && BLOCK diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index a36ae16120597a..a3e88f32f560b4 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -16,6 +16,7 @@ nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_BLK_DEV_ZONED) += zns.o nvme-core-$(CONFIG_FAULT_INJECTION_DEBUG_FS) += fault_inject.o nvme-core-$(CONFIG_NVME_HWMON) += hwmon.o +nvme-core-$(CONFIG_NVME_AUTH) += auth.o nvme-y += pci.o diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c new file mode 100644 index 00000000000000..9766bfffecac6e --- /dev/null +++ b/drivers/nvme/host/auth.c @@ -0,0 +1,828 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Hannes Reinecke, SUSE Linux + */ + +#include +#include +#include +#include +#include +#include +#include "nvme.h" +#include "fabrics.h" +#include + +struct nvme_dhchap_queue_context { + struct list_head entry; + struct work_struct auth_work; + struct nvme_ctrl *ctrl; + struct crypto_shash *shash_tfm; + void *buf; + size_t buf_size; + int qid; + int error; + u32 s1; + u32 s2; + u16 transaction; + u8 status; + u8 hash_id; + size_t hash_len; + u8 dhgroup_id; + u8 c1[64]; + u8 c2[64]; + u8 response[64]; + u8 *host_response; +}; + +#define nvme_auth_flags_from_qid(qid) \ + (qid == 0) ? 0 : BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED +#define nvme_auth_queue_from_qid(ctrl, qid) \ + (qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q + +static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid, + void *data, size_t data_len, bool auth_send) +{ + struct nvme_command cmd = {}; + blk_mq_req_flags_t flags = nvme_auth_flags_from_qid(qid); + struct request_queue *q = nvme_auth_queue_from_qid(ctrl, qid); + int ret; + + cmd.auth_common.opcode = nvme_fabrics_command; + cmd.auth_common.secp = NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER; + cmd.auth_common.spsp0 = 0x01; + cmd.auth_common.spsp1 = 0x01; + if (auth_send) { + cmd.auth_send.fctype = nvme_fabrics_type_auth_send; + cmd.auth_send.tl = cpu_to_le32(data_len); + } else { + cmd.auth_receive.fctype = nvme_fabrics_type_auth_receive; + cmd.auth_receive.al = cpu_to_le32(data_len); + } + + ret = __nvme_submit_sync_cmd(q, &cmd, NULL, data, data_len, + qid == 0 ? NVME_QID_ANY : qid, + 0, flags); + if (ret > 0) + dev_warn(ctrl->device, + "qid %d auth_send failed with status %d\n", qid, ret); + else if (ret < 0) + dev_err(ctrl->device, + "qid %d auth_send failed with error %d\n", qid, ret); + return ret; +} + +static int nvme_auth_receive_validate(struct nvme_ctrl *ctrl, int qid, + struct nvmf_auth_dhchap_failure_data *data, + u16 transaction, u8 expected_msg) +{ + dev_dbg(ctrl->device, "%s: qid %d auth_type %d auth_id %x\n", + __func__, qid, data->auth_type, data->auth_id); + + if (data->auth_type == NVME_AUTH_COMMON_MESSAGES && + data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { + return data->rescode_exp; + } + if (data->auth_type != NVME_AUTH_DHCHAP_MESSAGES || + data->auth_id != expected_msg) { + dev_warn(ctrl->device, + "qid %d invalid message %02x/%02x\n", + qid, data->auth_type, data->auth_id); + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; + } + if (le16_to_cpu(data->t_id) != transaction) { + dev_warn(ctrl->device, + "qid %d invalid transaction ID %d\n", + qid, le16_to_cpu(data->t_id)); + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; + } + return 0; +} + +static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + struct nvmf_auth_dhchap_negotiate_data *data = chap->buf; + size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol); + + if (chap->buf_size < size) { + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + return -EINVAL; + } + memset((u8 *)chap->buf, 0, size); + data->auth_type = NVME_AUTH_COMMON_MESSAGES; + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; + data->t_id = cpu_to_le16(chap->transaction); + data->sc_c = 0; /* No secure channel concatenation */ + data->napd = 1; + data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID; + data->auth_protocol[0].dhchap.halen = 3; + data->auth_protocol[0].dhchap.dhlen = 6; + data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256; + data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384; + data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512; + data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL; + data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048; + data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072; + data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096; + data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144; + data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192; + + return size; +} + +static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + struct nvmf_auth_dhchap_challenge_data *data = chap->buf; + u16 dhvlen = le16_to_cpu(data->dhvlen); + size_t size = sizeof(*data) + data->hl + dhvlen; + const char *hmac_name, *kpp_name; + + if (chap->buf_size < size) { + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + return NVME_SC_INVALID_FIELD; + } + + hmac_name = nvme_auth_hmac_name(data->hashid); + if (!hmac_name) { + dev_warn(ctrl->device, + "qid %d: invalid HASH ID %d\n", + chap->qid, data->hashid); + chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; + return NVME_SC_INVALID_FIELD; + } + + if (chap->hash_id == data->hashid && chap->shash_tfm && + !strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) && + crypto_shash_digestsize(chap->shash_tfm) == data->hl) { + dev_dbg(ctrl->device, + "qid %d: reuse existing hash %s\n", + chap->qid, hmac_name); + goto select_kpp; + } + + /* Reset if hash cannot be reused */ + if (chap->shash_tfm) { + crypto_free_shash(chap->shash_tfm); + chap->hash_id = 0; + chap->hash_len = 0; + } + chap->shash_tfm = crypto_alloc_shash(hmac_name, 0, + CRYPTO_ALG_ALLOCATES_MEMORY); + if (IS_ERR(chap->shash_tfm)) { + dev_warn(ctrl->device, + "qid %d: failed to allocate hash %s, error %ld\n", + chap->qid, hmac_name, PTR_ERR(chap->shash_tfm)); + chap->shash_tfm = NULL; + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; + return NVME_SC_AUTH_REQUIRED; + } + + if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) { + dev_warn(ctrl->device, + "qid %d: invalid hash length %d\n", + chap->qid, data->hl); + crypto_free_shash(chap->shash_tfm); + chap->shash_tfm = NULL; + chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; + return NVME_SC_AUTH_REQUIRED; + } + + /* Reset host response if the hash had been changed */ + if (chap->hash_id != data->hashid) { + kfree(chap->host_response); + chap->host_response = NULL; + } + + chap->hash_id = data->hashid; + chap->hash_len = data->hl; + dev_dbg(ctrl->device, "qid %d: selected hash %s\n", + chap->qid, hmac_name); + +select_kpp: + kpp_name = nvme_auth_dhgroup_kpp(data->dhgid); + if (!kpp_name) { + dev_warn(ctrl->device, + "qid %d: invalid DH group id %d\n", + chap->qid, data->dhgid); + chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; + return NVME_SC_AUTH_REQUIRED; + } + + if (data->dhgid != NVME_AUTH_DHGROUP_NULL) { + dev_warn(ctrl->device, + "qid %d: unsupported DH group %s\n", + chap->qid, kpp_name); + chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; + return NVME_SC_AUTH_REQUIRED; + } else if (dhvlen != 0) { + dev_warn(ctrl->device, + "qid %d: invalid DH value for NULL DH\n", + chap->qid); + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + return NVME_SC_INVALID_FIELD; + } + chap->dhgroup_id = data->dhgid; + + chap->s1 = le32_to_cpu(data->seqnum); + memcpy(chap->c1, data->cval, chap->hash_len); + + return 0; +} + +static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + struct nvmf_auth_dhchap_reply_data *data = chap->buf; + size_t size = sizeof(*data); + + size += 2 * chap->hash_len; + + if (chap->buf_size < size) { + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + return -EINVAL; + } + + memset(chap->buf, 0, size); + data->auth_type = NVME_AUTH_DHCHAP_MESSAGES; + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_REPLY; + data->t_id = cpu_to_le16(chap->transaction); + data->hl = chap->hash_len; + data->dhvlen = 0; + memcpy(data->rval, chap->response, chap->hash_len); + if (ctrl->ctrl_key) { + get_random_bytes(chap->c2, chap->hash_len); + data->cvalid = 1; + chap->s2 = nvme_auth_get_seqnum(); + memcpy(data->rval + chap->hash_len, chap->c2, + chap->hash_len); + dev_dbg(ctrl->device, "%s: qid %d ctrl challenge %*ph\n", + __func__, chap->qid, (int)chap->hash_len, chap->c2); + } else { + memset(chap->c2, 0, chap->hash_len); + chap->s2 = 0; + } + data->seqnum = cpu_to_le32(chap->s2); + return size; +} + +static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + struct nvmf_auth_dhchap_success1_data *data = chap->buf; + size_t size = sizeof(*data); + + if (ctrl->ctrl_key) + size += chap->hash_len; + + if (chap->buf_size < size) { + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + return NVME_SC_INVALID_FIELD; + } + + if (data->hl != chap->hash_len) { + dev_warn(ctrl->device, + "qid %d: invalid hash length %u\n", + chap->qid, data->hl); + chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; + return NVME_SC_INVALID_FIELD; + } + + /* Just print out information for the admin queue */ + if (chap->qid == 0) + dev_info(ctrl->device, + "qid 0: authenticated with hash %s dhgroup %s\n", + nvme_auth_hmac_name(chap->hash_id), + nvme_auth_dhgroup_name(chap->dhgroup_id)); + + if (!data->rvalid) + return 0; + + /* Validate controller response */ + if (memcmp(chap->response, data->rval, data->hl)) { + dev_dbg(ctrl->device, "%s: qid %d ctrl response %*ph\n", + __func__, chap->qid, (int)chap->hash_len, data->rval); + dev_dbg(ctrl->device, "%s: qid %d host response %*ph\n", + __func__, chap->qid, (int)chap->hash_len, + chap->response); + dev_warn(ctrl->device, + "qid %d: controller authentication failed\n", + chap->qid); + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; + return NVME_SC_AUTH_REQUIRED; + } + + /* Just print out information for the admin queue */ + if (chap->qid == 0) + dev_info(ctrl->device, + "qid 0: controller authenticated\n"); + return 0; +} + +static int nvme_auth_set_dhchap_success2_data(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + struct nvmf_auth_dhchap_success2_data *data = chap->buf; + size_t size = sizeof(*data); + + memset(chap->buf, 0, size); + data->auth_type = NVME_AUTH_DHCHAP_MESSAGES; + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2; + data->t_id = cpu_to_le16(chap->transaction); + + return size; +} + +static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + struct nvmf_auth_dhchap_failure_data *data = chap->buf; + size_t size = sizeof(*data); + + memset(chap->buf, 0, size); + data->auth_type = NVME_AUTH_COMMON_MESSAGES; + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2; + data->t_id = cpu_to_le16(chap->transaction); + data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED; + data->rescode_exp = chap->status; + + return size; +} + +static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + SHASH_DESC_ON_STACK(shash, chap->shash_tfm); + u8 buf[4], *challenge = chap->c1; + int ret; + + dev_dbg(ctrl->device, "%s: qid %d host response seq %u transaction %d\n", + __func__, chap->qid, chap->s1, chap->transaction); + + if (!chap->host_response) { + chap->host_response = nvme_auth_transform_key(ctrl->host_key, + ctrl->opts->host->nqn); + if (IS_ERR(chap->host_response)) { + ret = PTR_ERR(chap->host_response); + chap->host_response = NULL; + return ret; + } + } else { + dev_dbg(ctrl->device, "%s: qid %d re-using host response\n", + __func__, chap->qid); + } + + ret = crypto_shash_setkey(chap->shash_tfm, + chap->host_response, ctrl->host_key->len); + if (ret) { + dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", + chap->qid, ret); + goto out; + } + + shash->tfm = chap->shash_tfm; + ret = crypto_shash_init(shash); + if (ret) + goto out; + ret = crypto_shash_update(shash, challenge, chap->hash_len); + if (ret) + goto out; + put_unaligned_le32(chap->s1, buf); + ret = crypto_shash_update(shash, buf, 4); + if (ret) + goto out; + put_unaligned_le16(chap->transaction, buf); + ret = crypto_shash_update(shash, buf, 2); + if (ret) + goto out; + memset(buf, 0, sizeof(buf)); + ret = crypto_shash_update(shash, buf, 1); + if (ret) + goto out; + ret = crypto_shash_update(shash, "HostHost", 8); + if (ret) + goto out; + ret = crypto_shash_update(shash, ctrl->opts->host->nqn, + strlen(ctrl->opts->host->nqn)); + if (ret) + goto out; + ret = crypto_shash_update(shash, buf, 1); + if (ret) + goto out; + ret = crypto_shash_update(shash, ctrl->opts->subsysnqn, + strlen(ctrl->opts->subsysnqn)); + if (ret) + goto out; + ret = crypto_shash_final(shash, chap->response); +out: + if (challenge != chap->c1) + kfree(challenge); + return ret; +} + +static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + SHASH_DESC_ON_STACK(shash, chap->shash_tfm); + u8 *ctrl_response; + u8 buf[4], *challenge = chap->c2; + int ret; + + ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key, + ctrl->opts->subsysnqn); + if (IS_ERR(ctrl_response)) { + ret = PTR_ERR(ctrl_response); + return ret; + } + ret = crypto_shash_setkey(chap->shash_tfm, + ctrl_response, ctrl->ctrl_key->len); + if (ret) { + dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", + chap->qid, ret); + goto out; + } + + dev_dbg(ctrl->device, "%s: qid %d ctrl response seq %u transaction %d\n", + __func__, chap->qid, chap->s2, chap->transaction); + dev_dbg(ctrl->device, "%s: qid %d challenge %*ph\n", + __func__, chap->qid, (int)chap->hash_len, challenge); + dev_dbg(ctrl->device, "%s: qid %d subsysnqn %s\n", + __func__, chap->qid, ctrl->opts->subsysnqn); + dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n", + __func__, chap->qid, ctrl->opts->host->nqn); + shash->tfm = chap->shash_tfm; + ret = crypto_shash_init(shash); + if (ret) + goto out; + ret = crypto_shash_update(shash, challenge, chap->hash_len); + if (ret) + goto out; + put_unaligned_le32(chap->s2, buf); + ret = crypto_shash_update(shash, buf, 4); + if (ret) + goto out; + put_unaligned_le16(chap->transaction, buf); + ret = crypto_shash_update(shash, buf, 2); + if (ret) + goto out; + memset(buf, 0, 4); + ret = crypto_shash_update(shash, buf, 1); + if (ret) + goto out; + ret = crypto_shash_update(shash, "Controller", 10); + if (ret) + goto out; + ret = crypto_shash_update(shash, ctrl->opts->subsysnqn, + strlen(ctrl->opts->subsysnqn)); + if (ret) + goto out; + ret = crypto_shash_update(shash, buf, 1); + if (ret) + goto out; + ret = crypto_shash_update(shash, ctrl->opts->host->nqn, + strlen(ctrl->opts->host->nqn)); + if (ret) + goto out; + ret = crypto_shash_final(shash, chap->response); +out: + if (challenge != chap->c2) + kfree(challenge); + kfree(ctrl_response); + return ret; +} + +static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) +{ + chap->status = 0; + chap->error = 0; + chap->s1 = 0; + chap->s2 = 0; + chap->transaction = 0; + memset(chap->c1, 0, sizeof(chap->c1)); + memset(chap->c2, 0, sizeof(chap->c2)); +} + +static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap) +{ + __nvme_auth_reset(chap); + if (chap->shash_tfm) + crypto_free_shash(chap->shash_tfm); + kfree_sensitive(chap->host_response); + kfree(chap->buf); + kfree(chap); +} + +static void __nvme_auth_work(struct work_struct *work) +{ + struct nvme_dhchap_queue_context *chap = + container_of(work, struct nvme_dhchap_queue_context, auth_work); + struct nvme_ctrl *ctrl = chap->ctrl; + size_t tl; + int ret = 0; + + chap->transaction = ctrl->transaction++; + + /* DH-HMAC-CHAP Step 1: send negotiate */ + dev_dbg(ctrl->device, "%s: qid %d send negotiate\n", + __func__, chap->qid); + ret = nvme_auth_set_dhchap_negotiate_data(ctrl, chap); + if (ret < 0) { + chap->error = ret; + return; + } + tl = ret; + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true); + if (ret) { + chap->error = ret; + return; + } + + /* DH-HMAC-CHAP Step 2: receive challenge */ + dev_dbg(ctrl->device, "%s: qid %d receive challenge\n", + __func__, chap->qid); + + memset(chap->buf, 0, chap->buf_size); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + if (ret) { + dev_warn(ctrl->device, + "qid %d failed to receive challenge, %s %d\n", + chap->qid, ret < 0 ? "error" : "nvme status", ret); + chap->error = ret; + return; + } + ret = nvme_auth_receive_validate(ctrl, chap->qid, chap->buf, chap->transaction, + NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE); + if (ret) { + chap->status = ret; + chap->error = NVME_SC_AUTH_REQUIRED; + return; + } + + ret = nvme_auth_process_dhchap_challenge(ctrl, chap); + if (ret) { + /* Invalid challenge parameters */ + chap->error = ret; + goto fail2; + } + + dev_dbg(ctrl->device, "%s: qid %d host response\n", + __func__, chap->qid); + ret = nvme_auth_dhchap_setup_host_response(ctrl, chap); + if (ret) { + chap->error = ret; + goto fail2; + } + + /* DH-HMAC-CHAP Step 3: send reply */ + dev_dbg(ctrl->device, "%s: qid %d send reply\n", + __func__, chap->qid); + ret = nvme_auth_set_dhchap_reply_data(ctrl, chap); + if (ret < 0) { + chap->error = ret; + goto fail2; + } + + tl = ret; + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true); + if (ret) { + chap->error = ret; + goto fail2; + } + + /* DH-HMAC-CHAP Step 4: receive success1 */ + dev_dbg(ctrl->device, "%s: qid %d receive success1\n", + __func__, chap->qid); + + memset(chap->buf, 0, chap->buf_size); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false); + if (ret) { + dev_warn(ctrl->device, + "qid %d failed to receive success1, %s %d\n", + chap->qid, ret < 0 ? "error" : "nvme status", ret); + chap->error = ret; + return; + } + ret = nvme_auth_receive_validate(ctrl, chap->qid, + chap->buf, chap->transaction, + NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1); + if (ret) { + chap->status = ret; + chap->error = NVME_SC_AUTH_REQUIRED; + return; + } + + if (ctrl->ctrl_key) { + dev_dbg(ctrl->device, + "%s: qid %d controller response\n", + __func__, chap->qid); + ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap); + if (ret) { + chap->error = ret; + goto fail2; + } + } + + ret = nvme_auth_process_dhchap_success1(ctrl, chap); + if (ret) { + /* Controller authentication failed */ + chap->error = NVME_SC_AUTH_REQUIRED; + goto fail2; + } + + if (ctrl->ctrl_key) { + /* DH-HMAC-CHAP Step 5: send success2 */ + dev_dbg(ctrl->device, "%s: qid %d send success2\n", + __func__, chap->qid); + tl = nvme_auth_set_dhchap_success2_data(ctrl, chap); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true); + if (ret) + chap->error = ret; + } + if (!ret) { + chap->error = 0; + return; + } + +fail2: + dev_dbg(ctrl->device, "%s: qid %d send failure2, status %x\n", + __func__, chap->qid, chap->status); + tl = nvme_auth_set_dhchap_failure2_data(ctrl, chap); + ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, tl, true); + /* + * only update error if send failure2 failed and no other + * error had been set during authentication. + */ + if (ret && !chap->error) + chap->error = ret; +} + +int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) +{ + struct nvme_dhchap_queue_context *chap; + + if (!ctrl->host_key) { + dev_warn(ctrl->device, "qid %d: no key\n", qid); + return -ENOKEY; + } + + if (ctrl->opts->dhchap_ctrl_secret && !ctrl->ctrl_key) { + dev_warn(ctrl->device, "qid %d: invalid ctrl key\n", qid); + return -ENOKEY; + } + + mutex_lock(&ctrl->dhchap_auth_mutex); + /* Check if the context is already queued */ + list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { + WARN_ON(!chap->buf); + if (chap->qid == qid) { + dev_dbg(ctrl->device, "qid %d: re-using context\n", qid); + mutex_unlock(&ctrl->dhchap_auth_mutex); + flush_work(&chap->auth_work); + __nvme_auth_reset(chap); + queue_work(nvme_wq, &chap->auth_work); + return 0; + } + } + chap = kzalloc(sizeof(*chap), GFP_KERNEL); + if (!chap) { + mutex_unlock(&ctrl->dhchap_auth_mutex); + return -ENOMEM; + } + chap->qid = (qid == NVME_QID_ANY) ? 0 : qid; + chap->ctrl = ctrl; + + /* + * Allocate a large enough buffer for the entire negotiation: + * 4k should be enough to ffdhe8192. + */ + chap->buf_size = 4096; + chap->buf = kzalloc(chap->buf_size, GFP_KERNEL); + if (!chap->buf) { + mutex_unlock(&ctrl->dhchap_auth_mutex); + kfree(chap); + return -ENOMEM; + } + + INIT_WORK(&chap->auth_work, __nvme_auth_work); + list_add(&chap->entry, &ctrl->dhchap_auth_list); + mutex_unlock(&ctrl->dhchap_auth_mutex); + queue_work(nvme_wq, &chap->auth_work); + return 0; +} +EXPORT_SYMBOL_GPL(nvme_auth_negotiate); + +int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) +{ + struct nvme_dhchap_queue_context *chap; + int ret; + + mutex_lock(&ctrl->dhchap_auth_mutex); + list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { + if (chap->qid != qid) + continue; + mutex_unlock(&ctrl->dhchap_auth_mutex); + flush_work(&chap->auth_work); + ret = chap->error; + return ret; + } + mutex_unlock(&ctrl->dhchap_auth_mutex); + return -ENXIO; +} +EXPORT_SYMBOL_GPL(nvme_auth_wait); + +void nvme_auth_reset(struct nvme_ctrl *ctrl) +{ + struct nvme_dhchap_queue_context *chap; + + mutex_lock(&ctrl->dhchap_auth_mutex); + list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) { + mutex_unlock(&ctrl->dhchap_auth_mutex); + flush_work(&chap->auth_work); + __nvme_auth_reset(chap); + } + mutex_unlock(&ctrl->dhchap_auth_mutex); +} +EXPORT_SYMBOL_GPL(nvme_auth_reset); + +static void nvme_dhchap_auth_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, dhchap_auth_work); + int ret, q; + + /* Authenticate admin queue first */ + ret = nvme_auth_negotiate(ctrl, 0); + if (ret) { + dev_warn(ctrl->device, + "qid 0: error %d setting up authentication\n", ret); + return; + } + ret = nvme_auth_wait(ctrl, 0); + if (ret) { + dev_warn(ctrl->device, + "qid 0: authentication failed\n"); + return; + } + + for (q = 1; q < ctrl->queue_count; q++) { + ret = nvme_auth_negotiate(ctrl, q); + if (ret) { + dev_warn(ctrl->device, + "qid %d: error %d setting up authentication\n", + q, ret); + break; + } + } + + /* + * Failure is a soft-state; credentials remain valid until + * the controller terminates the connection. + */ +} + +void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) +{ + INIT_LIST_HEAD(&ctrl->dhchap_auth_list); + INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work); + mutex_init(&ctrl->dhchap_auth_mutex); + if (!ctrl->opts) + return; + nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key); + nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key); +} +EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl); + +void nvme_auth_stop(struct nvme_ctrl *ctrl) +{ + struct nvme_dhchap_queue_context *chap = NULL, *tmp; + + cancel_work_sync(&ctrl->dhchap_auth_work); + mutex_lock(&ctrl->dhchap_auth_mutex); + list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) + cancel_work_sync(&chap->auth_work); + mutex_unlock(&ctrl->dhchap_auth_mutex); +} +EXPORT_SYMBOL_GPL(nvme_auth_stop); + +void nvme_auth_free(struct nvme_ctrl *ctrl) +{ + struct nvme_dhchap_queue_context *chap = NULL, *tmp; + + mutex_lock(&ctrl->dhchap_auth_mutex); + list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) { + list_del_init(&chap->entry); + flush_work(&chap->auth_work); + __nvme_auth_free(chap); + } + mutex_unlock(&ctrl->dhchap_auth_mutex); + if (ctrl->host_key) { + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = NULL; + } + if (ctrl->ctrl_key) { + nvme_auth_free_key(ctrl->ctrl_key); + ctrl->ctrl_key = NULL; + } +} +EXPORT_SYMBOL_GPL(nvme_auth_free); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f56e58a975dd49..8d0089f83cbdc1 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -24,6 +24,7 @@ #include "nvme.h" #include "fabrics.h" +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -330,6 +331,7 @@ enum nvme_disposition { COMPLETE, RETRY, FAILOVER, + AUTHENTICATE, }; static inline enum nvme_disposition nvme_decide_disposition(struct request *req) @@ -337,6 +339,9 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req) if (likely(nvme_req(req)->status == 0)) return COMPLETE; + if ((nvme_req(req)->status & 0x7ff) == NVME_SC_AUTH_REQUIRED) + return AUTHENTICATE; + if (blk_noretry_request(req) || (nvme_req(req)->status & NVME_SC_DNR) || nvme_req(req)->retries >= nvme_max_retries) @@ -375,11 +380,13 @@ static inline void nvme_end_req(struct request *req) void nvme_complete_rq(struct request *req) { + struct nvme_ctrl *ctrl = nvme_req(req)->ctrl; + trace_nvme_complete_rq(req); nvme_cleanup_cmd(req); - if (nvme_req(req)->ctrl->kas) - nvme_req(req)->ctrl->comp_seen = true; + if (ctrl->kas) + ctrl->comp_seen = true; switch (nvme_decide_disposition(req)) { case COMPLETE: @@ -391,6 +398,14 @@ void nvme_complete_rq(struct request *req) case FAILOVER: nvme_failover_req(req); return; + case AUTHENTICATE: +#ifdef CONFIG_NVME_AUTH + queue_work(nvme_wq, &ctrl->dhchap_auth_work); + nvme_retry_req(req); +#else + nvme_end_req(req); +#endif + return; } } EXPORT_SYMBOL_GPL(nvme_complete_rq); @@ -702,7 +717,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq, switch (ctrl->state) { case NVME_CTRL_CONNECTING: if (blk_rq_is_passthrough(rq) && nvme_is_fabrics(req->cmd) && - req->cmd->fabrics.fctype == nvme_fabrics_type_connect) + (req->cmd->fabrics.fctype == nvme_fabrics_type_connect || + req->cmd->fabrics.fctype == nvme_fabrics_type_auth_send || + req->cmd->fabrics.fctype == nvme_fabrics_type_auth_receive)) return true; break; default: @@ -3609,6 +3626,108 @@ static ssize_t dctype_show(struct device *dev, } static DEVICE_ATTR_RO(dctype); +#ifdef CONFIG_NVME_AUTH +static ssize_t nvme_ctrl_dhchap_secret_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (!opts->dhchap_secret) + return sysfs_emit(buf, "none\n"); + return sysfs_emit(buf, "%s\n", opts->dhchap_secret); +} + +static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + char *dhchap_secret; + + if (!ctrl->opts->dhchap_secret) + return -EINVAL; + if (count < 7) + return -EINVAL; + if (memcmp(buf, "DHHC-1:", 7)) + return -EINVAL; + + dhchap_secret = kzalloc(count + 1, GFP_KERNEL); + if (!dhchap_secret) + return -ENOMEM; + memcpy(dhchap_secret, buf, count); + nvme_auth_stop(ctrl); + if (strcmp(dhchap_secret, opts->dhchap_secret)) { + int ret; + + ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key); + if (ret) + return ret; + kfree(opts->dhchap_secret); + opts->dhchap_secret = dhchap_secret; + /* Key has changed; re-authentication with new key */ + nvme_auth_reset(ctrl); + } + /* Start re-authentication */ + dev_info(ctrl->device, "re-authenticating controller\n"); + queue_work(nvme_wq, &ctrl->dhchap_auth_work); + + return count; +} +static DEVICE_ATTR(dhchap_secret, S_IRUGO | S_IWUSR, + nvme_ctrl_dhchap_secret_show, nvme_ctrl_dhchap_secret_store); + +static ssize_t nvme_ctrl_dhchap_ctrl_secret_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + + if (!opts->dhchap_ctrl_secret) + return sysfs_emit(buf, "none\n"); + return sysfs_emit(buf, "%s\n", opts->dhchap_ctrl_secret); +} + +static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + struct nvmf_ctrl_options *opts = ctrl->opts; + char *dhchap_secret; + + if (!ctrl->opts->dhchap_ctrl_secret) + return -EINVAL; + if (count < 7) + return -EINVAL; + if (memcmp(buf, "DHHC-1:", 7)) + return -EINVAL; + + dhchap_secret = kzalloc(count + 1, GFP_KERNEL); + if (!dhchap_secret) + return -ENOMEM; + memcpy(dhchap_secret, buf, count); + nvme_auth_stop(ctrl); + if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) { + int ret; + + ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key); + if (ret) + return ret; + kfree(opts->dhchap_ctrl_secret); + opts->dhchap_ctrl_secret = dhchap_secret; + /* Key has changed; re-authentication with new key */ + nvme_auth_reset(ctrl); + } + /* Start re-authentication */ + dev_info(ctrl->device, "re-authenticating controller\n"); + queue_work(nvme_wq, &ctrl->dhchap_auth_work); + + return count; +} +static DEVICE_ATTR(dhchap_ctrl_secret, S_IRUGO | S_IWUSR, + nvme_ctrl_dhchap_ctrl_secret_show, nvme_ctrl_dhchap_ctrl_secret_store); +#endif + static struct attribute *nvme_dev_attrs[] = { &dev_attr_reset_controller.attr, &dev_attr_rescan_controller.attr, @@ -3632,6 +3751,10 @@ static struct attribute *nvme_dev_attrs[] = { &dev_attr_kato.attr, &dev_attr_cntrltype.attr, &dev_attr_dctype.attr, +#ifdef CONFIG_NVME_AUTH + &dev_attr_dhchap_secret.attr, + &dev_attr_dhchap_ctrl_secret.attr, +#endif NULL }; @@ -3655,6 +3778,12 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj, return 0; if (a == &dev_attr_fast_io_fail_tmo.attr && !ctrl->opts) return 0; +#ifdef CONFIG_NVME_AUTH + if (a == &dev_attr_dhchap_secret.attr && !ctrl->opts) + return 0; + if (a == &dev_attr_dhchap_ctrl_secret.attr && !ctrl->opts) + return 0; +#endif return a->mode; } @@ -4549,8 +4678,10 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) * recovery actions from interfering with the controller's * firmware activation. */ - if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) { + nvme_auth_stop(ctrl); queue_work(nvme_wq, &ctrl->fw_act_work); + } break; #ifdef CONFIG_NVME_MULTIPATH case NVME_AER_NOTICE_ANA: @@ -4614,6 +4745,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_mpath_stop(ctrl); + nvme_auth_stop(ctrl); nvme_stop_keep_alive(ctrl); nvme_stop_failfast_work(ctrl); flush_work(&ctrl->async_event_work); @@ -4671,6 +4803,8 @@ static void nvme_free_ctrl(struct device *dev) nvme_free_cels(ctrl); nvme_mpath_uninit(ctrl); + nvme_auth_stop(ctrl); + nvme_auth_free(ctrl); __free_page(ctrl->discard_page); if (subsys) { @@ -4761,6 +4895,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device)); nvme_mpath_init_ctrl(ctrl); + nvme_auth_init_ctrl(ctrl); return 0; out_free_name: diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index e4b1520862d874..5207a234825763 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -369,6 +369,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) union nvme_result res; struct nvmf_connect_data *data; int ret; + u32 result; cmd.connect.opcode = nvme_fabrics_command; cmd.connect.fctype = nvme_fabrics_type_connect; @@ -401,8 +402,25 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) goto out_free_data; } - ctrl->cntlid = le16_to_cpu(res.u16); - + result = le32_to_cpu(res.u32); + ctrl->cntlid = result & 0xFFFF; + if ((result >> 16) & 0x3) { + /* Authentication required */ + ret = nvme_auth_negotiate(ctrl, 0); + if (ret) { + dev_warn(ctrl->device, + "qid 0: authentication setup failed\n"); + ret = NVME_SC_AUTH_REQUIRED; + goto out_free_data; + } + ret = nvme_auth_wait(ctrl, 0); + if (ret) + dev_warn(ctrl->device, + "qid 0: authentication failed\n"); + else + dev_info(ctrl->device, + "qid 0: authenticated\n"); + } out_free_data: kfree(data); return ret; @@ -435,6 +453,7 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) struct nvmf_connect_data *data; union nvme_result res; int ret; + u32 result; cmd.connect.opcode = nvme_fabrics_command; cmd.connect.fctype = nvme_fabrics_type_connect; @@ -460,6 +479,21 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid) nvmf_log_connect_error(ctrl, ret, le32_to_cpu(res.u32), &cmd, data); } + result = le32_to_cpu(res.u32); + if ((result >> 16) & 2) { + /* Authentication required */ + ret = nvme_auth_negotiate(ctrl, qid); + if (ret) { + dev_warn(ctrl->device, + "qid %d: authentication setup failed\n", qid); + ret = NVME_SC_AUTH_REQUIRED; + } else { + ret = nvme_auth_wait(ctrl, qid); + if (ret) + dev_warn(ctrl->device, + "qid %u: authentication failed\n", qid); + } + } kfree(data); return ret; } @@ -552,6 +586,8 @@ static const match_table_t opt_tokens = { { NVMF_OPT_TOS, "tos=%d" }, { NVMF_OPT_FAIL_FAST_TMO, "fast_io_fail_tmo=%d" }, { NVMF_OPT_DISCOVERY, "discovery" }, + { NVMF_OPT_DHCHAP_SECRET, "dhchap_secret=%s" }, + { NVMF_OPT_DHCHAP_CTRL_SECRET, "dhchap_ctrl_secret=%s" }, { NVMF_OPT_ERR, NULL } }; @@ -833,6 +869,34 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, case NVMF_OPT_DISCOVERY: opts->discovery_nqn = true; break; + case NVMF_OPT_DHCHAP_SECRET: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) { + pr_err("Invalid DH-CHAP secret %s\n", p); + ret = -EINVAL; + goto out; + } + kfree(opts->dhchap_secret); + opts->dhchap_secret = p; + break; + case NVMF_OPT_DHCHAP_CTRL_SECRET: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + if (strlen(p) < 11 || strncmp(p, "DHHC-1:", 7)) { + pr_err("Invalid DH-CHAP secret %s\n", p); + ret = -EINVAL; + goto out; + } + kfree(opts->dhchap_ctrl_secret); + opts->dhchap_ctrl_secret = p; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -951,6 +1015,8 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts) kfree(opts->subsysnqn); kfree(opts->host_traddr); kfree(opts->host_iface); + kfree(opts->dhchap_secret); + kfree(opts->dhchap_ctrl_secret); kfree(opts); } EXPORT_SYMBOL_GPL(nvmf_free_options); @@ -960,7 +1026,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\ NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\ - NVMF_OPT_FAIL_FAST_TMO) + NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_DHCHAP_SECRET |\ + NVMF_OPT_DHCHAP_CTRL_SECRET) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf) @@ -1196,7 +1263,14 @@ static void __exit nvmf_exit(void) BUILD_BUG_ON(sizeof(struct nvmf_connect_command) != 64); BUILD_BUG_ON(sizeof(struct nvmf_property_get_command) != 64); BUILD_BUG_ON(sizeof(struct nvmf_property_set_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_auth_send_command) != 64); + BUILD_BUG_ON(sizeof(struct nvmf_auth_receive_command) != 64); BUILD_BUG_ON(sizeof(struct nvmf_connect_data) != 1024); + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_negotiate_data) != 8); + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_challenge_data) != 16); + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_reply_data) != 16); + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success1_data) != 16); + BUILD_BUG_ON(sizeof(struct nvmf_auth_dhchap_success2_data) != 16); } MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 46d6e194ac2be5..a6e22116e1396a 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -68,6 +68,8 @@ enum { NVMF_OPT_FAIL_FAST_TMO = 1 << 20, NVMF_OPT_HOST_IFACE = 1 << 21, NVMF_OPT_DISCOVERY = 1 << 22, + NVMF_OPT_DHCHAP_SECRET = 1 << 23, + NVMF_OPT_DHCHAP_CTRL_SECRET = 1 << 24, }; /** @@ -97,6 +99,9 @@ enum { * @max_reconnects: maximum number of allowed reconnect attempts before removing * the controller, (-1) means reconnect forever, zero means remove * immediately; + * @dhchap_secret: DH-HMAC-CHAP secret + * @dhchap_ctrl_secret: DH-HMAC-CHAP controller secret for bi-directional + * authentication * @disable_sqflow: disable controller sq flow control * @hdr_digest: generate/verify header digest (TCP) * @data_digest: generate/verify data digest (TCP) @@ -121,6 +126,8 @@ struct nvmf_ctrl_options { unsigned int kato; struct nvmf_host *host; int max_reconnects; + char *dhchap_secret; + char *dhchap_ctrl_secret; bool disable_sqflow; bool hdr_digest; bool data_digest; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e4612dd0b4206e..e9350bf7b2d19f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -328,6 +328,15 @@ struct nvme_ctrl { struct work_struct ana_work; #endif +#ifdef CONFIG_NVME_AUTH + struct work_struct dhchap_auth_work; + struct list_head dhchap_auth_list; + struct mutex dhchap_auth_mutex; + struct nvme_dhchap_key *host_key; + struct nvme_dhchap_key *ctrl_key; + u16 transaction; +#endif + /* Power saving configuration */ u64 ps_max_latency_us; bool apst_enabled; @@ -991,6 +1000,27 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl) return ctrl->sgls & ((1 << 0) | (1 << 1)); } +#ifdef CONFIG_NVME_AUTH +void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl); +void nvme_auth_stop(struct nvme_ctrl *ctrl); +int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid); +int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid); +void nvme_auth_reset(struct nvme_ctrl *ctrl); +void nvme_auth_free(struct nvme_ctrl *ctrl); +#else +static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {}; +static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {}; +static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid) +{ + return -EPROTONOSUPPORT; +} +static inline int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid) +{ + return NVME_SC_AUTH_REQUIRED; +} +static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {}; +#endif + u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u8 opcode); int nvme_execute_passthru_rq(struct request *rq); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index f2a5e1ea508a7a..84ce3347d15834 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1197,6 +1197,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, err_work); + nvme_auth_stop(&ctrl->ctrl); nvme_stop_keep_alive(&ctrl->ctrl); flush_work(&ctrl->ctrl.async_event_work); nvme_rdma_teardown_io_queues(ctrl, false); diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index bb67538d241b65..a7848e430a5c53 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2174,6 +2174,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + nvme_auth_stop(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 2a89c5aa0790b5..1c36fcedea2008 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -287,6 +287,34 @@ static const char *nvme_trace_fabrics_property_get(struct trace_seq *p, u8 *spc) return ret; } +static const char *nvme_trace_fabrics_auth_send(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 spsp0 = spc[1]; + u8 spsp1 = spc[2]; + u8 secp = spc[3]; + u32 tl = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, tl=%u", + spsp0, spsp1, secp, tl); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvme_trace_fabrics_auth_receive(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 spsp0 = spc[1]; + u8 spsp1 = spc[2]; + u8 secp = spc[3]; + u32 al = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, al=%u", + spsp0, spsp1, secp, al); + trace_seq_putc(p, 0); + return ret; +} + static const char *nvme_trace_fabrics_common(struct trace_seq *p, u8 *spc) { const char *ret = trace_seq_buffer_ptr(p); @@ -306,6 +334,10 @@ const char *nvme_trace_parse_fabrics_cmd(struct trace_seq *p, return nvme_trace_fabrics_connect(p, spc); case nvme_fabrics_type_property_get: return nvme_trace_fabrics_property_get(p, spc); + case nvme_fabrics_type_auth_send: + return nvme_trace_fabrics_auth_send(p, spc); + case nvme_fabrics_type_auth_receive: + return nvme_trace_fabrics_auth_receive(p, spc); default: return nvme_trace_fabrics_common(p, spc); } diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h new file mode 100644 index 00000000000000..35445682622187 --- /dev/null +++ b/include/linux/nvme-auth.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021 Hannes Reinecke, SUSE Software Solutions + */ + +#ifndef _NVME_AUTH_H +#define _NVME_AUTH_H + +#include + +struct nvme_dhchap_key { + u8 *key; + size_t len; + u8 hash; +}; + +u32 nvme_auth_get_seqnum(void); +const char *nvme_auth_dhgroup_name(u8 dhgroup_id); +const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id); +u8 nvme_auth_dhgroup_id(const char *dhgroup_name); + +const char *nvme_auth_hmac_name(u8 hmac_id); +const char *nvme_auth_digest_name(u8 hmac_id); +size_t nvme_auth_hmac_hash_len(u8 hmac_id); +u8 nvme_auth_hmac_id(const char *hmac_name); + +struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, + u8 key_hash); +void nvme_auth_free_key(struct nvme_dhchap_key *key); +u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn); +int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key); + +#endif /* _NVME_AUTH_H */ From cd88aa2b89c936571aa7d7d5aa17bc95601690bc Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:52:03 +0200 Subject: [PATCH 0284/1250] nvme-auth: Diffie-Hellman key exchange support Implement Diffie-Hellman key exchange using FFDHE groups for NVMe In-Band Authentication. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/common/auth.c | 153 ++++++++++++++++++++++++++++ drivers/nvme/host/Kconfig | 2 + drivers/nvme/host/auth.c | 201 +++++++++++++++++++++++++++++++++++-- include/linux/nvme-auth.h | 8 ++ 4 files changed, 358 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 01adb29947d49c..0c86ebce59d255 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -301,6 +301,159 @@ u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn) } EXPORT_SYMBOL_GPL(nvme_auth_transform_key); +static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey) +{ + const char *digest_name; + struct crypto_shash *tfm; + int ret; + + digest_name = nvme_auth_digest_name(hmac_id); + if (!digest_name) { + pr_debug("%s: failed to get digest for %d\n", __func__, + hmac_id); + return -EINVAL; + } + tfm = crypto_alloc_shash(digest_name, 0, 0); + if (IS_ERR(tfm)) + return -ENOMEM; + + ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey); + if (ret < 0) + pr_debug("%s: Failed to hash digest len %zu\n", __func__, + skey_len); + + crypto_free_shash(tfm); + return ret; +} + +int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *challenge, u8 *aug, size_t hlen) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + u8 *hashed_key; + const char *hmac_name; + int ret; + + hashed_key = kmalloc(hlen, GFP_KERNEL); + if (!hashed_key) + return -ENOMEM; + + ret = nvme_auth_hash_skey(hmac_id, skey, + skey_len, hashed_key); + if (ret < 0) + goto out_free_key; + + hmac_name = nvme_auth_hmac_name(hmac_id); + if (!hmac_name) { + pr_warn("%s: invalid hash algoritm %d\n", + __func__, hmac_id); + ret = -EINVAL; + goto out_free_key; + } + + tfm = crypto_alloc_shash(hmac_name, 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out_free_key; + } + + desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), + GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_hash; + } + desc->tfm = tfm; + + ret = crypto_shash_setkey(tfm, hashed_key, hlen); + if (ret) + goto out_free_desc; + + ret = crypto_shash_init(desc); + if (ret) + goto out_free_desc; + + ret = crypto_shash_update(desc, challenge, hlen); + if (ret) + goto out_free_desc; + + ret = crypto_shash_final(desc, aug); +out_free_desc: + kfree_sensitive(desc); +out_free_hash: + crypto_free_shash(tfm); +out_free_key: + kfree_sensitive(hashed_key); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge); + +int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid) +{ + int ret; + + ret = crypto_kpp_set_secret(dh_tfm, NULL, 0); + if (ret) + pr_debug("failed to set private key, error %d\n", ret); + + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_gen_privkey); + +int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, + u8 *host_key, size_t host_key_len) +{ + struct kpp_request *req; + struct crypto_wait wait; + struct scatterlist dst; + int ret; + + req = kpp_request_alloc(dh_tfm, GFP_KERNEL); + if (!req) + return -ENOMEM; + + crypto_init_wait(&wait); + kpp_request_set_input(req, NULL, 0); + sg_init_one(&dst, host_key, host_key_len); + kpp_request_set_output(req, &dst, host_key_len); + kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + + ret = crypto_wait_req(crypto_kpp_generate_public_key(req), &wait); + kpp_request_free(req); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey); + +int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, + u8 *ctrl_key, size_t ctrl_key_len, + u8 *sess_key, size_t sess_key_len) +{ + struct kpp_request *req; + struct crypto_wait wait; + struct scatterlist src, dst; + int ret; + + req = kpp_request_alloc(dh_tfm, GFP_KERNEL); + if (!req) + return -ENOMEM; + + crypto_init_wait(&wait); + sg_init_one(&src, ctrl_key, ctrl_key_len); + kpp_request_set_input(req, &src, ctrl_key_len); + sg_init_one(&dst, sess_key, sess_key_len); + kpp_request_set_output(req, &dst, sess_key_len); + kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + + ret = crypto_wait_req(crypto_kpp_compute_shared_secret(req), &wait); + + kpp_request_free(req); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret); + int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) { struct nvme_dhchap_key *key; diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 6c503f42f3c680..2f6a7f8c94e8e0 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -100,6 +100,8 @@ config NVME_AUTH select CRYPTO_HMAC select CRYPTO_SHA256 select CRYPTO_SHA512 + select CRYPTO_DH + select CRYPTO_DH_RFC7919_GROUPS help This provides support for NVMe over Fabrics In-Band Authentication. diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 9766bfffecac6e..c8a6db7c449805 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -18,6 +18,7 @@ struct nvme_dhchap_queue_context { struct work_struct auth_work; struct nvme_ctrl *ctrl; struct crypto_shash *shash_tfm; + struct crypto_kpp *dh_tfm; void *buf; size_t buf_size; int qid; @@ -33,6 +34,12 @@ struct nvme_dhchap_queue_context { u8 c2[64]; u8 response[64]; u8 *host_response; + u8 *ctrl_key; + int ctrl_key_len; + u8 *host_key; + int host_key_len; + u8 *sess_key; + int sess_key_len; }; #define nvme_auth_flags_from_qid(qid) \ @@ -137,6 +144,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, struct nvmf_auth_dhchap_challenge_data *data = chap->buf; u16 dhvlen = le16_to_cpu(data->dhvlen); size_t size = sizeof(*data) + data->hl + dhvlen; + const char *gid_name = nvme_auth_dhgroup_name(data->dhgid); const char *hmac_name, *kpp_name; if (chap->buf_size < size) { @@ -207,15 +215,54 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, "qid %d: invalid DH group id %d\n", chap->qid, data->dhgid); chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; + /* Leave previous dh_tfm intact */ return NVME_SC_AUTH_REQUIRED; } + /* Clear host and controller key to avoid accidental reuse */ + kfree_sensitive(chap->host_key); + chap->host_key = NULL; + chap->host_key_len = 0; + kfree_sensitive(chap->ctrl_key); + chap->ctrl_key = NULL; + chap->ctrl_key_len = 0; + + if (chap->dhgroup_id == data->dhgid && + (data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) { + dev_dbg(ctrl->device, + "qid %d: reuse existing DH group %s\n", + chap->qid, gid_name); + goto skip_kpp; + } + + /* Reset dh_tfm if it can't be reused */ + if (chap->dh_tfm) { + crypto_free_kpp(chap->dh_tfm); + chap->dh_tfm = NULL; + } + if (data->dhgid != NVME_AUTH_DHGROUP_NULL) { - dev_warn(ctrl->device, - "qid %d: unsupported DH group %s\n", - chap->qid, kpp_name); - chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; - return NVME_SC_AUTH_REQUIRED; + if (dhvlen == 0) { + dev_warn(ctrl->device, + "qid %d: empty DH value\n", + chap->qid); + chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; + return NVME_SC_INVALID_FIELD; + } + + chap->dh_tfm = crypto_alloc_kpp(kpp_name, 0, 0); + if (IS_ERR(chap->dh_tfm)) { + int ret = PTR_ERR(chap->dh_tfm); + + dev_warn(ctrl->device, + "qid %d: error %d initializing DH group %s\n", + chap->qid, ret, gid_name); + chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; + chap->dh_tfm = NULL; + return NVME_SC_AUTH_REQUIRED; + } + dev_dbg(ctrl->device, "qid %d: selected DH group %s\n", + chap->qid, gid_name); } else if (dhvlen != 0) { dev_warn(ctrl->device, "qid %d: invalid DH value for NULL DH\n", @@ -225,8 +272,21 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, } chap->dhgroup_id = data->dhgid; +skip_kpp: chap->s1 = le32_to_cpu(data->seqnum); memcpy(chap->c1, data->cval, chap->hash_len); + if (dhvlen) { + chap->ctrl_key = kmalloc(dhvlen, GFP_KERNEL); + if (!chap->ctrl_key) { + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; + return NVME_SC_AUTH_REQUIRED; + } + chap->ctrl_key_len = dhvlen; + memcpy(chap->ctrl_key, data->cval + chap->hash_len, + dhvlen); + dev_dbg(ctrl->device, "ctrl public key %*ph\n", + (int)chap->ctrl_key_len, chap->ctrl_key); + } return 0; } @@ -239,6 +299,9 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, size += 2 * chap->hash_len; + if (chap->host_key_len) + size += chap->host_key_len; + if (chap->buf_size < size) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; return -EINVAL; @@ -249,7 +312,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_REPLY; data->t_id = cpu_to_le16(chap->transaction); data->hl = chap->hash_len; - data->dhvlen = 0; + data->dhvlen = cpu_to_le16(chap->host_key_len); memcpy(data->rval, chap->response, chap->hash_len); if (ctrl->ctrl_key) { get_random_bytes(chap->c2, chap->hash_len); @@ -264,6 +327,14 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl, chap->s2 = 0; } data->seqnum = cpu_to_le32(chap->s2); + if (chap->host_key_len) { + dev_dbg(ctrl->device, "%s: qid %d host public key %*ph\n", + __func__, chap->qid, + chap->host_key_len, chap->host_key); + memcpy(data->rval + 2 * chap->hash_len, chap->host_key, + chap->host_key_len); + } + return size; } @@ -381,6 +452,21 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, goto out; } + if (chap->dh_tfm) { + challenge = kmalloc(chap->hash_len, GFP_KERNEL); + if (!challenge) { + ret = -ENOMEM; + goto out; + } + ret = nvme_auth_augmented_challenge(chap->hash_id, + chap->sess_key, + chap->sess_key_len, + chap->c1, challenge, + chap->hash_len); + if (ret) + goto out; + } + shash->tfm = chap->shash_tfm; ret = crypto_shash_init(shash); if (ret) @@ -443,6 +529,20 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, goto out; } + if (chap->dh_tfm) { + challenge = kmalloc(chap->hash_len, GFP_KERNEL); + if (!challenge) { + ret = -ENOMEM; + goto out; + } + ret = nvme_auth_augmented_challenge(chap->hash_id, + chap->sess_key, + chap->sess_key_len, + chap->c2, challenge, + chap->hash_len); + if (ret) + goto out; + } dev_dbg(ctrl->device, "%s: qid %d ctrl response seq %u transaction %d\n", __func__, chap->qid, chap->s2, chap->transaction); dev_dbg(ctrl->device, "%s: qid %d challenge %*ph\n", @@ -492,8 +592,81 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, return ret; } +static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl, + struct nvme_dhchap_queue_context *chap) +{ + int ret; + + if (chap->host_key && chap->host_key_len) { + dev_dbg(ctrl->device, + "qid %d: reusing host key\n", chap->qid); + goto gen_sesskey; + } + ret = nvme_auth_gen_privkey(chap->dh_tfm, chap->dhgroup_id); + if (ret < 0) { + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + return ret; + } + + chap->host_key_len = crypto_kpp_maxsize(chap->dh_tfm); + + chap->host_key = kzalloc(chap->host_key_len, GFP_KERNEL); + if (!chap->host_key) { + chap->host_key_len = 0; + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; + return -ENOMEM; + } + ret = nvme_auth_gen_pubkey(chap->dh_tfm, + chap->host_key, chap->host_key_len); + if (ret) { + dev_dbg(ctrl->device, + "failed to generate public key, error %d\n", ret); + kfree(chap->host_key); + chap->host_key = NULL; + chap->host_key_len = 0; + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + return ret; + } + +gen_sesskey: + chap->sess_key_len = chap->host_key_len; + chap->sess_key = kmalloc(chap->sess_key_len, GFP_KERNEL); + if (!chap->sess_key) { + chap->sess_key_len = 0; + chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; + return -ENOMEM; + } + + ret = nvme_auth_gen_shared_secret(chap->dh_tfm, + chap->ctrl_key, chap->ctrl_key_len, + chap->sess_key, chap->sess_key_len); + if (ret) { + dev_dbg(ctrl->device, + "failed to generate shared secret, error %d\n", ret); + kfree_sensitive(chap->sess_key); + chap->sess_key = NULL; + chap->sess_key_len = 0; + chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + return ret; + } + dev_dbg(ctrl->device, "shared secret %*ph\n", + (int)chap->sess_key_len, chap->sess_key); + return 0; +} + static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap) { + kfree_sensitive(chap->host_response); + chap->host_response = NULL; + kfree_sensitive(chap->host_key); + chap->host_key = NULL; + chap->host_key_len = 0; + kfree_sensitive(chap->ctrl_key); + chap->ctrl_key = NULL; + chap->ctrl_key_len = 0; + kfree_sensitive(chap->sess_key); + chap->sess_key = NULL; + chap->sess_key_len = 0; chap->status = 0; chap->error = 0; chap->s1 = 0; @@ -508,6 +681,11 @@ static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap) __nvme_auth_reset(chap); if (chap->shash_tfm) crypto_free_shash(chap->shash_tfm); + if (chap->dh_tfm) + crypto_free_kpp(chap->dh_tfm); + kfree_sensitive(chap->ctrl_key); + kfree_sensitive(chap->host_key); + kfree_sensitive(chap->sess_key); kfree_sensitive(chap->host_response); kfree(chap->buf); kfree(chap); @@ -566,6 +744,17 @@ static void __nvme_auth_work(struct work_struct *work) goto fail2; } + if (chap->ctrl_key_len) { + dev_dbg(ctrl->device, + "%s: qid %d DH exponential\n", + __func__, chap->qid); + ret = nvme_auth_dhchap_exponential(ctrl, chap); + if (ret) { + chap->error = ret; + goto fail2; + } + } + dev_dbg(ctrl->device, "%s: qid %d host response\n", __func__, chap->qid); ret = nvme_auth_dhchap_setup_host_response(ctrl, chap); diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 35445682622187..dcb8030062ddaf 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -29,5 +29,13 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, void nvme_auth_free_key(struct nvme_dhchap_key *key); u8 *nvme_auth_transform_key(struct nvme_dhchap_key *key, char *nqn); int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key); +int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, + u8 *challenge, u8 *aug, size_t hlen); +int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); +int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, + u8 *host_key, size_t host_key_len); +int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, + u8 *ctrl_key, size_t ctrl_key_len, + u8 *sess_key, size_t sess_key_len); #endif /* _NVME_AUTH_H */ From 7f03b47ecc492b05d60b543b8ea1d640a1093e25 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:52:04 +0200 Subject: [PATCH 0285/1250] nvmet: parse fabrics commands on io queues Some fabrics commands can be sent via io queues, so add a new function nvmet_parse_fabrics_io_cmd() and rename the existing nvmet_parse_fabrics_cmd() to nvmet_parse_fabrics_admin_cmd(). Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/core.c | 4 ++++ drivers/nvme/target/fabrics-cmd.c | 17 ++++++++++++++++- drivers/nvme/target/nvmet.h | 3 ++- 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 397daaf51f1baf..31df40ac828fa0 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1017,7 +1017,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) u16 ret; if (nvme_is_fabrics(cmd)) - return nvmet_parse_fabrics_cmd(req); + return nvmet_parse_fabrics_admin_cmd(req); if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) return nvmet_parse_discovery_cmd(req); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 90e75324dae056..792f1562117345 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -865,8 +865,12 @@ static inline u16 nvmet_io_cmd_check_access(struct nvmet_req *req) static u16 nvmet_parse_io_cmd(struct nvmet_req *req) { + struct nvme_command *cmd = req->cmd; u16 ret; + if (nvme_is_fabrics(cmd)) + return nvmet_parse_fabrics_io_cmd(req); + ret = nvmet_check_ctrl_status(req); if (unlikely(ret)) return ret; diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 70fb587e941366..f23c2872990822 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -82,7 +82,7 @@ static void nvmet_execute_prop_get(struct nvmet_req *req) nvmet_req_complete(req, status); } -u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) +u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req) { struct nvme_command *cmd = req->cmd; @@ -103,6 +103,21 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req) return 0; } +u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + switch (cmd->fabrics.fctype) { + default: + pr_debug("received unknown capsule type 0x%x\n", + cmd->fabrics.fctype); + req->error_loc = offsetof(struct nvmf_common_command, fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return 0; +} + static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) { struct nvmf_connect_command *c = &req->cmd->connect; diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 69818752a33a58..c37f41eafc2f31 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -419,7 +419,8 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req); u16 nvmet_parse_admin_cmd(struct nvmet_req *req); u16 nvmet_parse_discovery_cmd(struct nvmet_req *req); -u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); +u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req); +u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req); bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops); From b281bec759460e37428204557663a60f6d13e8d3 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:52:05 +0200 Subject: [PATCH 0286/1250] nvmet: implement basic In-Band Authentication Implement NVMe-oF In-Band authentication according to NVMe TPAR 8006. This patch adds three additional configfs entries 'dhchap_key', 'dhchap_ctrl_key', and 'dhchap_hash' to the 'host' configfs directory. The 'dhchap_key' and 'dhchap_ctrl_key' entries need to be in the ASCII format as specified in NVMe Base Specification v2.0 section 8.13.5.8 'Secret representation'. 'dhchap_hash' defaults to 'hmac(sha256)', and can be written to to switch to a different HMAC algorithm. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/Kconfig | 13 + drivers/nvme/target/Makefile | 1 + drivers/nvme/target/admin-cmd.c | 2 + drivers/nvme/target/auth.c | 367 ++++++++++++++++++ drivers/nvme/target/configfs.c | 107 +++++- drivers/nvme/target/core.c | 11 + drivers/nvme/target/fabrics-cmd-auth.c | 502 +++++++++++++++++++++++++ drivers/nvme/target/fabrics-cmd.c | 38 +- drivers/nvme/target/nvmet.h | 62 +++ 9 files changed, 1100 insertions(+), 3 deletions(-) create mode 100644 drivers/nvme/target/auth.c create mode 100644 drivers/nvme/target/fabrics-cmd-auth.c diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index 973561c93888f6..df526b59b509fd 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -83,3 +83,16 @@ config NVME_TARGET_TCP devices over TCP. If unsure, say N. + +config NVME_TARGET_AUTH + bool "NVMe over Fabrics In-band Authentication support" + depends on NVME_TARGET + select NVME_COMMON + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_SHA256 + select CRYPTO_SHA512 + help + This enables support for NVMe over Fabrics In-band Authentication + + If unsure, say N. diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile index 9837e580fa7ee9..c6682010249395 100644 --- a/drivers/nvme/target/Makefile +++ b/drivers/nvme/target/Makefile @@ -13,6 +13,7 @@ nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \ discovery.o io-cmd-file.o io-cmd-bdev.o nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o nvmet-$(CONFIG_BLK_DEV_ZONED) += zns.o +nvmet-$(CONFIG_NVME_TARGET_AUTH) += fabrics-cmd-auth.o auth.o nvme-loop-y += loop.o nvmet-rdma-y += rdma.o nvmet-fc-y += fc.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 31df40ac828fa0..fc8a957fad0ac0 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1018,6 +1018,8 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (nvme_is_fabrics(cmd)) return nvmet_parse_fabrics_admin_cmd(req); + if (unlikely(!nvmet_check_auth_status(req))) + return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR; if (nvmet_is_disc_subsys(nvmet_req_subsys(req))) return nvmet_parse_discovery_cmd(req); diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c new file mode 100644 index 00000000000000..5cdd23c3418530 --- /dev/null +++ b/drivers/nvme/target/auth.c @@ -0,0 +1,367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics DH-HMAC-CHAP authentication. + * Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions. + * All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nvmet.h" + +int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, + bool set_ctrl) +{ + unsigned char key_hash; + char *dhchap_secret; + + if (sscanf(secret, "DHHC-1:%hhd:%*s", &key_hash) != 1) + return -EINVAL; + if (key_hash > 3) { + pr_warn("Invalid DH-HMAC-CHAP hash id %d\n", + key_hash); + return -EINVAL; + } + if (key_hash > 0) { + /* Validate selected hash algorithm */ + const char *hmac = nvme_auth_hmac_name(key_hash); + + if (!crypto_has_shash(hmac, 0, 0)) { + pr_err("DH-HMAC-CHAP hash %s unsupported\n", hmac); + return -ENOTSUPP; + } + } + dhchap_secret = kstrdup(secret, GFP_KERNEL); + if (!dhchap_secret) + return -ENOMEM; + if (set_ctrl) { + host->dhchap_ctrl_secret = strim(dhchap_secret); + host->dhchap_ctrl_key_hash = key_hash; + } else { + host->dhchap_secret = strim(dhchap_secret); + host->dhchap_key_hash = key_hash; + } + return 0; +} + +int nvmet_setup_auth(struct nvmet_ctrl *ctrl) +{ + int ret = 0; + struct nvmet_host_link *p; + struct nvmet_host *host = NULL; + const char *hash_name; + + down_read(&nvmet_config_sem); + if (nvmet_is_disc_subsys(ctrl->subsys)) + goto out_unlock; + + if (ctrl->subsys->allow_any_host) + goto out_unlock; + + list_for_each_entry(p, &ctrl->subsys->hosts, entry) { + pr_debug("check %s\n", nvmet_host_name(p->host)); + if (strcmp(nvmet_host_name(p->host), ctrl->hostnqn)) + continue; + host = p->host; + break; + } + if (!host) { + pr_debug("host %s not found\n", ctrl->hostnqn); + ret = -EPERM; + goto out_unlock; + } + + if (!host->dhchap_secret) { + pr_debug("No authentication provided\n"); + goto out_unlock; + } + + if (host->dhchap_hash_id == ctrl->shash_id) { + pr_debug("Re-use existing hash ID %d\n", + ctrl->shash_id); + } else { + hash_name = nvme_auth_hmac_name(host->dhchap_hash_id); + if (!hash_name) { + pr_warn("Hash ID %d invalid\n", host->dhchap_hash_id); + ret = -EINVAL; + goto out_unlock; + } + ctrl->shash_id = host->dhchap_hash_id; + } + + /* Skip the 'DHHC-1:XX:' prefix */ + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = nvme_auth_extract_key(host->dhchap_secret + 10, + host->dhchap_key_hash); + if (IS_ERR(ctrl->host_key)) { + ret = PTR_ERR(ctrl->host_key); + ctrl->host_key = NULL; + goto out_free_hash; + } + pr_debug("%s: using hash %s key %*ph\n", __func__, + ctrl->host_key->hash > 0 ? + nvme_auth_hmac_name(ctrl->host_key->hash) : "none", + (int)ctrl->host_key->len, ctrl->host_key->key); + + nvme_auth_free_key(ctrl->ctrl_key); + if (!host->dhchap_ctrl_secret) { + ctrl->ctrl_key = NULL; + goto out_unlock; + } + + ctrl->ctrl_key = nvme_auth_extract_key(host->dhchap_ctrl_secret + 10, + host->dhchap_ctrl_key_hash); + if (IS_ERR(ctrl->ctrl_key)) { + ret = PTR_ERR(ctrl->ctrl_key); + ctrl->ctrl_key = NULL; + } + pr_debug("%s: using ctrl hash %s key %*ph\n", __func__, + ctrl->ctrl_key->hash > 0 ? + nvme_auth_hmac_name(ctrl->ctrl_key->hash) : "none", + (int)ctrl->ctrl_key->len, ctrl->ctrl_key->key); + +out_free_hash: + if (ret) { + if (ctrl->host_key) { + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = NULL; + } + ctrl->shash_id = 0; + } +out_unlock: + up_read(&nvmet_config_sem); + + return ret; +} + +void nvmet_auth_sq_free(struct nvmet_sq *sq) +{ + kfree(sq->dhchap_c1); + sq->dhchap_c1 = NULL; + kfree(sq->dhchap_c2); + sq->dhchap_c2 = NULL; + kfree(sq->dhchap_skey); + sq->dhchap_skey = NULL; +} + +void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) +{ + ctrl->shash_id = 0; + + if (ctrl->host_key) { + nvme_auth_free_key(ctrl->host_key); + ctrl->host_key = NULL; + } + if (ctrl->ctrl_key) { + nvme_auth_free_key(ctrl->ctrl_key); + ctrl->ctrl_key = NULL; + } +} + +bool nvmet_check_auth_status(struct nvmet_req *req) +{ + if (req->sq->ctrl->host_key && + !req->sq->authenticated) + return false; + return true; +} + +int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, + unsigned int shash_len) +{ + struct crypto_shash *shash_tfm; + struct shash_desc *shash; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + const char *hash_name; + u8 *challenge = req->sq->dhchap_c1, *host_response; + u8 buf[4]; + int ret; + + hash_name = nvme_auth_hmac_name(ctrl->shash_id); + if (!hash_name) { + pr_warn("Hash ID %d invalid\n", ctrl->shash_id); + return -EINVAL; + } + + shash_tfm = crypto_alloc_shash(hash_name, 0, 0); + if (IS_ERR(shash_tfm)) { + pr_err("failed to allocate shash %s\n", hash_name); + return PTR_ERR(shash_tfm); + } + + if (shash_len != crypto_shash_digestsize(shash_tfm)) { + pr_debug("%s: hash len mismatch (len %d digest %d)\n", + __func__, shash_len, + crypto_shash_digestsize(shash_tfm)); + ret = -EINVAL; + goto out_free_tfm; + } + + host_response = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn); + if (IS_ERR(host_response)) { + ret = PTR_ERR(host_response); + goto out_free_tfm; + } + + ret = crypto_shash_setkey(shash_tfm, host_response, + ctrl->host_key->len); + if (ret) + goto out_free_response; + + pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", + ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, + req->sq->dhchap_tid); + + shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), + GFP_KERNEL); + if (!shash) { + ret = -ENOMEM; + goto out_free_response; + } + shash->tfm = shash_tfm; + ret = crypto_shash_init(shash); + if (ret) + goto out; + ret = crypto_shash_update(shash, challenge, shash_len); + if (ret) + goto out; + put_unaligned_le32(req->sq->dhchap_s1, buf); + ret = crypto_shash_update(shash, buf, 4); + if (ret) + goto out; + put_unaligned_le16(req->sq->dhchap_tid, buf); + ret = crypto_shash_update(shash, buf, 2); + if (ret) + goto out; + memset(buf, 0, 4); + ret = crypto_shash_update(shash, buf, 1); + if (ret) + goto out; + ret = crypto_shash_update(shash, "HostHost", 8); + if (ret) + goto out; + ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn)); + if (ret) + goto out; + ret = crypto_shash_update(shash, buf, 1); + if (ret) + goto out; + ret = crypto_shash_update(shash, ctrl->subsysnqn, + strlen(ctrl->subsysnqn)); + if (ret) + goto out; + ret = crypto_shash_final(shash, response); +out: + if (challenge != req->sq->dhchap_c1) + kfree(challenge); + kfree(shash); +out_free_response: + kfree_sensitive(host_response); +out_free_tfm: + crypto_free_shash(shash_tfm); + return 0; +} + +int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, + unsigned int shash_len) +{ + struct crypto_shash *shash_tfm; + struct shash_desc *shash; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + const char *hash_name; + u8 *challenge = req->sq->dhchap_c2, *ctrl_response; + u8 buf[4]; + int ret; + + hash_name = nvme_auth_hmac_name(ctrl->shash_id); + if (!hash_name) { + pr_warn("Hash ID %d invalid\n", ctrl->shash_id); + return -EINVAL; + } + + shash_tfm = crypto_alloc_shash(hash_name, 0, 0); + if (IS_ERR(shash_tfm)) { + pr_err("failed to allocate shash %s\n", hash_name); + return PTR_ERR(shash_tfm); + } + + if (shash_len != crypto_shash_digestsize(shash_tfm)) { + pr_debug("%s: hash len mismatch (len %d digest %d)\n", + __func__, shash_len, + crypto_shash_digestsize(shash_tfm)); + ret = -EINVAL; + goto out_free_tfm; + } + + ctrl_response = nvme_auth_transform_key(ctrl->ctrl_key, + ctrl->subsysnqn); + if (IS_ERR(ctrl_response)) { + ret = PTR_ERR(ctrl_response); + goto out_free_tfm; + } + + ret = crypto_shash_setkey(shash_tfm, ctrl_response, + ctrl->ctrl_key->len); + if (ret) + goto out_free_response; + + shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), + GFP_KERNEL); + if (!shash) { + ret = -ENOMEM; + goto out_free_response; + } + shash->tfm = shash_tfm; + + ret = crypto_shash_init(shash); + if (ret) + goto out; + ret = crypto_shash_update(shash, challenge, shash_len); + if (ret) + goto out; + put_unaligned_le32(req->sq->dhchap_s2, buf); + ret = crypto_shash_update(shash, buf, 4); + if (ret) + goto out; + put_unaligned_le16(req->sq->dhchap_tid, buf); + ret = crypto_shash_update(shash, buf, 2); + if (ret) + goto out; + memset(buf, 0, 4); + ret = crypto_shash_update(shash, buf, 1); + if (ret) + goto out; + ret = crypto_shash_update(shash, "Controller", 10); + if (ret) + goto out; + ret = crypto_shash_update(shash, ctrl->subsysnqn, + strlen(ctrl->subsysnqn)); + if (ret) + goto out; + ret = crypto_shash_update(shash, buf, 1); + if (ret) + goto out; + ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn)); + if (ret) + goto out; + ret = crypto_shash_final(shash, response); +out: + if (challenge != req->sq->dhchap_c2) + kfree(challenge); + kfree(shash); +out_free_response: + kfree_sensitive(ctrl_response); +out_free_tfm: + crypto_free_shash(shash_tfm); + return 0; +} diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index e44b2988759ec2..9a7d91c64fcd99 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -11,6 +11,11 @@ #include #include #include +#ifdef CONFIG_NVME_TARGET_AUTH +#include +#endif +#include +#include #include "nvmet.h" @@ -1660,10 +1665,102 @@ static const struct config_item_type nvmet_ports_type = { static struct config_group nvmet_subsystems_group; static struct config_group nvmet_ports_group; -static void nvmet_host_release(struct config_item *item) +#ifdef CONFIG_NVME_TARGET_AUTH +static ssize_t nvmet_host_dhchap_key_show(struct config_item *item, + char *page) +{ + u8 *dhchap_secret = to_host(item)->dhchap_secret; + + if (!dhchap_secret) + return sprintf(page, "\n"); + return sprintf(page, "%s\n", dhchap_secret); +} + +static ssize_t nvmet_host_dhchap_key_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_host *host = to_host(item); + int ret; + + ret = nvmet_auth_set_key(host, page, false); + /* + * Re-authentication is a soft state, so keep the + * current authentication valid until the host + * requests re-authentication. + */ + return ret < 0 ? ret : count; +} + +CONFIGFS_ATTR(nvmet_host_, dhchap_key); + +static ssize_t nvmet_host_dhchap_ctrl_key_show(struct config_item *item, + char *page) +{ + u8 *dhchap_secret = to_host(item)->dhchap_ctrl_secret; + + if (!dhchap_secret) + return sprintf(page, "\n"); + return sprintf(page, "%s\n", dhchap_secret); +} + +static ssize_t nvmet_host_dhchap_ctrl_key_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_host *host = to_host(item); + int ret; + + ret = nvmet_auth_set_key(host, page, true); + /* + * Re-authentication is a soft state, so keep the + * current authentication valid until the host + * requests re-authentication. + */ + return ret < 0 ? ret : count; +} + +CONFIGFS_ATTR(nvmet_host_, dhchap_ctrl_key); + +static ssize_t nvmet_host_dhchap_hash_show(struct config_item *item, + char *page) { struct nvmet_host *host = to_host(item); + const char *hash_name = nvme_auth_hmac_name(host->dhchap_hash_id); + return sprintf(page, "%s\n", hash_name ? hash_name : "none"); +} + +static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_host *host = to_host(item); + u8 hmac_id; + + hmac_id = nvme_auth_hmac_id(page); + if (hmac_id == NVME_AUTH_HASH_INVALID) + return -EINVAL; + if (!crypto_has_shash(nvme_auth_hmac_name(hmac_id), 0, 0)) + return -ENOTSUPP; + host->dhchap_hash_id = hmac_id; + return count; +} + +CONFIGFS_ATTR(nvmet_host_, dhchap_hash); + +static struct configfs_attribute *nvmet_host_attrs[] = { + &nvmet_host_attr_dhchap_key, + &nvmet_host_attr_dhchap_ctrl_key, + &nvmet_host_attr_dhchap_hash, + NULL, +}; +#endif /* CONFIG_NVME_TARGET_AUTH */ + +static void nvmet_host_release(struct config_item *item) +{ + struct nvmet_host *host = to_host(item); +#ifdef CONFIG_NVME_TARGET_AUTH + if (host->dhchap_secret) + kfree(host->dhchap_secret); +#endif kfree(host); } @@ -1673,6 +1770,9 @@ static struct configfs_item_operations nvmet_host_item_ops = { static const struct config_item_type nvmet_host_type = { .ct_item_ops = &nvmet_host_item_ops, +#ifdef CONFIG_NVME_TARGET_AUTH + .ct_attrs = nvmet_host_attrs, +#endif .ct_owner = THIS_MODULE, }; @@ -1685,6 +1785,11 @@ static struct config_group *nvmet_hosts_make_group(struct config_group *group, if (!host) return ERR_PTR(-ENOMEM); +#ifdef CONFIG_NVME_TARGET_AUTH + /* Default to SHA256 */ + host->dhchap_hash_id = NVME_AUTH_HASH_SHA256; +#endif + config_group_init_type_name(&host->group, name, &nvmet_host_type); return &host->group; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 792f1562117345..eec0351e102293 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -795,6 +795,7 @@ void nvmet_sq_destroy(struct nvmet_sq *sq) wait_for_completion(&sq->confirm_done); wait_for_completion(&sq->free_done); percpu_ref_exit(&sq->ref); + nvmet_auth_sq_free(sq); if (ctrl) { /* @@ -871,6 +872,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) if (nvme_is_fabrics(cmd)) return nvmet_parse_fabrics_io_cmd(req); + if (unlikely(!nvmet_check_auth_status(req))) + return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR; + ret = nvmet_check_ctrl_status(req); if (unlikely(ret)) return ret; @@ -1275,6 +1279,11 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req) req->cmd->common.opcode, req->sq->qid); return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } + + if (unlikely(!nvmet_check_auth_status(req))) { + pr_warn("qid %d not authenticated\n", req->sq->qid); + return NVME_SC_AUTH_REQUIRED | NVME_SC_DNR; + } return 0; } @@ -1465,6 +1474,8 @@ static void nvmet_ctrl_free(struct kref *ref) flush_work(&ctrl->async_event_work); cancel_work_sync(&ctrl->fatal_err_work); + nvmet_destroy_auth(ctrl); + ida_free(&cntlid_ida, ctrl->cntlid); nvmet_async_events_free(ctrl); diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c new file mode 100644 index 00000000000000..776073a10e04ee --- /dev/null +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NVMe over Fabrics DH-HMAC-CHAP authentication command handling. + * Copyright (c) 2020 Hannes Reinecke, SUSE Software Solutions. + * All rights reserved. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include +#include +#include +#include +#include +#include "nvmet.h" + +void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req) +{ + u32 result = le32_to_cpu(req->cqe->result.u32); + + /* Initialize in-band authentication */ + req->sq->authenticated = false; + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; + result |= (u32)NVME_CONNECT_AUTHREQ_ATR << 16; + req->cqe->result.u32 = cpu_to_le32(result); +} + +static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmf_auth_dhchap_negotiate_data *data = d; + int i, hash_id = 0, fallback_hash_id = 0, dhgid; + + pr_debug("%s: ctrl %d qid %d: data sc_d %d napd %d authid %d halen %d dhlen %d\n", + __func__, ctrl->cntlid, req->sq->qid, + data->sc_c, data->napd, data->auth_protocol[0].dhchap.authid, + data->auth_protocol[0].dhchap.halen, + data->auth_protocol[0].dhchap.dhlen); + req->sq->dhchap_tid = le16_to_cpu(data->t_id); + if (data->sc_c) + return NVME_AUTH_DHCHAP_FAILURE_CONCAT_MISMATCH; + + if (data->napd != 1) + return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; + + if (data->auth_protocol[0].dhchap.authid != + NVME_AUTH_DHCHAP_AUTH_ID) + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + + for (i = 0; i < data->auth_protocol[0].dhchap.halen; i++) { + u8 host_hmac_id = data->auth_protocol[0].dhchap.idlist[i]; + + if (!fallback_hash_id && + crypto_has_shash(nvme_auth_hmac_name(host_hmac_id), 0, 0)) + fallback_hash_id = host_hmac_id; + if (ctrl->shash_id != host_hmac_id) + continue; + hash_id = ctrl->shash_id; + break; + } + if (hash_id == 0) { + if (fallback_hash_id == 0) { + pr_debug("%s: ctrl %d qid %d: no usable hash found\n", + __func__, ctrl->cntlid, req->sq->qid); + return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; + } + pr_debug("%s: ctrl %d qid %d: no usable hash found, falling back to %s\n", + __func__, ctrl->cntlid, req->sq->qid, + nvme_auth_hmac_name(fallback_hash_id)); + ctrl->shash_id = fallback_hash_id; + } + + dhgid = -1; + for (i = 0; i < data->auth_protocol[0].dhchap.dhlen; i++) { + int tmp_dhgid = data->auth_protocol[0].dhchap.idlist[i + 30]; + + if (tmp_dhgid == NVME_AUTH_DHGROUP_NULL) { + dhgid = tmp_dhgid; + break; + } + } + if (dhgid < 0) { + pr_debug("%s: ctrl %d qid %d: no usable DH group found\n", + __func__, ctrl->cntlid, req->sq->qid); + return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; + } + pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n", + __func__, ctrl->cntlid, req->sq->qid, + nvme_auth_dhgroup_name(dhgid), dhgid); + return 0; +} + +static u16 nvmet_auth_reply(struct nvmet_req *req, void *d) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmf_auth_dhchap_reply_data *data = d; + u16 dhvlen = le16_to_cpu(data->dhvlen); + u8 *response; + + pr_debug("%s: ctrl %d qid %d: data hl %d cvalid %d dhvlen %u\n", + __func__, ctrl->cntlid, req->sq->qid, + data->hl, data->cvalid, dhvlen); + + if (dhvlen) { + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + } + + response = kmalloc(data->hl, GFP_KERNEL); + if (!response) + return NVME_AUTH_DHCHAP_FAILURE_FAILED; + + if (!ctrl->host_key) { + pr_warn("ctrl %d qid %d no host key\n", + ctrl->cntlid, req->sq->qid); + kfree(response); + return NVME_AUTH_DHCHAP_FAILURE_FAILED; + } + if (nvmet_auth_host_hash(req, response, data->hl) < 0) { + pr_debug("ctrl %d qid %d host hash failed\n", + ctrl->cntlid, req->sq->qid); + kfree(response); + return NVME_AUTH_DHCHAP_FAILURE_FAILED; + } + + if (memcmp(data->rval, response, data->hl)) { + pr_info("ctrl %d qid %d host response mismatch\n", + ctrl->cntlid, req->sq->qid); + kfree(response); + return NVME_AUTH_DHCHAP_FAILURE_FAILED; + } + kfree(response); + pr_debug("%s: ctrl %d qid %d host authenticated\n", + __func__, ctrl->cntlid, req->sq->qid); + if (data->cvalid) { + req->sq->dhchap_c2 = kmalloc(data->hl, GFP_KERNEL); + if (!req->sq->dhchap_c2) + return NVME_AUTH_DHCHAP_FAILURE_FAILED; + memcpy(req->sq->dhchap_c2, data->rval + data->hl, data->hl); + + pr_debug("%s: ctrl %d qid %d challenge %*ph\n", + __func__, ctrl->cntlid, req->sq->qid, data->hl, + req->sq->dhchap_c2); + req->sq->dhchap_s2 = le32_to_cpu(data->seqnum); + } else { + req->sq->authenticated = true; + req->sq->dhchap_c2 = NULL; + } + + return 0; +} + +static u16 nvmet_auth_failure2(struct nvmet_req *req, void *d) +{ + struct nvmf_auth_dhchap_failure_data *data = d; + + return data->rescode_exp; +} + +void nvmet_execute_auth_send(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmf_auth_dhchap_success2_data *data; + void *d; + u32 tl; + u16 status = 0; + + if (req->cmd->auth_send.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + req->error_loc = + offsetof(struct nvmf_auth_send_command, secp); + goto done; + } + if (req->cmd->auth_send.spsp0 != 0x01) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + req->error_loc = + offsetof(struct nvmf_auth_send_command, spsp0); + goto done; + } + if (req->cmd->auth_send.spsp1 != 0x01) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + req->error_loc = + offsetof(struct nvmf_auth_send_command, spsp1); + goto done; + } + tl = le32_to_cpu(req->cmd->auth_send.tl); + if (!tl) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + req->error_loc = + offsetof(struct nvmf_auth_send_command, tl); + goto done; + } + if (!nvmet_check_transfer_len(req, tl)) { + pr_debug("%s: transfer length mismatch (%u)\n", __func__, tl); + return; + } + + d = kmalloc(tl, GFP_KERNEL); + if (!d) { + status = NVME_SC_INTERNAL; + goto done; + } + + status = nvmet_copy_from_sgl(req, 0, d, tl); + if (status) { + kfree(d); + goto done; + } + + data = d; + pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__, + ctrl->cntlid, req->sq->qid, data->auth_type, data->auth_id, + req->sq->dhchap_step); + if (data->auth_type != NVME_AUTH_COMMON_MESSAGES && + data->auth_type != NVME_AUTH_DHCHAP_MESSAGES) + goto done_failure1; + if (data->auth_type == NVME_AUTH_COMMON_MESSAGES) { + if (data->auth_id == NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE) { + /* Restart negotiation */ + pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__, + ctrl->cntlid, req->sq->qid); + if (!req->sq->qid) { + status = nvmet_setup_auth(ctrl); + if (status < 0) { + pr_err("ctrl %d qid 0 failed to setup" + "re-authentication", + ctrl->cntlid); + goto done_failure1; + } + } + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; + } else if (data->auth_id != req->sq->dhchap_step) + goto done_failure1; + /* Validate negotiation parameters */ + status = nvmet_auth_negotiate(req, d); + if (status == 0) + req->sq->dhchap_step = + NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE; + else { + req->sq->dhchap_step = + NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; + req->sq->dhchap_status = status; + status = 0; + } + goto done_kfree; + } + if (data->auth_id != req->sq->dhchap_step) { + pr_debug("%s: ctrl %d qid %d step mismatch (%d != %d)\n", + __func__, ctrl->cntlid, req->sq->qid, + data->auth_id, req->sq->dhchap_step); + goto done_failure1; + } + if (le16_to_cpu(data->t_id) != req->sq->dhchap_tid) { + pr_debug("%s: ctrl %d qid %d invalid transaction %d (expected %d)\n", + __func__, ctrl->cntlid, req->sq->qid, + le16_to_cpu(data->t_id), + req->sq->dhchap_tid); + req->sq->dhchap_step = + NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; + req->sq->dhchap_status = + NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + goto done_kfree; + } + + switch (data->auth_id) { + case NVME_AUTH_DHCHAP_MESSAGE_REPLY: + status = nvmet_auth_reply(req, d); + if (status == 0) + req->sq->dhchap_step = + NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1; + else { + req->sq->dhchap_step = + NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; + req->sq->dhchap_status = status; + status = 0; + } + goto done_kfree; + break; + case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2: + req->sq->authenticated = true; + pr_debug("%s: ctrl %d qid %d ctrl authenticated\n", + __func__, ctrl->cntlid, req->sq->qid); + goto done_kfree; + break; + case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2: + status = nvmet_auth_failure2(req, d); + if (status) { + pr_warn("ctrl %d qid %d: authentication failed (%d)\n", + ctrl->cntlid, req->sq->qid, status); + req->sq->dhchap_status = status; + req->sq->authenticated = false; + status = 0; + } + goto done_kfree; + break; + default: + req->sq->dhchap_status = + NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; + req->sq->dhchap_step = + NVME_AUTH_DHCHAP_MESSAGE_FAILURE2; + req->sq->authenticated = false; + goto done_kfree; + break; + } +done_failure1: + req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_MESSAGE; + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE2; + +done_kfree: + kfree(d); +done: + pr_debug("%s: ctrl %d qid %d dhchap status %x step %x\n", __func__, + ctrl->cntlid, req->sq->qid, + req->sq->dhchap_status, req->sq->dhchap_step); + if (status) + pr_debug("%s: ctrl %d qid %d nvme status %x error loc %d\n", + __func__, ctrl->cntlid, req->sq->qid, + status, req->error_loc); + req->cqe->result.u64 = 0; + nvmet_req_complete(req, status); + if (req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 && + req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) + return; + /* Final states, clear up variables */ + nvmet_auth_sq_free(req->sq); + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) + nvmet_ctrl_fatal_error(ctrl); +} + +static int nvmet_auth_challenge(struct nvmet_req *req, void *d, int al) +{ + struct nvmf_auth_dhchap_challenge_data *data = d; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + int ret = 0; + int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id); + int data_size = sizeof(*d) + hash_len; + + if (al < data_size) { + pr_debug("%s: buffer too small (al %d need %d)\n", __func__, + al, data_size); + return -EINVAL; + } + memset(data, 0, data_size); + req->sq->dhchap_s1 = nvme_auth_get_seqnum(); + data->auth_type = NVME_AUTH_DHCHAP_MESSAGES; + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE; + data->t_id = cpu_to_le16(req->sq->dhchap_tid); + data->hashid = ctrl->shash_id; + data->hl = hash_len; + data->seqnum = cpu_to_le32(req->sq->dhchap_s1); + req->sq->dhchap_c1 = kmalloc(data->hl, GFP_KERNEL); + if (!req->sq->dhchap_c1) + return -ENOMEM; + get_random_bytes(req->sq->dhchap_c1, data->hl); + memcpy(data->cval, req->sq->dhchap_c1, data->hl); + pr_debug("%s: ctrl %d qid %d seq %u transaction %d hl %d dhvlen %u\n", + __func__, ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, + req->sq->dhchap_tid, data->hl, 0); + return ret; +} + +static int nvmet_auth_success1(struct nvmet_req *req, void *d, int al) +{ + struct nvmf_auth_dhchap_success1_data *data = d; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id); + + WARN_ON(al < sizeof(*data)); + memset(data, 0, sizeof(*data)); + data->auth_type = NVME_AUTH_DHCHAP_MESSAGES; + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1; + data->t_id = cpu_to_le16(req->sq->dhchap_tid); + data->hl = hash_len; + if (req->sq->dhchap_c2) { + if (!ctrl->ctrl_key) { + pr_warn("ctrl %d qid %d no ctrl key\n", + ctrl->cntlid, req->sq->qid); + return NVME_AUTH_DHCHAP_FAILURE_FAILED; + } + if (nvmet_auth_ctrl_hash(req, data->rval, data->hl)) + return NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; + data->rvalid = 1; + pr_debug("ctrl %d qid %d response %*ph\n", + ctrl->cntlid, req->sq->qid, data->hl, data->rval); + } + return 0; +} + +static void nvmet_auth_failure1(struct nvmet_req *req, void *d, int al) +{ + struct nvmf_auth_dhchap_failure_data *data = d; + + WARN_ON(al < sizeof(*data)); + data->auth_type = NVME_AUTH_COMMON_MESSAGES; + data->auth_id = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; + data->t_id = cpu_to_le16(req->sq->dhchap_tid); + data->rescode = NVME_AUTH_DHCHAP_FAILURE_REASON_FAILED; + data->rescode_exp = req->sq->dhchap_status; +} + +void nvmet_execute_auth_receive(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + void *d; + u32 al; + u16 status = 0; + + if (req->cmd->auth_receive.secp != NVME_AUTH_DHCHAP_PROTOCOL_IDENTIFIER) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + req->error_loc = + offsetof(struct nvmf_auth_receive_command, secp); + goto done; + } + if (req->cmd->auth_receive.spsp0 != 0x01) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + req->error_loc = + offsetof(struct nvmf_auth_receive_command, spsp0); + goto done; + } + if (req->cmd->auth_receive.spsp1 != 0x01) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + req->error_loc = + offsetof(struct nvmf_auth_receive_command, spsp1); + goto done; + } + al = le32_to_cpu(req->cmd->auth_receive.al); + if (!al) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + req->error_loc = + offsetof(struct nvmf_auth_receive_command, al); + goto done; + } + if (!nvmet_check_transfer_len(req, al)) { + pr_debug("%s: transfer length mismatch (%u)\n", __func__, al); + return; + } + + d = kmalloc(al, GFP_KERNEL); + if (!d) { + status = NVME_SC_INTERNAL; + goto done; + } + pr_debug("%s: ctrl %d qid %d step %x\n", __func__, + ctrl->cntlid, req->sq->qid, req->sq->dhchap_step); + switch (req->sq->dhchap_step) { + case NVME_AUTH_DHCHAP_MESSAGE_CHALLENGE: + status = nvmet_auth_challenge(req, d, al); + if (status < 0) { + pr_warn("ctrl %d qid %d: challenge error (%d)\n", + ctrl->cntlid, req->sq->qid, status); + status = NVME_SC_INTERNAL; + break; + } + if (status) { + req->sq->dhchap_status = status; + nvmet_auth_failure1(req, d, al); + pr_warn("ctrl %d qid %d: challenge status (%x)\n", + ctrl->cntlid, req->sq->qid, + req->sq->dhchap_status); + status = 0; + break; + } + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_REPLY; + break; + case NVME_AUTH_DHCHAP_MESSAGE_SUCCESS1: + status = nvmet_auth_success1(req, d, al); + if (status) { + req->sq->dhchap_status = status; + req->sq->authenticated = false; + nvmet_auth_failure1(req, d, al); + pr_warn("ctrl %d qid %d: success1 status (%x)\n", + ctrl->cntlid, req->sq->qid, + req->sq->dhchap_status); + break; + } + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2; + break; + case NVME_AUTH_DHCHAP_MESSAGE_FAILURE1: + req->sq->authenticated = false; + nvmet_auth_failure1(req, d, al); + pr_warn("ctrl %d qid %d failure1 (%x)\n", + ctrl->cntlid, req->sq->qid, req->sq->dhchap_status); + break; + default: + pr_warn("ctrl %d qid %d unhandled step (%d)\n", + ctrl->cntlid, req->sq->qid, req->sq->dhchap_step); + req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_FAILURE1; + req->sq->dhchap_status = NVME_AUTH_DHCHAP_FAILURE_FAILED; + nvmet_auth_failure1(req, d, al); + status = 0; + break; + } + + status = nvmet_copy_to_sgl(req, 0, d, al); + kfree(d); +done: + req->cqe->result.u64 = 0; + nvmet_req_complete(req, status); + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2) + nvmet_auth_sq_free(req->sq); + else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { + nvmet_auth_sq_free(req->sq); + nvmet_ctrl_fatal_error(ctrl); + } +} diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index f23c2872990822..f91a56180d3dd6 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -93,6 +93,14 @@ u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req) case nvme_fabrics_type_property_get: req->execute = nvmet_execute_prop_get; break; +#ifdef CONFIG_NVME_TARGET_AUTH + case nvme_fabrics_type_auth_send: + req->execute = nvmet_execute_auth_send; + break; + case nvme_fabrics_type_auth_receive: + req->execute = nvmet_execute_auth_receive; + break; +#endif default: pr_debug("received unknown capsule type 0x%x\n", cmd->fabrics.fctype); @@ -108,6 +116,14 @@ u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req) struct nvme_command *cmd = req->cmd; switch (cmd->fabrics.fctype) { +#ifdef CONFIG_NVME_TARGET_AUTH + case nvme_fabrics_type_auth_send: + req->execute = nvmet_execute_auth_send; + break; + case nvme_fabrics_type_auth_receive: + req->execute = nvmet_execute_auth_receive; + break; +#endif default: pr_debug("received unknown capsule type 0x%x\n", cmd->fabrics.fctype); @@ -188,6 +204,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) struct nvmf_connect_data *d; struct nvmet_ctrl *ctrl = NULL; u16 status = 0; + int ret; if (!nvmet_check_transfer_len(req, sizeof(struct nvmf_connect_data))) return; @@ -230,18 +247,32 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) uuid_copy(&ctrl->hostid, &d->hostid); + ret = nvmet_setup_auth(ctrl); + if (ret < 0) { + pr_err("Failed to setup authentication, error %d\n", ret); + nvmet_ctrl_put(ctrl); + if (ret == -EPERM) + status = (NVME_SC_CONNECT_INVALID_HOST | NVME_SC_DNR); + else + status = NVME_SC_INTERNAL; + goto out; + } + status = nvmet_install_queue(ctrl, req); if (status) { nvmet_ctrl_put(ctrl); goto out; } - pr_info("creating %s controller %d for subsystem %s for NQN %s%s.\n", + pr_info("creating %s controller %d for subsystem %s for NQN %s%s%s.\n", nvmet_is_disc_subsys(ctrl->subsys) ? "discovery" : "nvm", ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, - ctrl->pi_support ? " T10-PI is enabled" : ""); + ctrl->pi_support ? " T10-PI is enabled" : "", + nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); + if (nvmet_has_auth(ctrl)) + nvmet_init_auth(ctrl, req); out: kfree(d); complete: @@ -301,6 +332,9 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); + req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); + if (nvmet_has_auth(ctrl)) + nvmet_init_auth(ctrl, req); out: kfree(d); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c37f41eafc2f31..765db7541a8756 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -108,6 +108,18 @@ struct nvmet_sq { u16 size; u32 sqhd; bool sqhd_disabled; +#ifdef CONFIG_NVME_TARGET_AUTH + bool authenticated; + u16 dhchap_tid; + u16 dhchap_status; + int dhchap_step; + u8 *dhchap_c1; + u8 *dhchap_c2; + u32 dhchap_s1; + u32 dhchap_s2; + u8 *dhchap_skey; + int dhchap_skey_len; +#endif struct completion free_done; struct completion confirm_done; }; @@ -209,6 +221,11 @@ struct nvmet_ctrl { u64 err_counter; struct nvme_error_slot slots[NVMET_ERROR_LOG_SLOTS]; bool pi_support; +#ifdef CONFIG_NVME_TARGET_AUTH + struct nvme_dhchap_key *host_key; + struct nvme_dhchap_key *ctrl_key; + u8 shash_id; +#endif }; struct nvmet_subsys { @@ -270,6 +287,12 @@ static inline struct nvmet_subsys *namespaces_to_subsys( struct nvmet_host { struct config_group group; + u8 *dhchap_secret; + u8 *dhchap_ctrl_secret; + u8 dhchap_key_hash; + u8 dhchap_ctrl_key_hash; + u8 dhchap_hash_id; + u8 dhchap_dhgroup_id; }; static inline struct nvmet_host *to_host(struct config_item *item) @@ -668,4 +691,43 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio) bio_put(bio); } +#ifdef CONFIG_NVME_TARGET_AUTH +void nvmet_execute_auth_send(struct nvmet_req *req); +void nvmet_execute_auth_receive(struct nvmet_req *req); +int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, + bool set_ctrl); +int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); +int nvmet_setup_auth(struct nvmet_ctrl *ctrl); +void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req); +void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); +void nvmet_auth_sq_free(struct nvmet_sq *sq); +bool nvmet_check_auth_status(struct nvmet_req *req); +int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, + unsigned int hash_len); +int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, + unsigned int hash_len); +static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) +{ + return ctrl->host_key != NULL; +} +#else +static inline int nvmet_setup_auth(struct nvmet_ctrl *ctrl) +{ + return 0; +} +static inline void nvmet_init_auth(struct nvmet_ctrl *ctrl, + struct nvmet_req *req) {}; +static inline void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) {}; +static inline void nvmet_auth_sq_free(struct nvmet_sq *sq) {}; +static inline bool nvmet_check_auth_status(struct nvmet_req *req) +{ + return true; +} +static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) +{ + return false; +} +static inline const char *nvmet_dhchap_dhgroup_name(u8 dhgid) { return NULL; } +#endif + #endif /* _NVMET_H */ From 71ebe3842ebe6541bfc080d67df11ea1848edfc1 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:52:06 +0200 Subject: [PATCH 0287/1250] nvmet-auth: Diffie-Hellman key exchange support Implement Diffie-Hellman key exchange using FFDHE groups for NVMe In-Band Authentication. This patch adds a new host configfs attribute 'dhchap_dhgroup' to select the FFDHE group to use. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/Kconfig | 2 + drivers/nvme/target/auth.c | 157 +++++++++++++++++++++++++ drivers/nvme/target/configfs.c | 31 +++++ drivers/nvme/target/fabrics-cmd-auth.c | 41 +++++-- drivers/nvme/target/nvmet.h | 9 ++ 5 files changed, 232 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig index df526b59b509fd..f0c91f7686a39f 100644 --- a/drivers/nvme/target/Kconfig +++ b/drivers/nvme/target/Kconfig @@ -92,6 +92,8 @@ config NVME_TARGET_AUTH select CRYPTO_HMAC select CRYPTO_SHA256 select CRYPTO_SHA512 + select CRYPTO_DH + select CRYPTO_DH_GROUPS_RFC7919 help This enables support for NVMe over Fabrics In-band Authentication diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 5cdd23c3418530..d5624bdf834b85 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -54,6 +54,74 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, return 0; } +int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id) +{ + const char *dhgroup_kpp; + int ret = 0; + + pr_debug("%s: ctrl %d selecting dhgroup %d\n", + __func__, ctrl->cntlid, dhgroup_id); + + if (ctrl->dh_tfm) { + if (ctrl->dh_gid == dhgroup_id) { + pr_debug("%s: ctrl %d reuse existing DH group %d\n", + __func__, ctrl->cntlid, dhgroup_id); + return 0; + } + crypto_free_kpp(ctrl->dh_tfm); + ctrl->dh_tfm = NULL; + ctrl->dh_gid = 0; + } + + if (dhgroup_id == NVME_AUTH_DHGROUP_NULL) + return 0; + + dhgroup_kpp = nvme_auth_dhgroup_kpp(dhgroup_id); + if (!dhgroup_kpp) { + pr_debug("%s: ctrl %d invalid DH group %d\n", + __func__, ctrl->cntlid, dhgroup_id); + return -EINVAL; + } + ctrl->dh_tfm = crypto_alloc_kpp(dhgroup_kpp, 0, 0); + if (IS_ERR(ctrl->dh_tfm)) { + pr_debug("%s: ctrl %d failed to setup DH group %d, err %ld\n", + __func__, ctrl->cntlid, dhgroup_id, + PTR_ERR(ctrl->dh_tfm)); + ret = PTR_ERR(ctrl->dh_tfm); + ctrl->dh_tfm = NULL; + ctrl->dh_gid = 0; + } else { + ctrl->dh_gid = dhgroup_id; + pr_debug("%s: ctrl %d setup DH group %d\n", + __func__, ctrl->cntlid, ctrl->dh_gid); + ret = nvme_auth_gen_privkey(ctrl->dh_tfm, ctrl->dh_gid); + if (ret < 0) { + pr_debug("%s: ctrl %d failed to generate private key, err %d\n", + __func__, ctrl->cntlid, ret); + kfree_sensitive(ctrl->dh_key); + return ret; + } + ctrl->dh_keysize = crypto_kpp_maxsize(ctrl->dh_tfm); + kfree_sensitive(ctrl->dh_key); + ctrl->dh_key = kzalloc(ctrl->dh_keysize, GFP_KERNEL); + if (!ctrl->dh_key) { + pr_warn("ctrl %d failed to allocate public key\n", + ctrl->cntlid); + return -ENOMEM; + } + ret = nvme_auth_gen_pubkey(ctrl->dh_tfm, ctrl->dh_key, + ctrl->dh_keysize); + if (ret < 0) { + pr_warn("ctrl %d failed to generate public key\n", + ctrl->cntlid); + kfree(ctrl->dh_key); + ctrl->dh_key = NULL; + } + } + + return ret; +} + int nvmet_setup_auth(struct nvmet_ctrl *ctrl) { int ret = 0; @@ -81,6 +149,10 @@ int nvmet_setup_auth(struct nvmet_ctrl *ctrl) goto out_unlock; } + ret = nvmet_setup_dhgroup(ctrl, host->dhchap_dhgroup_id); + if (ret < 0) + pr_warn("Failed to setup DH group"); + if (!host->dhchap_secret) { pr_debug("No authentication provided\n"); goto out_unlock; @@ -158,6 +230,14 @@ void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) { ctrl->shash_id = 0; + if (ctrl->dh_tfm) { + crypto_free_kpp(ctrl->dh_tfm); + ctrl->dh_tfm = NULL; + ctrl->dh_gid = 0; + } + kfree_sensitive(ctrl->dh_key); + ctrl->dh_key = NULL; + if (ctrl->host_key) { nvme_auth_free_key(ctrl->host_key); ctrl->host_key = NULL; @@ -218,6 +298,21 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, if (ret) goto out_free_response; + if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) { + challenge = kmalloc(shash_len, GFP_KERNEL); + if (!challenge) { + ret = -ENOMEM; + goto out_free_response; + } + ret = nvme_auth_augmented_challenge(ctrl->shash_id, + req->sq->dhchap_skey, + req->sq->dhchap_skey_len, + req->sq->dhchap_c1, + challenge, shash_len); + if (ret) + goto out_free_response; + } + pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, req->sq->dhchap_tid); @@ -315,6 +410,21 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, if (ret) goto out_free_response; + if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) { + challenge = kmalloc(shash_len, GFP_KERNEL); + if (!challenge) { + ret = -ENOMEM; + goto out_free_response; + } + ret = nvme_auth_augmented_challenge(ctrl->shash_id, + req->sq->dhchap_skey, + req->sq->dhchap_skey_len, + req->sq->dhchap_c2, + challenge, shash_len); + if (ret) + goto out_free_response; + } + shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), GFP_KERNEL); if (!shash) { @@ -365,3 +475,50 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, crypto_free_shash(shash_tfm); return 0; } + +int nvmet_auth_ctrl_exponential(struct nvmet_req *req, + u8 *buf, int buf_size) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + int ret = 0; + + if (!ctrl->dh_key) { + pr_warn("ctrl %d no DH public key!\n", ctrl->cntlid); + return -ENOKEY; + } + if (buf_size != ctrl->dh_keysize) { + pr_warn("ctrl %d DH public key size mismatch, need %lu is %d\n", + ctrl->cntlid, ctrl->dh_keysize, buf_size); + ret = -EINVAL; + } else { + memcpy(buf, ctrl->dh_key, buf_size); + pr_debug("%s: ctrl %d public key %*ph\n", __func__, + ctrl->cntlid, (int)buf_size, buf); + } + + return ret; +} + +int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, + u8 *pkey, int pkey_size) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + int ret; + + req->sq->dhchap_skey_len = ctrl->dh_keysize; + req->sq->dhchap_skey = kzalloc(req->sq->dhchap_skey_len, GFP_KERNEL); + if (!req->sq->dhchap_skey) + return -ENOMEM; + ret = nvme_auth_gen_shared_secret(ctrl->dh_tfm, + pkey, pkey_size, + req->sq->dhchap_skey, + req->sq->dhchap_skey_len); + if (ret) + pr_debug("failed to compute shared secred, err %d\n", ret); + else + pr_debug("%s: shared secret %*ph\n", __func__, + (int)req->sq->dhchap_skey_len, + req->sq->dhchap_skey); + + return ret; +} diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 9a7d91c64fcd99..1b11f6a83bb6ca 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1746,10 +1746,41 @@ static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item, CONFIGFS_ATTR(nvmet_host_, dhchap_hash); +static ssize_t nvmet_host_dhchap_dhgroup_show(struct config_item *item, + char *page) +{ + struct nvmet_host *host = to_host(item); + const char *dhgroup = nvme_auth_dhgroup_name(host->dhchap_dhgroup_id); + + return sprintf(page, "%s\n", dhgroup ? dhgroup : "none"); +} + +static ssize_t nvmet_host_dhchap_dhgroup_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_host *host = to_host(item); + int dhgroup_id; + + dhgroup_id = nvme_auth_dhgroup_id(page); + if (dhgroup_id == NVME_AUTH_DHGROUP_INVALID) + return -EINVAL; + if (dhgroup_id != NVME_AUTH_DHGROUP_NULL) { + const char *kpp = nvme_auth_dhgroup_kpp(dhgroup_id); + + if (!crypto_has_kpp(kpp, 0, 0)) + return -EINVAL; + } + host->dhchap_dhgroup_id = dhgroup_id; + return count; +} + +CONFIGFS_ATTR(nvmet_host_, dhchap_dhgroup); + static struct configfs_attribute *nvmet_host_attrs[] = { &nvmet_host_attr_dhchap_key, &nvmet_host_attr_dhchap_ctrl_key, &nvmet_host_attr_dhchap_hash, + &nvmet_host_attr_dhchap_dhgroup, NULL, }; #endif /* CONFIG_NVME_TARGET_AUTH */ diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 776073a10e04ee..5b1be7e607e2a4 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -27,7 +27,7 @@ static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d) { struct nvmet_ctrl *ctrl = req->sq->ctrl; struct nvmf_auth_dhchap_negotiate_data *data = d; - int i, hash_id = 0, fallback_hash_id = 0, dhgid; + int i, hash_id = 0, fallback_hash_id = 0, dhgid, fallback_dhgid; pr_debug("%s: ctrl %d qid %d: data sc_d %d napd %d authid %d halen %d dhlen %d\n", __func__, ctrl->cntlid, req->sq->qid, @@ -69,22 +69,35 @@ static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d) } dhgid = -1; + fallback_dhgid = -1; for (i = 0; i < data->auth_protocol[0].dhchap.dhlen; i++) { int tmp_dhgid = data->auth_protocol[0].dhchap.idlist[i + 30]; - if (tmp_dhgid == NVME_AUTH_DHGROUP_NULL) { + if (tmp_dhgid != ctrl->dh_gid) { dhgid = tmp_dhgid; break; } + if (fallback_dhgid < 0) { + const char *kpp = nvme_auth_dhgroup_kpp(tmp_dhgid); + + if (crypto_has_kpp(kpp, 0, 0)) + fallback_dhgid = tmp_dhgid; + } } if (dhgid < 0) { - pr_debug("%s: ctrl %d qid %d: no usable DH group found\n", + if (fallback_dhgid < 0) { + pr_debug("%s: ctrl %d qid %d: no usable DH group found\n", __func__, ctrl->cntlid, req->sq->qid); - return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; + return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; + } + pr_debug("%s: ctrl %d qid %d: configured DH group %s not found\n", + __func__, ctrl->cntlid, req->sq->qid, + nvme_auth_dhgroup_name(fallback_dhgid)); + ctrl->dh_gid = fallback_dhgid; } pr_debug("%s: ctrl %d qid %d: selected DH group %s (%d)\n", __func__, ctrl->cntlid, req->sq->qid, - nvme_auth_dhgroup_name(dhgid), dhgid); + nvme_auth_dhgroup_name(ctrl->dh_gid), ctrl->dh_gid); return 0; } @@ -100,7 +113,11 @@ static u16 nvmet_auth_reply(struct nvmet_req *req, void *d) data->hl, data->cvalid, dhvlen); if (dhvlen) { - return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + if (!ctrl->dh_tfm) + return NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; + if (nvmet_auth_ctrl_sesskey(req, data->rval + 2 * data->hl, + dhvlen) < 0) + return NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; } response = kmalloc(data->hl, GFP_KERNEL); @@ -332,6 +349,8 @@ static int nvmet_auth_challenge(struct nvmet_req *req, void *d, int al) int hash_len = nvme_auth_hmac_hash_len(ctrl->shash_id); int data_size = sizeof(*d) + hash_len; + if (ctrl->dh_tfm) + data_size += ctrl->dh_keysize; if (al < data_size) { pr_debug("%s: buffer too small (al %d need %d)\n", __func__, al, data_size); @@ -350,9 +369,15 @@ static int nvmet_auth_challenge(struct nvmet_req *req, void *d, int al) return -ENOMEM; get_random_bytes(req->sq->dhchap_c1, data->hl); memcpy(data->cval, req->sq->dhchap_c1, data->hl); - pr_debug("%s: ctrl %d qid %d seq %u transaction %d hl %d dhvlen %u\n", + if (ctrl->dh_tfm) { + data->dhgid = ctrl->dh_gid; + data->dhvlen = cpu_to_le16(ctrl->dh_keysize); + ret = nvmet_auth_ctrl_exponential(req, data->cval + data->hl, + ctrl->dh_keysize); + } + pr_debug("%s: ctrl %d qid %d seq %d transaction %d hl %d dhvlen %zu\n", __func__, ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, - req->sq->dhchap_tid, data->hl, 0); + req->sq->dhchap_tid, data->hl, ctrl->dh_keysize); return ret; } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 765db7541a8756..8b239aec3ca23a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -225,6 +225,10 @@ struct nvmet_ctrl { struct nvme_dhchap_key *host_key; struct nvme_dhchap_key *ctrl_key; u8 shash_id; + struct crypto_kpp *dh_tfm; + u8 dh_gid; + u8 *dh_key; + size_t dh_keysize; #endif }; @@ -701,6 +705,7 @@ int nvmet_setup_auth(struct nvmet_ctrl *ctrl); void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req); void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); void nvmet_auth_sq_free(struct nvmet_sq *sq); +int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id); bool nvmet_check_auth_status(struct nvmet_req *req); int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, unsigned int hash_len); @@ -710,6 +715,10 @@ static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl) { return ctrl->host_key != NULL; } +int nvmet_auth_ctrl_exponential(struct nvmet_req *req, + u8 *buf, int buf_size); +int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, + u8 *buf, int buf_size); #else static inline int nvmet_setup_auth(struct nvmet_ctrl *ctrl) { From 375e2143d8f411c181eb630dd2f27a21e5a1a6e9 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 27 Jun 2022 11:52:07 +0200 Subject: [PATCH 0288/1250] nvmet-auth: expire authentication sessions Each authentication step is required to be completed within the KATO interval (or two minutes if not set). So add a workqueue function to reset the transaction ID and the expected next protocol step; this will automatically the next authentication command referring to the terminated authentication. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/auth.c | 1 + drivers/nvme/target/fabrics-cmd-auth.c | 20 +++++++++++++++++++- drivers/nvme/target/nvmet.h | 1 + 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index d5624bdf834b85..bf92435c783c45 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -218,6 +218,7 @@ int nvmet_setup_auth(struct nvmet_ctrl *ctrl) void nvmet_auth_sq_free(struct nvmet_sq *sq) { + cancel_delayed_work(&sq->auth_expired_work); kfree(sq->dhchap_c1); sq->dhchap_c1 = NULL; kfree(sq->dhchap_c2); diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 5b1be7e607e2a4..cc56e8c821ce34 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -12,11 +12,24 @@ #include #include "nvmet.h" +static void nvmet_auth_expired_work(struct work_struct *work) +{ + struct nvmet_sq *sq = container_of(to_delayed_work(work), + struct nvmet_sq, auth_expired_work); + + pr_debug("%s: ctrl %d qid %d transaction %u expired, resetting\n", + __func__, sq->ctrl->cntlid, sq->qid, sq->dhchap_tid); + sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; + sq->dhchap_tid = -1; +} + void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req) { u32 result = le32_to_cpu(req->cqe->result.u32); /* Initialize in-band authentication */ + INIT_DELAYED_WORK(&req->sq->auth_expired_work, + nvmet_auth_expired_work); req->sq->authenticated = false; req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE; result |= (u32)NVME_CONNECT_AUTHREQ_ATR << 16; @@ -333,8 +346,13 @@ void nvmet_execute_auth_send(struct nvmet_req *req) req->cqe->result.u64 = 0; nvmet_req_complete(req, status); if (req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2 && - req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) + req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) { + unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120; + + mod_delayed_work(system_wq, &req->sq->auth_expired_work, + auth_expire_secs * HZ); return; + } /* Final states, clear up variables */ nvmet_auth_sq_free(req->sq); if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 8b239aec3ca23a..829fb1d78ee16d 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -109,6 +109,7 @@ struct nvmet_sq { u32 sqhd; bool sqhd_disabled; #ifdef CONFIG_NVME_TARGET_AUTH + struct delayed_work auth_expired_work; bool authenticated; u16 dhchap_tid; u16 dhchap_status; From c2f0eda6ad1b6d9ae7bddebfd619a2a64d1efb8f Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Wed, 27 Apr 2022 15:29:01 +0100 Subject: [PATCH 0289/1250] ARM: 9202/1: kasan: support CONFIG_KASAN_VMALLOC Simply make shadow of vmalloc area mapped on demand. Since the virtual address of vmalloc for Arm is also between MODULE_VADDR and 0x100000000 (ZONE_HIGHMEM), which means the shadow address has already included between KASAN_SHADOW_START and KASAN_SHADOW_END. Thus we need to change nothing for memory map of Arm. This can fix ARM_MODULE_PLTS with KASan, support KASan for higmem and support CONFIG_VMAP_STACK with KASan. Signed-off-by: Lecopzer Chen Tested-by: Linus Walleij Reviewed-by: Linus Walleij Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 1 + arch/arm/mm/kasan_init.c | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0dcf88e7f9cf3b..a2c9d30551b806 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -75,6 +75,7 @@ config ARM select HAVE_ARCH_KFENCE if MMU && !XIP_KERNEL select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL + select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_PFN_VALID select HAVE_ARCH_SECCOMP diff --git a/arch/arm/mm/kasan_init.c b/arch/arm/mm/kasan_init.c index 5ad0d6c56d56ef..29caee9c79ce3c 100644 --- a/arch/arm/mm/kasan_init.c +++ b/arch/arm/mm/kasan_init.c @@ -236,7 +236,11 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); - kasan_populate_early_shadow(kasan_mem_to_shadow((void *)VMALLOC_START), + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_populate_early_shadow(kasan_mem_to_shadow((void *)VMALLOC_START), + kasan_mem_to_shadow((void *)VMALLOC_END)); + + kasan_populate_early_shadow(kasan_mem_to_shadow((void *)VMALLOC_END), kasan_mem_to_shadow((void *)-1UL) + 1); for_each_mem_range(i, &pa_start, &pa_end) { From 205b6eaef06f794142a5033645441f05f2463648 Mon Sep 17 00:00:00 2001 From: Lecopzer Chen Date: Wed, 27 Apr 2022 15:30:00 +0100 Subject: [PATCH 0290/1250] ARM: 9203/1: kconfig: fix MODULE_PLTS for KASAN with KASAN_VMALLOC When we run out of module space address with ko insertion, and with MODULE_PLTS, module would turn to try to find memory from VMALLOC address space. Unfortunately, with KASAN enabled, VMALLOC doesn't work without KASAN_VMALLOC, thus select KASAN_VMALLOC by default. 8<--- cut here --- Unable to handle kernel paging request at virtual address bd300860 [bd300860] *pgd=41cf1811, *pte=41cf26df, *ppte=41cf265f Internal error: Oops: 80f [#1] PREEMPT SMP ARM Modules linked in: hello(O+) CPU: 0 PID: 89 Comm: insmod Tainted: G O 5.16.0-rc6+ #19 Hardware name: Generic DT based system PC is at mmioset+0x30/0xa8 LR is at 0x0 pc : [] lr : [<00000000>] psr: 20000013 sp : c451fc18 ip : bd300860 fp : c451fc2c r10: f18042cc r9 : f18042d0 r8 : 00000000 r7 : 00000001 r6 : 00000003 r5 : 01312d00 r4 : f1804300 r3 : 00000000 r2 : 00262560 r1 : 00000000 r0 : bd300860 Flags: nzCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: 43e9406a DAC: 00000051 Register r0 information: non-paged memory Register r1 information: NULL pointer Register r2 information: non-paged memory Register r3 information: NULL pointer Register r4 information: 4887-page vmalloc region starting at 0xf1802000 allocated at load_module+0x14f4/0x32a8 Register r5 information: non-paged memory Register r6 information: non-paged memory Register r7 information: non-paged memory Register r8 information: NULL pointer Register r9 information: 4887-page vmalloc region starting at 0xf1802000 allocated at load_module+0x14f4/0x32a8 Register r10 information: 4887-page vmalloc region starting at 0xf1802000 allocated at load_module+0x14f4/0x32a8 Register r11 information: non-slab/vmalloc memory Register r12 information: non-paged memory Process insmod (pid: 89, stack limit = 0xc451c000) Stack: (0xc451fc18 to 0xc4520000) fc00: f18041f0 c04803a4 fc20: c451fc44 c451fc30 c048053c c0480358 f1804030 01312cff c451fc64 c451fc48 fc40: c047f330 c0480500 f18040c0 c1b52ccc 00000001 c5be7700 c451fc74 c451fc68 fc60: f1802098 c047f300 c451fcb4 c451fc78 c026106c f180208c c4880004 00000000 fc80: c451fcb4 bf001000 c044ff48 c451fec0 f18040c0 00000000 c1b54cc4 00000000 fca0: c451fdf0 f1804268 c451fe64 c451fcb8 c0264e88 c0260d48 ffff8000 00007fff fcc0: f18040c0 c025cd00 c451fd14 00000003 0157f008 f1804258 f180425c f1804174 fce0: f1804154 f180424c f18041f0 f180414c f1804178 f18041c0 bf0025d4 188a3fa8 fd00: 0000009e f1804170 f2b18000 c451ff10 c0d92e40 f180416c c451feec 00000001 fd20: 00000000 c451fec8 c451fe20 c451fed0 f18040cc 00000000 f17ea000 c451fdc0 fd40: 41b58ab3 c1387729 c0261c28 c047fb5c c451fe2c c451fd60 c0525308 c048033c fd60: 188a3fb4 c3ccb090 c451fe00 c3ccb080 00000000 00000000 00016920 00000000 fd80: c02d0388 c047f55c c02d0388 00000000 c451fddc c451fda0 c02d0388 00000000 fda0: 41b58ab3 c13a72d0 c0524ff0 c1705f48 c451fdfc c451fdc0 c02d0388 c047f55c fdc0: 00016920 00000000 00000003 c1bb2384 c451fdfc c3ccb080 c1bb2384 00000000 fde0: 00000000 00000000 00000000 00000000 c451fe1c c451fe00 c04e9d70 c1705f48 fe00: c1b54cc4 c1bbc71c c3ccb080 00000000 c3ccb080 00000000 00000003 c451fec0 fe20: c451fe64 c451fe30 c0525918 c0524ffc c451feb0 c1705f48 00000000 c1b54cc4 fe40: b78a3fd0 c451ff60 00000000 0157f008 00000003 c451fec0 c451ffa4 c451fe68 fe60: c0265480 c0261c34 c451feb0 7fffffff 00000000 00000002 00000000 c4880000 fe80: 41b58ab3 c138777b c02652cc c04803ec 000a0000 c451ff00 ffffff9c b6ac9f60 fea0: c451fed4 c1705f48 c04a4a90 b78a3fdc f17ea000 ffffff9c b6ac9f60 c0100244 fec0: f17ea21a f17ea300 f17ea000 00016920 f1800240 f18000ac f17fb7dc 01316000 fee0: 013161b0 00002590 01316250 00000000 00000000 00000000 00002580 00000029 ff00: 0000002a 00000013 00000000 0000000c 00000000 00000000 0157f004 c451ffb0 ff20: c1719be0 aed6f410 c451ff74 c451ff38 c0c4103c c0c407d0 c451ff84 c451ff48 ff40: 00000805 c02c8658 c1604230 c1719c30 00000805 0157f004 00000005 c451ffb0 ff60: c1719be0 aed6f410 c451ffac c451ff78 c0122130 c1705f48 c451ffac 0157f008 ff80: 00000006 0000005f 0000017b c0100244 c4880000 0000017b 00000000 c451ffa8 ffa0: c0100060 c02652d8 0157f008 00000006 00000003 0157f008 00000000 b6ac9f60 ffc0: 0157f008 00000006 0000005f 0000017b 00000000 00000000 aed85f74 00000000 ffe0: b6ac9cd8 b6ac9cc8 00030200 aecf2d60 a0000010 00000003 00000000 00000000 Backtrace: [] (kasan_poison) from [] (kasan_unpoison+0x48/0x5c) [] (kasan_unpoison) from [] (__asan_register_globals+0x3c/0x64) r5:01312cff r4:f1804030 [] (__asan_register_globals) from [] (_sub_I_65535_1+0x18/0xf80 [hello]) r7:c5be7700 r6:00000001 r5:c1b52ccc r4:f18040c0 [] (_sub_I_65535_1 [hello]) from [] (do_init_module+0x330/0x72c) [] (do_init_module) from [] (load_module+0x3260/0x32a8) r10:f1804268 r9:c451fdf0 r8:00000000 r7:c1b54cc4 r6:00000000 r5:f18040c0 r4:c451fec0 [] (load_module) from [] (sys_finit_module+0x1b4/0x1e8) r10:c451fec0 r9:00000003 r8:0157f008 r7:00000000 r6:c451ff60 r5:b78a3fd0 r4:c1b54cc4 [] (sys_finit_module) from [] (ret_fast_syscall+0x0/0x1c) Exception stack(0xc451ffa8 to 0xc451fff0) ffa0: 0157f008 00000006 00000003 0157f008 00000000 b6ac9f60 ffc0: 0157f008 00000006 0000005f 0000017b 00000000 00000000 aed85f74 00000000 ffe0: b6ac9cd8 b6ac9cc8 00030200 aecf2d60 r10:0000017b r9:c4880000 r8:c0100244 r7:0000017b r6:0000005f r5:00000006 r4:0157f008 Code: e92d4100 e1a08001 e1a0e003 e2522040 (a8ac410a) ---[ end trace df6e12843197b6f5 ]--- Signed-off-by: Lecopzer Chen Tested-by: Linus Walleij Reviewed-by: Linus Walleij Signed-off-by: Russell King (Oracle) --- arch/arm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a2c9d30551b806..36489a8de566bc 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1530,6 +1530,7 @@ config HW_PERF_EVENTS config ARM_MODULE_PLTS bool "Use PLTs to allow module memory to spill over into vmalloc area" depends on MODULES + select KASAN_VMALLOC if KASAN default y help Allocate PLTs when loading modules so that jumps and calls whose From f50f5a5eac8092fb9b3365ca4b1d7407cdab8427 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 30 Jun 2022 17:01:00 +0800 Subject: [PATCH 0291/1250] ext4: fix reading leftover inlined symlinks Since commit 6493792d3299 ("ext4: convert symlink external data block mapping to bdev"), create new symlink with inline_data is not supported, but it missing to handle the leftover inlined symlinks, which could cause below error message and fail to read symlink. ls: cannot read symbolic link 'foo': Structure needs cleaning EXT4-fs error (device sda): ext4_map_blocks:605: inode #12: block 2021161080: comm ls: lblock 0 mapped to illegal pblock 2021161080 (length 1) Fix this regression by adding ext4_read_inline_link(), which read the inline data directly and convert it through a kmalloced buffer. Fixes: 6493792d3299 ("ext4: convert symlink external data block mapping to bdev") Reported-by: Torge Matthies Signed-off-by: Zhang Yi Tested-by: Torge Matthies Link: https://lore.kernel.org/r/20220630090100.2769490-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/inline.c | 30 ++++++++++++++++++++++++++++++ fs/ext4/symlink.c | 15 +++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 75b8d81b24692c..adfc30ee4b7bea 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3583,6 +3583,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int *has_inline, __u64 start, __u64 len); +extern void *ext4_read_inline_link(struct inode *inode); struct iomap; extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index cff52ff6549d2d..1fa36cbe09ecc0 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -6,6 +6,7 @@ #include #include +#include #include #include @@ -1588,6 +1589,35 @@ int ext4_read_inline_dir(struct file *file, return ret; } +void *ext4_read_inline_link(struct inode *inode) +{ + struct ext4_iloc iloc; + int ret, inline_size; + void *link; + + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) + return ERR_PTR(ret); + + ret = -ENOMEM; + inline_size = ext4_get_inline_size(inode); + link = kmalloc(inline_size + 1, GFP_NOFS); + if (!link) + goto out; + + ret = ext4_read_inline_data(inode, link, inline_size, &iloc); + if (ret < 0) { + kfree(link); + goto out; + } + nd_terminate_link(link, inode->i_size, ret); +out: + if (ret < 0) + link = ERR_PTR(ret); + brelse(iloc.bh); + return link; +} + struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval) diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index d281f5bcc5264f..3d3ed3c38f5644 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -74,6 +74,21 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct buffer_head *bh; + char *inline_link; + + /* + * Create a new inlined symlink is not supported, just provide a + * method to read the leftovers. + */ + if (ext4_has_inline_data(inode)) { + if (!dentry) + return ERR_PTR(-ECHILD); + + inline_link = ext4_read_inline_link(inode); + if (!IS_ERR(inline_link)) + set_delayed_call(callback, kfree_link, inline_link); + return inline_link; + } if (!dentry) { bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); From e781b8ce9261f353df91b94303e53c31fdf9871e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 29 Jun 2022 00:00:25 -0400 Subject: [PATCH 0292/1250] ext4: update s_overhead_clusters in the superblock during an on-line resize When doing an online resize, the on-disk superblock on-disk wasn't updated. This means that when the file system is unmounted and remounted, and the on-disk overhead value is non-zero, this would result in the results of statfs(2) to be incorrect. This was partially fixed by Commits 10b01ee92df5 ("ext4: fix overhead calculation to account for the reserved gdt blocks"), 85d825dbf489 ("ext4: force overhead calculation if the s_overhead_cluster makes no sense"), and eb7054212eac ("ext4: update the cached overhead value in the superblock"). However, since it was too expensive to forcibly recalculate the overhead for bigalloc file systems at every mount, this didn't fix the problem for bigalloc file systems. This commit should address the problem when resizing file systems with the bigalloc feature enabled. Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20220629040026.112371-1-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 8b70a470129314..e5c2713aa11ad4 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1484,6 +1484,7 @@ static void ext4_update_super(struct super_block *sb, * Update the fs overhead information */ ext4_calculate_overhead(sb); + es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: added group %u:" From 2c8204b83ceaf439dff2d1a94a7e2d3ad7619287 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 29 Jun 2022 00:00:26 -0400 Subject: [PATCH 0293/1250] ext4: update the s_overhead_clusters in the backup sb's when resizing When the EXT4_IOC_RESIZE_FS ioctl is complete, update the backup superblocks. We don't do this for the old-style resize ioctls since they are quite ancient, and only used by very old versions of resize2fs --- and we don't want to update the backup superblocks every time EXT4_IOC_GROUP_ADD is called, since it might get called a lot. Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Link: https://lore.kernel.org/r/20220629040026.112371-2-tytso@mit.edu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++-- fs/ext4/ioctl.c | 22 +++++++++++++++------- fs/ext4/resize.c | 5 ++++- fs/ext4/super.c | 2 +- 4 files changed, 22 insertions(+), 11 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index adfc30ee4b7bea..310e976ef1fdab 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3016,7 +3016,7 @@ int ext4_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); -int ext4_update_overhead(struct super_block *sb); +int ext4_update_overhead(struct super_block *sb, bool force); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); @@ -3800,7 +3800,7 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; extern int ext4_resize_begin(struct super_block *sb); -extern void ext4_resize_end(struct super_block *sb); +extern int ext4_resize_end(struct super_block *sb, bool update_backups); static inline void ext4_set_io_unwritten_flag(struct inode *inode, struct ext4_io_end *io_end) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index cb01c1da0f9d51..1702c574407a27 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -944,7 +944,9 @@ static long ext4_ioctl_group_add(struct file *file, test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, input->group); group_add_out: - ext4_resize_end(sb); + err2 = ext4_resize_end(sb, false); + if (err == 0) + err = err2; return err; } @@ -1223,7 +1225,9 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = err2; mnt_drop_write_file(filp); group_extend_out: - ext4_resize_end(sb); + err2 = ext4_resize_end(sb, false); + if (err == 0) + err = err2; return err; } @@ -1371,7 +1375,9 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = ext4_register_li_request(sb, o_group); resizefs_out: - ext4_resize_end(sb); + err2 = ext4_resize_end(sb, true); + if (err == 0) + err = err2; return err; } @@ -1599,13 +1605,15 @@ static void set_overhead(struct ext4_super_block *es, const void *arg) es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } -int ext4_update_overhead(struct super_block *sb) +int ext4_update_overhead(struct super_block *sb, bool force) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sb_rdonly(sb) || sbi->s_overhead == 0 || - sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters)) + if (sb_rdonly(sb)) + return 0; + if (!force && + (sbi->s_overhead == 0 || + sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))) return 0; - return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e5c2713aa11ad4..e4e89ca82f8cc6 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -97,10 +97,13 @@ int ext4_resize_begin(struct super_block *sb) return ret; } -void ext4_resize_end(struct super_block *sb) +int ext4_resize_end(struct super_block *sb, bool update_backups) { clear_bit_unlock(EXT4_FLAGS_RESIZING, &EXT4_SB(sb)->s_ext4_flags); smp_mb__after_atomic(); + if (update_backups) + return ext4_update_overhead(sb, true); + return 0; } static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 845f2f8aee5f9b..6a8a752d812b25 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5523,7 +5523,7 @@ static int ext4_fill_super(struct super_block *sb, struct fs_context *fc) "Quota mode: %s.", descr, ext4_quota_mode(sb)); /* Update the s_overhead_clusters if necessary */ - ext4_update_overhead(sb); + ext4_update_overhead(sb, false); return 0; free_sbi: From 08779aaa3f63ccb5cb3a2b78135132231677085e Mon Sep 17 00:00:00 2001 From: hanjinke Date: Mon, 6 Jun 2022 23:53:05 +0800 Subject: [PATCH 0294/1250] ext4: reuse order and buddy in mb_mark_used when buddy split After each buddy split, mb_mark_used will search the proper order for the block which may consume some loop in mb_find_order_for_block. In fact, we can reuse the order and buddy generated by the buddy split. Reviewed by: lei.rao@intel.com Signed-off-by: hanjinke Link: https://lore.kernel.org/r/20220606155305.74146-1-hanjinke.666@bytedance.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9e06334771a394..b02f71f07289d1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1933,6 +1933,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) unsigned ret = 0; int len0 = len; void *buddy; + bool split = false; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); @@ -1957,12 +1958,16 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain buddy itself */ while (len) { - ord = mb_find_order_for_block(e4b, start); + if (!split) + ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; - buddy = mb_find_buddy(e4b, ord, &max); + if (!split) + buddy = mb_find_buddy(e4b, ord, &max); + else + split = false; BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; @@ -1989,6 +1994,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) mb_clear_bit(cur + 1, buddy); e4b->bd_info->bb_counters[ord]++; e4b->bd_info->bb_counters[ord]++; + split = true; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); From d7acf6d8c57a29bb33eac2fe9b5af5a89053eee2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Jun 2022 13:23:47 +0200 Subject: [PATCH 0295/1250] ext4: use ext4_debug() instead of jbd_debug() We use jbd_debug() in some places in ext4. It seems a bit strange to use jbd2 debugging output function for ext4 code. Also these days ext4_debug() uses dynamic printk so each debug message can be enabled / disabled on its own so the time when it made some sense to have these combined (to allow easier common selecting of messages to report) has passed. Just convert all jbd_debug() uses in ext4 to ext4_debug(). Signed-off-by: Jan Kara Reviewed-by: Lukas Czerner Link: https://lore.kernel.org/r/20220608112355.4397-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 2 +- fs/ext4/ext4_jbd2.c | 3 +-- fs/ext4/fast_commit.c | 44 +++++++++++++++++++++---------------------- fs/ext4/indirect.c | 4 ++-- fs/ext4/inode.c | 2 +- fs/ext4/orphan.c | 24 +++++++++++------------ fs/ext4/super.c | 2 +- 7 files changed, 40 insertions(+), 41 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 78ee3ef795aec6..8ff4b9192a9f5d 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -666,7 +666,7 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * it's possible we've just missed a transaction commit here, * so ignore the returned status */ - jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); + ext4_debug("%s: retrying operation after ENOSPC\n", sb->s_id); (void) jbd2_journal_force_commit_nested(sbi->s_journal); return 1; } diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 3477a16d08aeeb..8e1fb18f465ea1 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -267,8 +267,7 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, trace_ext4_forget(inode, is_metadata, blocknr); BUFFER_TRACE(bh, "enter"); - jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " - "data mode %x\n", + ext4_debug("forgetting bh %p: is_metadata=%d, mode %o, data mode %x\n", bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 795a60ad189784..0349cd96e93546 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -917,8 +917,8 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) mutex_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; - jbd_debug(1, "%s: will try writing %d to %d for inode %ld\n", - __func__, cur_lblk_off, new_blk_size, inode->i_ino); + ext4_debug("will try writing %d to %d for inode %ld\n", + cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; @@ -1168,7 +1168,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status, { struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; - jbd_debug(1, "Fast commit ended with status = %d for tid %u", + ext4_debug("Fast commit ended with status = %d for tid %u", status, commit_tid); if (status == EXT4_FC_STATUS_OK) { stats->fc_num_commits++; @@ -1375,14 +1375,14 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found", darg.ino); + ext4_debug("Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) { - jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); + ext4_debug("Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; } @@ -1407,21 +1407,21 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { - jbd_debug(1, "Dir with inode %d not found.", darg->parent_ino); + ext4_debug("Dir with inode %d not found.", darg->parent_ino); dir = NULL; goto out; } dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) { - jbd_debug(1, "Failed to obtain dentry"); + ext4_debug("Failed to obtain dentry"); dentry_dir = NULL; goto out; } dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) { - jbd_debug(1, "Inode dentry not created."); + ext4_debug("Inode dentry not created."); ret = -ENOMEM; goto out; } @@ -1434,7 +1434,7 @@ static int ext4_fc_replay_link_internal(struct super_block *sb, * could complete. */ if (ret && ret != -EEXIST) { - jbd_debug(1, "Failed to link\n"); + ext4_debug("Failed to link\n"); goto out; } @@ -1468,7 +1468,7 @@ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode not found."); + ext4_debug("Inode not found."); return 0; } @@ -1576,7 +1576,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl, /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode not found."); + ext4_debug("Inode not found."); return -EFSCORRUPTED; } @@ -1630,7 +1630,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "inode %d not found.", darg.ino); + ext4_debug("inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; goto out; @@ -1643,7 +1643,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { - jbd_debug(1, "Dir %d not found.", darg.ino); + ext4_debug("Dir %d not found.", darg.ino); goto out; } ret = ext4_init_new_dir(NULL, dir, inode); @@ -1727,7 +1727,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode not found."); + ext4_debug("Inode not found."); return 0; } @@ -1741,7 +1741,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, cur = start; remaining = len; - jbd_debug(1, "ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", + ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); @@ -1802,7 +1802,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, } /* Range is mapped and needs a state change */ - jbd_debug(1, "Converting from %ld to %d %lld", + ext4_debug("Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, @@ -1845,7 +1845,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange.fc_ino)); + ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } @@ -1853,7 +1853,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl, if (ret) goto out; - jbd_debug(1, "DEL_RANGE, inode %ld, lblk %d, len %d\n", + ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { @@ -1902,7 +1902,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); if (IS_ERR(inode)) { - jbd_debug(1, "Inode %d not found.", + ext4_debug("Inode %d not found.", state->fc_modified_inodes[i]); continue; } @@ -2031,7 +2031,7 @@ static int ext4_fc_replay_scan(journal_t *journal, for (cur = start; cur < end; cur = cur + sizeof(tl) + le16_to_cpu(tl.fc_len)) { memcpy(&tl, cur, sizeof(tl)); val = cur + sizeof(tl); - jbd_debug(3, "Scan phase, tag:%s, blk %lld\n", + ext4_debug("Scan phase, tag:%s, blk %lld\n", tag2str(le16_to_cpu(tl.fc_tag)), bh->b_blocknr); switch (le16_to_cpu(tl.fc_tag)) { case EXT4_FC_TAG_ADD_RANGE: @@ -2126,7 +2126,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, sbi->s_mount_state |= EXT4_FC_REPLAY; } if (!sbi->s_fc_replay_state.fc_replay_num_tags) { - jbd_debug(1, "Replay stops\n"); + ext4_debug("Replay stops\n"); ext4_fc_set_bitmaps_and_counters(sb); return 0; } @@ -2150,7 +2150,7 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, ext4_fc_set_bitmaps_and_counters(sb); break; } - jbd_debug(3, "Replay phase, tag:%s\n", + ext4_debug("Replay phase, tag:%s\n", tag2str(le16_to_cpu(tl.fc_tag))); state->fc_replay_num_tags--; switch (le16_to_cpu(tl.fc_tag)) { diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 07a8c75b65edc5..860fc51190098f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -460,7 +460,7 @@ static int ext4_splice_branch(handle_t *handle, * the new i_size. But that is not done here - it is done in * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ - jbd_debug(5, "splicing indirect only\n"); + ext4_debug("splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); if (err) @@ -472,7 +472,7 @@ static int ext4_splice_branch(handle_t *handle, err = ext4_mark_inode_dirty(handle, ar->inode); if (unlikely(err)) goto err_out; - jbd_debug(5, "splicing direct\n"); + ext4_debug("splicing direct\n"); } return err; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 84c0eb55071d65..33fcf5ef0f6b2a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5213,7 +5213,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { - jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n"); + ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 7de0612eb42d81..69a9cf9137a610 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -181,8 +181,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) } else brelse(iloc.bh); - jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); - jbd_debug(4, "orphan inode %lu will point to %d\n", + ext4_debug("superblock will point to %lu\n", inode->i_ino); + ext4_debug("orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out: ext4_std_error(sb, err); @@ -251,7 +251,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) } mutex_lock(&sbi->s_orphan_lock); - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + ext4_debug("remove inode %lu from orphan list\n", inode->i_ino); prev = ei->i_orphan.prev; list_del_init(&ei->i_orphan); @@ -267,7 +267,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) ino_next = NEXT_ORPHAN(inode); if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %u\n", ino_next); + ext4_debug("superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, inode->i_sb, sbi->s_sbh, EXT4_JTR_NONE); @@ -286,7 +286,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - jbd_debug(4, "orphan inode %lu will point to %u\n", + ext4_debug("orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) { @@ -332,8 +332,8 @@ static void ext4_process_orphan(struct inode *inode, ext4_msg(sb, KERN_DEBUG, "%s: truncating inode %lu to %lld bytes", __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %lld bytes\n", - inode->i_ino, inode->i_size); + ext4_debug("truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ret = ext4_truncate(inode); @@ -353,8 +353,8 @@ static void ext4_process_orphan(struct inode *inode, ext4_msg(sb, KERN_DEBUG, "%s: deleting unreferenced inode %lu", __func__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %lu\n", - inode->i_ino); + ext4_debug("deleting unreferenced inode %lu\n", + inode->i_ino); (*nr_orphans)++; } iput(inode); /* The delete magic happens here! */ @@ -391,7 +391,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) int inodes_per_ob = ext4_inodes_per_orphan_block(sb); if (!es->s_last_orphan && !oi->of_blocks) { - jbd_debug(4, "no orphan inodes to clean up\n"); + ext4_debug("no orphan inodes to clean up\n"); return; } @@ -415,7 +415,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) "clearing orphan list.\n"); es->s_last_orphan = 0; } - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + ext4_debug("Skipping orphan recovery on fs with errors.\n"); return; } @@ -459,7 +459,7 @@ void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) * so, skip the rest. */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + ext4_debug("Skipping orphan recovery on fs with errors.\n"); es->s_last_orphan = 0; break; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6a8a752d812b25..a6d71a41a0c4de 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5585,7 +5585,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, return NULL; } - jbd_debug(2, "Journal inode found at %p: %lld bytes\n", + ext4_debug("Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); From f237450c7436c18446e6fc20c9da50825c1cb382 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Jun 2022 13:23:48 +0200 Subject: [PATCH 0296/1250] jbd2: rename jbd_debug() to jbd2_debug() The name of jbd_debug() is confusing as all functions inside jbd2 have jbd2_ prefix. Rename jbd_debug() to jbd2_debug(). No functional changes. Signed-off-by: Jan Kara Reviewed-by: Lukas Czerner Link: https://lore.kernel.org/r/20220608112355.4397-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 6 +++--- fs/jbd2/commit.c | 30 +++++++++++++++--------------- fs/jbd2/journal.c | 34 +++++++++++++++++----------------- fs/jbd2/recovery.c | 30 +++++++++++++++--------------- fs/jbd2/revoke.c | 8 ++++---- fs/jbd2/transaction.c | 26 +++++++++++++------------- include/linux/jbd2.h | 4 ++-- 7 files changed, 69 insertions(+), 69 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 746132998c577f..51bd38da21cdda 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -203,7 +203,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) tid_t this_tid; int result, batch_count = 0; - jbd_debug(1, "Start checkpoint\n"); + jbd2_debug(1, "Start checkpoint\n"); /* * First thing: if there are any transactions in the log which @@ -212,7 +212,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) */ result = jbd2_cleanup_journal_tail(journal); trace_jbd2_checkpoint(journal, result); - jbd_debug(1, "cleanup_journal_tail returned %d\n", result); + jbd2_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -804,5 +804,5 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact trace_jbd2_drop_transaction(journal, transaction); - jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); + jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index eb315e81f1a6b9..aa14f20241d757 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -421,7 +421,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { - jbd_debug(3, "super block updated\n"); + jbd2_debug(3, "super block updated\n"); mutex_lock_io(&journal->j_checkpoint_mutex); /* * We hold j_checkpoint_mutex so tail cannot change under us. @@ -435,7 +435,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) REQ_SYNC); mutex_unlock(&journal->j_checkpoint_mutex); } else { - jbd_debug(3, "superblock not updated\n"); + jbd2_debug(3, "superblock not updated\n"); } J_ASSERT(journal->j_running_transaction != NULL); @@ -467,7 +467,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; trace_jbd2_start_commit(journal, commit_transaction); - jbd_debug(1, "JBD2: starting commit of transaction %d\n", + jbd2_debug(1, "JBD2: starting commit of transaction %d\n", commit_transaction->t_tid); write_lock(&journal->j_state_lock); @@ -540,7 +540,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) __jbd2_journal_clean_checkpoint_list(journal, false); spin_unlock(&journal->j_list_lock); - jbd_debug(3, "JBD2: commit phase 1\n"); + jbd2_debug(3, "JBD2: commit phase 1\n"); /* * Clear revoked flag to reflect there is no revoked buffers @@ -573,7 +573,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) wake_up(&journal->j_wait_transaction_locked); write_unlock(&journal->j_state_lock); - jbd_debug(3, "JBD2: commit phase 2a\n"); + jbd2_debug(3, "JBD2: commit phase 2a\n"); /* * Now start flushing things to disk, in the order they appear @@ -586,7 +586,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) blk_start_plug(&plug); jbd2_journal_write_revoke_records(commit_transaction, &log_bufs); - jbd_debug(3, "JBD2: commit phase 2b\n"); + jbd2_debug(3, "JBD2: commit phase 2b\n"); /* * Way to go: we have now written out all of the data for a @@ -642,7 +642,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (!descriptor) { J_ASSERT (bufs == 0); - jbd_debug(4, "JBD2: get descriptor\n"); + jbd2_debug(4, "JBD2: get descriptor\n"); descriptor = jbd2_journal_get_descriptor_buffer( commit_transaction, @@ -652,7 +652,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) continue; } - jbd_debug(4, "JBD2: got buffer %llu (%p)\n", + jbd2_debug(4, "JBD2: got buffer %llu (%p)\n", (unsigned long long)descriptor->b_blocknr, descriptor->b_data); tagp = &descriptor->b_data[sizeof(journal_header_t)]; @@ -737,7 +737,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_buffers == NULL || space_left < tag_bytes + 16 + csum_size) { - jbd_debug(4, "JBD2: Submit %d IOs\n", bufs); + jbd2_debug(4, "JBD2: Submit %d IOs\n", bufs); /* Write an end-of-descriptor marker before submitting the IOs. "tag" still points to @@ -839,7 +839,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) so we incur less scheduling load. */ - jbd_debug(3, "JBD2: commit phase 3\n"); + jbd2_debug(3, "JBD2: commit phase 3\n"); while (!list_empty(&io_bufs)) { struct buffer_head *bh = list_entry(io_bufs.prev, @@ -882,7 +882,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT (commit_transaction->t_shadow_list == NULL); - jbd_debug(3, "JBD2: commit phase 4\n"); + jbd2_debug(3, "JBD2: commit phase 4\n"); /* Here we wait for the revoke record and descriptor record buffers */ while (!list_empty(&log_bufs)) { @@ -906,7 +906,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (err) jbd2_journal_abort(journal, err); - jbd_debug(3, "JBD2: commit phase 5\n"); + jbd2_debug(3, "JBD2: commit phase 5\n"); write_lock(&journal->j_state_lock); J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH); commit_transaction->t_state = T_COMMIT_JFLUSH; @@ -945,7 +945,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) transaction can be removed from any checkpoint list it was on before. */ - jbd_debug(3, "JBD2: commit phase 6\n"); + jbd2_debug(3, "JBD2: commit phase 6\n"); J_ASSERT(list_empty(&commit_transaction->t_inode_list)); J_ASSERT(commit_transaction->t_buffers == NULL); @@ -1122,7 +1122,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Done with this transaction! */ - jbd_debug(3, "JBD2: commit phase 7\n"); + jbd2_debug(3, "JBD2: commit phase 7\n"); J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH); @@ -1164,7 +1164,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid); trace_jbd2_end_commit(journal, commit_transaction); - jbd_debug(1, "JBD2: commit %d complete, head %d\n", + jbd2_debug(1, "JBD2: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); write_lock(&journal->j_state_lock); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c0cbeeaec2d1aa..0a8ff211fac177 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -203,11 +203,11 @@ static int kjournald2(void *arg) if (journal->j_flags & JBD2_UNMOUNT) goto end_loop; - jbd_debug(1, "commit_sequence=%u, commit_request=%u\n", + jbd2_debug(1, "commit_sequence=%u, commit_request=%u\n", journal->j_commit_sequence, journal->j_commit_request); if (journal->j_commit_sequence != journal->j_commit_request) { - jbd_debug(1, "OK, requests differ\n"); + jbd2_debug(1, "OK, requests differ\n"); write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); @@ -222,7 +222,7 @@ static int kjournald2(void *arg) * good idea, because that depends on threads that may * be already stopped. */ - jbd_debug(1, "Now suspending kjournald2\n"); + jbd2_debug(1, "Now suspending kjournald2\n"); write_unlock(&journal->j_state_lock); try_to_freeze(); write_lock(&journal->j_state_lock); @@ -252,7 +252,7 @@ static int kjournald2(void *arg) finish_wait(&journal->j_wait_commit, &wait); } - jbd_debug(1, "kjournald2 wakes\n"); + jbd2_debug(1, "kjournald2 wakes\n"); /* * Were we woken up by a commit wakeup event? @@ -260,7 +260,7 @@ static int kjournald2(void *arg) transaction = journal->j_running_transaction; if (transaction && time_after_eq(jiffies, transaction->t_expires)) { journal->j_commit_request = transaction->t_tid; - jbd_debug(1, "woke because of timeout\n"); + jbd2_debug(1, "woke because of timeout\n"); } goto loop; @@ -268,7 +268,7 @@ static int kjournald2(void *arg) del_timer_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); - jbd_debug(1, "Journal thread exiting.\n"); + jbd2_debug(1, "Journal thread exiting.\n"); write_unlock(&journal->j_state_lock); return 0; } @@ -500,7 +500,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) */ journal->j_commit_request = target; - jbd_debug(1, "JBD2: requesting commit %u/%u\n", + jbd2_debug(1, "JBD2: requesting commit %u/%u\n", journal->j_commit_request, journal->j_commit_sequence); journal->j_running_transaction->t_requested = jiffies; @@ -705,7 +705,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) } #endif while (tid_gt(tid, journal->j_commit_sequence)) { - jbd_debug(1, "JBD2: want %u, j_commit_sequence=%u\n", + jbd2_debug(1, "JBD2: want %u, j_commit_sequence=%u\n", tid, journal->j_commit_sequence); read_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_commit); @@ -1117,7 +1117,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) freed += journal->j_last - journal->j_first; trace_jbd2_update_log_tail(journal, tid, block, freed); - jbd_debug(1, + jbd2_debug(1, "Cleaning journal tail from %u to %u (offset %lu), " "freeing %lu\n", journal->j_tail_sequence, tid, block, freed); @@ -1496,7 +1496,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) return NULL; } - jbd_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", + jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", inode->i_sb->s_id, inode->i_ino, (long long) inode->i_size, inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); @@ -1575,7 +1575,7 @@ static int journal_reset(journal_t *journal) * attempting a write to a potential-readonly device. */ if (sb->s_start == 0) { - jbd_debug(1, "JBD2: Skipping superblock update on recovered sb " + jbd2_debug(1, "JBD2: Skipping superblock update on recovered sb " "(start %ld, seq %u, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1678,7 +1678,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, } BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); - jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", + jbd2_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", tail_block, tail_tid); lock_buffer(journal->j_sb_buffer); @@ -1719,7 +1719,7 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op) return; } - jbd_debug(1, "JBD2: Marking journal as empty (seq %u)\n", + jbd2_debug(1, "JBD2: Marking journal as empty (seq %u)\n", journal->j_tail_sequence); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); @@ -1862,7 +1862,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) errcode = journal->j_errno; if (errcode == -ESHUTDOWN) errcode = 0; - jbd_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); + jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); sb->s_errno = cpu_to_be32(errcode); jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA); @@ -2334,7 +2334,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, compat & JBD2_FEATURE_COMPAT_CHECKSUM) compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM; - jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", + jbd2_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n", compat, ro, incompat); sb = journal->j_superblock; @@ -2403,7 +2403,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat, { journal_superblock_t *sb; - jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", + jbd2_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n", compat, ro, incompat); sb = journal->j_superblock; @@ -2860,7 +2860,7 @@ static struct journal_head *journal_alloc_journal_head(void) #endif ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); if (!ret) { - jbd_debug(1, "out of memory for journal_head\n"); + jbd2_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS | __GFP_NOFAIL); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 8ca3527189f871..63594030afd31a 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -245,11 +245,11 @@ static int fc_do_one_pass(journal_t *journal, return 0; while (next_fc_block <= journal->j_fc_last) { - jbd_debug(3, "Fast commit replay: next block %ld\n", + jbd2_debug(3, "Fast commit replay: next block %ld\n", next_fc_block); err = jread(&bh, journal, next_fc_block); if (err) { - jbd_debug(3, "Fast commit replay: read error\n"); + jbd2_debug(3, "Fast commit replay: read error\n"); break; } @@ -263,7 +263,7 @@ static int fc_do_one_pass(journal_t *journal, } if (err) - jbd_debug(3, "Fast commit replay failed, err = %d\n", err); + jbd2_debug(3, "Fast commit replay failed, err = %d\n", err); return err; } @@ -297,7 +297,7 @@ int jbd2_journal_recover(journal_t *journal) */ if (!sb->s_start) { - jbd_debug(1, "No recovery required, last transaction %d\n", + jbd2_debug(1, "No recovery required, last transaction %d\n", be32_to_cpu(sb->s_sequence)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; return 0; @@ -309,10 +309,10 @@ int jbd2_journal_recover(journal_t *journal) if (!err) err = do_one_pass(journal, &info, PASS_REPLAY); - jbd_debug(1, "JBD2: recovery, exit status %d, " + jbd2_debug(1, "JBD2: recovery, exit status %d, " "recovered transactions %u to %u\n", err, info.start_transaction, info.end_transaction); - jbd_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", + jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n", info.nr_replays, info.nr_revoke_hits, info.nr_revokes); /* Restart the log at the next transaction ID, thus invalidating @@ -362,7 +362,7 @@ int jbd2_journal_skip_recovery(journal_t *journal) #ifdef CONFIG_JBD2_DEBUG int dropped = info.end_transaction - be32_to_cpu(journal->j_superblock->s_sequence); - jbd_debug(1, + jbd2_debug(1, "JBD2: ignoring %d transaction%s from the journal.\n", dropped, (dropped == 1) ? "" : "s"); #endif @@ -484,7 +484,7 @@ static int do_one_pass(journal_t *journal, if (pass == PASS_SCAN) info->start_transaction = first_commit_ID; - jbd_debug(1, "Starting recovery pass %d\n", pass); + jbd2_debug(1, "Starting recovery pass %d\n", pass); /* * Now we walk through the log, transaction by transaction, @@ -510,7 +510,7 @@ static int do_one_pass(journal_t *journal, if (tid_geq(next_commit_ID, info->end_transaction)) break; - jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", + jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", next_commit_ID, next_log_block, jbd2_has_feature_fast_commit(journal) ? journal->j_fc_last : journal->j_last); @@ -519,7 +519,7 @@ static int do_one_pass(journal_t *journal, * either the next descriptor block or the final commit * record. */ - jbd_debug(3, "JBD2: checking block %ld\n", next_log_block); + jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block); err = jread(&bh, journal, next_log_block); if (err) goto failed; @@ -542,7 +542,7 @@ static int do_one_pass(journal_t *journal, blocktype = be32_to_cpu(tmp->h_blocktype); sequence = be32_to_cpu(tmp->h_sequence); - jbd_debug(3, "Found magic %d, sequence %d\n", + jbd2_debug(3, "Found magic %d, sequence %d\n", blocktype, sequence); if (sequence != next_commit_ID) { @@ -575,7 +575,7 @@ static int do_one_pass(journal_t *journal, goto failed; } need_check_commit_time = true; - jbd_debug(1, + jbd2_debug(1, "invalid descriptor block found in %lu\n", next_log_block); } @@ -758,7 +758,7 @@ static int do_one_pass(journal_t *journal, * It likely does not belong to same journal, * just end this recovery with success. */ - jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", + jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); brelse(bh); goto done; @@ -826,7 +826,7 @@ static int do_one_pass(journal_t *journal, if (pass == PASS_SCAN && !jbd2_descriptor_block_csum_verify(journal, bh->b_data)) { - jbd_debug(1, "JBD2: invalid revoke block found in %lu\n", + jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n", next_log_block); need_check_commit_time = true; } @@ -845,7 +845,7 @@ static int do_one_pass(journal_t *journal, continue; default: - jbd_debug(3, "Unrecognised magic %d, end of scan.\n", + jbd2_debug(3, "Unrecognised magic %d, end of scan.\n", blocktype); brelse(bh); goto done; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index fa608788b93d7c..4556e468902449 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -398,7 +398,7 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, } handle->h_revoke_credits--; - jbd_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); + jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); @@ -428,7 +428,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) int did_revoke = 0; /* akpm: debug */ struct buffer_head *bh = jh2bh(jh); - jbd_debug(4, "journal_head %p, cancelling revoke\n", jh); + jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); /* Is the existing Revoke bit valid? If so, we trust it, and * only perform the full cancel if the revoke bit is set. If @@ -444,7 +444,7 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) if (need_cancel) { record = find_revoke_record(journal, bh->b_blocknr); if (record) { - jbd_debug(4, "cancelled existing revoke on " + jbd2_debug(4, "cancelled existing revoke on " "blocknr %llu\n", (unsigned long long)bh->b_blocknr); spin_lock(&journal->j_revoke_lock); list_del(&record->hash); @@ -560,7 +560,7 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction, } if (descriptor) flush_descriptor(journal, descriptor, offset); - jbd_debug(1, "Wrote %d revoke records\n", count); + jbd2_debug(1, "Wrote %d revoke records\n", count); } /* diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e9c308ae475fd7..1d1a926b25c5be 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -373,7 +373,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, return -ENOMEM; } - jbd_debug(3, "New handle %p going live.\n", handle); + jbd2_debug(3, "New handle %p going live.\n", handle); /* * We need to hold j_state_lock until t_updates has been incremented, @@ -453,7 +453,7 @@ static int start_this_handle(journal_t *journal, handle_t *handle, handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); - jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", + jbd2_debug(4, "Handle %p given %d credits (total %d, free %lu)\n", handle, blocks, atomic_read(&transaction->t_outstanding_credits), jbd2_log_space_left(journal)); @@ -674,7 +674,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) /* Don't extend a locked-down transaction! */ if (transaction->t_state != T_RUNNING) { - jbd_debug(3, "denied handle %p %d blocks: " + jbd2_debug(3, "denied handle %p %d blocks: " "transaction not running\n", handle, nblocks); goto error_out; } @@ -689,7 +689,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) &transaction->t_outstanding_credits); if (wanted > journal->j_max_transaction_buffers) { - jbd_debug(3, "denied handle %p %d blocks: " + jbd2_debug(3, "denied handle %p %d blocks: " "transaction too large\n", handle, nblocks); atomic_sub(nblocks, &transaction->t_outstanding_credits); goto error_out; @@ -707,7 +707,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) handle->h_revoke_credits_requested += revoke_records; result = 0; - jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); + jbd2_debug(3, "extended handle %p by %d\n", handle, nblocks); error_out: read_unlock(&journal->j_state_lock); return result; @@ -795,7 +795,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int revoke_records, * First unlink the handle from its current transaction, and start the * commit on that. */ - jbd_debug(2, "restarting handle %p\n", handle); + jbd2_debug(2, "restarting handle %p\n", handle); stop_this_handle(handle); handle->h_transaction = NULL; @@ -979,7 +979,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, journal = transaction->t_journal; - jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); + jbd2_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy); JBUFFER_TRACE(jh, "entry"); repeat: @@ -1271,7 +1271,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) struct journal_head *jh = jbd2_journal_add_journal_head(bh); int err; - jbd_debug(5, "journal_head %p\n", jh); + jbd2_debug(5, "journal_head %p\n", jh); err = -EROFS; if (is_handle_aborted(handle)) goto out; @@ -1496,7 +1496,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * of the running transaction. */ jh = bh2jh(bh); - jbd_debug(5, "journal_head %p\n", jh); + jbd2_debug(5, "journal_head %p\n", jh); JBUFFER_TRACE(jh, "entry"); /* @@ -1818,7 +1818,7 @@ int jbd2_journal_stop(handle_t *handle) pid_t pid; if (--handle->h_ref > 0) { - jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + jbd2_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, handle->h_ref); if (is_handle_aborted(handle)) return -EIO; @@ -1838,7 +1838,7 @@ int jbd2_journal_stop(handle_t *handle) if (is_handle_aborted(handle)) err = -EIO; - jbd_debug(4, "Handle %p going down\n", handle); + jbd2_debug(4, "Handle %p going down\n", handle); trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, tid, handle->h_type, handle->h_line_no, jiffies - handle->h_start_jiffies, @@ -1916,7 +1916,7 @@ int jbd2_journal_stop(handle_t *handle) * completes the commit thread, it just doesn't write * anything to disk. */ - jbd_debug(2, "transaction too old, requesting commit for " + jbd2_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ jbd2_log_start_commit(journal, tid); @@ -2662,7 +2662,7 @@ static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, return -EROFS; journal = transaction->t_journal; - jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, + jbd2_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, transaction->t_tid); spin_lock(&journal->j_list_lock); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index e79d6e0b14e8e8..d4d59e43769ff9 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -58,10 +58,10 @@ extern ushort jbd2_journal_enable_debug; void __jbd2_debug(int level, const char *file, const char *func, unsigned int line, const char *fmt, ...); -#define jbd_debug(n, fmt, a...) \ +#define jbd2_debug(n, fmt, a...) \ __jbd2_debug((n), __FILE__, __func__, __LINE__, (fmt), ##a) #else -#define jbd_debug(n, fmt, a...) no_printk(fmt, ##a) +#define jbd2_debug(n, fmt, a...) no_printk(fmt, ##a) #endif extern void *jbd2_alloc(size_t size, gfp_t flags); From c56ed6eec06d47932d296a3ba64d0e4dca6bb5d4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Jun 2022 13:23:49 +0200 Subject: [PATCH 0297/1250] jbd2: remove unused exports for jbd2 debugging Jbd2 exports jbd2_journal_enable_debug and __jbd2_debug() depite the first is used only in fs/jbd2/journal.c and the second only within jbd2 code. Remove the pointless exports make jbd2_journal_enable_debug static. Signed-off-by: Jan Kara Reviewed-by: Lukas Czerner Link: https://lore.kernel.org/r/20220608112355.4397-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 4 +--- include/linux/jbd2.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 0a8ff211fac177..f38f57942700be 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -49,8 +49,7 @@ #include #ifdef CONFIG_JBD2_DEBUG -ushort jbd2_journal_enable_debug __read_mostly; -EXPORT_SYMBOL(jbd2_journal_enable_debug); +static ushort jbd2_journal_enable_debug __read_mostly; module_param_named(jbd2_debug, jbd2_journal_enable_debug, ushort, 0644); MODULE_PARM_DESC(jbd2_debug, "Debugging level for jbd2"); @@ -115,7 +114,6 @@ void __jbd2_debug(int level, const char *file, const char *func, printk(KERN_DEBUG "%s: (%s, %u): %pV", file, func, line, &vaf); va_end(args); } -EXPORT_SYMBOL(__jbd2_debug); #endif /* Checksumming functions */ diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d4d59e43769ff9..6c2aa61e0f732f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -54,7 +54,6 @@ * CONFIG_JBD2_DEBUG is on. */ #define JBD2_EXPENSIVE_CHECKING -extern ushort jbd2_journal_enable_debug; void __jbd2_debug(int level, const char *file, const char *func, unsigned int line, const char *fmt, ...); From 6c8bc8dd6d827d8bc0ddf85ac7da311a7a7faed0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Jun 2022 13:23:50 +0200 Subject: [PATCH 0298/1250] jbd2: unexport jbd2_log_start_commit() jbd2_log_start_commit() is not used outside of jbd2 so unexport it. Also make __jbd2_log_start_commit() static when we are at it. Signed-off-by: Jan Kara Reviewed-by: Lukas Czerner Link: https://lore.kernel.org/r/20220608112355.4397-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 3 +-- include/linux/jbd2.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f38f57942700be..97e205a0689d28 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -80,7 +80,6 @@ EXPORT_SYMBOL(jbd2_journal_errno); EXPORT_SYMBOL(jbd2_journal_ack_err); EXPORT_SYMBOL(jbd2_journal_clear_err); EXPORT_SYMBOL(jbd2_log_wait_commit); -EXPORT_SYMBOL(jbd2_log_start_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); @@ -479,7 +478,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, * Called with j_state_lock locked for writing. * Returns true if a transaction commit was started. */ -int __jbd2_log_start_commit(journal_t *journal, tid_t target) +static int __jbd2_log_start_commit(journal_t *journal, tid_t target) { /* Return if the txn has already requested to be committed */ if (journal->j_commit_request == target) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 6c2aa61e0f732f..164ddf1211c0fe 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1646,7 +1646,6 @@ extern void jbd2_clear_buffer_revoked_flags(journal_t *journal); */ int jbd2_log_start_commit(journal_t *journal, tid_t tid); -int __jbd2_log_start_commit(journal_t *journal, tid_t tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_transaction_committed(journal_t *journal, tid_t tid); From f8dc286e4d942dab79d1814e0708ac91052a34fa Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sat, 11 Jun 2022 21:04:26 +0800 Subject: [PATCH 0299/1250] jbd2: fix outstanding credits assert in jbd2_journal_commit_transaction() We catch an assert problem in jbd2_journal_commit_transaction() when doing fsstress and request falut injection tests. The problem is happened in a race condition between jbd2_journal_commit_transaction() and ext4_end_io_end(). Firstly, ext4_writepages() writeback dirty pages and start reserved handle, and then the journal was aborted due to some previous metadata IO error, jbd2_journal_abort() start to commit current running transaction, the committing procedure could be raced by ext4_end_io_end() and lead to subtract j_reserved_credits twice from commit_transaction->t_outstanding_credits, finally the t_outstanding_credits is mistakenly smaller than t_nr_buffers and trigger assert. kjournald2 kworker jbd2_journal_commit_transaction() write_unlock(&journal->j_state_lock); atomic_sub(j_reserved_credits, t_outstanding_credits); //sub once jbd2_journal_start_reserved() start_this_handle() //detect aborted journal jbd2_journal_free_reserved() //get running transaction read_lock(&journal->j_state_lock) __jbd2_journal_unreserve_handle() atomic_sub(j_reserved_credits, t_outstanding_credits); //sub again read_unlock(&journal->j_state_lock); journal->j_running_transaction = NULL; J_ASSERT(t_nr_buffers <= t_outstanding_credits) //bomb!!! Fix this issue by using journal->j_state_lock to protect the subtraction in jbd2_journal_commit_transaction(). Fixes: 96f1e0974575 ("jbd2: avoid long hold times of j_state_lock while committing a transaction") Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220611130426.2013258-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index aa14f20241d757..7cb4f5de8155af 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -553,13 +553,13 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_switch_revoke_table(journal); + write_lock(&journal->j_state_lock); /* * Reserved credits cannot be claimed anymore, free them */ atomic_sub(atomic_read(&journal->j_reserved_credits), &commit_transaction->t_outstanding_credits); - write_lock(&journal->j_state_lock); trace_jbd2_commit_flushing(journal, commit_transaction); stats.run.rs_flushing = jiffies; stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, From 5ba7b490d9fce87b2aea9de27e13da6ef5300a17 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 7 Jul 2022 16:31:21 +0200 Subject: [PATCH 0300/1250] block/rnbd-srv: Set keep_id to true after mutex_trylock After setting keep_id if the mutex trylock fails, the keep_id stays set for the rest of the sess_dev lifetime. Therefore, set keep_id to true after mutex_trylock succeeds, so that a failure of trylock does'nt touch keep_id. Fixes: b168e1d85cf3 ("block/rnbd-srv: Prevent a deadlock generated by accessing sysfs in parallel") Cc: gi-oh.kim@ionos.com Signed-off-by: Md Haris Iqbal Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20220707143122.460362-2-haris.iqbal@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index beaef43a67b9de..cf9e29a08db218 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -323,10 +323,11 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev, { struct rnbd_srv_session *sess = sess_dev->sess; - sess_dev->keep_id = true; /* It is already started to close by client's close message. */ if (!mutex_trylock(&sess->lock)) return; + + sess_dev->keep_id = true; /* first remove sysfs itself to avoid deadlock */ sysfs_remove_file_self(&sess_dev->kobj, &attr->attr); rnbd_srv_destroy_dev_session_sysfs(sess_dev); From cf9db9e0f6fd15aa044d32e4018c3a572534a9a7 Mon Sep 17 00:00:00 2001 From: Md Haris Iqbal Date: Thu, 7 Jul 2022 16:31:22 +0200 Subject: [PATCH 0301/1250] block/rnbd-srv: Replace sess_dev_list with index_idr The structure rnbd_srv_session maintains a list and an xarray of rnbd_srv_dev. There is no need to keep both as one of them can serve the purpose. Since one of the places where the lookup of rnbd_srv_dev using rnbd_srv_session is IO path, an xarray would serve us better than a list traversal. Hence remove sess_dev_list from rnbd_srv_session, and replace its uses from xarray. Signed-off-by: Md Haris Iqbal Reviewed-by: Aleksei Marov Signed-off-by: Jack Wang Link: https://lore.kernel.org/r/20220707143122.460362-3-haris.iqbal@ionos.com Signed-off-by: Jens Axboe --- drivers/block/rnbd/rnbd-srv.c | 17 +++++++---------- drivers/block/rnbd/rnbd-srv.h | 4 ---- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index cf9e29a08db218..9a80fbce775a21 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -224,7 +224,6 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) wait_for_completion(&dc); /* wait for inflights to drop to zero */ rnbd_dev_close(sess_dev->rnbd_dev); - list_del(&sess_dev->sess_list); mutex_lock(&sess_dev->dev->lock); list_del(&sess_dev->dev_list); if (sess_dev->open_flags & FMODE_WRITE) @@ -239,14 +238,14 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id) static void destroy_sess(struct rnbd_srv_session *srv_sess) { - struct rnbd_srv_sess_dev *sess_dev, *tmp; + struct rnbd_srv_sess_dev *sess_dev; + unsigned long index; - if (list_empty(&srv_sess->sess_dev_list)) + if (xa_empty(&srv_sess->index_idr)) goto out; mutex_lock(&srv_sess->lock); - list_for_each_entry_safe(sess_dev, tmp, &srv_sess->sess_dev_list, - sess_list) + xa_for_each(&srv_sess->index_idr, index, sess_dev) rnbd_srv_destroy_dev_session_sysfs(sess_dev); mutex_unlock(&srv_sess->lock); @@ -281,7 +280,6 @@ static int create_sess(struct rtrs_srv_sess *rtrs) srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs); xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC); - INIT_LIST_HEAD(&srv_sess->sess_dev_list); mutex_init(&srv_sess->lock); mutex_lock(&sess_lock); list_add(&srv_sess->list, &sess_list); @@ -667,11 +665,12 @@ static struct rnbd_srv_sess_dev * find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name) { struct rnbd_srv_sess_dev *sess_dev; + unsigned long index; - if (list_empty(&srv_sess->sess_dev_list)) + if (xa_empty(&srv_sess->index_idr)) return NULL; - list_for_each_entry(sess_dev, &srv_sess->sess_dev_list, sess_list) + xa_for_each(&srv_sess->index_idr, index, sess_dev) if (!strcmp(sess_dev->pathname, dev_name)) return sess_dev; @@ -782,8 +781,6 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list); mutex_unlock(&srv_dev->lock); - list_add(&srv_sess_dev->sess_list, &srv_sess->sess_dev_list); - rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id); kfree(full_path); diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index be2ae486d407e6..30e403557c671c 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -25,8 +25,6 @@ struct rnbd_srv_session { int queue_depth; struct xarray index_idr; - /* List of struct rnbd_srv_sess_dev */ - struct list_head sess_dev_list; struct mutex lock; u8 ver; }; @@ -48,8 +46,6 @@ struct rnbd_srv_dev { struct rnbd_srv_sess_dev { /* Entry inside rnbd_srv_dev struct */ struct list_head dev_list; - /* Entry inside rnbd_srv_session struct */ - struct list_head sess_list; struct rnbd_dev *rnbd_dev; struct rnbd_srv_session *sess; struct rnbd_srv_dev *dev; From 010aa15e98266e39ece352a8f757c73f7cfa3c33 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Thu, 12 May 2022 01:46:43 +0530 Subject: [PATCH 0302/1250] soc/tegra: Set ERD bit to mask inband errors Add a function to set the ERD (Error Response Disable) bit in the MISCREG_CCROC_ERR_CONFIG register from the Control Backbone (CBB) error handler driver. ERD bit allows masking of SError due to inband errors which are caused by illegal register accesses through CBB. When the bit is set, interrupt is used for reporting errors and magic code '0xdead2003' is returned. This change is only required for Tegra194 SoC as the config is moved to CBB register space for future SoC's. Also, remove unmapping the apbmisc_base as it's required to get the base address for accessing the misc register. Signed-off-by: Sumit Gupta Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/tegra-apbmisc.c | 29 ++++++++++++++++++++++++-- include/soc/tegra/fuse.h | 6 ++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c index 590c862538d0aa..de833f8d240835 100644 --- a/drivers/soc/tegra/fuse/tegra-apbmisc.c +++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c @@ -16,12 +16,16 @@ #define FUSE_SKU_INFO 0x10 +#define ERD_ERR_CONFIG 0x120c +#define ERD_MASK_INBAND_ERR 0x1 + #define PMC_STRAPPING_OPT_A_RAM_CODE_SHIFT 4 #define PMC_STRAPPING_OPT_A_RAM_CODE_MASK_LONG \ (0xf << PMC_STRAPPING_OPT_A_RAM_CODE_SHIFT) #define PMC_STRAPPING_OPT_A_RAM_CODE_MASK_SHORT \ (0x3 << PMC_STRAPPING_OPT_A_RAM_CODE_SHIFT) +static void __iomem *apbmisc_base; static bool long_ram_code; static u32 strapping; static u32 chipid; @@ -93,6 +97,28 @@ u32 tegra_read_ram_code(void) } EXPORT_SYMBOL_GPL(tegra_read_ram_code); +/* + * The function sets ERD(Error Response Disable) bit. + * This allows to mask inband errors and always send an + * OKAY response from CBB to the master which caused error. + */ +int tegra194_miscreg_mask_serror(void) +{ + if (!apbmisc_base) + return -EPROBE_DEFER; + + if (!of_machine_is_compatible("nvidia,tegra194")) { + WARN(1, "Only supported for Tegra194 devices!\n"); + return -EOPNOTSUPP; + } + + writel_relaxed(ERD_MASK_INBAND_ERR, + apbmisc_base + ERD_ERR_CONFIG); + + return 0; +} +EXPORT_SYMBOL(tegra194_miscreg_mask_serror); + static const struct of_device_id apbmisc_match[] __initconst = { { .compatible = "nvidia,tegra20-apbmisc", }, { .compatible = "nvidia,tegra186-misc", }, @@ -134,7 +160,7 @@ void __init tegra_init_revision(void) void __init tegra_init_apbmisc(void) { - void __iomem *apbmisc_base, *strapping_base; + void __iomem *strapping_base; struct resource apbmisc, straps; struct device_node *np; @@ -196,7 +222,6 @@ void __init tegra_init_apbmisc(void) pr_err("failed to map APBMISC registers\n"); } else { chipid = readl_relaxed(apbmisc_base + 4); - iounmap(apbmisc_base); } strapping_base = ioremap(straps.start, resource_size(&straps)); diff --git a/include/soc/tegra/fuse.h b/include/soc/tegra/fuse.h index d035e04cb86998..4595082170f08d 100644 --- a/include/soc/tegra/fuse.h +++ b/include/soc/tegra/fuse.h @@ -59,6 +59,7 @@ u32 tegra_read_chipid(void); u8 tegra_get_chip_id(void); u8 tegra_get_platform(void); bool tegra_is_silicon(void); +int tegra194_miscreg_mask_serror(void); #else static struct tegra_sku_info tegra_sku_info __maybe_unused; @@ -96,6 +97,11 @@ static inline bool tegra_is_silicon(void) { return false; } + +static inline int tegra194_miscreg_mask_serror(void) +{ + return false; +} #endif struct device *tegra_soc_device_register(void); From 97f2c3b42a166408e0e59b253b6073c9372d8b60 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Thu, 12 May 2022 01:46:47 +0530 Subject: [PATCH 0303/1250] soc/tegra: cbb: Add CBB 1.0 driver for Tegra194 Adding driver to handle errors from Control Backbone (CBB) which are generated due to illegal accesses. CBB 1.0 is used in Tegra194 SoCs. When an error is reported from a NOC within CBB, the driver prints debug information about failed transaction like Error Code, Error Description, Master, Address, AXI ID, Cache, Protection, Security Group etc. It then causes system crash using BUG_ON() or call WARN() based on whether the error type is fatal or not. Signed-off-by: Sumit Gupta Signed-off-by: Thierry Reding --- drivers/soc/tegra/Kconfig | 9 + drivers/soc/tegra/Makefile | 1 + drivers/soc/tegra/cbb/Makefile | 8 + drivers/soc/tegra/cbb/tegra-cbb.c | 190 +++ drivers/soc/tegra/cbb/tegra194-cbb.c | 2365 ++++++++++++++++++++++++++ include/soc/tegra/tegra-cbb.h | 47 + 6 files changed, 2620 insertions(+) create mode 100644 drivers/soc/tegra/cbb/Makefile create mode 100644 drivers/soc/tegra/cbb/tegra-cbb.c create mode 100644 drivers/soc/tegra/cbb/tegra194-cbb.c create mode 100644 include/soc/tegra/tegra-cbb.h diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig index 5725c8ef0406af..65283a93e78f4c 100644 --- a/drivers/soc/tegra/Kconfig +++ b/drivers/soc/tegra/Kconfig @@ -162,3 +162,12 @@ config SOC_TEGRA30_VOLTAGE_COUPLER bool "Voltage scaling support for Tegra30 SoCs" depends on ARCH_TEGRA_3x_SOC || COMPILE_TEST depends on REGULATOR + +config SOC_TEGRA_CBB + tristate "Tegra driver to handle error from CBB" + depends on ARCH_TEGRA_194_SOC + default y + help + Support for handling error from Tegra Control Backbone(CBB). + This driver handles the errors from CBB and prints debug + information about the failed transactions. diff --git a/drivers/soc/tegra/Makefile b/drivers/soc/tegra/Makefile index 054e862b63d892..d722f512dc9d38 100644 --- a/drivers/soc/tegra/Makefile +++ b/drivers/soc/tegra/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-y += fuse/ +obj-y += cbb/ obj-y += common.o obj-$(CONFIG_SOC_TEGRA_FLOWCTRL) += flowctrl.o diff --git a/drivers/soc/tegra/cbb/Makefile b/drivers/soc/tegra/cbb/Makefile new file mode 100644 index 00000000000000..711b756107033b --- /dev/null +++ b/drivers/soc/tegra/cbb/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Control Backbone Driver code. +# +ifdef CONFIG_SOC_TEGRA_CBB +obj-y += tegra-cbb.o +obj-$(CONFIG_ARCH_TEGRA_194_SOC) += tegra194-cbb.o +endif diff --git a/drivers/soc/tegra/cbb/tegra-cbb.c b/drivers/soc/tegra/cbb/tegra-cbb.c new file mode 100644 index 00000000000000..d200937353c7ca --- /dev/null +++ b/drivers/soc/tegra/cbb/tegra-cbb.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void tegra_cbb_print_err(struct seq_file *file, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + if (file) { + seq_vprintf(file, fmt, args); + } else { + vaf.fmt = fmt; + vaf.va = &args; + pr_crit("%pV", &vaf); + } + + va_end(args); +} + +void tegra_cbb_print_cache(struct seq_file *file, u32 cache) +{ + const char *buff_str, *mod_str, *rd_str, *wr_str; + + buff_str = (cache & BIT(0)) ? "Bufferable " : ""; + mod_str = (cache & BIT(1)) ? "Modifiable " : ""; + rd_str = (cache & BIT(2)) ? "Read-Allocate " : ""; + wr_str = (cache & BIT(3)) ? "Write-Allocate" : ""; + + if (cache == 0x0) + buff_str = "Device Non-Bufferable"; + + tegra_cbb_print_err(file, "\t Cache\t\t\t: 0x%x -- %s%s%s%s\n", + cache, buff_str, mod_str, rd_str, wr_str); +} + +void tegra_cbb_print_prot(struct seq_file *file, u32 prot) +{ + const char *data_str, *secure_str, *priv_str; + + data_str = (prot & 0x4) ? "Instruction" : "Data"; + secure_str = (prot & 0x2) ? "Non-Secure" : "Secure"; + priv_str = (prot & 0x1) ? "Privileged" : "Unprivileged"; + + tegra_cbb_print_err(file, "\t Protection\t\t: 0x%x -- %s, %s, %s Access\n", + prot, priv_str, secure_str, data_str); +} + +static int tegra_cbb_err_show(struct seq_file *file, void *data) +{ + struct tegra_cbb *cbb = file->private; + + return cbb->ops->debugfs_show(cbb, file, data); +} + +static int tegra_cbb_err_open(struct inode *inode, struct file *file) +{ + return single_open(file, tegra_cbb_err_show, inode->i_private); +} + +static const struct file_operations tegra_cbb_err_fops = { + .open = tegra_cbb_err_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release +}; + +static int tegra_cbb_err_debugfs_init(struct tegra_cbb *cbb) +{ + static struct dentry *root; + + if (!root) { + root = debugfs_create_file("tegra_cbb_err", 0444, NULL, cbb, &tegra_cbb_err_fops); + if (IS_ERR_OR_NULL(root)) { + pr_err("%s(): could not create debugfs node\n", __func__); + return PTR_ERR(root); + } + } + + return 0; +} + +void tegra_cbb_stall_enable(struct tegra_cbb *cbb) +{ + if (cbb->ops->stall_enable) + cbb->ops->stall_enable(cbb); +} + +void tegra_cbb_fault_enable(struct tegra_cbb *cbb) +{ + if (cbb->ops->fault_enable) + cbb->ops->fault_enable(cbb); +} + +void tegra_cbb_error_clear(struct tegra_cbb *cbb) +{ + if (cbb->ops->error_clear) + cbb->ops->error_clear(cbb); +} + +u32 tegra_cbb_get_status(struct tegra_cbb *cbb) +{ + if (cbb->ops->get_status) + return cbb->ops->get_status(cbb); + + return 0; +} + +int tegra_cbb_get_irq(struct platform_device *pdev, unsigned int *nonsec_irq, + unsigned int *sec_irq) +{ + unsigned int index = 0; + int num_intr = 0, irq; + + num_intr = platform_irq_count(pdev); + if (!num_intr) + return -EINVAL; + + if (num_intr == 2) { + irq = platform_get_irq(pdev, index); + if (irq <= 0) { + dev_err(&pdev->dev, "failed to get non-secure IRQ: %d\n", irq); + return -ENOENT; + } + + *nonsec_irq = irq; + index++; + } + + irq = platform_get_irq(pdev, index); + if (irq <= 0) { + dev_err(&pdev->dev, "failed to get secure IRQ: %d\n", irq); + return -ENOENT; + } + + *sec_irq = irq; + + if (num_intr == 1) + dev_dbg(&pdev->dev, "secure IRQ: %u\n", *sec_irq); + + if (num_intr == 2) + dev_dbg(&pdev->dev, "secure IRQ: %u, non-secure IRQ: %u\n", *sec_irq, *nonsec_irq); + + return 0; +} + +int tegra_cbb_register(struct tegra_cbb *cbb) +{ + int ret; + + if (IS_ENABLED(CONFIG_DEBUG_FS)) { + ret = tegra_cbb_err_debugfs_init(cbb); + if (ret) { + dev_err(cbb->dev, "failed to create debugfs\n"); + return ret; + } + } + + /* register interrupt handler for errors due to different initiators */ + ret = cbb->ops->interrupt_enable(cbb); + if (ret < 0) { + dev_err(cbb->dev, "Failed to register CBB Interrupt ISR"); + return ret; + } + + cbb->ops->error_enable(cbb); + dsb(sy); + + return 0; +} diff --git a/drivers/soc/tegra/cbb/tegra194-cbb.c b/drivers/soc/tegra/cbb/tegra194-cbb.c new file mode 100644 index 00000000000000..e8726b90227b4a --- /dev/null +++ b/drivers/soc/tegra/cbb/tegra194-cbb.c @@ -0,0 +1,2365 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved + * + * The driver handles Error's from Control Backbone(CBB) generated due to + * illegal accesses. When an error is reported from a NOC within CBB, + * the driver checks ErrVld status of all three Error Logger's of that NOC. + * It then prints debug information about failed transaction using ErrLog + * registers of error logger which has ErrVld set. Currently, SLV, DEC, + * TMO, SEC, UNS are the codes which are supported by CBB. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ERRLOGGER_0_ID_COREID_0 0x00000000 +#define ERRLOGGER_0_ID_REVISIONID_0 0x00000004 +#define ERRLOGGER_0_FAULTEN_0 0x00000008 +#define ERRLOGGER_0_ERRVLD_0 0x0000000c +#define ERRLOGGER_0_ERRCLR_0 0x00000010 +#define ERRLOGGER_0_ERRLOG0_0 0x00000014 +#define ERRLOGGER_0_ERRLOG1_0 0x00000018 +#define ERRLOGGER_0_RSVD_00_0 0x0000001c +#define ERRLOGGER_0_ERRLOG3_0 0x00000020 +#define ERRLOGGER_0_ERRLOG4_0 0x00000024 +#define ERRLOGGER_0_ERRLOG5_0 0x00000028 +#define ERRLOGGER_0_STALLEN_0 0x00000038 + +#define ERRLOGGER_1_ID_COREID_0 0x00000080 +#define ERRLOGGER_1_ID_REVISIONID_0 0x00000084 +#define ERRLOGGER_1_FAULTEN_0 0x00000088 +#define ERRLOGGER_1_ERRVLD_0 0x0000008c +#define ERRLOGGER_1_ERRCLR_0 0x00000090 +#define ERRLOGGER_1_ERRLOG0_0 0x00000094 +#define ERRLOGGER_1_ERRLOG1_0 0x00000098 +#define ERRLOGGER_1_RSVD_00_0 0x0000009c +#define ERRLOGGER_1_ERRLOG3_0 0x000000a0 +#define ERRLOGGER_1_ERRLOG4_0 0x000000a4 +#define ERRLOGGER_1_ERRLOG5_0 0x000000a8 +#define ERRLOGGER_1_STALLEN_0 0x000000b8 + +#define ERRLOGGER_2_ID_COREID_0 0x00000100 +#define ERRLOGGER_2_ID_REVISIONID_0 0x00000104 +#define ERRLOGGER_2_FAULTEN_0 0x00000108 +#define ERRLOGGER_2_ERRVLD_0 0x0000010c +#define ERRLOGGER_2_ERRCLR_0 0x00000110 +#define ERRLOGGER_2_ERRLOG0_0 0x00000114 +#define ERRLOGGER_2_ERRLOG1_0 0x00000118 +#define ERRLOGGER_2_RSVD_00_0 0x0000011c +#define ERRLOGGER_2_ERRLOG3_0 0x00000120 +#define ERRLOGGER_2_ERRLOG4_0 0x00000124 +#define ERRLOGGER_2_ERRLOG5_0 0x00000128 +#define ERRLOGGER_2_STALLEN_0 0x00000138 + +#define CBB_NOC_INITFLOW GENMASK(23, 20) +#define CBB_NOC_TARGFLOW GENMASK(19, 16) +#define CBB_NOC_TARG_SUBRANGE GENMASK(15, 9) +#define CBB_NOC_SEQID GENMASK(8, 0) + +#define BPMP_NOC_INITFLOW GENMASK(20, 18) +#define BPMP_NOC_TARGFLOW GENMASK(17, 13) +#define BPMP_NOC_TARG_SUBRANGE GENMASK(12, 9) +#define BPMP_NOC_SEQID GENMASK(8, 0) + +#define AON_NOC_INITFLOW GENMASK(22, 21) +#define AON_NOC_TARGFLOW GENMASK(20, 15) +#define AON_NOC_TARG_SUBRANGE GENMASK(14, 9) +#define AON_NOC_SEQID GENMASK(8, 0) + +#define SCE_NOC_INITFLOW GENMASK(21, 19) +#define SCE_NOC_TARGFLOW GENMASK(18, 14) +#define SCE_NOC_TARG_SUBRANGE GENMASK(13, 9) +#define SCE_NOC_SEQID GENMASK(8, 0) + +#define CBB_NOC_AXCACHE GENMASK(3, 0) +#define CBB_NOC_NON_MOD GENMASK(4, 4) +#define CBB_NOC_AXPROT GENMASK(7, 5) +#define CBB_NOC_FALCONSEC GENMASK(9, 8) +#define CBB_NOC_GRPSEC GENMASK(16, 10) +#define CBB_NOC_VQC GENMASK(18, 17) +#define CBB_NOC_MSTR_ID GENMASK(22, 19) +#define CBB_NOC_AXI_ID GENMASK(30, 23) + +#define CLUSTER_NOC_AXCACHE GENMASK(3, 0) +#define CLUSTER_NOC_AXPROT GENMASK(6, 4) +#define CLUSTER_NOC_FALCONSEC GENMASK(8, 7) +#define CLUSTER_NOC_GRPSEC GENMASK(15, 9) +#define CLUSTER_NOC_VQC GENMASK(17, 16) +#define CLUSTER_NOC_MSTR_ID GENMASK(21, 18) + +#define USRBITS_MSTR_ID GENMASK(21, 18) + +#define CBB_ERR_OPC GENMASK(4, 1) +#define CBB_ERR_ERRCODE GENMASK(10, 8) +#define CBB_ERR_LEN1 GENMASK(27, 16) + +#define DMAAPB_X_RAW_INTERRUPT_STATUS 0x2ec + +struct tegra194_cbb_packet_header { + bool lock; // [0] + u8 opc; // [4:1] + u8 errcode; // [10:8]= RD, RDW, RDL, RDX, WR, WRW, WRC, PRE, URG + u16 len1; // [27:16] + bool format; // [31] = 1 -> FlexNoC versions 2.7 & above +}; + +struct tegra194_cbb_aperture { + u8 initflow; + u8 targflow; + u8 targ_subrange; + u8 init_mapping; + u32 init_localaddress; + u8 targ_mapping; + u32 targ_localaddress; + u16 seqid; +}; + +struct tegra194_cbb_userbits { + u8 axcache; + u8 non_mod; + u8 axprot; + u8 falconsec; + u8 grpsec; + u8 vqc; + u8 mstr_id; + u8 axi_id; +}; + +struct tegra194_cbb_noc_data { + const char *name; + bool erd_mask_inband_err; + const char * const *master_id; + unsigned int max_aperture; + const struct tegra194_cbb_aperture *noc_aperture; + const char * const *routeid_initflow; + const char * const *routeid_targflow; + void (*parse_routeid)(struct tegra194_cbb_aperture *info, u64 routeid); + void (*parse_userbits)(struct tegra194_cbb_userbits *usrbits, u32 elog_5); +}; + +struct tegra194_axi2apb_bridge { + struct resource res; + void __iomem *base; +}; + +struct tegra194_cbb { + struct tegra_cbb base; + + const struct tegra194_cbb_noc_data *noc; + struct resource *res; + + void __iomem *regs; + unsigned int num_intr; + unsigned int sec_irq; + unsigned int nonsec_irq; + u32 errlog0; + u32 errlog1; + u32 errlog2; + u32 errlog3; + u32 errlog4; + u32 errlog5; + + struct tegra194_axi2apb_bridge *bridges; + unsigned int num_bridges; +}; + +static inline struct tegra194_cbb *to_tegra194_cbb(struct tegra_cbb *cbb) +{ + return container_of(cbb, struct tegra194_cbb, base); +} + +static LIST_HEAD(cbb_list); +static DEFINE_SPINLOCK(cbb_lock); + +static const char * const tegra194_cbb_trantype[] = { + "RD - Read, Incrementing", + "RDW - Read, Wrap", /* Not Supported */ + "RDX - Exclusive Read", /* Not Supported */ + "RDL - Linked Read", /* Not Supported */ + "WR - Write, Incrementing", + "WRW - Write, Wrap", /* Not Supported */ + "WRC - Exclusive Write", /* Not Supported */ + "PRE - Preamble Sequence for Fixed Accesses" +}; + +static const char * const tegra194_axi2apb_error[] = { + "SFIFONE - Status FIFO Not Empty interrupt", + "SFIFOF - Status FIFO Full interrupt", + "TIM - Timer(Timeout) interrupt", + "SLV - SLVERR interrupt", + "NULL", + "ERBF - Early response buffer Full interrupt", + "NULL", + "RDFIFOF - Read Response FIFO Full interrupt", + "WRFIFOF - Write Response FIFO Full interrupt", + "CH0DFIFOF - Ch0 Data FIFO Full interrupt", + "CH1DFIFOF - Ch1 Data FIFO Full interrupt", + "CH2DFIFOF - Ch2 Data FIFO Full interrupt", + "UAT - Unsupported alignment type error", + "UBS - Unsupported burst size error", + "UBE - Unsupported Byte Enable error", + "UBT - Unsupported burst type error", + "BFS - Block Firewall security error", + "ARFS - Address Range Firewall security error", + "CH0RFIFOF - Ch0 Request FIFO Full interrupt", + "CH1RFIFOF - Ch1 Request FIFO Full interrupt", + "CH2RFIFOF - Ch2 Request FIFO Full interrupt" +}; + +static const char * const tegra194_master_id[] = { + [0x0] = "CCPLEX", + [0x1] = "CCPLEX_DPMU", + [0x2] = "BPMP", + [0x3] = "AON", + [0x4] = "SCE", + [0x5] = "GPCDMA_PERIPHERAL", + [0x6] = "TSECA", + [0x7] = "TSECB", + [0x8] = "JTAGM_DFT", + [0x9] = "CORESIGHT_AXIAP", + [0xa] = "APE", + [0xb] = "PEATR", + [0xc] = "NVDEC", + [0xd] = "RCE", + [0xe] = "NVDEC1" +}; + +static const struct tegra_cbb_error tegra194_cbb_errors[] = { + { + .code = "SLV", + .source = "Target", + .desc = "Target error detected by CBB slave" + }, { + .code = "DEC", + .source = "Initiator NIU", + .desc = "Address decode error" + }, { + .code = "UNS", + .source = "Target NIU", + .desc = "Unsupported request. Not a valid transaction" + }, { + .code = "DISC", /* Not Supported by CBB */ + .source = "Power Disconnect", + .desc = "Disconnected target or domain" + }, { + .code = "SEC", + .source = "Initiator NIU or Firewall", + .desc = "Security violation. Firewall error" + }, { + .code = "HIDE", /* Not Supported by CBB */ + .source = "Firewall", + .desc = "Hidden security violation, reported as OK to initiator" + }, { + .code = "TMO", + .source = "Target NIU", + .desc = "Target time-out error" + }, { + .code = "RSV", + .source = "None", + .desc = "Reserved" + } +}; + +/* + * CBB NOC aperture lookup table as per file "cbb_central_noc_Structure.info". + */ +static const char * const tegra194_cbbcentralnoc_routeid_initflow[] = { + [0x0] = "aon_p2ps/I/aon", + [0x1] = "ape_p2ps/I/ape_p2ps", + [0x2] = "bpmp_p2ps/I/bpmp_p2ps", + [0x3] = "ccroc_p2ps/I/ccroc_p2ps", + [0x4] = "csite_p2ps/I/0", + [0x5] = "gpcdma_mmio_p2ps/I/0", + [0x6] = "jtag_p2ps/I/0", + [0x7] = "nvdec1_p2ps/I/0", + [0x8] = "nvdec_p2ps/I/0", + [0x9] = "rce_p2ps/I/rce_p2ps", + [0xa] = "sce_p2ps/I/sce_p2ps", + [0xb] = "tseca_p2ps/I/0", + [0xc] = "tsecb_p2ps/I/0", + [0xd] = "RESERVED", + [0xe] = "RESERVED", + [0xf] = "RESERVED" +}; + +static const char * const tegra194_cbbcentralnoc_routeid_targflow[] = { + [0x0] = "SVC/T/intreg", + [0x1] = "axis_satellite_axi2apb_p2pm/T/axis_satellite_axi2apb_p2pm", + [0x2] = "axis_satellite_grout/T/axis_satellite_grout", + [0x3] = "cbb_firewall/T/cbb_firewall", + [0x4] = "gpu_p2pm/T/gpu_p2pm", + [0x5] = "host1x_p2pm/T/host1x_p2pm", + [0x6] = "sapb_3_p2pm/T/sapb_3_p2pm", + [0x7] = "smmu0_p2pm/T/smmu0_p2pm", + [0x8] = "smmu1_p2pm/T/smmu1_p2pm", + [0x9] = "smmu2_p2pm/T/smmu2_p2pm", + [0xa] = "stm_p2pm/T/stm_p2pm", + [0xb] = "RESERVED", + [0xc] = "RESERVED", + [0xd] = "RESERVED", + [0xe] = "RESERVED", + [0xf] = "RESERVED" +}; + +/* + * Fields of CBB NOC lookup table: + * Init flow, Targ flow, Targ subrange, Init mapping, Init localAddress, + * Targ mapping, Targ localAddress + * ---------------------------------------------------------------------------- + */ +static const struct tegra194_cbb_aperture tegra194_cbbcentralnoc_apert_lookup[] = { + { 0x0, 0x0, 0x00, 0x0, 0x02300000, 0, 0x00000000 }, + { 0x0, 0x1, 0x00, 0x0, 0x02003000, 0, 0x02003000 }, + { 0x0, 0x1, 0x01, 0x0, 0x02006000, 2, 0x02006000 }, + { 0x0, 0x1, 0x02, 0x0, 0x02016000, 3, 0x02016000 }, + { 0x0, 0x1, 0x03, 0x0, 0x0201d000, 4, 0x0201d000 }, + { 0x0, 0x1, 0x04, 0x0, 0x0202b000, 6, 0x0202b000 }, + { 0x0, 0x1, 0x05, 0x0, 0x02434000, 20, 0x02434000 }, + { 0x0, 0x1, 0x06, 0x0, 0x02436000, 21, 0x02436000 }, + { 0x0, 0x1, 0x07, 0x0, 0x02438000, 22, 0x02438000 }, + { 0x0, 0x1, 0x08, 0x0, 0x02445000, 24, 0x02445000 }, + { 0x0, 0x1, 0x09, 0x0, 0x02446000, 25, 0x02446000 }, + { 0x0, 0x1, 0x0a, 0x0, 0x02004000, 1, 0x02004000 }, + { 0x0, 0x1, 0x0b, 0x0, 0x0201e000, 5, 0x0201e000 }, + { 0x0, 0x1, 0x0c, 0x0, 0x0202c000, 7, 0x0202c000 }, + { 0x0, 0x1, 0x0d, 0x0, 0x02204000, 8, 0x02204000 }, + { 0x0, 0x1, 0x0e, 0x0, 0x02214000, 9, 0x02214000 }, + { 0x0, 0x1, 0x0f, 0x0, 0x02224000, 10, 0x02224000 }, + { 0x0, 0x1, 0x10, 0x0, 0x02234000, 11, 0x02234000 }, + { 0x0, 0x1, 0x11, 0x0, 0x02244000, 12, 0x02244000 }, + { 0x0, 0x1, 0x12, 0x0, 0x02254000, 13, 0x02254000 }, + { 0x0, 0x1, 0x13, 0x0, 0x02264000, 14, 0x02264000 }, + { 0x0, 0x1, 0x14, 0x0, 0x02274000, 15, 0x02274000 }, + { 0x0, 0x1, 0x15, 0x0, 0x02284000, 16, 0x02284000 }, + { 0x0, 0x1, 0x16, 0x0, 0x0243a000, 23, 0x0243a000 }, + { 0x0, 0x1, 0x17, 0x0, 0x02370000, 17, 0x02370000 }, + { 0x0, 0x1, 0x18, 0x0, 0x023d0000, 18, 0x023d0000 }, + { 0x0, 0x1, 0x19, 0x0, 0x023e0000, 19, 0x023e0000 }, + { 0x0, 0x1, 0x1a, 0x0, 0x02450000, 26, 0x02450000 }, + { 0x0, 0x1, 0x1b, 0x0, 0x02460000, 27, 0x02460000 }, + { 0x0, 0x1, 0x1c, 0x0, 0x02490000, 28, 0x02490000 }, + { 0x0, 0x1, 0x1d, 0x0, 0x03130000, 31, 0x03130000 }, + { 0x0, 0x1, 0x1e, 0x0, 0x03160000, 32, 0x03160000 }, + { 0x0, 0x1, 0x1f, 0x0, 0x03270000, 33, 0x03270000 }, + { 0x0, 0x1, 0x20, 0x0, 0x032e0000, 35, 0x032e0000 }, + { 0x0, 0x1, 0x21, 0x0, 0x03300000, 36, 0x03300000 }, + { 0x0, 0x1, 0x22, 0x0, 0x13090000, 40, 0x13090000 }, + { 0x0, 0x1, 0x23, 0x0, 0x20120000, 43, 0x20120000 }, + { 0x0, 0x1, 0x24, 0x0, 0x20170000, 44, 0x20170000 }, + { 0x0, 0x1, 0x25, 0x0, 0x20190000, 45, 0x20190000 }, + { 0x0, 0x1, 0x26, 0x0, 0x201b0000, 46, 0x201b0000 }, + { 0x0, 0x1, 0x27, 0x0, 0x20250000, 47, 0x20250000 }, + { 0x0, 0x1, 0x28, 0x0, 0x20260000, 48, 0x20260000 }, + { 0x0, 0x1, 0x29, 0x0, 0x20420000, 49, 0x20420000 }, + { 0x0, 0x1, 0x2a, 0x0, 0x20460000, 50, 0x20460000 }, + { 0x0, 0x1, 0x2b, 0x0, 0x204f0000, 51, 0x204f0000 }, + { 0x0, 0x1, 0x2c, 0x0, 0x20520000, 52, 0x20520000 }, + { 0x0, 0x1, 0x2d, 0x0, 0x20580000, 53, 0x20580000 }, + { 0x0, 0x1, 0x2e, 0x0, 0x205a0000, 54, 0x205a0000 }, + { 0x0, 0x1, 0x2f, 0x0, 0x205c0000, 55, 0x205c0000 }, + { 0x0, 0x1, 0x30, 0x0, 0x20690000, 56, 0x20690000 }, + { 0x0, 0x1, 0x31, 0x0, 0x20770000, 57, 0x20770000 }, + { 0x0, 0x1, 0x32, 0x0, 0x20790000, 58, 0x20790000 }, + { 0x0, 0x1, 0x33, 0x0, 0x20880000, 59, 0x20880000 }, + { 0x0, 0x1, 0x34, 0x0, 0x20990000, 62, 0x20990000 }, + { 0x0, 0x1, 0x35, 0x0, 0x20e10000, 65, 0x20e10000 }, + { 0x0, 0x1, 0x36, 0x0, 0x20e70000, 66, 0x20e70000 }, + { 0x0, 0x1, 0x37, 0x0, 0x20e80000, 67, 0x20e80000 }, + { 0x0, 0x1, 0x38, 0x0, 0x20f30000, 68, 0x20f30000 }, + { 0x0, 0x1, 0x39, 0x0, 0x20f50000, 69, 0x20f50000 }, + { 0x0, 0x1, 0x3a, 0x0, 0x20fc0000, 70, 0x20fc0000 }, + { 0x0, 0x1, 0x3b, 0x0, 0x21110000, 72, 0x21110000 }, + { 0x0, 0x1, 0x3c, 0x0, 0x21270000, 73, 0x21270000 }, + { 0x0, 0x1, 0x3d, 0x0, 0x21290000, 74, 0x21290000 }, + { 0x0, 0x1, 0x3e, 0x0, 0x21840000, 75, 0x21840000 }, + { 0x0, 0x1, 0x3f, 0x0, 0x21880000, 76, 0x21880000 }, + { 0x0, 0x1, 0x40, 0x0, 0x218d0000, 77, 0x218d0000 }, + { 0x0, 0x1, 0x41, 0x0, 0x21950000, 78, 0x21950000 }, + { 0x0, 0x1, 0x42, 0x0, 0x21960000, 79, 0x21960000 }, + { 0x0, 0x1, 0x43, 0x0, 0x21a10000, 80, 0x21a10000 }, + { 0x0, 0x1, 0x44, 0x0, 0x024a0000, 29, 0x024a0000 }, + { 0x0, 0x1, 0x45, 0x0, 0x024c0000, 30, 0x024c0000 }, + { 0x0, 0x1, 0x46, 0x0, 0x032c0000, 34, 0x032c0000 }, + { 0x0, 0x1, 0x47, 0x0, 0x03400000, 37, 0x03400000 }, + { 0x0, 0x1, 0x48, 0x0, 0x130a0000, 41, 0x130a0000 }, + { 0x0, 0x1, 0x49, 0x0, 0x130c0000, 42, 0x130c0000 }, + { 0x0, 0x1, 0x4a, 0x0, 0x208a0000, 60, 0x208a0000 }, + { 0x0, 0x1, 0x4b, 0x0, 0x208c0000, 61, 0x208c0000 }, + { 0x0, 0x1, 0x4c, 0x0, 0x209a0000, 63, 0x209a0000 }, + { 0x0, 0x1, 0x4d, 0x0, 0x21a40000, 81, 0x21a40000 }, + { 0x0, 0x1, 0x4e, 0x0, 0x03440000, 38, 0x03440000 }, + { 0x0, 0x1, 0x4f, 0x0, 0x20d00000, 64, 0x20d00000 }, + { 0x0, 0x1, 0x50, 0x0, 0x21000000, 71, 0x21000000 }, + { 0x0, 0x1, 0x51, 0x0, 0x0b000000, 39, 0x0b000000 }, + { 0x0, 0x2, 0x00, 0x0, 0x00000000, 0, 0x00000000 }, + { 0x0, 0x3, 0x00, 0x0, 0x02340000, 0, 0x00000000 }, + { 0x0, 0x4, 0x00, 0x0, 0x17000000, 0, 0x17000000 }, + { 0x0, 0x4, 0x01, 0x0, 0x18000000, 1, 0x18000000 }, + { 0x0, 0x5, 0x00, 0x0, 0x13e80000, 1, 0x13e80000 }, + { 0x0, 0x5, 0x01, 0x0, 0x15810000, 12, 0x15810000 }, + { 0x0, 0x5, 0x02, 0x0, 0x15840000, 14, 0x15840000 }, + { 0x0, 0x5, 0x03, 0x0, 0x15a40000, 17, 0x15a40000 }, + { 0x0, 0x5, 0x04, 0x0, 0x13f00000, 3, 0x13f00000 }, + { 0x0, 0x5, 0x05, 0x0, 0x15820000, 13, 0x15820000 }, + { 0x0, 0x5, 0x06, 0x0, 0x13ec0000, 2, 0x13ec0000 }, + { 0x0, 0x5, 0x07, 0x0, 0x15200000, 6, 0x15200000 }, + { 0x0, 0x5, 0x08, 0x0, 0x15340000, 7, 0x15340000 }, + { 0x0, 0x5, 0x09, 0x0, 0x15380000, 8, 0x15380000 }, + { 0x0, 0x5, 0x0a, 0x0, 0x15500000, 10, 0x15500000 }, + { 0x0, 0x5, 0x0b, 0x0, 0x155c0000, 11, 0x155c0000 }, + { 0x0, 0x5, 0x0c, 0x0, 0x15a00000, 16, 0x15a00000 }, + { 0x0, 0x5, 0x0d, 0x0, 0x13e00000, 0, 0x13e00000 }, + { 0x0, 0x5, 0x0e, 0x0, 0x15100000, 5, 0x15100000 }, + { 0x0, 0x5, 0x0f, 0x0, 0x15480000, 9, 0x15480000 }, + { 0x0, 0x5, 0x10, 0x0, 0x15880000, 15, 0x15880000 }, + { 0x0, 0x5, 0x11, 0x0, 0x15a80000, 18, 0x15a80000 }, + { 0x0, 0x5, 0x12, 0x0, 0x15b00000, 19, 0x15b00000 }, + { 0x0, 0x5, 0x13, 0x0, 0x14800000, 4, 0x14800000 }, + { 0x0, 0x5, 0x14, 0x0, 0x15c00000, 20, 0x15c00000 }, + { 0x0, 0x5, 0x15, 0x0, 0x16000000, 21, 0x16000000 }, + { 0x0, 0x6, 0x00, 0x0, 0x02000000, 4, 0x02000000 }, + { 0x0, 0x6, 0x01, 0x0, 0x02007000, 5, 0x02007000 }, + { 0x0, 0x6, 0x02, 0x0, 0x02008000, 6, 0x02008000 }, + { 0x0, 0x6, 0x03, 0x0, 0x02013000, 7, 0x02013000 }, + { 0x0, 0x6, 0x04, 0x0, 0x0201c000, 8, 0x0201c000 }, + { 0x0, 0x6, 0x05, 0x0, 0x02020000, 9, 0x02020000 }, + { 0x0, 0x6, 0x06, 0x0, 0x0202a000, 10, 0x0202a000 }, + { 0x0, 0x6, 0x07, 0x0, 0x0202e000, 11, 0x0202e000 }, + { 0x0, 0x6, 0x08, 0x0, 0x06400000, 33, 0x06400000 }, + { 0x0, 0x6, 0x09, 0x0, 0x02038000, 12, 0x02038000 }, + { 0x0, 0x6, 0x0a, 0x0, 0x00100000, 0, 0x00100000 }, + { 0x0, 0x6, 0x0b, 0x0, 0x023b0000, 13, 0x023b0000 }, + { 0x0, 0x6, 0x0c, 0x0, 0x02800000, 16, 0x02800000 }, + { 0x0, 0x6, 0x0d, 0x0, 0x030e0000, 22, 0x030e0000 }, + { 0x0, 0x6, 0x0e, 0x0, 0x03800000, 23, 0x03800000 }, + { 0x0, 0x6, 0x0f, 0x0, 0x03980000, 25, 0x03980000 }, + { 0x0, 0x6, 0x10, 0x0, 0x03a60000, 26, 0x03a60000 }, + { 0x0, 0x6, 0x11, 0x0, 0x03d80000, 31, 0x03d80000 }, + { 0x0, 0x6, 0x12, 0x0, 0x20000000, 36, 0x20000000 }, + { 0x0, 0x6, 0x13, 0x0, 0x20050000, 38, 0x20050000 }, + { 0x0, 0x6, 0x14, 0x0, 0x201e0000, 40, 0x201e0000 }, + { 0x0, 0x6, 0x15, 0x0, 0x20280000, 42, 0x20280000 }, + { 0x0, 0x6, 0x16, 0x0, 0x202c0000, 43, 0x202c0000 }, + { 0x0, 0x6, 0x17, 0x0, 0x20390000, 44, 0x20390000 }, + { 0x0, 0x6, 0x18, 0x0, 0x20430000, 45, 0x20430000 }, + { 0x0, 0x6, 0x19, 0x0, 0x20440000, 46, 0x20440000 }, + { 0x0, 0x6, 0x1a, 0x0, 0x204e0000, 47, 0x204e0000 }, + { 0x0, 0x6, 0x1b, 0x0, 0x20550000, 48, 0x20550000 }, + { 0x0, 0x6, 0x1c, 0x0, 0x20570000, 49, 0x20570000 }, + { 0x0, 0x6, 0x1d, 0x0, 0x20590000, 50, 0x20590000 }, + { 0x0, 0x6, 0x1e, 0x0, 0x20730000, 52, 0x20730000 }, + { 0x0, 0x6, 0x1f, 0x0, 0x209f0000, 54, 0x209f0000 }, + { 0x0, 0x6, 0x20, 0x0, 0x20e20000, 55, 0x20e20000 }, + { 0x0, 0x6, 0x21, 0x0, 0x20ed0000, 56, 0x20ed0000 }, + { 0x0, 0x6, 0x22, 0x0, 0x20fd0000, 57, 0x20fd0000 }, + { 0x0, 0x6, 0x23, 0x0, 0x21120000, 59, 0x21120000 }, + { 0x0, 0x6, 0x24, 0x0, 0x211a0000, 60, 0x211a0000 }, + { 0x0, 0x6, 0x25, 0x0, 0x21850000, 61, 0x21850000 }, + { 0x0, 0x6, 0x26, 0x0, 0x21860000, 62, 0x21860000 }, + { 0x0, 0x6, 0x27, 0x0, 0x21890000, 63, 0x21890000 }, + { 0x0, 0x6, 0x28, 0x0, 0x21970000, 64, 0x21970000 }, + { 0x0, 0x6, 0x29, 0x0, 0x21990000, 65, 0x21990000 }, + { 0x0, 0x6, 0x2a, 0x0, 0x21a00000, 66, 0x21a00000 }, + { 0x0, 0x6, 0x2b, 0x0, 0x21a90000, 68, 0x21a90000 }, + { 0x0, 0x6, 0x2c, 0x0, 0x21ac0000, 70, 0x21ac0000 }, + { 0x0, 0x6, 0x2d, 0x0, 0x01f80000, 3, 0x01f80000 }, + { 0x0, 0x6, 0x2e, 0x0, 0x024e0000, 14, 0x024e0000 }, + { 0x0, 0x6, 0x2f, 0x0, 0x030c0000, 21, 0x030c0000 }, + { 0x0, 0x6, 0x30, 0x0, 0x03820000, 24, 0x03820000 }, + { 0x0, 0x6, 0x31, 0x0, 0x03aa0000, 27, 0x03aa0000 }, + { 0x0, 0x6, 0x32, 0x0, 0x03c80000, 29, 0x03c80000 }, + { 0x0, 0x6, 0x33, 0x0, 0x130e0000, 34, 0x130e0000 }, + { 0x0, 0x6, 0x34, 0x0, 0x20020000, 37, 0x20020000 }, + { 0x0, 0x6, 0x35, 0x0, 0x20060000, 39, 0x20060000 }, + { 0x0, 0x6, 0x36, 0x0, 0x20200000, 41, 0x20200000 }, + { 0x0, 0x6, 0x37, 0x0, 0x206a0000, 51, 0x206a0000 }, + { 0x0, 0x6, 0x38, 0x0, 0x20740000, 53, 0x20740000 }, + { 0x0, 0x6, 0x39, 0x0, 0x20fe0000, 58, 0x20fe0000 }, + { 0x0, 0x6, 0x3a, 0x0, 0x21a20000, 67, 0x21a20000 }, + { 0x0, 0x6, 0x3b, 0x0, 0x21aa0000, 69, 0x21aa0000 }, + { 0x0, 0x6, 0x3c, 0x0, 0x02b80000, 17, 0x02b80000 }, + { 0x0, 0x6, 0x3d, 0x0, 0x03080000, 20, 0x03080000 }, + { 0x0, 0x6, 0x3e, 0x0, 0x13100000, 35, 0x13100000 }, + { 0x0, 0x6, 0x3f, 0x0, 0x01f00000, 2, 0x01f00000 }, + { 0x0, 0x6, 0x40, 0x0, 0x03000000, 19, 0x03000000 }, + { 0x0, 0x6, 0x41, 0x0, 0x03c00000, 28, 0x03c00000 }, + { 0x0, 0x6, 0x42, 0x0, 0x03d00000, 30, 0x03d00000 }, + { 0x0, 0x6, 0x43, 0x0, 0x01700000, 1, 0x01700000 }, + { 0x0, 0x6, 0x44, 0x0, 0x02c00000, 18, 0x02c00000 }, + { 0x0, 0x6, 0x45, 0x0, 0x02600000, 15, 0x02600000 }, + { 0x0, 0x6, 0x46, 0x0, 0x06000000, 32, 0x06000000 }, + { 0x0, 0x6, 0x47, 0x0, 0x24000000, 71, 0x24000000 }, + { 0x0, 0x7, 0x00, 0x0, 0x12000000, 0, 0x12000000 }, + { 0x0, 0x8, 0x00, 0x0, 0x11000000, 0, 0x11000000 }, + { 0x0, 0x9, 0x00, 0x0, 0x10000000, 0, 0x10000000 }, + { 0x0, 0xa, 0x00, 0x0, 0x22000000, 0, 0x22000000 } +}; + +/* + * BPMP NOC aperture lookup table as per file "BPMP_NOC_Structure.info". + */ +static const char * const tegra194_bpmpnoc_routeid_initflow[] = { + [0x0] = "cbb_i/I/0", + [0x1] = "cpu_m_i/I/0", + [0x2] = "cpu_p_i/I/0", + [0x3] = "cvc_i/I/0", + [0x4] = "dma_m_i/I/0", + [0x5] = "dma_p_i/I/0", + [0x6] = "RESERVED", + [0x7] = "RESERVED" +}; + +static const char * const tegra194_bpmpnoc_routeid_targflow[] = { + [0x00] = "multiport0_t/T/actmon", + [0x01] = "multiport0_t/T/ast_0", + [0x02] = "multiport0_t/T/ast_1", + [0x03] = "multiport0_t/T/atcm_cfg", + [0x04] = "multiport0_t/T/car", + [0x05] = "multiport0_t/T/central_pwr_mgr", + [0x06] = "multiport0_t/T/central_vtg_ctlr", + [0x07] = "multiport0_t/T/cfg", + [0x08] = "multiport0_t/T/dma", + [0x09] = "multiport0_t/T/err_collator", + [0x0a] = "multiport0_t/T/err_collator_car", + [0x0b] = "multiport0_t/T/fpga_misc", + [0x0c] = "multiport0_t/T/fpga_uart", + [0x0d] = "multiport0_t/T/gte", + [0x0e] = "multiport0_t/T/hsp", + [0x0f] = "multiport0_t/T/misc", + [0x10] = "multiport0_t/T/pm", + [0x11] = "multiport0_t/T/simon0", + [0x12] = "multiport0_t/T/simon1", + [0x13] = "multiport0_t/T/simon2", + [0x14] = "multiport0_t/T/simon3", + [0x15] = "multiport0_t/T/simon4", + [0x16] = "multiport0_t/T/soc_therm", + [0x17] = "multiport0_t/T/tke", + [0x18] = "multiport0_t/T/vic_0", + [0x19] = "multiport0_t/T/vic_1", + [0x1a] = "ast0_t/T/0", + [0x1b] = "ast1_t/T/0", + [0x1c] = "bpmp_noc_firewall/T/0", + [0x1d] = "cbb_t/T/0", + [0x1e] = "cpu_t/T/0", + [0x1f] = "svc_t/T/0" +}; + +/* + * Fields of BPMP NOC lookup table: + * Init flow, Targ flow, Targ subrange, Init mapping, Init localAddress, + * Targ mapping, Targ localAddress + * ---------------------------------------------------------------------------- + */ +static const struct tegra194_cbb_aperture tegra194_bpmpnoc_apert_lookup[] = { + { 0x0, 0x1c, 0x0, 0x0, 0x0d640000, 0, 0x00000000 }, + { 0x0, 0x1e, 0x0, 0x0, 0x0d400000, 0, 0x0d400000 }, + { 0x0, 0x00, 0x0, 0x0, 0x0d230000, 0, 0x00000000 }, + { 0x0, 0x01, 0x0, 0x0, 0x0d040000, 0, 0x00000000 }, + { 0x0, 0x02, 0x0, 0x0, 0x0d050000, 0, 0x00000000 }, + { 0x0, 0x03, 0x0, 0x0, 0x0d000000, 0, 0x00000000 }, + { 0x0, 0x04, 0x0, 0x0, 0x20ae0000, 3, 0x000e0000 }, + { 0x0, 0x04, 0x1, 0x0, 0x20ac0000, 2, 0x000c0000 }, + { 0x0, 0x04, 0x2, 0x0, 0x20a80000, 1, 0x00080000 }, + { 0x0, 0x04, 0x3, 0x0, 0x20a00000, 0, 0x00000000 }, + { 0x0, 0x05, 0x0, 0x0, 0x0d2a0000, 0, 0x00000000 }, + { 0x0, 0x06, 0x0, 0x0, 0x0d290000, 0, 0x00000000 }, + { 0x0, 0x07, 0x0, 0x0, 0x0d2c0000, 0, 0x00000000 }, + { 0x0, 0x08, 0x0, 0x0, 0x0d0e0000, 4, 0x00080000 }, + { 0x0, 0x08, 0x1, 0x0, 0x0d060000, 0, 0x00000000 }, + { 0x0, 0x08, 0x2, 0x0, 0x0d080000, 1, 0x00020000 }, + { 0x0, 0x08, 0x3, 0x0, 0x0d0a0000, 2, 0x00040000 }, + { 0x0, 0x08, 0x4, 0x0, 0x0d0c0000, 3, 0x00060000 }, + { 0x0, 0x09, 0x0, 0x0, 0x0d650000, 0, 0x00000000 }, + { 0x0, 0x0a, 0x0, 0x0, 0x20af0000, 0, 0x00000000 }, + { 0x0, 0x0b, 0x0, 0x0, 0x0d3e0000, 0, 0x00000000 }, + { 0x0, 0x0c, 0x0, 0x0, 0x0d3d0000, 0, 0x00000000 }, + { 0x0, 0x0d, 0x0, 0x0, 0x0d1e0000, 0, 0x00000000 }, + { 0x0, 0x0e, 0x0, 0x0, 0x0d150000, 0, 0x00000000 }, + { 0x0, 0x0e, 0x1, 0x0, 0x0d160000, 1, 0x00010000 }, + { 0x0, 0x0e, 0x2, 0x0, 0x0d170000, 2, 0x00020000 }, + { 0x0, 0x0e, 0x3, 0x0, 0x0d180000, 3, 0x00030000 }, + { 0x0, 0x0e, 0x4, 0x0, 0x0d190000, 4, 0x00040000 }, + { 0x0, 0x0e, 0x5, 0x0, 0x0d1a0000, 5, 0x00050000 }, + { 0x0, 0x0e, 0x6, 0x0, 0x0d1b0000, 6, 0x00060000 }, + { 0x0, 0x0e, 0x7, 0x0, 0x0d1c0000, 7, 0x00070000 }, + { 0x0, 0x0e, 0x8, 0x0, 0x0d1d0000, 8, 0x00080000 }, + { 0x0, 0x0f, 0x0, 0x0, 0x0d660000, 0, 0x00000000 }, + { 0x0, 0x10, 0x0, 0x0, 0x0d1f0000, 0, 0x00000000 }, + { 0x0, 0x10, 0x1, 0x0, 0x0d200000, 1, 0x00010000 }, + { 0x0, 0x10, 0x2, 0x0, 0x0d210000, 2, 0x00020000 }, + { 0x0, 0x10, 0x3, 0x0, 0x0d220000, 3, 0x00030000 }, + { 0x0, 0x11, 0x0, 0x0, 0x0d240000, 0, 0x00000000 }, + { 0x0, 0x12, 0x0, 0x0, 0x0d250000, 0, 0x00000000 }, + { 0x0, 0x13, 0x0, 0x0, 0x0d260000, 0, 0x00000000 }, + { 0x0, 0x14, 0x0, 0x0, 0x0d270000, 0, 0x00000000 }, + { 0x0, 0x15, 0x0, 0x0, 0x0d2b0000, 0, 0x00000000 }, + { 0x0, 0x16, 0x0, 0x0, 0x0d280000, 0, 0x00000000 }, + { 0x0, 0x17, 0x0, 0x0, 0x0d0f0000, 0, 0x00000000 }, + { 0x0, 0x17, 0x1, 0x0, 0x0d100000, 1, 0x00010000 }, + { 0x0, 0x17, 0x2, 0x0, 0x0d110000, 2, 0x00020000 }, + { 0x0, 0x17, 0x3, 0x0, 0x0d120000, 3, 0x00030000 }, + { 0x0, 0x17, 0x4, 0x0, 0x0d130000, 4, 0x00040000 }, + { 0x0, 0x17, 0x5, 0x0, 0x0d140000, 5, 0x00050000 }, + { 0x0, 0x18, 0x0, 0x0, 0x0d020000, 0, 0x00000000 }, + { 0x0, 0x19, 0x0, 0x0, 0x0d030000, 0, 0x00000000 }, + { 0x0, 0x1f, 0x0, 0x0, 0x0d600000, 0, 0x00000000 }, + { 0x0, 0x1f, 0x1, 0x0, 0x00000000, 0, 0x00000000 }, + { 0x1, 0x1a, 0x0, 0x0, 0x40000000, 0, 0x40000000 }, + { 0x1, 0x1a, 0x1, 0x1, 0x80000000, 1, 0x80000000 }, + { 0x1, 0x1a, 0x2, 0x0, 0x00000000, 0, 0x00000000 }, + { 0x2, 0x1c, 0x0, 0x0, 0x0d640000, 0, 0x00000000 }, + { 0x2, 0x1d, 0x0, 0x0, 0x20b00000, 8, 0x20b00000 }, + { 0x2, 0x1d, 0x1, 0x0, 0x20800000, 7, 0x20800000 }, + { 0x2, 0x1d, 0x2, 0x0, 0x20c00000, 9, 0x20c00000 }, + { 0x2, 0x1d, 0x3, 0x0, 0x0d800000, 3, 0x0d800000 }, + { 0x2, 0x1d, 0x4, 0x0, 0x20000000, 6, 0x20000000 }, + { 0x2, 0x1d, 0x5, 0x0, 0x0c000000, 2, 0x0c000000 }, + { 0x2, 0x1d, 0x6, 0x0, 0x21000000, 10, 0x21000000 }, + { 0x2, 0x1d, 0x7, 0x0, 0x0e000000, 4, 0x0e000000 }, + { 0x2, 0x1d, 0x8, 0x0, 0x22000000, 11, 0x22000000 }, + { 0x2, 0x1d, 0x9, 0x0, 0x08000000, 1, 0x08000000 }, + { 0x2, 0x1d, 0xa, 0x0, 0x24000000, 12, 0x24000000 }, + { 0x2, 0x1d, 0xb, 0x0, 0x00000000, 0, 0x00000000 }, + { 0x2, 0x1d, 0xc, 0x0, 0x28000000, 13, 0x28000000 }, + { 0x2, 0x1d, 0xd, 0x0, 0x10000000, 5, 0x10000000 }, + { 0x2, 0x1d, 0xe, 0x0, 0x30000000, 14, 0x30000000 }, + { 0x2, 0x00, 0x0, 0x0, 0x0d230000, 0, 0x00000000 }, + { 0x2, 0x01, 0x0, 0x0, 0x0d040000, 0, 0x00000000 }, + { 0x2, 0x02, 0x0, 0x0, 0x0d050000, 0, 0x00000000 }, + { 0x2, 0x03, 0x0, 0x0, 0x0d000000, 0, 0x00000000 }, + { 0x2, 0x04, 0x0, 0x0, 0x20ae0000, 3, 0x000e0000 }, + { 0x2, 0x04, 0x1, 0x0, 0x20ac0000, 2, 0x000c0000 }, + { 0x2, 0x04, 0x2, 0x0, 0x20a80000, 1, 0x00080000 }, + { 0x2, 0x04, 0x3, 0x0, 0x20a00000, 0, 0x00000000 }, + { 0x2, 0x05, 0x0, 0x0, 0x0d2a0000, 0, 0x00000000 }, + { 0x2, 0x06, 0x0, 0x0, 0x0d290000, 0, 0x00000000 }, + { 0x2, 0x07, 0x0, 0x0, 0x0d2c0000, 0, 0x00000000 }, + { 0x2, 0x08, 0x0, 0x0, 0x0d0e0000, 4, 0x00080000 }, + { 0x2, 0x08, 0x1, 0x0, 0x0d060000, 0, 0x00000000 }, + { 0x2, 0x08, 0x2, 0x0, 0x0d080000, 1, 0x00020000 }, + { 0x2, 0x08, 0x3, 0x0, 0x0d0a0000, 2, 0x00040000 }, + { 0x2, 0x08, 0x4, 0x0, 0x0d0c0000, 3, 0x00060000 }, + { 0x2, 0x09, 0x0, 0x0, 0x0d650000, 0, 0x00000000 }, + { 0x2, 0x0a, 0x0, 0x0, 0x20af0000, 0, 0x00000000 }, + { 0x2, 0x0b, 0x0, 0x0, 0x0d3e0000, 0, 0x00000000 }, + { 0x2, 0x0c, 0x0, 0x0, 0x0d3d0000, 0, 0x00000000 }, + { 0x2, 0x0d, 0x0, 0x0, 0x0d1e0000, 0, 0x00000000 }, + { 0x2, 0x0e, 0x0, 0x0, 0x0d150000, 0, 0x00000000 }, + { 0x2, 0x0e, 0x1, 0x0, 0x0d160000, 1, 0x00010000 }, + { 0x2, 0x0e, 0x2, 0x0, 0x0d170000, 2, 0x00020000 }, + { 0x2, 0x0e, 0x3, 0x0, 0x0d180000, 3, 0x00030000 }, + { 0x2, 0x0e, 0x4, 0x0, 0x0d190000, 4, 0x00040000 }, + { 0x2, 0x0e, 0x5, 0x0, 0x0d1a0000, 5, 0x00050000 }, + { 0x2, 0x0e, 0x6, 0x0, 0x0d1b0000, 6, 0x00060000 }, + { 0x2, 0x0e, 0x7, 0x0, 0x0d1c0000, 7, 0x00070000 }, + { 0x2, 0x0e, 0x8, 0x0, 0x0d1d0000, 8, 0x00080000 }, + { 0x2, 0x0f, 0x0, 0x0, 0x0d660000, 0, 0x00000000 }, + { 0x2, 0x10, 0x0, 0x0, 0x0d1f0000, 0, 0x00000000 }, + { 0x2, 0x10, 0x1, 0x0, 0x0d200000, 1, 0x00010000 }, + { 0x2, 0x10, 0x2, 0x0, 0x0d210000, 2, 0x00020000 }, + { 0x2, 0x10, 0x3, 0x0, 0x0d220000, 3, 0x00030000 }, + { 0x2, 0x11, 0x0, 0x0, 0x0d240000, 0, 0x00000000 }, + { 0x2, 0x12, 0x0, 0x0, 0x0d250000, 0, 0x00000000 }, + { 0x2, 0x13, 0x0, 0x0, 0x0d260000, 0, 0x00000000 }, + { 0x2, 0x14, 0x0, 0x0, 0x0d270000, 0, 0x00000000 }, + { 0x2, 0x15, 0x0, 0x0, 0x0d2b0000, 0, 0x00000000 }, + { 0x2, 0x16, 0x0, 0x0, 0x0d280000, 0, 0x00000000 }, + { 0x2, 0x17, 0x0, 0x0, 0x0d0f0000, 0, 0x00000000 }, + { 0x2, 0x17, 0x1, 0x0, 0x0d100000, 1, 0x00010000 }, + { 0x2, 0x17, 0x2, 0x0, 0x0d110000, 2, 0x00020000 }, + { 0x2, 0x17, 0x3, 0x0, 0x0d120000, 3, 0x00030000 }, + { 0x2, 0x17, 0x4, 0x0, 0x0d130000, 4, 0x00040000 }, + { 0x2, 0x17, 0x5, 0x0, 0x0d140000, 5, 0x00050000 }, + { 0x2, 0x18, 0x0, 0x0, 0x0d020000, 0, 0x00000000 }, + { 0x2, 0x19, 0x0, 0x0, 0x0d030000, 0, 0x00000000 }, + { 0x2, 0x1f, 0x0, 0x0, 0x0d600000, 0, 0x00000000 }, + { 0x2, 0x1f, 0x1, 0x0, 0x00000000, 0, 0x00000000 }, + { 0x3, 0x1b, 0x0, 0x0, 0x40000000, 0, 0x40000000 }, + { 0x3, 0x1b, 0x1, 0x1, 0x80000000, 1, 0x80000000 }, + { 0x3, 0x1c, 0x0, 0x2, 0x0d640000, 0, 0x00000000 }, + { 0x3, 0x1d, 0x0, 0x2, 0x20b00000, 8, 0x20b00000 }, + { 0x3, 0x1d, 0x1, 0x2, 0x20800000, 7, 0x20800000 }, + { 0x3, 0x1d, 0x2, 0x2, 0x20c00000, 9, 0x20c00000 }, + { 0x3, 0x1d, 0x3, 0x2, 0x0d800000, 3, 0x0d800000 }, + { 0x3, 0x1d, 0x4, 0x2, 0x20000000, 6, 0x20000000 }, + { 0x3, 0x1d, 0x5, 0x2, 0x0c000000, 2, 0x0c000000 }, + { 0x3, 0x1d, 0x6, 0x2, 0x21000000, 10, 0x21000000 }, + { 0x3, 0x1d, 0x7, 0x2, 0x0e000000, 4, 0x0e000000 }, + { 0x3, 0x1d, 0x8, 0x2, 0x22000000, 11, 0x22000000 }, + { 0x3, 0x1d, 0x9, 0x2, 0x08000000, 1, 0x08000000 }, + { 0x3, 0x1d, 0xa, 0x2, 0x24000000, 12, 0x24000000 }, + { 0x3, 0x1d, 0xb, 0x2, 0x00000000, 0, 0x00000000 }, + { 0x3, 0x1d, 0xc, 0x2, 0x28000000, 13, 0x28000000 }, + { 0x3, 0x1d, 0xd, 0x2, 0x10000000, 5, 0x10000000 }, + { 0x3, 0x1d, 0xe, 0x2, 0x30000000, 14, 0x30000000 }, + { 0x3, 0x1e, 0x0, 0x2, 0x0d400000, 0, 0x0d400000 }, + { 0x3, 0x00, 0x0, 0x2, 0x0d230000, 0, 0x00000000 }, + { 0x3, 0x01, 0x0, 0x2, 0x0d040000, 0, 0x00000000 }, + { 0x3, 0x02, 0x0, 0x2, 0x0d050000, 0, 0x00000000 }, + { 0x3, 0x03, 0x0, 0x2, 0x0d000000, 0, 0x00000000 }, + { 0x3, 0x04, 0x0, 0x2, 0x20ae0000, 3, 0x000e0000 }, + { 0x3, 0x04, 0x1, 0x2, 0x20ac0000, 2, 0x000c0000 }, + { 0x3, 0x04, 0x2, 0x2, 0x20a80000, 1, 0x00080000 }, + { 0x3, 0x04, 0x3, 0x2, 0x20a00000, 0, 0x00000000 }, + { 0x3, 0x05, 0x0, 0x2, 0x0d2a0000, 0, 0x00000000 }, + { 0x3, 0x06, 0x0, 0x2, 0x0d290000, 0, 0x00000000 }, + { 0x3, 0x07, 0x0, 0x2, 0x0d2c0000, 0, 0x00000000 }, + { 0x3, 0x08, 0x0, 0x2, 0x0d0e0000, 4, 0x00080000 }, + { 0x3, 0x08, 0x1, 0x2, 0x0d060000, 0, 0x00000000 }, + { 0x3, 0x08, 0x2, 0x2, 0x0d080000, 1, 0x00020000 }, + { 0x3, 0x08, 0x3, 0x2, 0x0d0a0000, 2, 0x00040000 }, + { 0x3, 0x08, 0x4, 0x2, 0x0d0c0000, 3, 0x00060000 }, + { 0x3, 0x09, 0x0, 0x2, 0x0d650000, 0, 0x00000000 }, + { 0x3, 0x0a, 0x0, 0x2, 0x20af0000, 0, 0x00000000 }, + { 0x3, 0x0b, 0x0, 0x2, 0x0d3e0000, 0, 0x00000000 }, + { 0x3, 0x0c, 0x0, 0x2, 0x0d3d0000, 0, 0x00000000 }, + { 0x3, 0x0d, 0x0, 0x2, 0x0d1e0000, 0, 0x00000000 }, + { 0x3, 0x0e, 0x0, 0x2, 0x0d150000, 0, 0x00000000 }, + { 0x3, 0x0e, 0x1, 0x2, 0x0d160000, 1, 0x00010000 }, + { 0x3, 0x0e, 0x2, 0x2, 0x0d170000, 2, 0x00020000 }, + { 0x3, 0x0e, 0x3, 0x2, 0x0d180000, 3, 0x00030000 }, + { 0x3, 0x0e, 0x4, 0x2, 0x0d190000, 4, 0x00040000 }, + { 0x3, 0x0e, 0x5, 0x2, 0x0d1a0000, 5, 0x00050000 }, + { 0x3, 0x0e, 0x6, 0x2, 0x0d1b0000, 6, 0x00060000 }, + { 0x3, 0x0e, 0x7, 0x2, 0x0d1c0000, 7, 0x00070000 }, + { 0x3, 0x0e, 0x8, 0x2, 0x0d1d0000, 8, 0x00080000 }, + { 0x3, 0x0f, 0x0, 0x2, 0x0d660000, 0, 0x00000000 }, + { 0x3, 0x10, 0x0, 0x2, 0x0d1f0000, 0, 0x00000000 }, + { 0x3, 0x10, 0x1, 0x2, 0x0d200000, 1, 0x00010000 }, + { 0x3, 0x10, 0x2, 0x2, 0x0d210000, 2, 0x00020000 }, + { 0x3, 0x10, 0x3, 0x2, 0x0d220000, 3, 0x00030000 }, + { 0x3, 0x11, 0x0, 0x2, 0x0d240000, 0, 0x00000000 }, + { 0x3, 0x12, 0x0, 0x2, 0x0d250000, 0, 0x00000000 }, + { 0x3, 0x13, 0x0, 0x2, 0x0d260000, 0, 0x00000000 }, + { 0x3, 0x14, 0x0, 0x2, 0x0d270000, 0, 0x00000000 }, + { 0x3, 0x15, 0x0, 0x2, 0x0d2b0000, 0, 0x00000000 }, + { 0x3, 0x16, 0x0, 0x2, 0x0d280000, 0, 0x00000000 }, + { 0x3, 0x17, 0x0, 0x2, 0x0d0f0000, 0, 0x00000000 }, + { 0x3, 0x17, 0x1, 0x2, 0x0d100000, 1, 0x00010000 }, + { 0x3, 0x17, 0x2, 0x2, 0x0d110000, 2, 0x00020000 }, + { 0x3, 0x17, 0x3, 0x2, 0x0d120000, 3, 0x00030000 }, + { 0x3, 0x17, 0x4, 0x2, 0x0d130000, 4, 0x00040000 }, + { 0x3, 0x17, 0x5, 0x2, 0x0d140000, 5, 0x00050000 }, + { 0x3, 0x18, 0x0, 0x2, 0x0d020000, 0, 0x00000000 }, + { 0x3, 0x19, 0x0, 0x2, 0x0d030000, 0, 0x00000000 }, + { 0x3, 0x1f, 0x0, 0x2, 0x0d600000, 0, 0x00000000 }, + { 0x3, 0x1f, 0x1, 0x0, 0x00000000, 0, 0x00000000 }, + { 0x4, 0x1b, 0x0, 0x0, 0x40000000, 0, 0x40000000 }, + { 0x4, 0x1b, 0x1, 0x1, 0x80000000, 1, 0x80000000 }, + { 0x4, 0x1e, 0x0, 0x2, 0x0d400000, 0, 0x0d400000 }, + { 0x4, 0x1e, 0x1, 0x0, 0x00000000, 0, 0x00000000 }, + { 0x5, 0x1c, 0x0, 0x0, 0x0d640000, 0, 0x00000000 }, + { 0x5, 0x1d, 0x0, 0x0, 0x20b00000, 8, 0x20b00000 }, + { 0x5, 0x1d, 0x1, 0x0, 0x20800000, 7, 0x20800000 }, + { 0x5, 0x1d, 0x2, 0x0, 0x20c00000, 9, 0x20c00000 }, + { 0x5, 0x1d, 0x3, 0x0, 0x0d800000, 3, 0x0d800000 }, + { 0x5, 0x1d, 0x4, 0x0, 0x20000000, 6, 0x20000000 }, + { 0x5, 0x1d, 0x5, 0x0, 0x0c000000, 2, 0x0c000000 }, + { 0x5, 0x1d, 0x6, 0x0, 0x21000000, 10, 0x21000000 }, + { 0x5, 0x1d, 0x7, 0x0, 0x0e000000, 4, 0x0e000000 }, + { 0x5, 0x1d, 0x8, 0x0, 0x22000000, 11, 0x22000000 }, + { 0x5, 0x1d, 0x9, 0x0, 0x08000000, 1, 0x08000000 }, + { 0x5, 0x1d, 0xa, 0x0, 0x24000000, 12, 0x24000000 }, + { 0x5, 0x1d, 0xb, 0x0, 0x00000000, 0, 0x00000000 }, + { 0x5, 0x1d, 0xc, 0x0, 0x28000000, 13, 0x28000000 }, + { 0x5, 0x1d, 0xd, 0x0, 0x10000000, 5, 0x10000000 }, + { 0x5, 0x1d, 0xe, 0x0, 0x30000000, 14, 0x30000000 }, + { 0x5, 0x00, 0x0, 0x0, 0x0d230000, 0, 0x00000000 }, + { 0x5, 0x01, 0x0, 0x0, 0x0d040000, 0, 0x00000000 }, + { 0x5, 0x02, 0x0, 0x0, 0x0d050000, 0, 0x00000000 }, + { 0x5, 0x03, 0x0, 0x0, 0x0d000000, 0, 0x00000000 }, + { 0x5, 0x04, 0x0, 0x0, 0x20ae0000, 3, 0x000e0000 }, + { 0x5, 0x04, 0x1, 0x0, 0x20ac0000, 2, 0x000c0000 }, + { 0x5, 0x04, 0x2, 0x0, 0x20a80000, 1, 0x00080000 }, + { 0x5, 0x04, 0x3, 0x0, 0x20a00000, 0, 0x00000000 }, + { 0x5, 0x05, 0x0, 0x0, 0x0d2a0000, 0, 0x00000000 }, + { 0x5, 0x06, 0x0, 0x0, 0x0d290000, 0, 0x00000000 }, + { 0x5, 0x07, 0x0, 0x0, 0x0d2c0000, 0, 0x00000000 }, + { 0x5, 0x08, 0x0, 0x0, 0x0d0e0000, 4, 0x00080000 }, + { 0x5, 0x08, 0x1, 0x0, 0x0d060000, 0, 0x00000000 }, + { 0x5, 0x08, 0x2, 0x0, 0x0d080000, 1, 0x00020000 }, + { 0x5, 0x08, 0x3, 0x0, 0x0d0a0000, 2, 0x00040000 }, + { 0x5, 0x08, 0x4, 0x0, 0x0d0c0000, 3, 0x00060000 }, + { 0x5, 0x09, 0x0, 0x0, 0x0d650000, 0, 0x00000000 }, + { 0x5, 0x0a, 0x0, 0x0, 0x20af0000, 0, 0x00000000 }, + { 0x5, 0x0b, 0x0, 0x0, 0x0d3e0000, 0, 0x00000000 }, + { 0x5, 0x0c, 0x0, 0x0, 0x0d3d0000, 0, 0x00000000 }, + { 0x5, 0x0d, 0x0, 0x0, 0x0d1e0000, 0, 0x00000000 }, + { 0x5, 0x0e, 0x0, 0x0, 0x0d150000, 0, 0x00000000 }, + { 0x5, 0x0e, 0x1, 0x0, 0x0d160000, 1, 0x00010000 }, + { 0x5, 0x0e, 0x2, 0x0, 0x0d170000, 2, 0x00020000 }, + { 0x5, 0x0e, 0x3, 0x0, 0x0d180000, 3, 0x00030000 }, + { 0x5, 0x0e, 0x4, 0x0, 0x0d190000, 4, 0x00040000 }, + { 0x5, 0x0e, 0x5, 0x0, 0x0d1a0000, 5, 0x00050000 }, + { 0x5, 0x0e, 0x6, 0x0, 0x0d1b0000, 6, 0x00060000 }, + { 0x5, 0x0e, 0x7, 0x0, 0x0d1c0000, 7, 0x00070000 }, + { 0x5, 0x0e, 0x8, 0x0, 0x0d1d0000, 8, 0x00080000 }, + { 0x5, 0x0f, 0x0, 0x0, 0x0d660000, 0, 0x00000000 }, + { 0x5, 0x10, 0x0, 0x0, 0x0d1f0000, 0, 0x00000000 }, + { 0x5, 0x10, 0x1, 0x0, 0x0d200000, 1, 0x00010000 }, + { 0x5, 0x10, 0x2, 0x0, 0x0d210000, 2, 0x00020000 }, + { 0x5, 0x10, 0x3, 0x0, 0x0d220000, 3, 0x00030000 }, + { 0x5, 0x11, 0x0, 0x0, 0x0d240000, 0, 0x00000000 }, + { 0x5, 0x12, 0x0, 0x0, 0x0d250000, 0, 0x00000000 }, + { 0x5, 0x13, 0x0, 0x0, 0x0d260000, 0, 0x00000000 }, + { 0x5, 0x14, 0x0, 0x0, 0x0d270000, 0, 0x00000000 }, + { 0x5, 0x15, 0x0, 0x0, 0x0d2b0000, 0, 0x00000000 }, + { 0x5, 0x16, 0x0, 0x0, 0x0d280000, 0, 0x00000000 }, + { 0x5, 0x17, 0x0, 0x0, 0x0d0f0000, 0, 0x00000000 }, + { 0x5, 0x17, 0x1, 0x0, 0x0d100000, 1, 0x00010000 }, + { 0x5, 0x17, 0x2, 0x0, 0x0d110000, 2, 0x00020000 }, + { 0x5, 0x17, 0x3, 0x0, 0x0d120000, 3, 0x00030000 }, + { 0x5, 0x17, 0x4, 0x0, 0x0d130000, 4, 0x00040000 }, + { 0x5, 0x17, 0x5, 0x0, 0x0d140000, 5, 0x00050000 }, + { 0x5, 0x18, 0x0, 0x0, 0x0d020000, 0, 0x00000000 }, + { 0x5, 0x19, 0x0, 0x0, 0x0d030000, 0, 0x00000000 }, + { 0x5, 0x1f, 0x0, 0x0, 0x0d600000, 0, 0x00000000 }, + { 0x5, 0x1f, 0x1, 0x0, 0x00000000, 0, 0x00000000 } +}; + +/* + * AON NOC aperture lookup table as per file "AON_NOC_Structure.info". + */ +static const char * const tegra194_aonnoc_routeid_initflow[] = { + [0x0] = "cbb_i/I/0", + [0x1] = "cpu_p_i/I/0", + [0x2] = "dma_m_i/I/0", + [0x3] = "dma_p_i/I/0" +}; + +static const char * const tegra194_aonnoc_routeid_targflow[] = { + [0x00] = "multiport1_t/T/aon_misc", + [0x01] = "multiport1_t/T/avic0", + [0x02] = "multiport1_t/T/avic1", + [0x03] = "multiport1_t/T/can1", + [0x04] = "multiport1_t/T/can2", + [0x05] = "multiport1_t/T/dma", + [0x06] = "multiport1_t/T/dmic", + [0x07] = "multiport1_t/T/err_collator", + [0x08] = "multiport1_t/T/fpga_misc", + [0x09] = "multiport1_t/T/gte", + [0x0a] = "multiport1_t/T/hsp", + [0x0b] = "multiport1_t/T/i2c2", + [0x0c] = "multiport1_t/T/i2c8", + [0x0d] = "multiport1_t/T/pwm", + [0x0e] = "multiport1_t/T/spi2", + [0x0f] = "multiport1_t/T/tke", + [0x10] = "multiport1_t/T/uartg", + [0x11] = "RESERVED", + [0x12] = "RESERVED", + [0x13] = "RESERVED", + [0x14] = "RESERVED", + [0x15] = "RESERVED", + [0x16] = "RESERVED", + [0x17] = "RESERVED", + [0x18] = "RESERVED", + [0x19] = "RESERVED", + [0x1a] = "RESERVED", + [0x1b] = "RESERVED", + [0x1c] = "RESERVED", + [0x1d] = "RESERVED", + [0x1e] = "RESERVED", + [0x1f] = "RESERVED", + [0x20] = "multiport0_t/T/aovc", + [0x21] = "multiport0_t/T/atcm", + [0x22] = "multiport0_t/T/cast", + [0x23] = "multiport0_t/T/dast", + [0x24] = "multiport0_t/T/err_collator_car", + [0x25] = "multiport0_t/T/gpio", + [0x26] = "multiport0_t/T/i2c10", + [0x27] = "multiport0_t/T/mss", + [0x28] = "multiport0_t/T/padctl_a12", + [0x29] = "multiport0_t/T/padctl_a14", + [0x2a] = "multiport0_t/T/padctl_a15", + [0x2b] = "multiport0_t/T/rtc", + [0x2c] = "multiport0_t/T/tsc", + [0x2d] = "RESERVED", + [0x2e] = "RESERVED", + [0x2f] = "RESERVED", + [0x30] = "multiport2_t/T/aon_vref_ro", + [0x31] = "multiport2_t/T/aopm", + [0x32] = "multiport2_t/T/car", + [0x33] = "multiport2_t/T/pmc", + [0x34] = "ast1_t/T/0", + [0x35] = "cbb_t/T/0", + [0x36] = "cpu_t/T/0", + [0x37] = "firewall_t/T/0", + [0x38] = "svc_t/T/0", + [0x39] = "uartc/T/uartc", + [0x3a] = "RESERVED", + [0x3b] = "RESERVED", + [0x3c] = "RESERVED", + [0x3d] = "RESERVED", + [0x3e] = "RESERVED", + [0x3f] = "RESERVED" +}; + +/* + * Fields of AON NOC lookup table: + * Init flow, Targ flow, Targ subrange, Init mapping, Init localAddress, + * Targ mapping, Targ localAddress + * ---------------------------------------------------------------------------- + */ +static const struct tegra194_cbb_aperture tegra194_aonnoc_aperture_lookup[] = { + { 0x0, 0x37, 0x00, 0, 0x0c640000, 0, 0x00000000 }, + { 0x0, 0x20, 0x00, 0, 0x0c3b0000, 0, 0x00000000 }, + { 0x0, 0x21, 0x00, 0, 0x0c000000, 0, 0x00000000 }, + { 0x0, 0x22, 0x00, 0, 0x0c040000, 0, 0x00000000 }, + { 0x0, 0x23, 0x00, 0, 0x0c050000, 0, 0x00000000 }, + { 0x0, 0x24, 0x00, 0, 0x20cf0000, 0, 0x00000000 }, + { 0x0, 0x25, 0x00, 0, 0x0c2f0000, 0, 0x00000000 }, + { 0x0, 0x26, 0x00, 0, 0x0c230000, 0, 0x00000000 }, + { 0x0, 0x27, 0x00, 0, 0x0c350000, 0, 0x00000000 }, + { 0x0, 0x28, 0x00, 0, 0x0c301000, 0, 0x00000000 }, + { 0x0, 0x29, 0x00, 0, 0x0c302000, 0, 0x00000000 }, + { 0x0, 0x2a, 0x00, 0, 0x0c303000, 0, 0x00000000 }, + { 0x0, 0x2b, 0x00, 0, 0x0c2a0000, 0, 0x00000000 }, + { 0x0, 0x2c, 0x00, 0, 0x0c2b0000, 0, 0x00000000 }, + { 0x0, 0x2c, 0x01, 0, 0x0c2c0000, 1, 0x00010000 }, + { 0x0, 0x2c, 0x02, 0, 0x0c2d0000, 2, 0x00020000 }, + { 0x0, 0x2c, 0x03, 0, 0x0c2e0000, 3, 0x00030000 }, + { 0x0, 0x00, 0x00, 0, 0x0c660000, 0, 0x00000000 }, + { 0x0, 0x01, 0x00, 0, 0x0c020000, 0, 0x00000000 }, + { 0x0, 0x02, 0x00, 0, 0x0c030000, 0, 0x00000000 }, + { 0x0, 0x03, 0x00, 0, 0x0c310000, 0, 0x00000000 }, + { 0x0, 0x04, 0x00, 0, 0x0c320000, 0, 0x00000000 }, + { 0x0, 0x05, 0x00, 0, 0x0c0a0000, 2, 0x00040000 }, + { 0x0, 0x05, 0x01, 0, 0x0c0b0000, 3, 0x00050000 }, + { 0x0, 0x05, 0x02, 0, 0x0c0e0000, 5, 0x00080000 }, + { 0x0, 0x05, 0x03, 0, 0x0c060000, 0, 0x00000000 }, + { 0x0, 0x05, 0x04, 0, 0x0c080000, 1, 0x00020000 }, + { 0x0, 0x05, 0x05, 0, 0x0c0c0000, 4, 0x00060000 }, + { 0x0, 0x06, 0x00, 0, 0x0c330000, 0, 0x00000000 }, + { 0x0, 0x07, 0x00, 0, 0x0c650000, 0, 0x00000000 }, + { 0x0, 0x08, 0x00, 0, 0x0c3e0000, 0, 0x00000000 }, + { 0x0, 0x09, 0x00, 0, 0x0c1e0000, 0, 0x00000000 }, + { 0x0, 0x0a, 0x00, 0, 0x0c150000, 0, 0x00000000 }, + { 0x0, 0x0a, 0x01, 0, 0x0c160000, 1, 0x00010000 }, + { 0x0, 0x0a, 0x02, 0, 0x0c170000, 2, 0x00020000 }, + { 0x0, 0x0a, 0x03, 0, 0x0c180000, 3, 0x00030000 }, + { 0x0, 0x0a, 0x04, 0, 0x0c190000, 4, 0x00040000 }, + { 0x0, 0x0a, 0x05, 0, 0x0c1a0000, 5, 0x00050000 }, + { 0x0, 0x0a, 0x06, 0, 0x0c1b0000, 6, 0x00060000 }, + { 0x0, 0x0a, 0x07, 0, 0x0c1c0000, 7, 0x00070000 }, + { 0x0, 0x0a, 0x08, 0, 0x0c1d0000, 8, 0x00080000 }, + { 0x0, 0x0b, 0x00, 0, 0x0c240000, 0, 0x00000000 }, + { 0x0, 0x0c, 0x00, 0, 0x0c250000, 0, 0x00000000 }, + { 0x0, 0x0d, 0x00, 0, 0x0c340000, 0, 0x00000000 }, + { 0x0, 0x0e, 0x00, 0, 0x0c260000, 0, 0x00000000 }, + { 0x0, 0x0f, 0x00, 0, 0x0c0f0000, 0, 0x00000000 }, + { 0x0, 0x0f, 0x01, 0, 0x0c100000, 1, 0x00010000 }, + { 0x0, 0x0f, 0x02, 0, 0x0c110000, 2, 0x00020000 }, + { 0x0, 0x0f, 0x03, 0, 0x0c120000, 3, 0x00030000 }, + { 0x0, 0x0f, 0x04, 0, 0x0c130000, 4, 0x00040000 }, + { 0x0, 0x0f, 0x05, 0, 0x0c140000, 5, 0x00050000 }, + { 0x0, 0x10, 0x00, 0, 0x0c290000, 0, 0x00000000 }, + { 0x0, 0x30, 0x00, 0, 0x20ce0000, 0, 0x00000000 }, + { 0x0, 0x31, 0x00, 0, 0x0c1f0000, 0, 0x00000000 }, + { 0x0, 0x31, 0x01, 0, 0x0c200000, 1, 0x00010000 }, + { 0x0, 0x31, 0x02, 0, 0x0c210000, 2, 0x00020000 }, + { 0x0, 0x31, 0x03, 0, 0x0c220000, 3, 0x00030000 }, + { 0x0, 0x32, 0x00, 0, 0x20cc0000, 3, 0x001c0000 }, + { 0x0, 0x32, 0x01, 0, 0x20c80000, 2, 0x00180000 }, + { 0x0, 0x32, 0x02, 0, 0x20c00000, 1, 0x00100000 }, + { 0x0, 0x32, 0x03, 0, 0x20b00000, 0, 0x00000000 }, + { 0x0, 0x33, 0x00, 0, 0x0c360000, 0, 0x00000000 }, + { 0x0, 0x33, 0x01, 0, 0x0c370000, 1, 0x00010000 }, + { 0x0, 0x33, 0x02, 0, 0x0c3a0000, 3, 0x00040000 }, + { 0x0, 0x33, 0x03, 0, 0x0c380000, 2, 0x00020000 }, + { 0x0, 0x38, 0x00, 0, 0x0c600000, 0, 0x00000000 }, + { 0x0, 0x38, 0x01, 0, 0x00000000, 0, 0x00000000 }, + { 0x0, 0x39, 0x00, 0, 0x0c280000, 0, 0x00000000 }, + { 0x1, 0x35, 0x00, 0, 0x00000000, 0, 0x00000000 }, + { 0x1, 0x35, 0x01, 0, 0x00100000, 1, 0x00100000 }, + { 0x1, 0x35, 0x02, 0, 0x05a00000, 11, 0x05a00000 }, + { 0x1, 0x35, 0x03, 0, 0x05b00000, 32, 0x05b00000 }, + { 0x1, 0x35, 0x04, 0, 0x05c00000, 33, 0x05c00000 }, + { 0x1, 0x35, 0x05, 0, 0x05d00000, 12, 0x05d00000 }, + { 0x1, 0x35, 0x06, 0, 0x20000000, 19, 0x20000000 }, + { 0x1, 0x35, 0x07, 0, 0x20100000, 20, 0x20100000 }, + { 0x1, 0x35, 0x08, 0, 0x20a00000, 24, 0x20a00000 }, + { 0x1, 0x35, 0x09, 0, 0x20d00000, 25, 0x20d00000 }, + { 0x1, 0x35, 0x0a, 0, 0x00200000, 2, 0x00200000 }, + { 0x1, 0x35, 0x0b, 0, 0x05800000, 10, 0x05800000 }, + { 0x1, 0x35, 0x0c, 0, 0x05e00000, 13, 0x05e00000 }, + { 0x1, 0x35, 0x0d, 0, 0x20200000, 21, 0x20200000 }, + { 0x1, 0x35, 0x0e, 0, 0x20800000, 23, 0x20800000 }, + { 0x1, 0x35, 0x0f, 0, 0x20e00000, 26, 0x20e00000 }, + { 0x1, 0x35, 0x10, 0, 0x00400000, 3, 0x00400000 }, + { 0x1, 0x35, 0x11, 0, 0x20400000, 22, 0x20400000 }, + { 0x1, 0x35, 0x12, 0, 0x00800000, 4, 0x00800000 }, + { 0x1, 0x35, 0x13, 0, 0x05000000, 9, 0x05000000 }, + { 0x1, 0x35, 0x14, 0, 0x0c800000, 34, 0x0c800000 }, + { 0x1, 0x35, 0x15, 0, 0x01000000, 5, 0x01000000 }, + { 0x1, 0x35, 0x16, 0, 0x03000000, 7, 0x03000000 }, + { 0x1, 0x35, 0x17, 0, 0x04000000, 8, 0x04000000 }, + { 0x1, 0x35, 0x18, 0, 0x0d000000, 16, 0x0d000000 }, + { 0x1, 0x35, 0x19, 0, 0x21000000, 27, 0x21000000 }, + { 0x1, 0x35, 0x1a, 0, 0x02000000, 6, 0x02000000 }, + { 0x1, 0x35, 0x1b, 0, 0x06000000, 14, 0x06000000 }, + { 0x1, 0x35, 0x1c, 0, 0x0e000000, 17, 0x0e000000 }, + { 0x1, 0x35, 0x1d, 0, 0x22000000, 28, 0x22000000 }, + { 0x1, 0x35, 0x1e, 0, 0x08000000, 15, 0x08000000 }, + { 0x1, 0x35, 0x1f, 0, 0x24000000, 29, 0x24000000 }, + { 0x1, 0x35, 0x20, 0, 0x28000000, 30, 0x28000000 }, + { 0x1, 0x35, 0x21, 0, 0x10000000, 18, 0x10000000 }, + { 0x1, 0x35, 0x22, 0, 0x30000000, 31, 0x30000000 }, + { 0x1, 0x37, 0x00, 0, 0x0c640000, 0, 0x00000000 }, + { 0x1, 0x20, 0x00, 0, 0x0c3b0000, 0, 0x00000000 }, + { 0x1, 0x21, 0x00, 0, 0x0c000000, 0, 0x00000000 }, + { 0x1, 0x22, 0x00, 0, 0x0c040000, 0, 0x00000000 }, + { 0x1, 0x23, 0x00, 0, 0x0c050000, 0, 0x00000000 }, + { 0x1, 0x24, 0x00, 0, 0x20cf0000, 0, 0x00000000 }, + { 0x1, 0x25, 0x00, 0, 0x0c2f0000, 0, 0x00000000 }, + { 0x1, 0x26, 0x00, 0, 0x0c230000, 0, 0x00000000 }, + { 0x1, 0x27, 0x00, 0, 0x0c350000, 0, 0x00000000 }, + { 0x1, 0x28, 0x00, 0, 0x0c301000, 0, 0x00000000 }, + { 0x1, 0x29, 0x00, 0, 0x0c302000, 0, 0x00000000 }, + { 0x1, 0x2a, 0x00, 0, 0x0c303000, 0, 0x00000000 }, + { 0x1, 0x2b, 0x00, 0, 0x0c2a0000, 0, 0x00000000 }, + { 0x1, 0x2c, 0x00, 0, 0x0c2b0000, 0, 0x00000000 }, + { 0x1, 0x2c, 0x01, 0, 0x0c2c0000, 1, 0x00010000 }, + { 0x1, 0x2c, 0x02, 0, 0x0c2d0000, 2, 0x00020000 }, + { 0x1, 0x2c, 0x03, 0, 0x0c2e0000, 3, 0x00030000 }, + { 0x1, 0x00, 0x00, 0, 0x0c660000, 0, 0x00000000 }, + { 0x1, 0x01, 0x00, 0, 0x0c020000, 0, 0x00000000 }, + { 0x1, 0x02, 0x00, 0, 0x0c030000, 0, 0x00000000 }, + { 0x1, 0x03, 0x00, 0, 0x0c310000, 0, 0x00000000 }, + { 0x1, 0x04, 0x00, 0, 0x0c320000, 0, 0x00000000 }, + { 0x1, 0x05, 0x00, 0, 0x0c0a0000, 2, 0x00040000 }, + { 0x1, 0x05, 0x01, 0, 0x0c0b0000, 3, 0x00050000 }, + { 0x1, 0x05, 0x02, 0, 0x0c0e0000, 5, 0x00080000 }, + { 0x1, 0x05, 0x03, 0, 0x0c060000, 0, 0x00000000 }, + { 0x1, 0x05, 0x04, 0, 0x0c080000, 1, 0x00020000 }, + { 0x1, 0x05, 0x05, 0, 0x0c0c0000, 4, 0x00060000 }, + { 0x1, 0x06, 0x00, 0, 0x0c330000, 0, 0x00000000 }, + { 0x1, 0x07, 0x00, 0, 0x0c650000, 0, 0x00000000 }, + { 0x1, 0x08, 0x00, 0, 0x0c3e0000, 0, 0x00000000 }, + { 0x1, 0x09, 0x00, 0, 0x0c1e0000, 0, 0x00000000 }, + { 0x1, 0x0a, 0x00, 0, 0x0c150000, 0, 0x00000000 }, + { 0x1, 0x0a, 0x01, 0, 0x0c160000, 1, 0x00010000 }, + { 0x1, 0x0a, 0x02, 0, 0x0c170000, 2, 0x00020000 }, + { 0x1, 0x0a, 0x03, 0, 0x0c180000, 3, 0x00030000 }, + { 0x1, 0x0a, 0x04, 0, 0x0c190000, 4, 0x00040000 }, + { 0x1, 0x0a, 0x05, 0, 0x0c1a0000, 5, 0x00050000 }, + { 0x1, 0x0a, 0x06, 0, 0x0c1b0000, 6, 0x00060000 }, + { 0x1, 0x0a, 0x07, 0, 0x0c1c0000, 7, 0x00070000 }, + { 0x1, 0x0a, 0x08, 0, 0x0c1d0000, 8, 0x00080000 }, + { 0x1, 0x0b, 0x00, 0, 0x0c240000, 0, 0x00000000 }, + { 0x1, 0x0c, 0x00, 0, 0x0c250000, 0, 0x00000000 }, + { 0x1, 0x0d, 0x00, 0, 0x0c340000, 0, 0x00000000 }, + { 0x1, 0x0e, 0x00, 0, 0x0c260000, 0, 0x00000000 }, + { 0x1, 0x0f, 0x00, 0, 0x0c0f0000, 0, 0x00000000 }, + { 0x1, 0x0f, 0x01, 0, 0x0c100000, 1, 0x00010000 }, + { 0x1, 0x0f, 0x02, 0, 0x0c110000, 2, 0x00020000 }, + { 0x1, 0x0f, 0x03, 0, 0x0c120000, 3, 0x00030000 }, + { 0x1, 0x0f, 0x04, 0, 0x0c130000, 4, 0x00040000 }, + { 0x1, 0x0f, 0x05, 0, 0x0c140000, 5, 0x00050000 }, + { 0x1, 0x10, 0x00, 0, 0x0c290000, 0, 0x00000000 }, + { 0x1, 0x30, 0x00, 0, 0x20ce0000, 0, 0x00000000 }, + { 0x1, 0x31, 0x00, 0, 0x0c1f0000, 0, 0x00000000 }, + { 0x1, 0x31, 0x01, 0, 0x0c200000, 1, 0x00010000 }, + { 0x1, 0x31, 0x02, 0, 0x0c210000, 2, 0x00020000 }, + { 0x1, 0x31, 0x03, 0, 0x0c220000, 3, 0x00030000 }, + { 0x1, 0x32, 0x00, 0, 0x20cc0000, 3, 0x001c0000 }, + { 0x1, 0x32, 0x01, 0, 0x20c80000, 2, 0x00180000 }, + { 0x1, 0x32, 0x02, 0, 0x20c00000, 1, 0x00100000 }, + { 0x1, 0x32, 0x03, 0, 0x20b00000, 0, 0x00000000 }, + { 0x1, 0x33, 0x00, 0, 0x0c360000, 0, 0x00000000 }, + { 0x1, 0x33, 0x01, 0, 0x0c370000, 1, 0x00010000 }, + { 0x1, 0x33, 0x02, 0, 0x0c3a0000, 3, 0x00040000 }, + { 0x1, 0x33, 0x03, 0, 0x0c380000, 2, 0x00020000 }, + { 0x1, 0x38, 0x00, 0, 0x0c600000, 0, 0x00000000 }, + { 0x1, 0x38, 0x01, 0, 0x00000000, 0, 0x00000000 }, + { 0x1, 0x39, 0x00, 0, 0x0c280000, 0, 0x00000000 }, + { 0x2, 0x34, 0x00, 0, 0x40000000, 0, 0x40000000 }, + { 0x2, 0x34, 0x01, 0, 0x80000000, 1, 0x80000000 }, + { 0x2, 0x36, 0x00, 0, 0x0c400000, 0, 0x0c400000 }, + { 0x2, 0x36, 0x01, 0, 0x00000000, 0, 0x00000000 }, + { 0x3, 0x35, 0x00, 0, 0x00000000, 0, 0x00000000 }, + { 0x3, 0x35, 0x01, 0, 0x00100000, 1, 0x00100000 }, + { 0x3, 0x35, 0x02, 0, 0x05a00000, 11, 0x05a00000 }, + { 0x3, 0x35, 0x03, 0, 0x05b00000, 32, 0x05b00000 }, + { 0x3, 0x35, 0x04, 0, 0x05c00000, 33, 0x05c00000 }, + { 0x3, 0x35, 0x05, 0, 0x05d00000, 12, 0x05d00000 }, + { 0x3, 0x35, 0x06, 0, 0x20000000, 19, 0x20000000 }, + { 0x3, 0x35, 0x07, 0, 0x20100000, 20, 0x20100000 }, + { 0x3, 0x35, 0x08, 0, 0x20a00000, 24, 0x20a00000 }, + { 0x3, 0x35, 0x09, 0, 0x20d00000, 25, 0x20d00000 }, + { 0x3, 0x35, 0x0a, 0, 0x00200000, 2, 0x00200000 }, + { 0x3, 0x35, 0x0b, 0, 0x05800000, 10, 0x05800000 }, + { 0x3, 0x35, 0x0c, 0, 0x05e00000, 13, 0x05e00000 }, + { 0x3, 0x35, 0x0d, 0, 0x20200000, 21, 0x20200000 }, + { 0x3, 0x35, 0x0e, 0, 0x20800000, 23, 0x20800000 }, + { 0x3, 0x35, 0x0f, 0, 0x20e00000, 26, 0x20e00000 }, + { 0x3, 0x35, 0x10, 0, 0x00400000, 3, 0x00400000 }, + { 0x3, 0x35, 0x11, 0, 0x20400000, 22, 0x20400000 }, + { 0x3, 0x35, 0x12, 0, 0x00800000, 4, 0x00800000 }, + { 0x3, 0x35, 0x13, 0, 0x50000000, 9, 0x05000000 }, + { 0x3, 0x35, 0x14, 0, 0xc0800000, 34, 0x0c800000 }, + { 0x3, 0x35, 0x15, 0, 0x10000000, 5, 0x01000000 }, + { 0x3, 0x35, 0x16, 0, 0x30000000, 7, 0x03000000 }, + { 0x3, 0x35, 0x17, 0, 0x04000000, 8, 0x04000000 }, + { 0x3, 0x35, 0x18, 0, 0x0d000000, 16, 0x0d000000 }, + { 0x3, 0x35, 0x19, 0, 0x21000000, 27, 0x21000000 }, + { 0x3, 0x35, 0x1a, 0, 0x02000000, 6, 0x02000000 }, + { 0x3, 0x35, 0x1b, 0, 0x06000000, 14, 0x06000000 }, + { 0x3, 0x35, 0x1c, 0, 0x0e000000, 17, 0x0e000000 }, + { 0x3, 0x35, 0x1d, 0, 0x22000000, 28, 0x22000000 }, + { 0x3, 0x35, 0x1e, 0, 0x08000000, 15, 0x08000000 }, + { 0x3, 0x35, 0x1f, 0, 0x24000000, 29, 0x24000000 }, + { 0x3, 0x35, 0x20, 0, 0x28000000, 30, 0x28000000 }, + { 0x3, 0x35, 0x21, 0, 0x10000000, 18, 0x10000000 }, + { 0x3, 0x35, 0x22, 0, 0x30000000, 31, 0x30000000 }, + { 0x3, 0x37, 0x00, 0, 0x0c640000, 0, 0x00000000 }, + { 0x3, 0x20, 0x00, 0, 0x0c3b0000, 0, 0x00000000 }, + { 0x3, 0x21, 0x00, 0, 0x0c000000, 0, 0x00000000 }, + { 0x3, 0x22, 0x00, 0, 0x0c040000, 0, 0x00000000 }, + { 0x3, 0x23, 0x00, 0, 0x0c050000, 0, 0x00000000 }, + { 0x3, 0x24, 0x00, 0, 0x20cf0000, 0, 0x00000000 }, + { 0x3, 0x25, 0x00, 0, 0x0c2f0000, 0, 0x00000000 }, + { 0x3, 0x26, 0x00, 0, 0x0c230000, 0, 0x00000000 }, + { 0x3, 0x27, 0x00, 0, 0x0c350000, 0, 0x00000000 }, + { 0x3, 0x28, 0x00, 0, 0x0c301000, 0, 0x00000000 }, + { 0x3, 0x29, 0x00, 0, 0x0c302000, 0, 0x00000000 }, + { 0x3, 0x2a, 0x00, 0, 0x0c303000, 0, 0x00000000 }, + { 0x3, 0x2b, 0x00, 0, 0x0c2a0000, 0, 0x00000000 }, + { 0x3, 0x2c, 0x00, 0, 0x0c2b0000, 0, 0x00000000 }, + { 0x3, 0x2c, 0x01, 0, 0x0c2c0000, 1, 0x00010000 }, + { 0x3, 0x2c, 0x02, 0, 0x0c2d0000, 2, 0x00020000 }, + { 0x3, 0x2c, 0x03, 0, 0x0c2e0000, 3, 0x00030000 }, + { 0x3, 0x00, 0x00, 0, 0x0c660000, 0, 0x00000000 }, + { 0x3, 0x01, 0x00, 0, 0x0c020000, 0, 0x00000000 }, + { 0x3, 0x02, 0x00, 0, 0x0c030000, 0, 0x00000000 }, + { 0x3, 0x03, 0x00, 0, 0x0c310000, 0, 0x00000000 }, + { 0x3, 0x04, 0x00, 0, 0x0c320000, 0, 0x00000000 }, + { 0x3, 0x05, 0x00, 0, 0x0c0a0000, 2, 0x00040000 }, + { 0x3, 0x05, 0x01, 0, 0x0c0b0000, 3, 0x00050000 }, + { 0x3, 0x05, 0x02, 0, 0x0c0e0000, 5, 0x00080000 }, + { 0x3, 0x05, 0x03, 0, 0x0c060000, 0, 0x00000000 }, + { 0x3, 0x05, 0x04, 0, 0x0c080000, 1, 0x00020000 }, + { 0x3, 0x05, 0x05, 0, 0x0c0c0000, 4, 0x00060000 }, + { 0x3, 0x06, 0x00, 0, 0x0c330000, 0, 0x00000000 }, + { 0x3, 0x07, 0x00, 0, 0x0c650000, 0, 0x00000000 }, + { 0x3, 0x08, 0x00, 0, 0x0c3e0000, 0, 0x00000000 }, + { 0x3, 0x09, 0x00, 0, 0x0c1e0000, 0, 0x00000000 }, + { 0x3, 0x0a, 0x00, 0, 0x0c150000, 0, 0x00000000 }, + { 0x3, 0x0a, 0x01, 0, 0x0c160000, 1, 0x00010000 }, + { 0x3, 0x0a, 0x02, 0, 0x0c170000, 2, 0x00020000 }, + { 0x3, 0x0a, 0x03, 0, 0x0c180000, 3, 0x00030000 }, + { 0x3, 0x0a, 0x04, 0, 0x0c190000, 4, 0x00040000 }, + { 0x3, 0x0a, 0x05, 0, 0x0c1a0000, 5, 0x00050000 }, + { 0x3, 0x0a, 0x06, 0, 0x0c1b0000, 6, 0x00060000 }, + { 0x3, 0x0a, 0x07, 0, 0x0c1c0000, 7, 0x00070000 }, + { 0x3, 0x0a, 0x08, 0, 0x0c1d0000, 8, 0x00080000 }, + { 0x3, 0x0b, 0x00, 0, 0x0c240000, 0, 0x00000000 }, + { 0x3, 0x0c, 0x00, 0, 0x0c250000, 0, 0x00000000 }, + { 0x3, 0x0d, 0x00, 0, 0x0c340000, 0, 0x00000000 }, + { 0x3, 0x0e, 0x00, 0, 0x0c260000, 0, 0x00000000 }, + { 0x3, 0x0f, 0x00, 0, 0x0c0f0000, 0, 0x00000000 }, + { 0x3, 0x0f, 0x01, 0, 0x0c100000, 1, 0x00010000 }, + { 0x3, 0x0f, 0x02, 0, 0x0c110000, 2, 0x00020000 }, + { 0x3, 0x0f, 0x03, 0, 0x0c120000, 3, 0x00030000 }, + { 0x3, 0x0f, 0x04, 0, 0x0c130000, 4, 0x00040000 }, + { 0x3, 0x0f, 0x05, 0, 0x0c140000, 5, 0x00050000 }, + { 0x3, 0x10, 0x00, 0, 0x0c290000, 0, 0x00000000 }, + { 0x3, 0x30, 0x00, 0, 0x20ce0000, 0, 0x00000000 }, + { 0x3, 0x31, 0x00, 0, 0x0c1f0000, 0, 0x00000000 }, + { 0x3, 0x31, 0x01, 0, 0x0c200000, 1, 0x00010000 }, + { 0x3, 0x31, 0x02, 0, 0x0c210000, 2, 0x00020000 }, + { 0x3, 0x31, 0x03, 0, 0x0c220000, 3, 0x00030000 }, + { 0x3, 0x32, 0x00, 0, 0x20cc0000, 3, 0x001c0000 }, + { 0x3, 0x32, 0x01, 0, 0x20c80000, 2, 0x00180000 }, + { 0x3, 0x32, 0x02, 0, 0x20c00000, 1, 0x00100000 }, + { 0x3, 0x32, 0x03, 0, 0x20b00000, 0, 0x00000000 }, + { 0x3, 0x33, 0x00, 0, 0x0c360000, 0, 0x00000000 }, + { 0x3, 0x33, 0x01, 0, 0x0c370000, 1, 0x00010000 }, + { 0x3, 0x33, 0x02, 0, 0x0c3a0000, 3, 0x00040000 }, + { 0x3, 0x33, 0x03, 0, 0x0c380000, 2, 0x00020000 }, + { 0x3, 0x38, 0x00, 0, 0x0c600000, 0, 0x00000000 }, + { 0x3, 0x38, 0x01, 0, 0x00000000, 0, 0x00000000 }, + { 0x3, 0x39, 0x00, 0, 0x0c280000, 0, 0x00000000 } +}; + +/* + * SCE/RCE NOC aperture lookup table as per file "AON_NOC_Structure.info". + */ +static const char * const tegra194_scenoc_routeid_initflow[] = { + [0x0] = "cbb_i/I/0", + [0x1] = "cpu_m_i/I/0", + [0x2] = "cpu_p_i/I/0", + [0x3] = "dma_m_i/I/0", + [0x4] = "dma_p_i/I/0", + [0x5] = "RESERVED", + [0x6] = "RESERVED", + [0x7] = "RESERVED" +}; + +static const char * const tegra194_scenoc_routeid_targflow[] = { + [0x00] = "multiport0_t/T/atcm_cfg", + [0x01] = "multiport0_t/T/car", + [0x02] = "multiport0_t/T/cast", + [0x03] = "multiport0_t/T/cfg", + [0x04] = "multiport0_t/T/dast", + [0x05] = "multiport0_t/T/dma", + [0x06] = "multiport0_t/T/err_collator", + [0x07] = "multiport0_t/T/err_collator_car", + [0x08] = "multiport0_t/T/fpga_misc", + [0x09] = "multiport0_t/T/fpga_uart", + [0x0a] = "multiport0_t/T/gte", + [0x0b] = "multiport0_t/T/hsp", + [0x0c] = "multiport0_t/T/misc", + [0x0d] = "multiport0_t/T/pm", + [0x0e] = "multiport0_t/T/tke", + [0x0f] = "RESERVED", + [0x10] = "multiport1_t/T/hsm", + [0x11] = "multiport1_t/T/vic0", + [0x12] = "multiport1_t/T/vic1", + [0x13] = "ast0_t/T/0", + [0x14] = "ast1_t/T/0", + [0x15] = "cbb_t/T/0", + [0x16] = "cpu_t/T/0", + [0x17] = "sce_noc_firewall/T/0", + [0x18] = "svc_t/T/0", + [0x19] = "RESERVED", + [0x1a] = "RESERVED", + [0x1b] = "RESERVED", + [0x1c] = "RESERVED", + [0x1d] = "RESERVED", + [0x1e] = "RESERVED", + [0x1f] = "RESERVED" +}; + +/* + * Fields of SCE/RCE NOC lookup table: + * Init flow, Targ flow, Targ subrange, Init mapping, Init localAddress, + * Targ mapping, Targ localAddress + * ---------------------------------------------------------------------------- + */ +static const struct tegra194_cbb_aperture tegra194_scenoc_apert_lookup[] = { + { 0x0, 0x16, 0x0, 0, 0x0b400000, 0, 0x0b400000 }, + { 0x0, 0x16, 0x1, 0, 0x0bc00000, 1, 0x0bc00000 }, + { 0x0, 0x0, 0x0, 0, 0x0b000000, 0, 0x00000000 }, + { 0x0, 0x0, 0x1, 0, 0x0b800000, 1, 0x00000000 }, + { 0x0, 0x1, 0x0, 0, 0x20de0000, 3, 0x000e0000 }, + { 0x0, 0x1, 0x1, 0, 0x210e0000, 7, 0x000e0000 }, + { 0x0, 0x1, 0x2, 0, 0x20dc0000, 2, 0x000c0000 }, + { 0x0, 0x1, 0x3, 0, 0x210c0000, 6, 0x000c0000 }, + { 0x0, 0x1, 0x4, 0, 0x20d80000, 1, 0x00080000 }, + { 0x0, 0x1, 0x5, 0, 0x21080000, 5, 0x00080000 }, + { 0x0, 0x1, 0x6, 0, 0x20d00000, 0, 0x00000000 }, + { 0x0, 0x1, 0x7, 0, 0x21000000, 4, 0x00000000 }, + { 0x0, 0x2, 0x0, 0, 0x0b040000, 0, 0x00000000 }, + { 0x0, 0x2, 0x1, 0, 0x0b840000, 1, 0x00000000 }, + { 0x0, 0x3, 0x0, 0, 0x0b230000, 0, 0x00000000 }, + { 0x0, 0x3, 0x1, 0, 0x0ba30000, 1, 0x00000000 }, + { 0x0, 0x4, 0x0, 0, 0x0b050000, 0, 0x00000000 }, + { 0x0, 0x4, 0x1, 0, 0x0b850000, 1, 0x00000000 }, + { 0x0, 0x5, 0x0, 0, 0x0b060000, 0, 0x00000000 }, + { 0x0, 0x5, 0x1, 0, 0x0b070000, 1, 0x00010000 }, + { 0x0, 0x5, 0x2, 0, 0x0b080000, 2, 0x00020000 }, + { 0x0, 0x5, 0x3, 0, 0x0b090000, 3, 0x00030000 }, + { 0x0, 0x5, 0x4, 0, 0x0b0a0000, 4, 0x00040000 }, + { 0x0, 0x5, 0x5, 0, 0x0b0b0000, 5, 0x00050000 }, + { 0x0, 0x5, 0x6, 0, 0x0b0c0000, 6, 0x00060000 }, + { 0x0, 0x5, 0x7, 0, 0x0b0d0000, 7, 0x00070000 }, + { 0x0, 0x5, 0x8, 0, 0x0b0e0000, 8, 0x00080000 }, + { 0x0, 0x5, 0x9, 0, 0x0b860000, 9, 0x00000000 }, + { 0x0, 0x5, 0xa, 0, 0x0b870000, 10, 0x00010000 }, + { 0x0, 0x5, 0xb, 0, 0x0b880000, 11, 0x00020000 }, + { 0x0, 0x5, 0xc, 0, 0x0b890000, 12, 0x00030000 }, + { 0x0, 0x5, 0xd, 0, 0x0b8a0000, 13, 0x00040000 }, + { 0x0, 0x5, 0xe, 0, 0x0b8b0000, 14, 0x00050000 }, + { 0x0, 0x5, 0xf, 0, 0x0b8c0000, 15, 0x00060000 }, + { 0x0, 0x5, 0x10, 0, 0x0b8d0000, 16, 0x00070000 }, + { 0x0, 0x5, 0x11, 0, 0x0b8e0000, 17, 0x00080000 }, + { 0x0, 0x6, 0x0, 0, 0x0b650000, 0, 0x00000000 }, + { 0x0, 0x6, 0x1, 0, 0x0be50000, 1, 0x00000000 }, + { 0x0, 0x7, 0x0, 0, 0x20df0000, 0, 0x00000000 }, + { 0x0, 0x7, 0x1, 0, 0x210f0000, 1, 0x00000000 }, + { 0x0, 0x8, 0x0, 0, 0x0b3e0000, 0, 0x00000000 }, + { 0x0, 0x8, 0x1, 0, 0x0bbe0000, 1, 0x00000000 }, + { 0x0, 0x9, 0x0, 0, 0x0b3d0000, 0, 0x00000000 }, + { 0x0, 0x9, 0x1, 0, 0x0bbd0000, 1, 0x00000000 }, + { 0x0, 0xa, 0x0, 0, 0x0b1e0000, 0, 0x00000000 }, + { 0x0, 0xa, 0x1, 0, 0x0b9e0000, 1, 0x00000000 }, + { 0x0, 0xb, 0x0, 0, 0x0b150000, 0, 0x00000000 }, + { 0x0, 0xb, 0x1, 0, 0x0b160000, 1, 0x00010000 }, + { 0x0, 0xb, 0x2, 0, 0x0b170000, 2, 0x00020000 }, + { 0x0, 0xb, 0x3, 0, 0x0b180000, 3, 0x00030000 }, + { 0x0, 0xb, 0x4, 0, 0x0b190000, 4, 0x00040000 }, + { 0x0, 0xb, 0x5, 0, 0x0b1a0000, 5, 0x00050000 }, + { 0x0, 0xb, 0x6, 0, 0x0b1b0000, 6, 0x00060000 }, + { 0x0, 0xb, 0x7, 0, 0x0b1c0000, 7, 0x00070000 }, + { 0x0, 0xb, 0x8, 0, 0x0b1d0000, 8, 0x00080000 }, + { 0x0, 0xb, 0x9, 0, 0x0b950000, 9, 0x00000000 }, + { 0x0, 0xb, 0xa, 0, 0x0b960000, 10, 0x00010000 }, + { 0x0, 0xb, 0xb, 0, 0x0b970000, 11, 0x00020000 }, + { 0x0, 0xb, 0xc, 0, 0x0b980000, 12, 0x00030000 }, + { 0x0, 0xb, 0xd, 0, 0x0b990000, 13, 0x00040000 }, + { 0x0, 0xb, 0xe, 0, 0x0b9a0000, 14, 0x00050000 }, + { 0x0, 0xb, 0xf, 0, 0x0b9b0000, 15, 0x00060000 }, + { 0x0, 0xb, 0x10, 0, 0x0b9c0000, 16, 0x00070000 }, + { 0x0, 0xb, 0x11, 0, 0x0b9d0000, 17, 0x00080000 }, + { 0x0, 0xc, 0x0, 0, 0x0b660000, 0, 0x00000000 }, + { 0x0, 0xc, 0x1, 0, 0x0be60000, 1, 0x00000000 }, + { 0x0, 0xd, 0x0, 0, 0x0b1f0000, 0, 0x00000000 }, + { 0x0, 0xd, 0x1, 0, 0x0b200000, 1, 0x00010000 }, + { 0x0, 0xd, 0x2, 0, 0x0b210000, 2, 0x00020000 }, + { 0x0, 0xd, 0x3, 0, 0x0b220000, 3, 0x00030000 }, + { 0x0, 0xd, 0x4, 0, 0x0b9f0000, 4, 0x00000000 }, + { 0x0, 0xd, 0x5, 0, 0x0ba00000, 5, 0x00010000 }, + { 0x0, 0xd, 0x6, 0, 0x0ba10000, 6, 0x00020000 }, + { 0x0, 0xd, 0x7, 0, 0x0ba20000, 7, 0x00030000 }, + { 0x0, 0xe, 0x0, 0, 0x0b0f0000, 0, 0x00000000 }, + { 0x0, 0xe, 0x1, 0, 0x0b100000, 1, 0x00010000 }, + { 0x0, 0xe, 0x2, 0, 0x0b110000, 2, 0x00020000 }, + { 0x0, 0xe, 0x3, 0, 0x0b120000, 3, 0x00030000 }, + { 0x0, 0xe, 0x4, 0, 0x0b130000, 4, 0x00040000 }, + { 0x0, 0xe, 0x5, 0, 0x0b140000, 5, 0x00050000 }, + { 0x0, 0xe, 0x6, 0, 0x0b8f0000, 6, 0x00000000 }, + { 0x0, 0xe, 0x7, 0, 0x0b900000, 7, 0x00010000 }, + { 0x0, 0xe, 0x8, 0, 0x0b910000, 8, 0x00020000 }, + { 0x0, 0xe, 0x9, 0, 0x0b920000, 9, 0x00030000 }, + { 0x0, 0xe, 0xa, 0, 0x0b930000, 10, 0x00040000 }, + { 0x0, 0xe, 0xb, 0, 0x0b940000, 11, 0x00050000 }, + { 0x0, 0x10, 0x0, 0, 0x0b240000, 0, 0x00000000 }, + { 0x0, 0x10, 0x1, 0, 0x0ba40000, 1, 0x00000000 }, + { 0x0, 0x11, 0x0, 0, 0x0b020000, 0, 0x00000000 }, + { 0x0, 0x11, 0x1, 0, 0x0b820000, 1, 0x00000000 }, + { 0x0, 0x12, 0x0, 0, 0x0b030000, 0, 0x00000000 }, + { 0x0, 0x12, 0x1, 0, 0x0b830000, 1, 0x00000000 }, + { 0x0, 0x17, 0x0, 0, 0x0b640000, 0, 0x00000000 }, + { 0x0, 0x17, 0x1, 0, 0x0be40000, 1, 0x00000000 }, + { 0x0, 0x18, 0x0, 0, 0x0b600000, 0, 0x00000000 }, + { 0x0, 0x18, 0x1, 0, 0x0be00000, 1, 0x00000000 }, + { 0x0, 0x18, 0x2, 0, 0x00000000, 0, 0x00000000 }, + { 0x0, 0x18, 0x3, 0, 0x00000000, 0, 0x00000000 }, + { 0x1, 0x13, 0x0, 0, 0x40000000, 0, 0x40000000 }, + { 0x1, 0x13, 0x1, 1, 0x80000000, 1, 0x80000000 }, + { 0x1, 0x13, 0x2, 0, 0x00000000, 0, 0x00000000 }, + { 0x2, 0x15, 0x0, 0, 0x20c00000, 8, 0x20c00000 }, + { 0x2, 0x15, 0x1, 0, 0x21100000, 22, 0x21100000 }, + { 0x2, 0x15, 0x2, 0, 0x20e00000, 9, 0x20e00000 }, + { 0x2, 0x15, 0x3, 0, 0x21200000, 23, 0x21200000 }, + { 0x2, 0x15, 0x4, 0, 0x20800000, 7, 0x20800000 }, + { 0x2, 0x15, 0x5, 0, 0x21400000, 24, 0x21400000 }, + { 0x2, 0x15, 0x6, 0, 0x0b000000, 18, 0x0b000000 }, + { 0x2, 0x15, 0x7, 0, 0x0b800000, 3, 0x0b800000 }, + { 0x2, 0x15, 0x8, 0, 0x20000000, 6, 0x20000000 }, + { 0x2, 0x15, 0x9, 0, 0x21800000, 25, 0x21800000 }, + { 0x2, 0x15, 0xa, 0, 0x0a000000, 2, 0x0a000000 }, + { 0x2, 0x15, 0xb, 0, 0x0a000000, 17, 0x0a000000 }, + { 0x2, 0x15, 0xc, 0, 0x20000000, 21, 0x20000000 }, + { 0x2, 0x15, 0xd, 0, 0x21000000, 10, 0x21000000 }, + { 0x2, 0x15, 0xe, 0, 0x08000000, 1, 0x08000000 }, + { 0x2, 0x15, 0xf, 0, 0x08000000, 16, 0x08000000 }, + { 0x2, 0x15, 0x10, 0, 0x22000000, 11, 0x22000000 }, + { 0x2, 0x15, 0x11, 0, 0x22000000, 26, 0x22000000 }, + { 0x2, 0x15, 0x12, 0, 0x0c000000, 4, 0x0c000000 }, + { 0x2, 0x15, 0x13, 0, 0x0c000000, 19, 0x0c000000 }, + { 0x2, 0x15, 0x14, 0, 0x24000000, 12, 0x24000000 }, + { 0x2, 0x15, 0x15, 0, 0x24000000, 27, 0x24000000 }, + { 0x2, 0x15, 0x16, 0, 0x00000000, 0, 0x00000000 }, + { 0x2, 0x15, 0x17, 0, 0x00000000, 15, 0x00000000 }, + { 0x2, 0x15, 0x18, 0, 0x28000000, 13, 0x28000000 }, + { 0x2, 0x15, 0x19, 0, 0x28000000, 28, 0x28000000 }, + { 0x2, 0x15, 0x1a, 0, 0x10000000, 5, 0x10000000 }, + { 0x2, 0x15, 0x1b, 0, 0x10000000, 20, 0x10000000 }, + { 0x2, 0x15, 0x1c, 0, 0x30000000, 14, 0x30000000 }, + { 0x2, 0x15, 0x1d, 0, 0x30000000, 29, 0x30000000 }, + { 0x2, 0x0, 0x0, 0, 0x0b000000, 0, 0x00000000 }, + { 0x2, 0x0, 0x1, 0, 0x0b800000, 1, 0x00000000 }, + { 0x2, 0x1, 0x0, 0, 0x20de0000, 3, 0x000e0000 }, + { 0x2, 0x1, 0x1, 0, 0x210e0000, 7, 0x000e0000 }, + { 0x2, 0x1, 0x2, 0, 0x20dc0000, 2, 0x000c0000 }, + { 0x2, 0x1, 0x3, 0, 0x210c0000, 6, 0x000c0000 }, + { 0x2, 0x1, 0x4, 0, 0x20d80000, 1, 0x00080000 }, + { 0x2, 0x1, 0x5, 0, 0x21080000, 5, 0x00080000 }, + { 0x2, 0x1, 0x6, 0, 0x20d00000, 0, 0x00000000 }, + { 0x2, 0x1, 0x7, 0, 0x21000000, 4, 0x00000000 }, + { 0x2, 0x2, 0x0, 0, 0x0b040000, 0, 0x00000000 }, + { 0x2, 0x2, 0x1, 0, 0x0b840000, 1, 0x00000000 }, + { 0x2, 0x3, 0x0, 0, 0x0b230000, 0, 0x00000000 }, + { 0x2, 0x3, 0x1, 0, 0x0ba30000, 1, 0x00000000 }, + { 0x2, 0x4, 0x0, 0, 0x0b050000, 0, 0x00000000 }, + { 0x2, 0x4, 0x1, 0, 0x0b850000, 1, 0x00000000 }, + { 0x2, 0x5, 0x0, 0, 0x0b060000, 0, 0x00000000 }, + { 0x2, 0x5, 0x1, 0, 0x0b070000, 1, 0x00010000 }, + { 0x2, 0x5, 0x2, 0, 0x0b080000, 2, 0x00020000 }, + { 0x2, 0x5, 0x3, 0, 0x0b090000, 3, 0x00030000 }, + { 0x2, 0x5, 0x4, 0, 0x0b0a0000, 4, 0x00040000 }, + { 0x2, 0x5, 0x5, 0, 0x0b0b0000, 5, 0x00050000 }, + { 0x2, 0x5, 0x6, 0, 0x0b0c0000, 6, 0x00060000 }, + { 0x2, 0x5, 0x7, 0, 0x0b0d0000, 7, 0x00070000 }, + { 0x2, 0x5, 0x8, 0, 0x0b0e0000, 8, 0x00080000 }, + { 0x2, 0x5, 0x9, 0, 0x0b860000, 9, 0x00000000 }, + { 0x2, 0x5, 0xa, 0, 0x0b870000, 10, 0x00010000 }, + { 0x2, 0x5, 0xb, 0, 0x0b880000, 11, 0x00020000 }, + { 0x2, 0x5, 0xc, 0, 0x0b890000, 12, 0x00030000 }, + { 0x2, 0x5, 0xd, 0, 0x0b8a0000, 13, 0x00040000 }, + { 0x2, 0x5, 0xe, 0, 0x0b8b0000, 14, 0x00050000 }, + { 0x2, 0x5, 0xf, 0, 0x0b8c0000, 15, 0x00060000 }, + { 0x2, 0x5, 0x10, 0, 0x0b8d0000, 16, 0x00070000 }, + { 0x2, 0x5, 0x11, 0, 0x0b8e0000, 17, 0x00080000 }, + { 0x2, 0x6, 0x0, 0, 0x0b650000, 0, 0x00000000 }, + { 0x2, 0x6, 0x1, 0, 0x0be50000, 1, 0x00000000 }, + { 0x2, 0x7, 0x0, 0, 0x20df0000, 0, 0x00000000 }, + { 0x2, 0x7, 0x1, 0, 0x210f0000, 1, 0x00000000 }, + { 0x2, 0x8, 0x0, 0, 0x0b3e0000, 0, 0x00000000 }, + { 0x2, 0x8, 0x1, 0, 0x0bbe0000, 1, 0x00000000 }, + { 0x2, 0x9, 0x0, 0, 0x0b3d0000, 0, 0x00000000 }, + { 0x2, 0x9, 0x1, 0, 0x0bbd0000, 1, 0x00000000 }, + { 0x2, 0xa, 0x0, 0, 0x0b1e0000, 0, 0x00000000 }, + { 0x2, 0xa, 0x1, 0, 0x0b9e0000, 1, 0x00000000 }, + { 0x2, 0xb, 0x0, 0, 0x0b150000, 0, 0x00000000 }, + { 0x2, 0xb, 0x1, 0, 0x0b160000, 1, 0x00010000 }, + { 0x2, 0xb, 0x2, 0, 0x0b170000, 2, 0x00020000 }, + { 0x2, 0xb, 0x3, 0, 0x0b180000, 3, 0x00030000 }, + { 0x2, 0xb, 0x4, 0, 0x0b190000, 4, 0x00040000 }, + { 0x2, 0xb, 0x5, 0, 0x0b1a0000, 5, 0x00050000 }, + { 0x2, 0xb, 0x6, 0, 0x0b1b0000, 6, 0x00060000 }, + { 0x2, 0xb, 0x7, 0, 0x0b1c0000, 7, 0x00070000 }, + { 0x2, 0xb, 0x8, 0, 0x0b1d0000, 8, 0x00080000 }, + { 0x2, 0xb, 0x9, 0, 0x0b950000, 9, 0x00000000 }, + { 0x2, 0xb, 0xa, 0, 0x0b960000, 10, 0x00010000 }, + { 0x2, 0xb, 0xb, 0, 0x0b970000, 11, 0x00020000 }, + { 0x2, 0xb, 0xc, 0, 0x0b980000, 12, 0x00030000 }, + { 0x2, 0xb, 0xd, 0, 0x0b990000, 13, 0x00040000 }, + { 0x2, 0xb, 0xe, 0, 0x0b9a0000, 14, 0x00050000 }, + { 0x2, 0xb, 0xf, 0, 0x0b9b0000, 15, 0x00060000 }, + { 0x2, 0xb, 0x10, 0, 0x0b9c0000, 16, 0x00070000 }, + { 0x2, 0xb, 0x11, 0, 0x0b9d0000, 17, 0x00080000 }, + { 0x2, 0xc, 0x0, 0, 0x0b660000, 0, 0x00000000 }, + { 0x2, 0xc, 0x1, 0, 0x0be60000, 1, 0x00000000 }, + { 0x2, 0xd, 0x0, 0, 0x0b1f0000, 0, 0x00000000 }, + { 0x2, 0xd, 0x1, 0, 0x0b200000, 1, 0x00010000 }, + { 0x2, 0xd, 0x2, 0, 0x0b210000, 2, 0x00020000 }, + { 0x2, 0xd, 0x3, 0, 0x0b220000, 3, 0x00030000 }, + { 0x2, 0xd, 0x4, 0, 0x0b9f0000, 4, 0x00000000 }, + { 0x2, 0xd, 0x5, 0, 0x0ba00000, 5, 0x00010000 }, + { 0x2, 0xd, 0x6, 0, 0x0ba10000, 6, 0x00020000 }, + { 0x2, 0xd, 0x7, 0, 0x0ba20000, 7, 0x00030000 }, + { 0x2, 0xe, 0x0, 0, 0x0b0f0000, 0, 0x00000000 }, + { 0x2, 0xe, 0x1, 0, 0x0b100000, 1, 0x00010000 }, + { 0x2, 0xe, 0x2, 0, 0x0b110000, 2, 0x00020000 }, + { 0x2, 0xe, 0x3, 0, 0x0b120000, 3, 0x00030000 }, + { 0x2, 0xe, 0x4, 0, 0x0b130000, 4, 0x00040000 }, + { 0x2, 0xe, 0x5, 0, 0x0b140000, 5, 0x00050000 }, + { 0x2, 0xe, 0x6, 0, 0x0b8f0000, 6, 0x00000000 }, + { 0x2, 0xe, 0x7, 0, 0x0b900000, 7, 0x00010000 }, + { 0x2, 0xe, 0x8, 0, 0x0b910000, 8, 0x00020000 }, + { 0x2, 0xe, 0x9, 0, 0x0b920000, 9, 0x00030000 }, + { 0x2, 0xe, 0xa, 0, 0x0b930000, 10, 0x00040000 }, + { 0x2, 0xe, 0xb, 0, 0x0b940000, 11, 0x00050000 }, + { 0x2, 0x10, 0x0, 0, 0x0b240000, 0, 0x00000000 }, + { 0x2, 0x10, 0x1, 0, 0x0ba40000, 1, 0x00000000 }, + { 0x2, 0x11, 0x0, 0, 0x0b020000, 0, 0x00000000 }, + { 0x2, 0x11, 0x1, 0, 0x0b820000, 1, 0x00000000 }, + { 0x2, 0x12, 0x0, 0, 0x0b030000, 0, 0x00000000 }, + { 0x2, 0x12, 0x1, 0, 0x0b830000, 1, 0x00000000 }, + { 0x2, 0x17, 0x0, 0, 0x0b640000, 0, 0x00000000 }, + { 0x2, 0x17, 0x1, 0, 0x0be40000, 1, 0x00000000 }, + { 0x2, 0x18, 0x0, 0, 0x0b600000, 0, 0x00000000 }, + { 0x2, 0x18, 0x1, 0, 0x0be00000, 1, 0x00000000 }, + { 0x2, 0x18, 0x2, 0, 0x00000000, 0, 0x00000000 }, + { 0x2, 0x18, 0x3, 0, 0x00000000, 0, 0x00000000 }, + { 0x3, 0x14, 0x0, 0, 0x40000000, 0, 0x40000000 }, + { 0x3, 0x14, 0x1, 1, 0x80000000, 1, 0x80000000 }, + { 0x3, 0x16, 0x0, 2, 0x0b400000, 0, 0x0b400000 }, + { 0x3, 0x16, 0x1, 2, 0x0bc00000, 1, 0x0bc00000 }, + { 0x3, 0x16, 0x2, 0, 0x00000000, 0, 0x00000000 }, + { 0x3, 0x16, 0x3, 0, 0x00000000, 0, 0x00000000 }, + { 0x4, 0x15, 0x0, 0, 0x20c00000, 8, 0x20c00000 }, + { 0x4, 0x15, 0x1, 0, 0x21100000, 22, 0x21100000 }, + { 0x4, 0x15, 0x2, 0, 0x20e00000, 9, 0x20e00000 }, + { 0x4, 0x15, 0x3, 0, 0x21200000, 23, 0x21200000 }, + { 0x4, 0x15, 0x4, 0, 0x20800000, 7, 0x20800000 }, + { 0x4, 0x15, 0x5, 0, 0x21400000, 24, 0x21400000 }, + { 0x4, 0x15, 0x6, 0, 0x0b000000, 18, 0x0b000000 }, + { 0x4, 0x15, 0x7, 0, 0x0b800000, 3, 0x0b800000 }, + { 0x4, 0x15, 0x8, 0, 0x20000000, 6, 0x20000000 }, + { 0x4, 0x15, 0x9, 0, 0x21800000, 25, 0x21800000 }, + { 0x4, 0x15, 0xa, 0, 0x0a000000, 2, 0x0a000000 }, + { 0x4, 0x15, 0xb, 0, 0x0a000000, 17, 0x0a000000 }, + { 0x4, 0x15, 0xc, 0, 0x20000000, 21, 0x20000000 }, + { 0x4, 0x15, 0xd, 0, 0x21000000, 10, 0x21000000 }, + { 0x4, 0x15, 0xe, 0, 0x08000000, 1, 0x08000000 }, + { 0x4, 0x15, 0xf, 0, 0x08000000, 16, 0x08000000 }, + { 0x4, 0x15, 0x10, 0, 0x22000000, 11, 0x22000000 }, + { 0x4, 0x15, 0x11, 0, 0x22000000, 26, 0x22000000 }, + { 0x4, 0x15, 0x12, 0, 0x0c000000, 4, 0x0c000000 }, + { 0x4, 0x15, 0x13, 0, 0x0c000000, 19, 0x0c000000 }, + { 0x4, 0x15, 0x14, 0, 0x24000000, 12, 0x24000000 }, + { 0x4, 0x15, 0x15, 0, 0x24000000, 27, 0x24000000 }, + { 0x4, 0x15, 0x16, 0, 0x00000000, 0, 0x00000000 }, + { 0x4, 0x15, 0x17, 0, 0x00000000, 15, 0x00000000 }, + { 0x4, 0x15, 0x18, 0, 0x28000000, 13, 0x28000000 }, + { 0x4, 0x15, 0x19, 0, 0x28000000, 28, 0x28000000 }, + { 0x4, 0x15, 0x1a, 0, 0x10000000, 5, 0x10000000 }, + { 0x4, 0x15, 0x1b, 0, 0x10000000, 20, 0x10000000 }, + { 0x4, 0x15, 0x1c, 0, 0x30000000, 14, 0x30000000 }, + { 0x4, 0x15, 0x1d, 0, 0x30000000, 29, 0x30000000 }, + { 0x4, 0x0, 0x0, 0, 0x0b000000, 0, 0x00000000 }, + { 0x4, 0x0, 0x1, 0, 0x0b800000, 1, 0x00000000 }, + { 0x4, 0x1, 0x0, 0, 0x20de0000, 3, 0x000e0000 }, + { 0x4, 0x1, 0x1, 0, 0x210e0000, 7, 0x000e0000 }, + { 0x4, 0x1, 0x2, 0, 0x20dc0000, 2, 0x000c0000 }, + { 0x4, 0x1, 0x3, 0, 0x210c0000, 6, 0x000c0000 }, + { 0x4, 0x1, 0x4, 0, 0x20d80000, 1, 0x00080000 }, + { 0x4, 0x1, 0x5, 0, 0x21080000, 5, 0x00080000 }, + { 0x4, 0x1, 0x6, 0, 0x20d00000, 0, 0x00000000 }, + { 0x4, 0x1, 0x7, 0, 0x21000000, 4, 0x00000000 }, + { 0x4, 0x2, 0x0, 0, 0x0b040000, 0, 0x00000000 }, + { 0x4, 0x2, 0x1, 0, 0x0b840000, 1, 0x00000000 }, + { 0x4, 0x3, 0x0, 0, 0x0b230000, 0, 0x00000000 }, + { 0x4, 0x3, 0x1, 0, 0x0ba30000, 1, 0x00000000 }, + { 0x4, 0x4, 0x0, 0, 0x0b050000, 0, 0x00000000 }, + { 0x4, 0x4, 0x1, 0, 0x0b850000, 1, 0x00000000 }, + { 0x4, 0x5, 0x0, 0, 0x0b060000, 0, 0x00000000 }, + { 0x4, 0x5, 0x1, 0, 0x0b070000, 1, 0x00010000 }, + { 0x4, 0x5, 0x2, 0, 0x0b080000, 2, 0x00020000 }, + { 0x4, 0x5, 0x3, 0, 0x0b090000, 3, 0x00030000 }, + { 0x4, 0x5, 0x4, 0, 0x0b0a0000, 4, 0x00040000 }, + { 0x4, 0x5, 0x5, 0, 0x0b0b0000, 5, 0x00050000 }, + { 0x4, 0x5, 0x6, 0, 0x0b0c0000, 6, 0x00060000 }, + { 0x4, 0x5, 0x7, 0, 0x0b0d0000, 7, 0x00070000 }, + { 0x4, 0x5, 0x8, 0, 0x0b0e0000, 8, 0x00080000 }, + { 0x4, 0x5, 0x9, 0, 0x0b860000, 9, 0x00000000 }, + { 0x4, 0x5, 0xa, 0, 0x0b870000, 10, 0x00010000 }, + { 0x4, 0x5, 0xb, 0, 0x0b880000, 11, 0x00020000 }, + { 0x4, 0x5, 0xc, 0, 0x0b890000, 12, 0x00030000 }, + { 0x4, 0x5, 0xd, 0, 0x0b8a0000, 13, 0x00040000 }, + { 0x4, 0x5, 0xe, 0, 0x0b8b0000, 14, 0x00050000 }, + { 0x4, 0x5, 0xf, 0, 0x0b8c0000, 15, 0x00060000 }, + { 0x4, 0x5, 0x10, 0, 0x0b8d0000, 16, 0x00070000 }, + { 0x4, 0x5, 0x11, 0, 0x0b8e0000, 17, 0x00080000 }, + { 0x4, 0x6, 0x0, 0, 0x0b650000, 0, 0x00000000 }, + { 0x4, 0x6, 0x1, 0, 0x0be50000, 1, 0x00000000 }, + { 0x4, 0x7, 0x0, 0, 0x20df0000, 0, 0x00000000 }, + { 0x4, 0x7, 0x1, 0, 0x210f0000, 1, 0x00000000 }, + { 0x4, 0x8, 0x0, 0, 0x0b3e0000, 0, 0x00000000 }, + { 0x4, 0x8, 0x1, 0, 0x0bbe0000, 1, 0x00000000 }, + { 0x4, 0x9, 0x0, 0, 0x0b3d0000, 0, 0x00000000 }, + { 0x4, 0x9, 0x1, 0, 0x0bbd0000, 1, 0x00000000 }, + { 0x4, 0xa, 0x0, 0, 0x0b1e0000, 0, 0x00000000 }, + { 0x4, 0xa, 0x1, 0, 0x0b9e0000, 1, 0x00000000 }, + { 0x4, 0xb, 0x0, 0, 0x0b150000, 0, 0x00000000 }, + { 0x4, 0xb, 0x1, 0, 0x0b160000, 1, 0x00010000 }, + { 0x4, 0xb, 0x2, 0, 0x0b170000, 2, 0x00020000 }, + { 0x4, 0xb, 0x3, 0, 0x0b180000, 3, 0x00030000 }, + { 0x4, 0xb, 0x4, 0, 0x0b190000, 4, 0x00040000 }, + { 0x4, 0xb, 0x5, 0, 0x0b1a0000, 5, 0x00050000 }, + { 0x4, 0xb, 0x6, 0, 0x0b1b0000, 6, 0x00060000 }, + { 0x4, 0xb, 0x7, 0, 0x0b1c0000, 7, 0x00070000 }, + { 0x4, 0xb, 0x8, 0, 0x0b1d0000, 8, 0x00080000 }, + { 0x4, 0xb, 0x9, 0, 0x0b950000, 9, 0x00000000 }, + { 0x4, 0xb, 0xa, 0, 0x0b960000, 10, 0x00010000 }, + { 0x4, 0xb, 0xb, 0, 0x0b970000, 11, 0x00020000 }, + { 0x4, 0xb, 0xc, 0, 0x0b980000, 12, 0x00030000 }, + { 0x4, 0xb, 0xd, 0, 0x0b990000, 13, 0x00040000 }, + { 0x4, 0xb, 0xe, 0, 0x0b9a0000, 14, 0x00050000 }, + { 0x4, 0xb, 0xf, 0, 0x0b9b0000, 15, 0x00060000 }, + { 0x4, 0xb, 0x10, 0, 0x0b9c0000, 16, 0x00070000 }, + { 0x4, 0xb, 0x11, 0, 0x0b9d0000, 17, 0x00080000 }, + { 0x4, 0xc, 0x0, 0, 0x0b660000, 0, 0x00000000 }, + { 0x4, 0xc, 0x1, 0, 0x0be60000, 1, 0x00000000 }, + { 0x4, 0xd, 0x0, 0, 0x0b1f0000, 0, 0x00000000 }, + { 0x4, 0xd, 0x1, 0, 0x0b200000, 1, 0x00010000 }, + { 0x4, 0xd, 0x2, 0, 0x0b210000, 2, 0x00020000 }, + { 0x4, 0xd, 0x3, 0, 0x0b220000, 3, 0x00030000 }, + { 0x4, 0xd, 0x4, 0, 0x0b9f0000, 4, 0x00000000 }, + { 0x4, 0xd, 0x5, 0, 0x0ba00000, 5, 0x00010000 }, + { 0x4, 0xd, 0x6, 0, 0x0ba10000, 6, 0x00020000 }, + { 0x4, 0xd, 0x7, 0, 0x0ba20000, 7, 0x00030000 }, + { 0x4, 0xe, 0x0, 0, 0x0b0f0000, 0, 0x00000000 }, + { 0x4, 0xe, 0x1, 0, 0x0b100000, 1, 0x00010000 }, + { 0x4, 0xe, 0x2, 0, 0x0b110000, 2, 0x00020000 }, + { 0x4, 0xe, 0x3, 0, 0x0b120000, 3, 0x00030000 }, + { 0x4, 0xe, 0x4, 0, 0x0b130000, 4, 0x00040000 }, + { 0x4, 0xe, 0x5, 0, 0x0b140000, 5, 0x00050000 }, + { 0x4, 0xe, 0x6, 0, 0x0b8f0000, 6, 0x00000000 }, + { 0x4, 0xe, 0x7, 0, 0x0b900000, 7, 0x00010000 }, + { 0x4, 0xe, 0x8, 0, 0x0b910000, 8, 0x00020000 }, + { 0x4, 0xe, 0x9, 0, 0x0b920000, 9, 0x00030000 }, + { 0x4, 0xe, 0xa, 0, 0x0b930000, 10, 0x00040000 }, + { 0x4, 0xe, 0xb, 0, 0x0b940000, 11, 0x00050000 }, + { 0x4, 0x10, 0x0, 0, 0x0b240000, 0, 0x00000000 }, + { 0x4, 0x10, 0x1, 0, 0x0ba40000, 1, 0x00000000 }, + { 0x4, 0x11, 0x0, 0, 0x0b020000, 0, 0x00000000 }, + { 0x4, 0x11, 0x1, 0, 0x0b820000, 1, 0x00000000 }, + { 0x4, 0x12, 0x0, 0, 0x0b030000, 0, 0x00000000 }, + { 0x4, 0x12, 0x1, 0, 0x0b830000, 1, 0x00000000 }, + { 0x4, 0x17, 0x0, 0, 0x0b640000, 0, 0x00000000 }, + { 0x4, 0x17, 0x1, 0, 0x0be40000, 1, 0x00000000 }, + { 0x4, 0x18, 0x0, 0, 0x0b600000, 0, 0x00000000 }, + { 0x4, 0x18, 0x1, 0, 0x0be00000, 1, 0x00000000 }, + { 0x4, 0x18, 0x2, 0, 0x00000000, 0, 0x00000000 }, + { 0x4, 0x18, 0x3, 0, 0x00000000, 0, 0x00000000 } +}; + +static void cbbcentralnoc_parse_routeid(struct tegra194_cbb_aperture *info, u64 routeid) +{ + info->initflow = FIELD_GET(CBB_NOC_INITFLOW, routeid); + info->targflow = FIELD_GET(CBB_NOC_TARGFLOW, routeid); + info->targ_subrange = FIELD_GET(CBB_NOC_TARG_SUBRANGE, routeid); + info->seqid = FIELD_GET(CBB_NOC_SEQID, routeid); +} + +static void bpmpnoc_parse_routeid(struct tegra194_cbb_aperture *info, u64 routeid) +{ + info->initflow = FIELD_GET(BPMP_NOC_INITFLOW, routeid); + info->targflow = FIELD_GET(BPMP_NOC_TARGFLOW, routeid); + info->targ_subrange = FIELD_GET(BPMP_NOC_TARG_SUBRANGE, routeid); + info->seqid = FIELD_GET(BPMP_NOC_SEQID, routeid); +} + +static void aonnoc_parse_routeid(struct tegra194_cbb_aperture *info, u64 routeid) +{ + info->initflow = FIELD_GET(AON_NOC_INITFLOW, routeid); + info->targflow = FIELD_GET(AON_NOC_TARGFLOW, routeid); + info->targ_subrange = FIELD_GET(AON_NOC_TARG_SUBRANGE, routeid); + info->seqid = FIELD_GET(AON_NOC_SEQID, routeid); +} + +static void scenoc_parse_routeid(struct tegra194_cbb_aperture *info, u64 routeid) +{ + info->initflow = FIELD_GET(SCE_NOC_INITFLOW, routeid); + info->targflow = FIELD_GET(SCE_NOC_TARGFLOW, routeid); + info->targ_subrange = FIELD_GET(SCE_NOC_TARG_SUBRANGE, routeid); + info->seqid = FIELD_GET(SCE_NOC_SEQID, routeid); +} + +static void cbbcentralnoc_parse_userbits(struct tegra194_cbb_userbits *usrbits, u32 elog_5) +{ + usrbits->axcache = FIELD_GET(CBB_NOC_AXCACHE, elog_5); + usrbits->non_mod = FIELD_GET(CBB_NOC_NON_MOD, elog_5); + usrbits->axprot = FIELD_GET(CBB_NOC_AXPROT, elog_5); + usrbits->falconsec = FIELD_GET(CBB_NOC_FALCONSEC, elog_5); + usrbits->grpsec = FIELD_GET(CBB_NOC_GRPSEC, elog_5); + usrbits->vqc = FIELD_GET(CBB_NOC_VQC, elog_5); + usrbits->mstr_id = FIELD_GET(CBB_NOC_MSTR_ID, elog_5) - 1; + usrbits->axi_id = FIELD_GET(CBB_NOC_AXI_ID, elog_5); +} + +static void clusternoc_parse_userbits(struct tegra194_cbb_userbits *usrbits, u32 elog_5) +{ + usrbits->axcache = FIELD_GET(CLUSTER_NOC_AXCACHE, elog_5); + usrbits->axprot = FIELD_GET(CLUSTER_NOC_AXCACHE, elog_5); + usrbits->falconsec = FIELD_GET(CLUSTER_NOC_FALCONSEC, elog_5); + usrbits->grpsec = FIELD_GET(CLUSTER_NOC_GRPSEC, elog_5); + usrbits->vqc = FIELD_GET(CLUSTER_NOC_VQC, elog_5); + usrbits->mstr_id = FIELD_GET(CLUSTER_NOC_MSTR_ID, elog_5) - 1; +} + +static void tegra194_cbb_fault_enable(struct tegra_cbb *cbb) +{ + struct tegra194_cbb *priv = to_tegra194_cbb(cbb); + + writel(1, priv->regs + ERRLOGGER_0_FAULTEN_0); + writel(1, priv->regs + ERRLOGGER_1_FAULTEN_0); + writel(1, priv->regs + ERRLOGGER_2_FAULTEN_0); +} + +static void tegra194_cbb_stall_enable(struct tegra_cbb *cbb) +{ + struct tegra194_cbb *priv = to_tegra194_cbb(cbb); + + writel(1, priv->regs + ERRLOGGER_0_STALLEN_0); + writel(1, priv->regs + ERRLOGGER_1_STALLEN_0); + writel(1, priv->regs + ERRLOGGER_2_STALLEN_0); +} + +static void tegra194_cbb_error_clear(struct tegra_cbb *cbb) +{ + struct tegra194_cbb *priv = to_tegra194_cbb(cbb); + + writel(1, priv->regs + ERRLOGGER_0_ERRCLR_0); + writel(1, priv->regs + ERRLOGGER_1_ERRCLR_0); + writel(1, priv->regs + ERRLOGGER_2_ERRCLR_0); + dsb(sy); +} + +static u32 tegra194_cbb_get_status(struct tegra_cbb *cbb) +{ + struct tegra194_cbb *priv = to_tegra194_cbb(cbb); + u32 value; + + value = readl(priv->regs + ERRLOGGER_0_ERRVLD_0); + value |= (readl(priv->regs + ERRLOGGER_1_ERRVLD_0) << 1); + value |= (readl(priv->regs + ERRLOGGER_2_ERRVLD_0) << 2); + + dsb(sy); + return value; +} + +static u32 tegra194_axi2apb_status(void __iomem *addr) +{ + u32 value; + + value = readl(addr + DMAAPB_X_RAW_INTERRUPT_STATUS); + writel(0xffffffff, addr + DMAAPB_X_RAW_INTERRUPT_STATUS); + + return value; +} + +static bool tegra194_axi2apb_fatal(struct seq_file *file, unsigned int bridge, u32 status) +{ + bool is_fatal = true; + size_t i; + + for (i = 0; i < ARRAY_SIZE(tegra194_axi2apb_error); i++) { + if (status & BIT(i)) { + tegra_cbb_print_err(file, "\t AXI2APB_%d bridge error: %s\n", + bridge + 1, tegra194_axi2apb_error[i]); + if (strstr(tegra194_axi2apb_error[i], "Firewall")) + is_fatal = false; + } + } + + return is_fatal; +} + +/* + * Fetch InitlocalAddress from NOC Aperture lookup table + * using Targflow, Targsubrange + */ +static u32 get_init_localaddress(const struct tegra194_cbb_aperture *info, + const struct tegra194_cbb_aperture *aper, unsigned int max) +{ + unsigned int t_f = 0, t_sr = 0; + u32 addr = 0; + + for (t_f = 0; t_f < max; t_f++) { + if (aper[t_f].targflow == info->targflow) { + t_sr = t_f; + + do { + if (aper[t_sr].targ_subrange == info->targ_subrange) { + addr = aper[t_sr].init_localaddress; + return addr; + } + + if (t_sr >= max) + return 0; + + t_sr++; + } while (aper[t_sr].targflow == aper[t_sr - 1].targflow); + + t_f = t_sr; + } + } + + return addr; +} + +static void print_errlog5(struct seq_file *file, struct tegra194_cbb *cbb) +{ + struct tegra194_cbb_userbits userbits; + + cbb->noc->parse_userbits(&userbits, cbb->errlog5); + + if (!strcmp(cbb->noc->name, "cbb-noc")) { + tegra_cbb_print_err(file, "\t Non-Modify\t\t: %#x\n", userbits.non_mod); + tegra_cbb_print_err(file, "\t AXI ID\t\t: %#x\n", userbits.axi_id); + } + + tegra_cbb_print_err(file, "\t Master ID\t\t: %s\n", + cbb->noc->master_id[userbits.mstr_id]); + tegra_cbb_print_err(file, "\t Security Group(GRPSEC): %#x\n", userbits.grpsec); + tegra_cbb_print_cache(file, userbits.axcache); + tegra_cbb_print_prot(file, userbits.axprot); + tegra_cbb_print_err(file, "\t FALCONSEC\t\t: %#x\n", userbits.falconsec); + tegra_cbb_print_err(file, "\t Virtual Queuing Channel(VQC): %#x\n", userbits.vqc); +} + +/* + * Fetch Base Address/InitlocalAddress from NOC aperture lookup table using TargFlow & + * Targ_subRange extracted from RouteId. Perform address reconstruction as below: + * + * Address = Base Address + (ErrLog3 + ErrLog4) + */ +static void +print_errlog3_4(struct seq_file *file, u32 errlog3, u32 errlog4, + const struct tegra194_cbb_aperture *info, + const struct tegra194_cbb_aperture *aperture, unsigned int max) +{ + u64 addr = (u64)errlog4 << 32 | errlog3; + + /* + * If errlog4[7] = "1", then it's a joker entry. Joker entries are a rare phenomenon and + * such addresses are not reliable. Debugging should be done using only the RouteId + * information. + */ + if (errlog4 & 0x80) + tegra_cbb_print_err(file, "\t debug using RouteId alone as below address is a " + "joker entry and not reliable"); + + addr += get_init_localaddress(info, aperture, max); + + tegra_cbb_print_err(file, "\t Address accessed\t: %#llx\n", addr); +} + +/* + * Get RouteId from ErrLog1+ErrLog2 registers and fetch values of + * InitFlow, TargFlow, Targ_subRange and SeqId values from RouteId + */ +static void +print_errlog1_2(struct seq_file *file, struct tegra194_cbb *cbb, + struct tegra194_cbb_aperture *info) +{ + u64 routeid = (u64)cbb->errlog2 << 32 | cbb->errlog1; + u32 seqid = 0; + + tegra_cbb_print_err(file, "\t RouteId\t\t: %#llx\n", routeid); + + cbb->noc->parse_routeid(info, routeid); + + tegra_cbb_print_err(file, "\t InitFlow\t\t: %s\n", + cbb->noc->routeid_initflow[info->initflow]); + + tegra_cbb_print_err(file, "\t Targflow\t\t: %s\n", + cbb->noc->routeid_targflow[info->targflow]); + + tegra_cbb_print_err(file, "\t TargSubRange\t\t: %d\n", info->targ_subrange); + tegra_cbb_print_err(file, "\t SeqId\t\t\t: %d\n", seqid); +} + +/* + * Print transcation type, error code and description from ErrLog0 for all + * errors. For NOC slave errors, all relevant error info is printed using + * ErrLog0 only. But additional information is printed for errors from + * APB slaves because for them: + * - All errors are logged as SLV(slave) errors due to APB having only single + * bit pslverr to report all errors. + * - Exact cause is printed by reading DMAAPB_X_RAW_INTERRUPT_STATUS register. + * - The driver prints information showing AXI2APB bridge and exact error + * only if there is error in any AXI2APB slave. + * - There is still no way to disambiguate a DEC error from SLV error type. + */ +static bool print_errlog0(struct seq_file *file, struct tegra194_cbb *cbb) +{ + struct tegra194_cbb_packet_header hdr; + bool is_fatal = true; + + hdr.lock = cbb->errlog0 & 0x1; + hdr.opc = FIELD_GET(CBB_ERR_OPC, cbb->errlog0); + hdr.errcode = FIELD_GET(CBB_ERR_ERRCODE, cbb->errlog0); + hdr.len1 = FIELD_GET(CBB_ERR_LEN1, cbb->errlog0); + hdr.format = (cbb->errlog0 >> 31); + + tegra_cbb_print_err(file, "\t Transaction Type\t: %s\n", + tegra194_cbb_trantype[hdr.opc]); + tegra_cbb_print_err(file, "\t Error Code\t\t: %s\n", + tegra194_cbb_errors[hdr.errcode].code); + tegra_cbb_print_err(file, "\t Error Source\t\t: %s\n", + tegra194_cbb_errors[hdr.errcode].source); + tegra_cbb_print_err(file, "\t Error Description\t: %s\n", + tegra194_cbb_errors[hdr.errcode].desc); + + /* + * Do not crash system for errors which are only notifications to indicate a transaction + * was not allowed to be attempted. + */ + if (!strcmp(tegra194_cbb_errors[hdr.errcode].code, "SEC") || + !strcmp(tegra194_cbb_errors[hdr.errcode].code, "DEC") || + !strcmp(tegra194_cbb_errors[hdr.errcode].code, "UNS") || + !strcmp(tegra194_cbb_errors[hdr.errcode].code, "DISC")) { + is_fatal = false; + } else if (!strcmp(tegra194_cbb_errors[hdr.errcode].code, "SLV") && + cbb->num_bridges > 0) { + unsigned int i; + u32 status; + + /* For all SLV errors, read DMAAPB_X_RAW_INTERRUPT_STATUS + * register to get error status for all AXI2APB bridges. + * Print bridge details if a bit is set in a bridge's + * status register due to error in a APB slave connected + * to that bridge. For other NOC slaves, none of the status + * register will be set. + */ + + for (i = 0; i < cbb->num_bridges; i++) { + status = tegra194_axi2apb_status(cbb->bridges[i].base); + + if (status) + is_fatal = tegra194_axi2apb_fatal(file, i, status); + } + } + + tegra_cbb_print_err(file, "\t Packet header Lock\t: %d\n", hdr.lock); + tegra_cbb_print_err(file, "\t Packet header Len1\t: %d\n", hdr.len1); + + if (hdr.format) + tegra_cbb_print_err(file, "\t NOC protocol version\t: %s\n", + "version >= 2.7"); + else + tegra_cbb_print_err(file, "\t NOC protocol version\t: %s\n", + "version < 2.7"); + + return is_fatal; +} + +/* + * Print debug information about failed transaction using + * ErrLog registers of error loggger having ErrVld set + */ +static bool print_errloggerX_info(struct seq_file *file, struct tegra194_cbb *cbb, + int errloggerX) +{ + struct tegra194_cbb_aperture info = { 0, }; + bool is_fatal = true; + + tegra_cbb_print_err(file, "\tError Logger\t\t: %d\n", errloggerX); + + if (errloggerX == 0) { + cbb->errlog0 = readl(cbb->regs + ERRLOGGER_0_ERRLOG0_0); + cbb->errlog1 = readl(cbb->regs + ERRLOGGER_0_ERRLOG1_0); + cbb->errlog2 = readl(cbb->regs + ERRLOGGER_0_RSVD_00_0); + cbb->errlog3 = readl(cbb->regs + ERRLOGGER_0_ERRLOG3_0); + cbb->errlog4 = readl(cbb->regs + ERRLOGGER_0_ERRLOG4_0); + cbb->errlog5 = readl(cbb->regs + ERRLOGGER_0_ERRLOG5_0); + } else if (errloggerX == 1) { + cbb->errlog0 = readl(cbb->regs + ERRLOGGER_1_ERRLOG0_0); + cbb->errlog1 = readl(cbb->regs + ERRLOGGER_1_ERRLOG1_0); + cbb->errlog2 = readl(cbb->regs + ERRLOGGER_1_RSVD_00_0); + cbb->errlog3 = readl(cbb->regs + ERRLOGGER_1_ERRLOG3_0); + cbb->errlog4 = readl(cbb->regs + ERRLOGGER_1_ERRLOG4_0); + cbb->errlog5 = readl(cbb->regs + ERRLOGGER_1_ERRLOG5_0); + } else if (errloggerX == 2) { + cbb->errlog0 = readl(cbb->regs + ERRLOGGER_2_ERRLOG0_0); + cbb->errlog1 = readl(cbb->regs + ERRLOGGER_2_ERRLOG1_0); + cbb->errlog2 = readl(cbb->regs + ERRLOGGER_2_RSVD_00_0); + cbb->errlog3 = readl(cbb->regs + ERRLOGGER_2_ERRLOG3_0); + cbb->errlog4 = readl(cbb->regs + ERRLOGGER_2_ERRLOG4_0); + cbb->errlog5 = readl(cbb->regs + ERRLOGGER_2_ERRLOG5_0); + } + + tegra_cbb_print_err(file, "\tErrLog0\t\t\t: %#x\n", cbb->errlog0); + is_fatal = print_errlog0(file, cbb); + + tegra_cbb_print_err(file, "\tErrLog1\t\t\t: %#x\n", cbb->errlog1); + tegra_cbb_print_err(file, "\tErrLog2\t\t\t: %#x\n", cbb->errlog2); + print_errlog1_2(file, cbb, &info); + + tegra_cbb_print_err(file, "\tErrLog3\t\t\t: %#x\n", cbb->errlog3); + tegra_cbb_print_err(file, "\tErrLog4\t\t\t: %#x\n", cbb->errlog4); + print_errlog3_4(file, cbb->errlog3, cbb->errlog4, &info, cbb->noc->noc_aperture, + cbb->noc->max_aperture); + + tegra_cbb_print_err(file, "\tErrLog5\t\t\t: %#x\n", cbb->errlog5); + + if (cbb->errlog5) + print_errlog5(file, cbb); + + return is_fatal; +} + +static bool print_errlog(struct seq_file *file, struct tegra194_cbb *cbb, u32 errvld) +{ + bool is_fatal = true; + + pr_crit("**************************************\n"); + pr_crit("CPU:%d, Error:%s\n", smp_processor_id(), cbb->noc->name); + + if (errvld & 0x1) + is_fatal = print_errloggerX_info(file, cbb, 0); + else if (errvld & 0x2) + is_fatal = print_errloggerX_info(file, cbb, 1); + else if (errvld & 0x4) + is_fatal = print_errloggerX_info(file, cbb, 2); + + tegra_cbb_error_clear(&cbb->base); + tegra_cbb_print_err(file, "\t**************************************\n"); + return is_fatal; +} + +#ifdef CONFIG_DEBUG_FS +static DEFINE_MUTEX(cbb_err_mutex); + +static int tegra194_cbb_debugfs_show(struct tegra_cbb *cbb, struct seq_file *file, void *data) +{ + struct tegra_cbb *noc; + + mutex_lock(&cbb_err_mutex); + + list_for_each_entry(noc, &cbb_list, node) { + struct tegra194_cbb *priv = to_tegra194_cbb(noc); + u32 status; + + status = tegra_cbb_get_status(noc); + if (status) + print_errlog(file, priv, status); + } + + mutex_unlock(&cbb_err_mutex); + + return 0; +} +#endif + +/* + * Handler for CBB errors from different initiators + */ +static irqreturn_t tegra194_cbb_err_isr(int irq, void *data) +{ + bool is_inband_err = false, is_fatal = false; + //struct tegra194_cbb *cbb = data; + struct tegra_cbb *noc; + unsigned long flags; + u8 mstr_id = 0; + + spin_lock_irqsave(&cbb_lock, flags); + + /* XXX only process interrupts for "cbb" instead of iterating over all NOCs? */ + list_for_each_entry(noc, &cbb_list, node) { + struct tegra194_cbb *priv = to_tegra194_cbb(noc); + u32 status = 0; + + status = tegra_cbb_get_status(noc); + + if (status && ((irq == priv->sec_irq) || (irq == priv->nonsec_irq))) { + tegra_cbb_print_err(NULL, "CPU:%d, Error: %s@%llx, irq=%d\n", + smp_processor_id(), priv->noc->name, priv->res->start, + irq); + + mstr_id = FIELD_GET(USRBITS_MSTR_ID, priv->errlog5) - 1; + is_fatal = print_errlog(NULL, priv, status); + + /* + * If illegal request is from CCPLEX(0x1) + * initiator then call BUG() to crash system. + */ + if ((mstr_id == 0x1) && priv->noc->erd_mask_inband_err) + is_inband_err = 1; + } + } + + spin_unlock_irqrestore(&cbb_lock, flags); + + if (is_inband_err) { + if (is_fatal) + BUG(); + else + WARN(true, "Warning due to CBB Error\n"); + } + + return IRQ_HANDLED; +} + +/* + * Register handler for CBB_NONSECURE & CBB_SECURE interrupts + * for reporting CBB errors + */ +static int tegra194_cbb_interrupt_enable(struct tegra_cbb *cbb) +{ + struct tegra194_cbb *priv = to_tegra194_cbb(cbb); + struct device *dev = cbb->dev; + int err; + + if (priv->sec_irq) { + err = devm_request_irq(dev, priv->sec_irq, tegra194_cbb_err_isr, 0, dev_name(dev), + priv); + if (err) { + dev_err(dev, "failed to register interrupt %u: %d\n", priv->sec_irq, err); + return err; + } + } + + if (priv->nonsec_irq) { + err = devm_request_irq(dev, priv->nonsec_irq, tegra194_cbb_err_isr, 0, + dev_name(dev), priv); + if (err) { + dev_err(dev, "failed to register interrupt %u: %d\n", priv->nonsec_irq, + err); + return err; + } + } + + return 0; +} + +static void tegra194_cbb_error_enable(struct tegra_cbb *cbb) +{ + /* + * Set “StallEn=1” to enable queuing of error packets till + * first is served & cleared + */ + tegra_cbb_stall_enable(cbb); + + /* set “FaultEn=1” to enable error reporting signal “Fault” */ + tegra_cbb_fault_enable(cbb); +} + +static const struct tegra_cbb_ops tegra194_cbb_ops = { + .get_status = tegra194_cbb_get_status, + .error_clear = tegra194_cbb_error_clear, + .fault_enable = tegra194_cbb_fault_enable, + .stall_enable = tegra194_cbb_stall_enable, + .error_enable = tegra194_cbb_error_enable, + .interrupt_enable = tegra194_cbb_interrupt_enable, +#ifdef CONFIG_DEBUG_FS + .debugfs_show = tegra194_cbb_debugfs_show, +#endif +}; + +static struct tegra194_cbb_noc_data tegra194_cbb_central_noc_data = { + .name = "cbb-noc", + .erd_mask_inband_err = true, + .master_id = tegra194_master_id, + .noc_aperture = tegra194_cbbcentralnoc_apert_lookup, + .max_aperture = ARRAY_SIZE(tegra194_cbbcentralnoc_apert_lookup), + .routeid_initflow = tegra194_cbbcentralnoc_routeid_initflow, + .routeid_targflow = tegra194_cbbcentralnoc_routeid_targflow, + .parse_routeid = cbbcentralnoc_parse_routeid, + .parse_userbits = cbbcentralnoc_parse_userbits +}; + +static struct tegra194_cbb_noc_data tegra194_aon_noc_data = { + .name = "aon-noc", + .erd_mask_inband_err = false, + .master_id = tegra194_master_id, + .noc_aperture = tegra194_aonnoc_aperture_lookup, + .max_aperture = ARRAY_SIZE(tegra194_aonnoc_aperture_lookup), + .routeid_initflow = tegra194_aonnoc_routeid_initflow, + .routeid_targflow = tegra194_aonnoc_routeid_targflow, + .parse_routeid = aonnoc_parse_routeid, + .parse_userbits = clusternoc_parse_userbits +}; + +static struct tegra194_cbb_noc_data tegra194_bpmp_noc_data = { + .name = "bpmp-noc", + .erd_mask_inband_err = false, + .master_id = tegra194_master_id, + .noc_aperture = tegra194_bpmpnoc_apert_lookup, + .max_aperture = ARRAY_SIZE(tegra194_bpmpnoc_apert_lookup), + .routeid_initflow = tegra194_bpmpnoc_routeid_initflow, + .routeid_targflow = tegra194_bpmpnoc_routeid_targflow, + .parse_routeid = bpmpnoc_parse_routeid, + .parse_userbits = clusternoc_parse_userbits +}; + +static struct tegra194_cbb_noc_data tegra194_rce_noc_data = { + .name = "rce-noc", + .erd_mask_inband_err = false, + .master_id = tegra194_master_id, + .noc_aperture = tegra194_scenoc_apert_lookup, + .max_aperture = ARRAY_SIZE(tegra194_scenoc_apert_lookup), + .routeid_initflow = tegra194_scenoc_routeid_initflow, + .routeid_targflow = tegra194_scenoc_routeid_targflow, + .parse_routeid = scenoc_parse_routeid, + .parse_userbits = clusternoc_parse_userbits +}; + +static struct tegra194_cbb_noc_data tegra194_sce_noc_data = { + .name = "sce-noc", + .erd_mask_inband_err = false, + .master_id = tegra194_master_id, + .noc_aperture = tegra194_scenoc_apert_lookup, + .max_aperture = ARRAY_SIZE(tegra194_scenoc_apert_lookup), + .routeid_initflow = tegra194_scenoc_routeid_initflow, + .routeid_targflow = tegra194_scenoc_routeid_targflow, + .parse_routeid = scenoc_parse_routeid, + .parse_userbits = clusternoc_parse_userbits +}; + +static const struct of_device_id tegra194_cbb_match[] = { + { .compatible = "nvidia,tegra194-cbb-noc", .data = &tegra194_cbb_central_noc_data }, + { .compatible = "nvidia,tegra194-aon-noc", .data = &tegra194_aon_noc_data }, + { .compatible = "nvidia,tegra194-bpmp-noc", .data = &tegra194_bpmp_noc_data }, + { .compatible = "nvidia,tegra194-rce-noc", .data = &tegra194_rce_noc_data }, + { .compatible = "nvidia,tegra194-sce-noc", .data = &tegra194_sce_noc_data }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, tegra194_cbb_match); + +static int tegra194_cbb_get_bridges(struct tegra194_cbb *cbb, struct device_node *np) +{ + struct tegra_cbb *entry; + struct resource res; + unsigned long flags; + unsigned int i; + int err; + + spin_lock_irqsave(&cbb_lock, flags); + + list_for_each_entry(entry, &cbb_list, node) { + struct tegra194_cbb *priv = to_tegra194_cbb(entry); + + if (priv->bridges) { + cbb->num_bridges = priv->num_bridges; + cbb->bridges = priv->bridges; + break; + } + } + + spin_unlock_irqrestore(&cbb_lock, flags); + + if (!cbb->bridges) { + while (of_address_to_resource(np, cbb->num_bridges, &res) == 0) + cbb->num_bridges++; + + cbb->bridges = devm_kcalloc(cbb->base.dev, cbb->num_bridges, + sizeof(*cbb->bridges), GFP_KERNEL); + if (!cbb->bridges) + return -ENOMEM; + + for (i = 0; i < cbb->num_bridges; i++) { + err = of_address_to_resource(np, i, &cbb->bridges[i].res); + if (err < 0) + return err; + + cbb->bridges[i].base = devm_ioremap_resource(cbb->base.dev, + &cbb->bridges[i].res); + if (IS_ERR(cbb->bridges[i].base)) { + dev_err(cbb->base.dev, "failed to map AXI2APB range\n"); + return PTR_ERR(cbb->bridges[i].base); + } + } + } + + if (cbb->num_bridges > 0) { + dev_dbg(cbb->base.dev, "AXI2APB bridge info present:\n"); + + for (i = 0; i < cbb->num_bridges; i++) + dev_dbg(cbb->base.dev, " %u: %pR\n", i, &cbb->bridges[i].res); + } + + return 0; +} + +static int tegra194_cbb_probe(struct platform_device *pdev) +{ + const struct tegra194_cbb_noc_data *noc; + struct tegra194_cbb *cbb; + struct device_node *np; + unsigned long flags; + int err; + + noc = of_device_get_match_data(&pdev->dev); + + if (noc->erd_mask_inband_err) { + /* + * Set Error Response Disable(ERD) bit to mask SError/inband + * error and only trigger interrupts for illegal access from + * CCPLEX initiator. + */ + err = tegra194_miscreg_mask_serror(); + if (err) { + dev_err(&pdev->dev, "couldn't mask inband errors\n"); + return err; + } + } + + cbb = devm_kzalloc(&pdev->dev, sizeof(*cbb), GFP_KERNEL); + if (!cbb) + return -ENOMEM; + + INIT_LIST_HEAD(&cbb->base.node); + cbb->base.ops = &tegra194_cbb_ops; + cbb->base.dev = &pdev->dev; + cbb->noc = noc; + + cbb->regs = devm_platform_get_and_ioremap_resource(pdev, 0, &cbb->res); + if (IS_ERR(cbb->regs)) + return PTR_ERR(cbb->regs); + + err = tegra_cbb_get_irq(pdev, &cbb->nonsec_irq, &cbb->sec_irq); + if (err) + return err; + + np = of_parse_phandle(pdev->dev.of_node, "nvidia,axi2apb", 0); + if (np) { + err = tegra194_cbb_get_bridges(cbb, np); + of_node_put(np); + if (err < 0) + return err; + } + + platform_set_drvdata(pdev, cbb); + + spin_lock_irqsave(&cbb_lock, flags); + list_add(&cbb->base.node, &cbb_list); + spin_unlock_irqrestore(&cbb_lock, flags); + + return tegra_cbb_register(&cbb->base); +} + +static int tegra194_cbb_remove(struct platform_device *pdev) +{ + struct tegra194_cbb *cbb = platform_get_drvdata(pdev); + struct tegra_cbb *noc, *tmp; + unsigned long flags; + + spin_lock_irqsave(&cbb_lock, flags); + + list_for_each_entry_safe(noc, tmp, &cbb_list, node) { + struct tegra194_cbb *priv = to_tegra194_cbb(noc); + + if (cbb->res->start == priv->res->start) { + list_del(&noc->node); + break; + } + } + + spin_unlock_irqrestore(&cbb_lock, flags); + + return 0; +} + +static int __maybe_unused tegra194_cbb_resume_noirq(struct device *dev) +{ + struct tegra194_cbb *cbb = dev_get_drvdata(dev); + + tegra194_cbb_error_enable(&cbb->base); + dsb(sy); + + dev_dbg(dev, "%s resumed\n", cbb->noc->name); + return 0; +} + +static const struct dev_pm_ops tegra194_cbb_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, tegra194_cbb_resume_noirq) +}; + +static struct platform_driver tegra194_cbb_driver = { + .probe = tegra194_cbb_probe, + .remove = tegra194_cbb_remove, + .driver = { + .owner = THIS_MODULE, + .name = "tegra194-cbb", + .of_match_table = of_match_ptr(tegra194_cbb_match), + .pm = &tegra194_cbb_pm, + }, +}; + +static int __init tegra194_cbb_init(void) +{ + return platform_driver_register(&tegra194_cbb_driver); +} +pure_initcall(tegra194_cbb_init); + +static void __exit tegra194_cbb_exit(void) +{ + platform_driver_unregister(&tegra194_cbb_driver); +} +module_exit(tegra194_cbb_exit); + +MODULE_AUTHOR("Sumit Gupta "); +MODULE_DESCRIPTION("Control Backbone error handling driver for Tegra194"); +MODULE_LICENSE("GPL"); diff --git a/include/soc/tegra/tegra-cbb.h b/include/soc/tegra/tegra-cbb.h new file mode 100644 index 00000000000000..e864c2ebe794e4 --- /dev/null +++ b/include/soc/tegra/tegra-cbb.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved + */ + +#ifndef TEGRA_CBB_H +#define TEGRA_CBB_H + +#include + +struct tegra_cbb_error { + const char *code; + const char *source; + const char *desc; +}; + +struct tegra_cbb { + struct device *dev; + const struct tegra_cbb_ops *ops; + struct list_head node; +}; + +struct tegra_cbb_ops { + int (*debugfs_show)(struct tegra_cbb *cbb, struct seq_file *s, void *v); + int (*interrupt_enable)(struct tegra_cbb *cbb); + void (*error_enable)(struct tegra_cbb *cbb); + void (*fault_enable)(struct tegra_cbb *cbb); + void (*stall_enable)(struct tegra_cbb *cbb); + void (*error_clear)(struct tegra_cbb *cbb); + u32 (*get_status)(struct tegra_cbb *cbb); +}; + +int tegra_cbb_get_irq(struct platform_device *pdev, unsigned int *nonsec_irq, + unsigned int *sec_irq); +__printf(2, 3) +void tegra_cbb_print_err(struct seq_file *file, const char *fmt, ...); + +void tegra_cbb_print_cache(struct seq_file *file, u32 cache); +void tegra_cbb_print_prot(struct seq_file *file, u32 prot); +int tegra_cbb_register(struct tegra_cbb *cbb); + +void tegra_cbb_fault_enable(struct tegra_cbb *cbb); +void tegra_cbb_stall_enable(struct tegra_cbb *cbb); +void tegra_cbb_error_clear(struct tegra_cbb *cbb); +u32 tegra_cbb_get_status(struct tegra_cbb *cbb); + +#endif /* TEGRA_CBB_H */ From eb9a50b64a59a6f90a094660d293b9d402637022 Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Thu, 12 May 2022 01:46:50 +0530 Subject: [PATCH 0304/1250] soc/tegra: cbb: Add driver for Tegra234 CBB 2.0 Adding driver to handle errors from CBB version 2.0 which is used in Tegra234 SoC. The driver prints debug information about failed transaction on receiving interrupt from the error notifier. The error notifier collates the interrupts from various error monitor blocks and presents a single interrupt to the SoC interrupt controller. For timeout errors, the driver also does the lookup to find timed out clients and prints their client ID. Drivers for hardware that needs to be reset on timeout will have to call BPMP from the client IP's driver. BPMP firmware will also clear the timeout bit after resetting the IP so that next transactions are send to them after reset. Signed-off-by: Sumit Gupta Signed-off-by: Thierry Reding --- drivers/soc/tegra/Kconfig | 2 +- drivers/soc/tegra/cbb/Makefile | 1 + drivers/soc/tegra/cbb/tegra234-cbb.c | 847 +++++++++++++++++++++++++++ 3 files changed, 849 insertions(+), 1 deletion(-) create mode 100644 drivers/soc/tegra/cbb/tegra234-cbb.c diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig index 65283a93e78f4c..bd360488cd8241 100644 --- a/drivers/soc/tegra/Kconfig +++ b/drivers/soc/tegra/Kconfig @@ -165,7 +165,7 @@ config SOC_TEGRA30_VOLTAGE_COUPLER config SOC_TEGRA_CBB tristate "Tegra driver to handle error from CBB" - depends on ARCH_TEGRA_194_SOC + depends on ARCH_TEGRA_194_SOC || ARCH_TEGRA_234_SOC default y help Support for handling error from Tegra Control Backbone(CBB). diff --git a/drivers/soc/tegra/cbb/Makefile b/drivers/soc/tegra/cbb/Makefile index 711b756107033b..e3ac6cdddf5c9e 100644 --- a/drivers/soc/tegra/cbb/Makefile +++ b/drivers/soc/tegra/cbb/Makefile @@ -5,4 +5,5 @@ ifdef CONFIG_SOC_TEGRA_CBB obj-y += tegra-cbb.o obj-$(CONFIG_ARCH_TEGRA_194_SOC) += tegra194-cbb.o +obj-$(CONFIG_ARCH_TEGRA_234_SOC) += tegra234-cbb.o endif diff --git a/drivers/soc/tegra/cbb/tegra234-cbb.c b/drivers/soc/tegra/cbb/tegra234-cbb.c new file mode 100644 index 00000000000000..0d01803e7eecf4 --- /dev/null +++ b/drivers/soc/tegra/cbb/tegra234-cbb.c @@ -0,0 +1,847 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved + * + * The driver handles Error's from Control Backbone(CBB) version 2.0. + * generated due to illegal accesses. The driver prints debug information + * about failed transaction on receiving interrupt from Error Notifier. + * Error types supported by CBB2.0 are: + * UNSUPPORTED_ERR, PWRDOWN_ERR, TIMEOUT_ERR, FIREWALL_ERR, DECODE_ERR, + * SLAVE_ERR + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FABRIC_EN_CFG_INTERRUPT_ENABLE_0_0 0x0 +#define FABRIC_EN_CFG_STATUS_0_0 0x40 +#define FABRIC_EN_CFG_ADDR_INDEX_0_0 0x60 +#define FABRIC_EN_CFG_ADDR_LOW_0 0x80 +#define FABRIC_EN_CFG_ADDR_HI_0 0x84 + +#define FABRIC_MN_MASTER_ERR_EN_0 0x200 +#define FABRIC_MN_MASTER_ERR_FORCE_0 0x204 +#define FABRIC_MN_MASTER_ERR_STATUS_0 0x208 +#define FABRIC_MN_MASTER_ERR_OVERFLOW_STATUS_0 0x20c + +#define FABRIC_MN_MASTER_LOG_ERR_STATUS_0 0x300 +#define FABRIC_MN_MASTER_LOG_ADDR_LOW_0 0x304 +#define FABRIC_MN_MASTER_LOG_ADDR_HIGH_0 0x308 +#define FABRIC_MN_MASTER_LOG_ATTRIBUTES0_0 0x30c +#define FABRIC_MN_MASTER_LOG_ATTRIBUTES1_0 0x310 +#define FABRIC_MN_MASTER_LOG_ATTRIBUTES2_0 0x314 +#define FABRIC_MN_MASTER_LOG_USER_BITS0_0 0x318 + +#define AXI_SLV_TIMEOUT_STATUS_0_0 0x8 +#define APB_BLOCK_TMO_STATUS_0 0xc00 +#define APB_BLOCK_NUM_TMO_OFFSET 0x20 + +#define FAB_EM_EL_MSTRID GENMASK(29, 24) +#define FAB_EM_EL_VQC GENMASK(17, 16) +#define FAB_EM_EL_GRPSEC GENMASK(14, 8) +#define FAB_EM_EL_FALCONSEC GENMASK(1, 0) + +#define FAB_EM_EL_FABID GENMASK(20, 16) +#define FAB_EM_EL_SLAVEID GENMASK(7, 0) + +#define FAB_EM_EL_ACCESSID GENMASK(7, 0) + +#define FAB_EM_EL_AXCACHE GENMASK(27, 24) +#define FAB_EM_EL_AXPROT GENMASK(22, 20) +#define FAB_EM_EL_BURSTLENGTH GENMASK(19, 12) +#define FAB_EM_EL_BURSTTYPE GENMASK(9, 8) +#define FAB_EM_EL_BEATSIZE GENMASK(6, 4) +#define FAB_EM_EL_ACCESSTYPE GENMASK(0, 0) + +#define USRBITS_MSTR_ID GENMASK(29, 24) + +#define REQ_SOCKET_ID GENMASK(27, 24) + +enum tegra234_cbb_fabric_ids { + CBB_FAB_ID, + SCE_FAB_ID, + RCE_FAB_ID, + DCE_FAB_ID, + AON_FAB_ID, + PSC_FAB_ID, + BPMP_FAB_ID, + FSI_FAB_ID, + MAX_FAB_ID, +}; + +struct tegra234_slave_lookup { + const char *name; + unsigned int offset; +}; + +struct tegra234_cbb_fabric { + const char *name; + phys_addr_t off_mask_erd; + bool erd_mask_inband_err; + const char * const *master_id; + unsigned int notifier_offset; + const struct tegra_cbb_error *errors; + const struct tegra234_slave_lookup *slave_map; +}; + +struct tegra234_cbb { + struct tegra_cbb base; + + const struct tegra234_cbb_fabric *fabric; + struct resource *res; + void __iomem *regs; + + int num_intr; + int sec_irq; + + /* record */ + void __iomem *mon; + unsigned int type; + u32 mask; + u64 access; + u32 mn_attr0; + u32 mn_attr1; + u32 mn_attr2; + u32 mn_user_bits; +}; + +static inline struct tegra234_cbb *to_tegra234_cbb(struct tegra_cbb *cbb) +{ + return container_of(cbb, struct tegra234_cbb, base); +} + +static LIST_HEAD(cbb_list); +static DEFINE_SPINLOCK(cbb_lock); + +static void tegra234_cbb_fault_enable(struct tegra_cbb *cbb) +{ + struct tegra234_cbb *priv = to_tegra234_cbb(cbb); + void __iomem *addr; + + addr = priv->regs + priv->fabric->notifier_offset; + writel(0x1ff, addr + FABRIC_EN_CFG_INTERRUPT_ENABLE_0_0); + dsb(sy); +} + +static void tegra234_cbb_error_clear(struct tegra_cbb *cbb) +{ + struct tegra234_cbb *priv = to_tegra234_cbb(cbb); + + writel(0x3f, priv->mon + FABRIC_MN_MASTER_ERR_STATUS_0); + dsb(sy); +} + +static u32 tegra234_cbb_get_status(struct tegra_cbb *cbb) +{ + struct tegra234_cbb *priv = to_tegra234_cbb(cbb); + void __iomem *addr; + u32 value; + + addr = priv->regs + priv->fabric->notifier_offset; + value = readl(addr + FABRIC_EN_CFG_STATUS_0_0); + dsb(sy); + + return value; +} + +static void tegra234_cbb_mask_serror(struct tegra234_cbb *cbb) +{ + writel(0x1, cbb->regs + cbb->fabric->off_mask_erd); + dsb(sy); +} + +static u32 tegra234_cbb_get_tmo_slv(void __iomem *addr) +{ + u32 timeout; + + timeout = readl(addr); + return timeout; +} + +static void tegra234_cbb_tmo_slv(struct seq_file *file, const char *slave, void __iomem *addr, + u32 status) +{ + tegra_cbb_print_err(file, "\t %s : %#x\n", slave, status); +} + +static void tegra234_cbb_lookup_apbslv(struct seq_file *file, const char *slave, + void __iomem *base) +{ + unsigned int block = 0; + void __iomem *addr; + char name[64]; + u32 status; + + status = tegra234_cbb_get_tmo_slv(base); + if (status) + tegra_cbb_print_err(file, "\t %s_BLOCK_TMO_STATUS : %#x\n", slave, status); + + while (status) { + if (status & BIT(0)) { + u32 timeout, clients, client = 0; + + addr = base + APB_BLOCK_NUM_TMO_OFFSET + (block * 4); + timeout = tegra234_cbb_get_tmo_slv(addr); + clients = timeout; + + while (timeout) { + if (timeout & BIT(0)) { + if (clients != 0xffffffff) + clients &= BIT(client); + + sprintf(name, "%s_BLOCK%d_TMO", slave, block); + + tegra234_cbb_tmo_slv(file, name, addr, clients); + } + + timeout >>= 1; + client++; + } + } + + status >>= 1; + block++; + } +} + +static void tegra234_lookup_slave_timeout(struct seq_file *file, struct tegra234_cbb *cbb, + u8 slave_id, u8 fab_id) +{ + const struct tegra234_slave_lookup *map = cbb->fabric->slave_map; + void __iomem *addr; + + /* + * 1) Get slave node name and address mapping using slave_id. + * 2) Check if the timed out slave node is APB or AXI. + * 3) If AXI, then print timeout register and reset axi slave + * using _SN_<>_SLV_TIMEOUT_STATUS_0_0 register. + * 4) If APB, then perform an additional lookup to find the client + * which timed out. + * a) Get block number from the index of set bit in + * _SN_AXI2APB_<>_BLOCK_TMO_STATUS_0 register. + * b) Get address of register repective to block number i.e. + * _SN_AXI2APB_<>_BLOCK_TMO_0. + * c) Read the register in above step to get client_id which + * timed out as per the set bits. + * d) Reset the timedout client and print details. + * e) Goto step-a till all bits are set. + */ + + addr = cbb->regs + map[slave_id].offset; + + if (strstr(map[slave_id].name, "AXI2APB")) { + addr += APB_BLOCK_TMO_STATUS_0; + + tegra234_cbb_lookup_apbslv(file, map[slave_id].name, addr); + } else { + char name[64]; + u32 status; + + addr += AXI_SLV_TIMEOUT_STATUS_0_0; + + status = tegra234_cbb_get_tmo_slv(addr); + if (status) { + sprintf(name, "%s_SLV_TIMEOUT_STATUS", map[slave_id].name); + tegra234_cbb_tmo_slv(file, name, addr, status); + } + } +} + +static void tegra234_cbb_print_error(struct seq_file *file, struct tegra234_cbb *cbb, u32 status, + u32 overflow) +{ + unsigned int type = 0; + + if (status & (status - 1)) + tegra_cbb_print_err(file, "\t Multiple type of errors reported\n"); + + while (status) { + if (status & 0x1) + tegra_cbb_print_err(file, "\t Error Code\t\t: %s\n", + cbb->fabric->errors[type].code); + + status >>= 1; + type++; + } + + type = 0; + + while (overflow) { + if (overflow & 0x1) + tegra_cbb_print_err(file, "\t Overflow\t\t: Multiple %s\n", + cbb->fabric->errors[type].code); + + overflow >>= 1; + type++; + } +} + +static void print_errlog_err(struct seq_file *file, struct tegra234_cbb *cbb) +{ + u8 cache_type, prot_type, burst_length, mstr_id, grpsec, vqc, falconsec, beat_size; + u8 access_type, access_id, slave_id, fab_id, burst_type; + char fabric_name[20]; + + mstr_id = FIELD_GET(FAB_EM_EL_MSTRID, cbb->mn_user_bits); + vqc = FIELD_GET(FAB_EM_EL_VQC, cbb->mn_user_bits); + grpsec = FIELD_GET(FAB_EM_EL_GRPSEC, cbb->mn_user_bits); + falconsec = FIELD_GET(FAB_EM_EL_FALCONSEC, cbb->mn_user_bits); + + fab_id = FIELD_GET(FAB_EM_EL_FABID, cbb->mn_attr2); + slave_id = FIELD_GET(FAB_EM_EL_SLAVEID, cbb->mn_attr2); + + access_id = FIELD_GET(FAB_EM_EL_ACCESSID, cbb->mn_attr1); + + cache_type = FIELD_GET(FAB_EM_EL_AXCACHE, cbb->mn_attr0); + prot_type = FIELD_GET(FAB_EM_EL_AXPROT, cbb->mn_attr0); + burst_length = FIELD_GET(FAB_EM_EL_BURSTLENGTH, cbb->mn_attr0); + burst_type = FIELD_GET(FAB_EM_EL_BURSTTYPE, cbb->mn_attr0); + beat_size = FIELD_GET(FAB_EM_EL_BEATSIZE, cbb->mn_attr0); + access_type = FIELD_GET(FAB_EM_EL_ACCESSTYPE, cbb->mn_attr0); + + tegra_cbb_print_err(file, "\n"); + tegra_cbb_print_err(file, "\t Error Code\t\t: %s\n", + cbb->fabric->errors[cbb->type].code); + + tegra_cbb_print_err(file, "\t MASTER_ID\t\t: %s\n", cbb->fabric->master_id[mstr_id]); + tegra_cbb_print_err(file, "\t Address\t\t: %#llx\n", cbb->access); + + tegra_cbb_print_cache(file, cache_type); + tegra_cbb_print_prot(file, prot_type); + + tegra_cbb_print_err(file, "\t Access_Type\t\t: %s", (access_type) ? "Write\n" : "Read\n"); + tegra_cbb_print_err(file, "\t Access_ID\t\t: %#x", access_id); + + if (fab_id == PSC_FAB_ID) + strcpy(fabric_name, "psc-fabric"); + else if (fab_id == FSI_FAB_ID) + strcpy(fabric_name, "fsi-fabric"); + else + strcpy(fabric_name, cbb->fabric->name); + + tegra_cbb_print_err(file, "\t Fabric\t\t: %s\n", fabric_name); + tegra_cbb_print_err(file, "\t Slave_Id\t\t: %#x\n", slave_id); + tegra_cbb_print_err(file, "\t Burst_length\t\t: %#x\n", burst_length); + tegra_cbb_print_err(file, "\t Burst_type\t\t: %#x\n", burst_type); + tegra_cbb_print_err(file, "\t Beat_size\t\t: %#x\n", beat_size); + tegra_cbb_print_err(file, "\t VQC\t\t\t: %#x\n", vqc); + tegra_cbb_print_err(file, "\t GRPSEC\t\t: %#x\n", grpsec); + tegra_cbb_print_err(file, "\t FALCONSEC\t\t: %#x\n", falconsec); + + if ((fab_id == PSC_FAB_ID) || (fab_id == FSI_FAB_ID)) + return; + + if (!strcmp(cbb->fabric->errors[cbb->type].code, "TIMEOUT_ERR")) { + tegra234_lookup_slave_timeout(file, cbb, slave_id, fab_id); + return; + } + + tegra_cbb_print_err(file, "\t Slave\t\t\t: %s\n", cbb->fabric->slave_map[slave_id].name); +} + +static int print_errmonX_info(struct seq_file *file, struct tegra234_cbb *cbb) +{ + u32 overflow, status, error; + + status = readl(cbb->mon + FABRIC_MN_MASTER_ERR_STATUS_0); + if (!status) { + pr_err("Error Notifier received a spurious notification\n"); + return -ENODATA; + } + + if (status == 0xffffffff) { + pr_err("CBB registers returning all 1's which is invalid\n"); + return -EINVAL; + } + + overflow = readl(cbb->mon + FABRIC_MN_MASTER_ERR_OVERFLOW_STATUS_0); + + tegra234_cbb_print_error(file, cbb, status, overflow); + + error = readl(cbb->mon + FABRIC_MN_MASTER_LOG_ERR_STATUS_0); + if (!error) { + pr_info("Error Monitor doesn't have Error Logger\n"); + return -EINVAL; + } + + cbb->type = 0; + + while (error) { + if (error & BIT(0)) { + u32 hi, lo; + + hi = readl(cbb->mon + FABRIC_MN_MASTER_LOG_ADDR_HIGH_0); + lo = readl(cbb->mon + FABRIC_MN_MASTER_LOG_ADDR_LOW_0); + + cbb->access = (u64)hi << 32 | lo; + + cbb->mn_attr0 = readl(cbb->mon + FABRIC_MN_MASTER_LOG_ATTRIBUTES0_0); + cbb->mn_attr1 = readl(cbb->mon + FABRIC_MN_MASTER_LOG_ATTRIBUTES1_0); + cbb->mn_attr2 = readl(cbb->mon + FABRIC_MN_MASTER_LOG_ATTRIBUTES2_0); + cbb->mn_user_bits = readl(cbb->mon + FABRIC_MN_MASTER_LOG_USER_BITS0_0); + + print_errlog_err(file, cbb); + } + + cbb->type++; + error >>= 1; + } + + return 0; +} + +static int print_err_notifier(struct seq_file *file, struct tegra234_cbb *cbb, u32 status) +{ + unsigned int index = 0; + int err; + + pr_crit("**************************************\n"); + pr_crit("CPU:%d, Error:%s, Errmon:%d\n", smp_processor_id(), + cbb->fabric->name, status); + + while (status) { + if (status & BIT(0)) { + unsigned int notifier = cbb->fabric->notifier_offset; + u32 hi, lo, mask = BIT(index); + phys_addr_t addr; + u64 offset; + + writel(mask, cbb->regs + notifier + FABRIC_EN_CFG_ADDR_INDEX_0_0); + hi = readl(cbb->regs + notifier + FABRIC_EN_CFG_ADDR_HI_0); + lo = readl(cbb->regs + notifier + FABRIC_EN_CFG_ADDR_LOW_0); + + addr = (u64)hi << 32 | lo; + + offset = addr - cbb->res->start; + cbb->mon = cbb->regs + offset; + cbb->mask = BIT(index); + + err = print_errmonX_info(file, cbb); + tegra234_cbb_error_clear(&cbb->base); + if (err) + return err; + } + + status >>= 1; + index++; + } + + tegra_cbb_print_err(file, "\t**************************************\n"); + return 0; +} + +#ifdef CONFIG_DEBUG_FS +static DEFINE_MUTEX(cbb_debugfs_mutex); + +static int tegra234_cbb_debugfs_show(struct tegra_cbb *cbb, struct seq_file *file, void *data) +{ + int err = 0; + + mutex_lock(&cbb_debugfs_mutex); + + list_for_each_entry(cbb, &cbb_list, node) { + struct tegra234_cbb *priv = to_tegra234_cbb(cbb); + u32 status; + + status = tegra_cbb_get_status(&priv->base); + if (status) { + err = print_err_notifier(file, priv, status); + if (err) + break; + } + } + + mutex_unlock(&cbb_debugfs_mutex); + return err; +} +#endif + +/* + * Handler for CBB errors + */ +static irqreturn_t tegra234_cbb_isr(int irq, void *data) +{ + bool is_inband_err = false; + struct tegra_cbb *cbb; + unsigned long flags; + u8 mstr_id; + int err; + + spin_lock_irqsave(&cbb_lock, flags); + + list_for_each_entry(cbb, &cbb_list, node) { + struct tegra234_cbb *priv = to_tegra234_cbb(cbb); + u32 status = tegra_cbb_get_status(cbb); + + if (status && (irq == priv->sec_irq)) { + tegra_cbb_print_err(NULL, "CPU:%d, Error: %s@%llx, irq=%d\n", + smp_processor_id(), priv->fabric->name, + priv->res->start, irq); + + err = print_err_notifier(NULL, priv, status); + if (err) + goto unlock; + + mstr_id = FIELD_GET(USRBITS_MSTR_ID, priv->mn_user_bits); + + /* + * If illegal request is from CCPLEX(id:0x1) master then call BUG() to + * crash system. + */ + if ((mstr_id == 0x1) && priv->fabric->off_mask_erd) + is_inband_err = 1; + } + } + +unlock: + spin_unlock_irqrestore(&cbb_lock, flags); + WARN_ON(is_inband_err); + return IRQ_HANDLED; +} + +/* + * Register handler for CBB_SECURE interrupt for reporting errors + */ +static int tegra234_cbb_interrupt_enable(struct tegra_cbb *cbb) +{ + struct tegra234_cbb *priv = to_tegra234_cbb(cbb); + + if (priv->sec_irq) { + int err = devm_request_irq(cbb->dev, priv->sec_irq, tegra234_cbb_isr, 0, + dev_name(cbb->dev), priv); + if (err) { + dev_err(cbb->dev, "failed to register interrupt %u: %d\n", priv->sec_irq, + err); + return err; + } + } + + return 0; +} + +static void tegra234_cbb_error_enable(struct tegra_cbb *cbb) +{ + tegra_cbb_fault_enable(cbb); +} + +static const struct tegra_cbb_ops tegra234_cbb_ops = { + .get_status = tegra234_cbb_get_status, + .error_clear = tegra234_cbb_error_clear, + .fault_enable = tegra234_cbb_fault_enable, + .error_enable = tegra234_cbb_error_enable, + .interrupt_enable = tegra234_cbb_interrupt_enable, +#ifdef CONFIG_DEBUG_FS + .debugfs_show = tegra234_cbb_debugfs_show, +#endif +}; + +static const char * const tegra234_master_id[] = { + [0x00] = "TZ", + [0x01] = "CCPLEX", + [0x02] = "CCPMU", + [0x03] = "BPMP_FW", + [0x04] = "AON", + [0x05] = "SCE", + [0x06] = "GPCDMA_P", + [0x07] = "TSECA_NONSECURE", + [0x08] = "TSECA_LIGHTSECURE", + [0x09] = "TSECA_HEAVYSECURE", + [0x0a] = "CORESIGHT", + [0x0b] = "APE", + [0x0c] = "PEATRANS", + [0x0d] = "JTAGM_DFT", + [0x0e] = "RCE", + [0x0f] = "DCE", + [0x10] = "PSC_FW_USER", + [0x11] = "PSC_FW_SUPERVISOR", + [0x12] = "PSC_FW_MACHINE", + [0x13] = "PSC_BOOT", + [0x14] = "BPMP_BOOT", + [0x15] = "NVDEC_NONSECURE", + [0x16] = "NVDEC_LIGHTSECURE", + [0x17] = "NVDEC_HEAVYSECURE", + [0x18] = "CBB_INTERNAL", + [0x19] = "RSVD" +}; + +static const struct tegra_cbb_error tegra234_cbb_errors[] = { + { + .code = "SLAVE_ERR", + .desc = "Slave being accessed responded with an error" + }, { + .code = "DECODE_ERR", + .desc = "Attempt to access an address hole" + }, { + .code = "FIREWALL_ERR", + .desc = "Attempt to access a region which is firewall protected" + }, { + .code = "TIMEOUT_ERR", + .desc = "No response returned by slave" + }, { + .code = "PWRDOWN_ERR", + .desc = "Attempt to access a portion of fabric that is powered down" + }, { + .code = "UNSUPPORTED_ERR", + .desc = "Attempt to access a slave through an unsupported access" + } +}; + +static const struct tegra234_slave_lookup tegra234_aon_slave_map[] = { + { "AXI2APB", 0x00000 }, + { "AST", 0x14000 }, + { "CBB", 0x15000 }, + { "CPU", 0x16000 }, +}; + +static const struct tegra234_cbb_fabric tegra234_aon_fabric = { + .name = "aon-fabric", + .master_id = tegra234_master_id, + .slave_map = tegra234_aon_slave_map, + .errors = tegra234_cbb_errors, + .notifier_offset = 0x17000, +}; + +static const struct tegra234_slave_lookup tegra234_bpmp_slave_map[] = { + { "AXI2APB", 0x00000 }, + { "AST0", 0x15000 }, + { "AST1", 0x16000 }, + { "CBB", 0x17000 }, + { "CPU", 0x18000 }, +}; + +static const struct tegra234_cbb_fabric tegra234_bpmp_fabric = { + .name = "bpmp-fabric", + .master_id = tegra234_master_id, + .slave_map = tegra234_bpmp_slave_map, + .errors = tegra234_cbb_errors, + .notifier_offset = 0x19000, +}; + +static const struct tegra234_slave_lookup tegra234_cbb_slave_map[] = { + { "AON", 0x40000 }, + { "BPMP", 0x41000 }, + { "CBB", 0x42000 }, + { "HOST1X", 0x43000 }, + { "STM", 0x44000 }, + { "FSI", 0x45000 }, + { "PSC", 0x46000 }, + { "PCIE_C1", 0x47000 }, + { "PCIE_C2", 0x48000 }, + { "PCIE_C3", 0x49000 }, + { "PCIE_C0", 0x4a000 }, + { "PCIE_C4", 0x4b000 }, + { "GPU", 0x4c000 }, + { "SMMU0", 0x4d000 }, + { "SMMU1", 0x4e000 }, + { "SMMU2", 0x4f000 }, + { "SMMU3", 0x50000 }, + { "SMMU4", 0x51000 }, + { "PCIE_C10", 0x52000 }, + { "PCIE_C7", 0x53000 }, + { "PCIE_C8", 0x54000 }, + { "PCIE_C9", 0x55000 }, + { "PCIE_C5", 0x56000 }, + { "PCIE_C6", 0x57000 }, + { "DCE", 0x58000 }, + { "RCE", 0x59000 }, + { "SCE", 0x5a000 }, + { "AXI2APB_1", 0x70000 }, + { "AXI2APB_10", 0x71000 }, + { "AXI2APB_11", 0x72000 }, + { "AXI2APB_12", 0x73000 }, + { "AXI2APB_13", 0x74000 }, + { "AXI2APB_14", 0x75000 }, + { "AXI2APB_15", 0x76000 }, + { "AXI2APB_16", 0x77000 }, + { "AXI2APB_17", 0x78000 }, + { "AXI2APB_18", 0x79000 }, + { "AXI2APB_19", 0x7a000 }, + { "AXI2APB_2", 0x7b000 }, + { "AXI2APB_20", 0x7c000 }, + { "AXI2APB_21", 0x7d000 }, + { "AXI2APB_22", 0x7e000 }, + { "AXI2APB_23", 0x7f000 }, + { "AXI2APB_25", 0x80000 }, + { "AXI2APB_26", 0x81000 }, + { "AXI2APB_27", 0x82000 }, + { "AXI2APB_28", 0x83000 }, + { "AXI2APB_29", 0x84000 }, + { "AXI2APB_30", 0x85000 }, + { "AXI2APB_31", 0x86000 }, + { "AXI2APB_32", 0x87000 }, + { "AXI2APB_33", 0x88000 }, + { "AXI2APB_34", 0x89000 }, + { "AXI2APB_35", 0x92000 }, + { "AXI2APB_4", 0x8b000 }, + { "AXI2APB_5", 0x8c000 }, + { "AXI2APB_6", 0x8d000 }, + { "AXI2APB_7", 0x8e000 }, + { "AXI2APB_8", 0x8f000 }, + { "AXI2APB_9", 0x90000 }, + { "AXI2APB_3", 0x91000 }, +}; + +static const struct tegra234_cbb_fabric tegra234_cbb_fabric = { + .name = "cbb-fabric", + .master_id = tegra234_master_id, + .slave_map = tegra234_cbb_slave_map, + .errors = tegra234_cbb_errors, + .notifier_offset = 0x60000, + .off_mask_erd = 0x3a004 +}; + +static const struct tegra234_slave_lookup tegra234_dce_slave_map[] = { + { "AXI2APB", 0x00000 }, + { "AST0", 0x15000 }, + { "AST1", 0x16000 }, + { "CPU", 0x18000 }, +}; + +static const struct tegra234_cbb_fabric tegra234_dce_fabric = { + .name = "dce-fabric", + .master_id = tegra234_master_id, + .slave_map = tegra234_dce_slave_map, + .errors = tegra234_cbb_errors, + .notifier_offset = 0x19000, +}; + +static const struct tegra234_slave_lookup tegra234_rce_slave_map[] = { + { "AXI2APB", 0x00000 }, + { "AST0", 0x15000 }, + { "AST1", 0x16000 }, + { "CPU", 0x18000 }, +}; + +static const struct tegra234_cbb_fabric tegra234_rce_fabric = { + .name = "rce-fabric", + .master_id = tegra234_master_id, + .slave_map = tegra234_rce_slave_map, + .errors = tegra234_cbb_errors, + .notifier_offset = 0x19000, +}; + +static const struct tegra234_slave_lookup tegra234_sce_slave_map[] = { + { "AXI2APB", 0x00000 }, + { "AST0", 0x15000 }, + { "AST1", 0x16000 }, + { "CBB", 0x17000 }, + { "CPU", 0x18000 }, +}; + +static const struct tegra234_cbb_fabric tegra234_sce_fabric = { + .name = "sce-fabric", + .master_id = tegra234_master_id, + .slave_map = tegra234_sce_slave_map, + .errors = tegra234_cbb_errors, + .notifier_offset = 0x19000, +}; + +static const struct of_device_id tegra234_cbb_dt_ids[] = { + { .compatible = "nvidia,tegra234-cbb-fabric", .data = &tegra234_cbb_fabric }, + { .compatible = "nvidia,tegra234-aon-fabric", .data = &tegra234_aon_fabric }, + { .compatible = "nvidia,tegra234-bpmp-fabric", .data = &tegra234_bpmp_fabric }, + { .compatible = "nvidia,tegra234-dce-fabric", .data = &tegra234_dce_fabric }, + { .compatible = "nvidia,tegra234-rce-fabric", .data = &tegra234_rce_fabric }, + { .compatible = "nvidia,tegra234-sce-fabric", .data = &tegra234_sce_fabric }, + { /* sentinel */ }, +}; +MODULE_DEVICE_TABLE(of, tegra234_cbb_dt_ids); + +static int tegra234_cbb_probe(struct platform_device *pdev) +{ + const struct tegra234_cbb_fabric *fabric; + struct tegra234_cbb *cbb; + unsigned long flags = 0; + int err; + + fabric = of_device_get_match_data(&pdev->dev); + + cbb = devm_kzalloc(&pdev->dev, sizeof(*cbb), GFP_KERNEL); + if (!cbb) + return -ENOMEM; + + INIT_LIST_HEAD(&cbb->base.node); + cbb->base.ops = &tegra234_cbb_ops; + cbb->base.dev = &pdev->dev; + cbb->fabric = fabric; + + cbb->regs = devm_platform_get_and_ioremap_resource(pdev, 0, &cbb->res); + if (IS_ERR(cbb->regs)) + return PTR_ERR(cbb->regs); + + err = tegra_cbb_get_irq(pdev, NULL, &cbb->sec_irq); + if (err) + return err; + + platform_set_drvdata(pdev, cbb); + + spin_lock_irqsave(&cbb_lock, flags); + list_add(&cbb->base.node, &cbb_list); + spin_unlock_irqrestore(&cbb_lock, flags); + + /* set ERD bit to mask SError and generate interrupt to report error */ + if (cbb->fabric->off_mask_erd) + tegra234_cbb_mask_serror(cbb); + + return tegra_cbb_register(&cbb->base); +} + +static int tegra234_cbb_remove(struct platform_device *pdev) +{ + return 0; +} + +static int __maybe_unused tegra234_cbb_resume_noirq(struct device *dev) +{ + struct tegra234_cbb *cbb = dev_get_drvdata(dev); + + tegra234_cbb_error_enable(&cbb->base); + + dev_dbg(dev, "%s resumed\n", cbb->fabric->name); + + return 0; +} + +static const struct dev_pm_ops tegra234_cbb_pm = { + SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, tegra234_cbb_resume_noirq) +}; + +static struct platform_driver tegra234_cbb_driver = { + .probe = tegra234_cbb_probe, + .remove = tegra234_cbb_remove, + .driver = { + .owner = THIS_MODULE, + .name = "tegra234-cbb", + .of_match_table = tegra234_cbb_dt_ids, + .pm = &tegra234_cbb_pm, + }, +}; + +static int __init tegra234_cbb_init(void) +{ + return platform_driver_register(&tegra234_cbb_driver); +} +pure_initcall(tegra234_cbb_init); + +static void __exit tegra234_cbb_exit(void) +{ + platform_driver_unregister(&tegra234_cbb_driver); +} +module_exit(tegra234_cbb_exit); + +MODULE_DESCRIPTION("Control Backbone 2.0 error handling driver for Tegra234"); +MODULE_LICENSE("GPL"); From 3bc9dd1530335886b01eeb19281137c91495627d Mon Sep 17 00:00:00 2001 From: Sumit Gupta Date: Thu, 12 May 2022 01:46:51 +0530 Subject: [PATCH 0305/1250] soc/tegra: cbb: Add support for Tegra241 (Grace) Adding support for Tegra241 (Grace) which uses fabrics based on the CBB 2.0 architecture. Since Tegra241 requires ACPI, implement ACPI-based probe support. Fabrics reporting errors in Tegra241 are "CBB" and "BPMP". The CBB fabric connects various other CBB 2.0 based fabrics and also services the Initiators and Targets which are connected to itself. The BPMP fabric is present in the BPMP cluster. Signed-off-by: Sumit Gupta Signed-off-by: Thierry Reding --- drivers/soc/tegra/cbb/tegra234-cbb.c | 271 ++++++++++++++++++++++++++- 1 file changed, 269 insertions(+), 2 deletions(-) diff --git a/drivers/soc/tegra/cbb/tegra234-cbb.c b/drivers/soc/tegra/cbb/tegra234-cbb.c index 0d01803e7eecf4..8bae8166732cb5 100644 --- a/drivers/soc/tegra/cbb/tegra234-cbb.c +++ b/drivers/soc/tegra/cbb/tegra234-cbb.c @@ -10,6 +10,7 @@ * SLAVE_ERR */ +#include #include #include #include @@ -293,14 +294,33 @@ static void tegra234_cbb_print_error(struct seq_file *file, struct tegra234_cbb static void print_errlog_err(struct seq_file *file, struct tegra234_cbb *cbb) { u8 cache_type, prot_type, burst_length, mstr_id, grpsec, vqc, falconsec, beat_size; - u8 access_type, access_id, slave_id, fab_id, burst_type; + u8 access_type, access_id, requester_socket_id, local_socket_id, slave_id, fab_id; char fabric_name[20]; + bool is_numa = false; + u8 burst_type; + + if (num_possible_nodes() > 1) + is_numa = true; mstr_id = FIELD_GET(FAB_EM_EL_MSTRID, cbb->mn_user_bits); vqc = FIELD_GET(FAB_EM_EL_VQC, cbb->mn_user_bits); grpsec = FIELD_GET(FAB_EM_EL_GRPSEC, cbb->mn_user_bits); falconsec = FIELD_GET(FAB_EM_EL_FALCONSEC, cbb->mn_user_bits); + /* + * For SOC with multiple NUMA nodes, print cross socket access + * errors only if initiator/master_id is CCPLEX, CPMU or GPU. + */ + if (is_numa) { + local_socket_id = numa_node_id(); + requester_socket_id = FIELD_GET(REQ_SOCKET_ID, cbb->mn_attr2); + + if (requester_socket_id != local_socket_id) { + if ((mstr_id != 0x1) && (mstr_id != 0x2) && (mstr_id != 0xB)) + return; + } + } + fab_id = FIELD_GET(FAB_EM_EL_FABID, cbb->mn_attr2); slave_id = FIELD_GET(FAB_EM_EL_SLAVEID, cbb->mn_attr2); @@ -333,6 +353,15 @@ static void print_errlog_err(struct seq_file *file, struct tegra234_cbb *cbb) else strcpy(fabric_name, cbb->fabric->name); + if (is_numa) { + tegra_cbb_print_err(file, "\t Requester_Socket_Id\t: %#x\n", + requester_socket_id); + tegra_cbb_print_err(file, "\t Local_Socket_Id\t: %#x\n", + local_socket_id); + tegra_cbb_print_err(file, "\t No. of NUMA_NODES\t: %#x\n", + num_possible_nodes()); + } + tegra_cbb_print_err(file, "\t Fabric\t\t: %s\n", fabric_name); tegra_cbb_print_err(file, "\t Slave_Id\t\t: %#x\n", slave_id); tegra_cbb_print_err(file, "\t Burst_length\t\t: %#x\n", burst_length); @@ -750,6 +779,200 @@ static const struct tegra234_cbb_fabric tegra234_sce_fabric = { .notifier_offset = 0x19000, }; +static const char * const tegra241_master_id[] = { + [0x0] = "TZ", + [0x1] = "CCPLEX", + [0x2] = "CCPMU", + [0x3] = "BPMP_FW", + [0x4] = "PSC_FW_USER", + [0x5] = "PSC_FW_SUPERVISOR", + [0x6] = "PSC_FW_MACHINE", + [0x7] = "PSC_BOOT", + [0x8] = "BPMP_BOOT", + [0x9] = "JTAGM_DFT", + [0xa] = "CORESIGHT", + [0xb] = "GPU", + [0xc] = "PEATRANS", + [0xd ... 0x3f] = "RSVD" +}; + +/* + * Possible causes for Slave and Timeout errors. + * SLAVE_ERR: + * Slave being accessed responded with an error. Slave could return + * an error for various cases : + * Unsupported access, clamp setting when power gated, register + * level firewall(SCR), address hole within the slave, etc + * + * TIMEOUT_ERR: + * No response returned by slave. Can be due to slave being clock + * gated, under reset, powered down or slave inability to respond + * for an internal slave issue + */ +static const struct tegra_cbb_error tegra241_cbb_errors[] = { + { + .code = "SLAVE_ERR", + .desc = "Slave being accessed responded with an error." + }, { + .code = "DECODE_ERR", + .desc = "Attempt to access an address hole or Reserved region of memory." + }, { + .code = "FIREWALL_ERR", + .desc = "Attempt to access a region which is firewalled." + }, { + .code = "TIMEOUT_ERR", + .desc = "No response returned by slave." + }, { + .code = "PWRDOWN_ERR", + .desc = "Attempt to access a portion of the fabric that is powered down." + }, { + .code = "UNSUPPORTED_ERR", + .desc = "Attempt to access a slave through an unsupported access." + }, { + .code = "POISON_ERR", + .desc = "Slave responds with poison error to indicate error in data." + }, { + .code = "RSVD" + }, { + .code = "RSVD" + }, { + .code = "RSVD" + }, { + .code = "RSVD" + }, { + .code = "RSVD" + }, { + .code = "RSVD" + }, { + .code = "RSVD" + }, { + .code = "RSVD" + }, { + .code = "RSVD" + }, { + .code = "NO_SUCH_ADDRESS_ERR", + .desc = "The address belongs to the pri_target range but there is no register " + "implemented at the address." + }, { + .code = "TASK_ERR", + .desc = "Attempt to update a PRI task when the current task has still not " + "completed." + }, { + .code = "EXTERNAL_ERR", + .desc = "Indicates that an external PRI register access met with an error due to " + "any issue in the unit." + }, { + .code = "INDEX_ERR", + .desc = "Applicable to PRI index aperture pair, when the programmed index is " + "outside the range defined in the manual." + }, { + .code = "RESET_ERR", + .desc = "Target in Reset Error: Attempt to access a SubPri or external PRI " + "register but they are in reset." + }, { + .code = "REGISTER_RST_ERR", + .desc = "Attempt to access a PRI register but the register is partial or " + "completely in reset." + }, { + .code = "POWER_GATED_ERR", + .desc = "Returned by external PRI client when the external access goes to a power " + "gated domain." + }, { + .code = "SUBPRI_FS_ERR", + .desc = "Subpri is floorswept: Attempt to access a subpri through the main pri " + "target but subPri logic is floorswept." + }, { + .code = "SUBPRI_CLK_OFF_ERR", + .desc = "Subpri clock is off: Attempt to access a subpri through the main pri " + "target but subPris clock is gated/off." + }, +}; + +static const struct tegra234_slave_lookup tegra241_cbb_slave_map[] = { + { "CCPLEX", 0x50000 }, + { "PCIE_C8", 0x51000 }, + { "PCIE_C9", 0x52000 }, + { "RSVD", 0x00000 }, + { "RSVD", 0x00000 }, + { "RSVD", 0x00000 }, + { "RSVD", 0x00000 }, + { "RSVD", 0x00000 }, + { "RSVD", 0x00000 }, + { "RSVD", 0x00000 }, + { "RSVD", 0x00000 }, + { "AON", 0x5b000 }, + { "BPMP", 0x5c000 }, + { "RSVD", 0x00000 }, + { "RSVD", 0x00000 }, + { "PSC", 0x5d000 }, + { "STM", 0x5e000 }, + { "AXI2APB_1", 0x70000 }, + { "AXI2APB_10", 0x71000 }, + { "AXI2APB_11", 0x72000 }, + { "AXI2APB_12", 0x73000 }, + { "AXI2APB_13", 0x74000 }, + { "AXI2APB_14", 0x75000 }, + { "AXI2APB_15", 0x76000 }, + { "AXI2APB_16", 0x77000 }, + { "AXI2APB_17", 0x78000 }, + { "AXI2APB_18", 0x79000 }, + { "AXI2APB_19", 0x7a000 }, + { "AXI2APB_2", 0x7b000 }, + { "AXI2APB_20", 0x7c000 }, + { "AXI2APB_4", 0x87000 }, + { "AXI2APB_5", 0x88000 }, + { "AXI2APB_6", 0x89000 }, + { "AXI2APB_7", 0x8a000 }, + { "AXI2APB_8", 0x8b000 }, + { "AXI2APB_9", 0x8c000 }, + { "AXI2APB_3", 0x8d000 }, + { "AXI2APB_21", 0x7d000 }, + { "AXI2APB_22", 0x7e000 }, + { "AXI2APB_23", 0x7f000 }, + { "AXI2APB_24", 0x80000 }, + { "AXI2APB_25", 0x81000 }, + { "AXI2APB_26", 0x82000 }, + { "AXI2APB_27", 0x83000 }, + { "AXI2APB_28", 0x84000 }, + { "PCIE_C4", 0x53000 }, + { "PCIE_C5", 0x54000 }, + { "PCIE_C6", 0x55000 }, + { "PCIE_C7", 0x56000 }, + { "PCIE_C2", 0x57000 }, + { "PCIE_C3", 0x58000 }, + { "PCIE_C0", 0x59000 }, + { "PCIE_C1", 0x5a000 }, + { "AXI2APB_29", 0x85000 }, + { "AXI2APB_30", 0x86000 }, +}; + +static const struct tegra234_cbb_fabric tegra241_cbb_fabric = { + .name = "cbb-fabric", + .master_id = tegra241_master_id, + .slave_map = tegra241_cbb_slave_map, + .errors = tegra241_cbb_errors, + .notifier_offset = 0x60000, + .off_mask_erd = 0x40004, +}; + +static const struct tegra234_slave_lookup tegra241_bpmp_slave_map[] = { + { "RSVD", 0x00000 }, + { "RSVD", 0x00000 }, + { "CBB", 0x15000 }, + { "CPU", 0x16000 }, + { "AXI2APB", 0x00000 }, + { "DBB0", 0x17000 }, + { "DBB1", 0x18000 }, +}; + +static const struct tegra234_cbb_fabric tegra241_bpmp_fabric = { + .name = "bpmp-fabric", + .master_id = tegra241_master_id, + .slave_map = tegra241_bpmp_slave_map, + .errors = tegra241_cbb_errors, + .notifier_offset = 0x19000, +}; + static const struct of_device_id tegra234_cbb_dt_ids[] = { { .compatible = "nvidia,tegra234-cbb-fabric", .data = &tegra234_cbb_fabric }, { .compatible = "nvidia,tegra234-aon-fabric", .data = &tegra234_aon_fabric }, @@ -761,6 +984,37 @@ static const struct of_device_id tegra234_cbb_dt_ids[] = { }; MODULE_DEVICE_TABLE(of, tegra234_cbb_dt_ids); +struct tegra234_cbb_acpi_uid { + const char *hid; + const char *uid; + const struct tegra234_cbb_fabric *fabric; +}; + +static const struct tegra234_cbb_acpi_uid tegra234_cbb_acpi_uids[] = { + { "NVDA1070", "1", &tegra241_cbb_fabric }, + { "NVDA1070", "2", &tegra241_bpmp_fabric }, + { }, +}; + +static const struct +tegra234_cbb_fabric *tegra234_cbb_acpi_get_fabric(struct acpi_device *adev) +{ + const struct tegra234_cbb_acpi_uid *entry; + + for (entry = tegra234_cbb_acpi_uids; entry->hid; entry++) { + if (acpi_dev_hid_uid_match(adev, entry->hid, entry->uid)) + return entry->fabric; + } + + return NULL; +} + +static const struct acpi_device_id tegra241_cbb_acpi_ids[] = { + { "NVDA1070" }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, tegra241_cbb_acpi_ids); + static int tegra234_cbb_probe(struct platform_device *pdev) { const struct tegra234_cbb_fabric *fabric; @@ -768,7 +1022,19 @@ static int tegra234_cbb_probe(struct platform_device *pdev) unsigned long flags = 0; int err; - fabric = of_device_get_match_data(&pdev->dev); + if (pdev->dev.of_node) { + fabric = of_device_get_match_data(&pdev->dev); + } else { + struct acpi_device *device = ACPI_COMPANION(&pdev->dev); + if (!device) + return -ENODEV; + + fabric = tegra234_cbb_acpi_get_fabric(device); + if (!fabric) { + dev_err(&pdev->dev, "no device match found\n"); + return -ENODEV; + } + } cbb = devm_kzalloc(&pdev->dev, sizeof(*cbb), GFP_KERNEL); if (!cbb) @@ -827,6 +1093,7 @@ static struct platform_driver tegra234_cbb_driver = { .owner = THIS_MODULE, .name = "tegra234-cbb", .of_match_table = tegra234_cbb_dt_ids, + .acpi_match_table = tegra241_cbb_acpi_ids, .pm = &tegra234_cbb_pm, }, }; From a16a833a156bd6c014d43933944788a1d78f5367 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 6 Jul 2022 15:00:48 +0800 Subject: [PATCH 0306/1250] soc/tegra: fuse: Add missing DMADEVICES dependency WARNING: unmet direct dependencies detected for TEGRA20_APB_DMA Depends on [n]: DMADEVICES [=n] && (ARCH_TEGRA [=y] || COMPILE_TEST [=n]) Selected by [y]: - SOC_TEGRA_FUSE [=y] && ARCH_TEGRA [=y] && ARCH_TEGRA_2x_SOC [=y] TEGRA20_APB_DMA depends on DMADEVICES, so add this condition check while select it. Fixes: 19d41e5e9c68 ("soc/tegra: fuse: Add APB DMA dependency for Tegra20") Suggested-by: Dmitry Osipenko Signed-off-by: YueHaibing Signed-off-by: Thierry Reding --- drivers/soc/tegra/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig index bd360488cd8241..cb188544fd6dcc 100644 --- a/drivers/soc/tegra/Kconfig +++ b/drivers/soc/tegra/Kconfig @@ -136,7 +136,7 @@ config SOC_TEGRA_FUSE def_bool y depends on ARCH_TEGRA select SOC_BUS - select TEGRA20_APB_DMA if ARCH_TEGRA_2x_SOC + select TEGRA20_APB_DMA if (ARCH_TEGRA_2x_SOC && DMADEVICES) config SOC_TEGRA_FLOWCTRL bool From 4773d1c739e22101a92f89c0ae0983190ddbe112 Mon Sep 17 00:00:00 2001 From: Liang He Date: Wed, 15 Jun 2022 20:32:32 +0800 Subject: [PATCH 0307/1250] soc/tegra: fuse: Add missing of_node_put() In tegra_init_apbmisc(), of_find_matching_node() will return a node pointer with refcount incremented. We should use of_node_put() in each failure path or when it is not used anymore. Signed-off-by: Liang He Signed-off-by: Thierry Reding --- drivers/soc/tegra/fuse/tegra-apbmisc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/soc/tegra/fuse/tegra-apbmisc.c b/drivers/soc/tegra/fuse/tegra-apbmisc.c index de833f8d240835..3351bd872ab272 100644 --- a/drivers/soc/tegra/fuse/tegra-apbmisc.c +++ b/drivers/soc/tegra/fuse/tegra-apbmisc.c @@ -208,12 +208,12 @@ void __init tegra_init_apbmisc(void) */ if (of_address_to_resource(np, 0, &apbmisc) < 0) { pr_err("failed to get APBMISC registers\n"); - return; + goto put; } if (of_address_to_resource(np, 1, &straps) < 0) { pr_err("failed to get strapping options registers\n"); - return; + goto put; } } @@ -233,4 +233,7 @@ void __init tegra_init_apbmisc(void) } long_ram_code = of_property_read_bool(np, "nvidia,long-ram-code"); + +put: + of_node_put(np); } From 55553f3437c8b185b71fe4bd8106141a4e4192de Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 16:25:10 -0400 Subject: [PATCH 0308/1250] mm/migrate: Convert expected_page_refs() to folio_expected_refs() Now that both callers have a folio, convert this function to take a folio & rename it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- mm/migrate.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index ea5398d0f7f1f0..61cd8d270b030d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -336,13 +336,18 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) } #endif -static int expected_page_refs(struct address_space *mapping, struct page *page) +static int folio_expected_refs(struct address_space *mapping, + struct folio *folio) { - int expected_count = 1; + int refs = 1; + if (!mapping) + return refs; - if (mapping) - expected_count += compound_nr(page) + page_has_private(page); - return expected_count; + refs += folio_nr_pages(folio); + if (folio_test_private(folio)) + refs++; + + return refs; } /* @@ -359,7 +364,7 @@ int folio_migrate_mapping(struct address_space *mapping, XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct zone *oldzone, *newzone; int dirty; - int expected_count = expected_page_refs(mapping, &folio->page) + extra_count; + int expected_count = folio_expected_refs(mapping, folio) + extra_count; long nr = folio_nr_pages(folio); if (!mapping) { @@ -669,7 +674,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, return migrate_page(mapping, &dst->page, &src->page, mode); /* Check whether page does not have extra refs before we do more work */ - expected_count = expected_page_refs(mapping, &src->page); + expected_count = folio_expected_refs(mapping, src); if (folio_ref_count(src) != expected_count) return -EAGAIN; From e8172b8e1728b41160dc8fef7fdd9ffbcaa152c6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 09:22:19 -0400 Subject: [PATCH 0309/1250] btrfs: Convert btree_migratepage to migrate_folio Use a folio throughout this function. migrate_page() will be converted later. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: David Sterba --- fs/btrfs/disk-io.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index dbac856565718b..e3ecc38a9ebcca 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -952,28 +952,28 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ } #ifdef CONFIG_MIGRATION -static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) +static int btree_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { /* * we can't safely write a btree page from here, * we haven't done the locking hook */ - if (PageDirty(page)) + if (folio_test_dirty(src)) return -EAGAIN; /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) + if (folio_get_private(src) && + !filemap_release_folio(src, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, mode); + return migrate_page(mapping, &dst->page, &src->page, mode); } +#else +#define btree_migrate_folio NULL #endif - static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1073,10 +1073,8 @@ static const struct address_space_operations btree_aops = { .writepages = btree_writepages, .release_folio = btree_release_folio, .invalidate_folio = btree_invalidate_folio, -#ifdef CONFIG_MIGRATION - .migratepage = btree_migratepage, -#endif - .dirty_folio = btree_dirty_folio, + .migrate_folio = btree_migrate_folio, + .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( From 27826326e888a185d7d191670cf445dec88e9218 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 09:22:19 -0400 Subject: [PATCH 0310/1250] nfs: Convert to migrate_folio Use a folio throughout this function. migrate_page() will be converted later. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Anna Schumaker Reviewed-by: Christoph Hellwig --- fs/nfs/file.c | 4 +--- fs/nfs/internal.h | 6 ++++-- fs/nfs/write.c | 16 ++++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2d72b1b7ed74c9..549baed763513b 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -533,9 +533,7 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidate_folio = nfs_invalidate_folio, .release_folio = nfs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = nfs_migrate_page, -#endif + .migrate_folio = nfs_migrate_folio, .launder_folio = nfs_launder_folio, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8f8cd6e2d4dbcb..437ebe544aafb5 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -578,8 +578,10 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) #endif #ifdef CONFIG_MIGRATION -extern int nfs_migrate_page(struct address_space *, - struct page *, struct page *, enum migrate_mode); +int nfs_migrate_folio(struct address_space *, struct folio *dst, + struct folio *src, enum migrate_mode); +#else +#define nfs_migrate_folio NULL #endif static inline int diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1c706465d090b0..649b9e63345974 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2119,27 +2119,27 @@ int nfs_wb_page(struct inode *inode, struct page *page) } #ifdef CONFIG_MIGRATION -int nfs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) +int nfs_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { /* - * If PagePrivate is set, then the page is currently associated with + * If the private flag is set, the folio is currently associated with * an in-progress read or write request. Don't try to migrate it. * * FIXME: we could do this in principle, but we'll need a way to ensure * that we can safely release the inode reference while holding - * the page lock. + * the folio lock. */ - if (PagePrivate(page)) + if (folio_test_private(src)) return -EBUSY; - if (PageFsCache(page)) { + if (folio_test_fscache(src)) { if (mode == MIGRATE_ASYNC) return -EBUSY; - wait_on_page_fscache(page); + folio_wait_fscache(src); } - return migrate_page(mapping, newpage, page, mode); + return migrate_page(mapping, &dst->page, &src->page, mode); } #endif From af9c33968b722c5871974541067dc180377501df Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 10:27:41 -0400 Subject: [PATCH 0311/1250] mm/migrate: Convert migrate_page() to migrate_folio() Convert all callers to pass a folio. Most have the folio already available. Switch all users from aops->migratepage to aops->migrate_folio. Also turn the documentation into kerneldoc. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: David Sterba --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 4 +-- fs/btrfs/disk-io.c | 2 +- fs/nfs/write.c | 2 +- include/linux/migrate.h | 5 ++- mm/migrate.c | 37 +++++++++++---------- mm/migrate_device.c | 3 +- mm/shmem.c | 2 +- mm/swap_state.c | 2 +- 8 files changed, 30 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 094f06b4ce3359..8423df021b7138 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -216,8 +216,8 @@ i915_gem_userptr_put_pages(struct drm_i915_gem_object *obj, * However...! * * The mmu-notifier can be invalidated for a - * migrate_page, that is alreadying holding the lock - * on the page. Such a try_to_unmap() will result + * migrate_folio, that is alreadying holding the lock + * on the folio. Such a try_to_unmap() will result * in us calling put_pages() and so recursively try * to lock the page. We avoid that deadlock with * a trylock_page() and in exchange we risk missing diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e3ecc38a9ebcca..03bbb7e96bf34e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -968,7 +968,7 @@ static int btree_migrate_folio(struct address_space *mapping, if (folio_get_private(src) && !filemap_release_folio(src, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, &dst->page, &src->page, mode); + return migrate_folio(mapping, dst, src, mode); } #else #define btree_migrate_folio NULL diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 649b9e63345974..69569696dde0bd 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -2139,7 +2139,7 @@ int nfs_migrate_folio(struct address_space *mapping, struct folio *dst, folio_wait_fscache(src); } - return migrate_page(mapping, &dst->page, &src->page, mode); + return migrate_folio(mapping, dst, src, mode); } #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 82c735ba6109f5..c9986d5da335fe 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -62,9 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); -extern int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode); +int migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded); diff --git a/mm/migrate.c b/mm/migrate.c index 61cd8d270b030d..77aeb7e12f6298 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -593,34 +593,37 @@ EXPORT_SYMBOL(folio_migrate_copy); * Migration functions ***********************************************************/ -/* - * Common logic to directly migrate a single LRU page suitable for - * pages that do not use PagePrivate/PagePrivate2. +/** + * migrate_folio() - Simple folio migration. + * @mapping: The address_space containing the folio. + * @dst: The folio to migrate the data to. + * @src: The folio containing the current data. + * @mode: How to migrate the page. * - * Pages are locked upon entry and exit. + * Common logic to directly migrate a single LRU folio suitable for + * folios that do not use PagePrivate/PagePrivate2. + * + * Folios are locked upon entry and exit. */ -int migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) +int migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { - struct folio *newfolio = page_folio(newpage); - struct folio *folio = page_folio(page); int rc; - BUG_ON(folio_test_writeback(folio)); /* Writeback must be complete */ + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ - rc = folio_migrate_mapping(mapping, newfolio, folio, 0); + rc = folio_migrate_mapping(mapping, dst, src, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(newfolio, folio); + folio_migrate_copy(dst, src); else - folio_migrate_flags(newfolio, folio); + folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } -EXPORT_SYMBOL(migrate_page); +EXPORT_SYMBOL(migrate_folio); #ifdef CONFIG_BLOCK /* Returns true if all buffers are successfully locked */ @@ -671,7 +674,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, head = folio_buffers(src); if (!head) - return migrate_page(mapping, &dst->page, &src->page, mode); + return migrate_folio(mapping, dst, src, mode); /* Check whether page does not have extra refs before we do more work */ expected_count = folio_expected_refs(mapping, src); @@ -848,7 +851,7 @@ static int fallback_migrate_folio(struct address_space *mapping, !filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; - return migrate_page(mapping, &dst->page, &src->page, mode); + return migrate_folio(mapping, dst, src, mode); } /* @@ -875,7 +878,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, struct address_space *mapping = folio_mapping(src); if (!mapping) - rc = migrate_page(mapping, &dst->page, &src->page, mode); + rc = migrate_folio(mapping, dst, src, mode); else if (mapping->a_ops->migrate_folio) /* * Most folios have a mapping and most filesystems diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5052093d0262d7..5dd97c39ca6ae1 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -718,7 +718,8 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY); + r = migrate_folio(mapping, page_folio(newpage), + page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } diff --git a/mm/shmem.c b/mm/shmem.c index 28a62be1d41e5d..15c61456e0875f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3801,7 +3801,7 @@ const struct address_space_operations shmem_aops = { .write_end = shmem_write_end, #endif #ifdef CONFIG_MIGRATION - .migratepage = migrate_page, + .migrate_folio = migrate_folio, #endif .error_remove_page = shmem_error_remove_page, }; diff --git a/mm/swap_state.c b/mm/swap_state.c index f5b6f563890805..0a2021fc55ade0 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, .dirty_folio = noop_dirty_folio, #ifdef CONFIG_MIGRATION - .migratepage = migrate_page, + .migrate_folio = migrate_folio, #endif }; From 441b3afcb2e31aca89f6e2cd6642f141c7bbe142 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 12:55:08 -0400 Subject: [PATCH 0312/1250] mm/migrate: Add filemap_migrate_folio() There is nothing iomap-specific about iomap_migratepage(), and it fits a pattern used by several other filesystems, so move it to mm/migrate.c, convert it to be filemap_migrate_folio() and convert the iomap filesystems to use it. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong --- fs/gfs2/aops.c | 2 +- fs/iomap/buffered-io.c | 25 ------------------------- fs/xfs/xfs_aops.c | 2 +- fs/zonefs/super.c | 2 +- include/linux/iomap.h | 6 ------ include/linux/pagemap.h | 6 ++++++ mm/migrate.c | 20 ++++++++++++++++++++ 7 files changed, 29 insertions(+), 34 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 106e90a365838e..57ff883d432c75 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -774,7 +774,7 @@ static const struct address_space_operations gfs2_aops = { .invalidate_folio = iomap_invalidate_folio, .bmap = gfs2_bmap, .direct_IO = noop_direct_IO, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 66278a14bfa7c9..5a91aa1db945bc 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -489,31 +489,6 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); -#ifdef CONFIG_MIGRATION -int -iomap_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) -{ - struct folio *folio = page_folio(page); - struct folio *newfolio = page_folio(newpage); - int ret; - - ret = folio_migrate_mapping(mapping, newfolio, folio, 0); - if (ret != MIGRATEPAGE_SUCCESS) - return ret; - - if (folio_test_private(folio)) - folio_attach_private(newfolio, folio_detach_private(folio)); - - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(newfolio, folio); - else - folio_migrate_flags(newfolio, folio); - return MIGRATEPAGE_SUCCESS; -} -EXPORT_SYMBOL_GPL(iomap_migrate_page); -#endif /* CONFIG_MIGRATION */ - static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 8ec38b25187bdb..5d1a995b15f833 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -570,7 +570,7 @@ const struct address_space_operations xfs_address_space_operations = { .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, .direct_IO = noop_direct_IO, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = xfs_iomap_swapfile_activate, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 053299758deb98..cc6d4cf580ac6f 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -271,7 +271,7 @@ static const struct address_space_operations zonefs_file_aops = { .dirty_folio = filemap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, - .migratepage = iomap_migrate_page, + .migrate_folio = filemap_migrate_folio, .is_partially_uptodate = iomap_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .direct_IO = noop_direct_IO, diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e552097c67e0bb..758a1125e72fb2 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -231,12 +231,6 @@ void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags); void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len); -#ifdef CONFIG_MIGRATION -int iomap_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode); -#else -#define iomap_migrate_page NULL -#endif int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops); int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 87d4ea571240e9..cc9adbaddb591b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1078,6 +1078,12 @@ static inline int __must_check write_one_page(struct page *page) int __set_page_dirty_nobuffers(struct page *page); bool noop_dirty_folio(struct address_space *mapping, struct folio *folio); +#ifdef CONFIG_MIGRATION +int filemap_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode); +#else +#define filemap_migrate_folio NULL +#endif void page_endio(struct page *page, bool is_write, int err); void folio_end_private_2(struct folio *folio); diff --git a/mm/migrate.c b/mm/migrate.c index 77aeb7e12f6298..4ed8f0d53c77c2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -784,6 +784,26 @@ int buffer_migrate_folio_norefs(struct address_space *mapping, } #endif +int filemap_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) +{ + int ret; + + ret = folio_migrate_mapping(mapping, dst, src, 0); + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + + if (folio_get_private(src)) + folio_attach_private(dst, folio_detach_private(src)); + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} +EXPORT_SYMBOL_GPL(filemap_migrate_folio); + /* * Writeback a folio to clean the dirty state */ From 9c5161d1eefb24389a077b43e8e11322f2e4cd42 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 10:47:21 -0400 Subject: [PATCH 0313/1250] btrfs: Convert btrfs_migratepage to migrate_folio Use filemap_migrate_folio() to do the bulk of the work, and then copy the ordered flag across if needed. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig Acked-by: David Sterba --- fs/btrfs/inode.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 81737eff92f3d8..5f41d869c6484d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8255,30 +8255,24 @@ static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) } #ifdef CONFIG_MIGRATION -static int btrfs_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, +static int btrfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - int ret; + int ret = filemap_migrate_folio(mapping, dst, src, mode); - ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) - attach_page_private(newpage, detach_page_private(page)); - - if (PageOrdered(page)) { - ClearPageOrdered(page); - SetPageOrdered(newpage); + if (folio_test_ordered(src)) { + folio_clear_ordered(src); + folio_set_ordered(dst); } - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } +#else +#define btrfs_migrate_folio NULL #endif static void btrfs_invalidate_folio(struct folio *folio, size_t offset, @@ -11422,9 +11416,7 @@ static const struct address_space_operations btrfs_aops = { .direct_IO = noop_direct_IO, .invalidate_folio = btrfs_invalidate_folio, .release_folio = btrfs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = btrfs_migratepage, -#endif + .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, From 9f11d68b27211e6a73b0540dc757c91c01ae74bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 10:47:21 -0400 Subject: [PATCH 0314/1250] ubifs: Convert to filemap_migrate_folio() filemap_migrate_folio() is a little more general than ubifs really needs, but it's better to share the code. Signed-off-by: Matthew Wilcox (Oracle) --- fs/ubifs/file.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 04ced154960fa6..f2353dd676ef08 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1461,29 +1461,6 @@ static bool ubifs_dirty_folio(struct address_space *mapping, return ret; } -#ifdef CONFIG_MIGRATION -static int ubifs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) -{ - int rc; - - rc = migrate_page_move_mapping(mapping, newpage, page, 0); - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - if (PagePrivate(page)) { - detach_page_private(page); - attach_page_private(newpage, (void *)1); - } - - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); - return MIGRATEPAGE_SUCCESS; -} -#endif - static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags) { struct inode *inode = folio->mapping->host; @@ -1649,10 +1626,8 @@ const struct address_space_operations ubifs_file_address_operations = { .write_end = ubifs_write_end, .invalidate_folio = ubifs_invalidate_folio, .dirty_folio = ubifs_dirty_folio, -#ifdef CONFIG_MIGRATION - .migratepage = ubifs_migrate_page, -#endif - .release_folio = ubifs_release_folio, + .migrate_folio = filemap_migrate_folio, + .release_folio = ubifs_release_folio, }; const struct inode_operations ubifs_file_inode_operations = { From 9c16c4c68213afc512aadb854c9603f4eff6b977 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 10:47:21 -0400 Subject: [PATCH 0315/1250] f2fs: Convert to filemap_migrate_folio() filemap_migrate_folio() fits f2fs's needs perfectly. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Chao Yu --- fs/f2fs/checkpoint.c | 4 +--- fs/f2fs/data.c | 40 +--------------------------------------- fs/f2fs/f2fs.h | 4 ---- fs/f2fs/node.c | 4 +--- 4 files changed, 3 insertions(+), 49 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6d8b2bf14de0fd..8259e0fa97e1fc 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -463,9 +463,7 @@ const struct address_space_operations f2fs_meta_aops = { .dirty_folio = f2fs_dirty_meta_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif + .migrate_folio = filemap_migrate_folio, }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7fcbcf97973724..318a3f91ad74ba 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3751,42 +3751,6 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return blknr; } -#ifdef CONFIG_MIGRATION -#include - -int f2fs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode) -{ - int rc, extra_count = 0; - - BUG_ON(PageWriteback(page)); - - rc = migrate_page_move_mapping(mapping, newpage, - page, extra_count); - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - /* guarantee to start from no stale private field */ - set_page_private(newpage, 0); - if (PagePrivate(page)) { - set_page_private(newpage, page_private(page)); - SetPagePrivate(newpage); - get_page(newpage); - - set_page_private(page, 0); - ClearPagePrivate(page); - put_page(page); - } - - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); - - return MIGRATEPAGE_SUCCESS; -} -#endif - #ifdef CONFIG_SWAP static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, unsigned int blkcnt) @@ -4018,15 +3982,13 @@ const struct address_space_operations f2fs_dblock_aops = { .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, .dirty_folio = f2fs_dirty_data_folio, + .migrate_folio = filemap_migrate_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .direct_IO = noop_direct_IO, .bmap = f2fs_bmap, .swap_activate = f2fs_swap_activate, .swap_deactivate = f2fs_swap_deactivate, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif }; void f2fs_clear_page_cache_dirty_tag(struct page *page) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d9bbecd008d22a..f258a1b6faed3a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3764,10 +3764,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted, void f2fs_write_failed(struct inode *inode, loff_t to); void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length); bool f2fs_release_folio(struct folio *folio, gfp_t wait); -#ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode); -#endif bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len); void f2fs_clear_page_cache_dirty_tag(struct page *page); int f2fs_init_post_read_processing(void); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index cf6f7fc83c0829..12bba66a8a3001 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2165,9 +2165,7 @@ const struct address_space_operations f2fs_node_aops = { .dirty_folio = f2fs_dirty_node_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, -#ifdef CONFIG_MIGRATION - .migratepage = f2fs_migrate_page, -#endif + .migrate_folio = filemap_migrate_folio, }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, From 9b553d25025600ecaeb903b9250279b50c1c6054 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 10:47:21 -0400 Subject: [PATCH 0316/1250] aio: Convert to migrate_folio Use a folio throughout this function. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- fs/aio.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 3c249b93863274..a1911e86859c70 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -400,8 +400,8 @@ static const struct file_operations aio_ring_fops = { }; #if IS_ENABLED(CONFIG_MIGRATION) -static int aio_migratepage(struct address_space *mapping, struct page *new, - struct page *old, enum migrate_mode mode) +static int aio_migrate_folio(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode) { struct kioctx *ctx; unsigned long flags; @@ -435,10 +435,10 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, goto out; } - idx = old->index; + idx = src->index; if (idx < (pgoff_t)ctx->nr_pages) { - /* Make sure the old page hasn't already been changed */ - if (ctx->ring_pages[idx] != old) + /* Make sure the old folio hasn't already been changed */ + if (ctx->ring_pages[idx] != &src->page) rc = -EAGAIN; } else rc = -EINVAL; @@ -447,27 +447,27 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, goto out_unlock; /* Writeback must be complete */ - BUG_ON(PageWriteback(old)); - get_page(new); + BUG_ON(folio_test_writeback(src)); + folio_get(dst); - rc = migrate_page_move_mapping(mapping, new, old, 1); + rc = folio_migrate_mapping(mapping, dst, src, 1); if (rc != MIGRATEPAGE_SUCCESS) { - put_page(new); + folio_put(dst); goto out_unlock; } /* Take completion_lock to prevent other writes to the ring buffer - * while the old page is copied to the new. This prevents new + * while the old folio is copied to the new. This prevents new * events from being lost. */ spin_lock_irqsave(&ctx->completion_lock, flags); - migrate_page_copy(new, old); - BUG_ON(ctx->ring_pages[idx] != old); - ctx->ring_pages[idx] = new; + folio_migrate_copy(dst, src); + BUG_ON(ctx->ring_pages[idx] != &src->page); + ctx->ring_pages[idx] = &dst->page; spin_unlock_irqrestore(&ctx->completion_lock, flags); - /* The old page is no longer accessible. */ - put_page(old); + /* The old folio is no longer accessible. */ + folio_put(src); out_unlock: mutex_unlock(&ctx->ring_lock); @@ -475,13 +475,13 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, spin_unlock(&mapping->private_lock); return rc; } +#else +#define aio_migrate_folio NULL #endif static const struct address_space_operations aio_ctx_aops = { .dirty_folio = noop_dirty_folio, -#if IS_ENABLED(CONFIG_MIGRATION) - .migratepage = aio_migratepage, -#endif + .migrate_folio = aio_migrate_folio, }; static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) From 5567427fd70edcd4809a5b1df03f363e72f997bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 10:47:21 -0400 Subject: [PATCH 0317/1250] hugetlb: Convert to migrate_folio This involves converting migrate_huge_page_move_mapping(). We also need a folio variant of hugetlb_set_page_subpool(), but that's for a later patch. Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Muchun Song Reviewed-by: Mike Kravetz --- fs/hugetlbfs/inode.c | 23 ++++++++++++++--------- include/linux/migrate.h | 6 +++--- mm/migrate.c | 18 +++++++++--------- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 14d33f725e059f..eca1d0fabd7ea9 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -954,28 +954,33 @@ static int hugetlbfs_symlink(struct user_namespace *mnt_userns, return error; } -static int hugetlbfs_migrate_page(struct address_space *mapping, - struct page *newpage, struct page *page, +#ifdef CONFIG_MIGRATION +static int hugetlbfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { int rc; - rc = migrate_huge_page_move_mapping(mapping, newpage, page); + rc = migrate_huge_page_move_mapping(mapping, dst, src); if (rc != MIGRATEPAGE_SUCCESS) return rc; - if (hugetlb_page_subpool(page)) { - hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page)); - hugetlb_set_page_subpool(page, NULL); + if (hugetlb_page_subpool(&src->page)) { + hugetlb_set_page_subpool(&dst->page, + hugetlb_page_subpool(&src->page)); + hugetlb_set_page_subpool(&src->page, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); + folio_migrate_copy(dst, src); else - migrate_page_states(newpage, page); + folio_migrate_flags(dst, src); return MIGRATEPAGE_SUCCESS; } +#else +#define hugetlbfs_migrate_folio NULL +#endif static int hugetlbfs_error_remove_page(struct address_space *mapping, struct page *page) @@ -1142,7 +1147,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, .dirty_folio = noop_dirty_folio, - .migratepage = hugetlbfs_migrate_page, + .migrate_folio = hugetlbfs_migrate_folio, .error_remove_page = hugetlbfs_error_remove_page, }; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index c9986d5da335fe..13f793309b7539 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -72,8 +72,8 @@ extern int isolate_movable_page(struct page *page, isolate_mode_t mode); extern void migrate_page_states(struct page *newpage, struct page *page); extern void migrate_page_copy(struct page *newpage, struct page *page); -extern int migrate_huge_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page); +int migrate_huge_page_move_mapping(struct address_space *mapping, + struct folio *dst, struct folio *src); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, @@ -104,7 +104,7 @@ static inline void migrate_page_copy(struct page *newpage, struct page *page) {} static inline int migrate_huge_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page) + struct folio *dst, struct folio *src) { return -ENOSYS; } diff --git a/mm/migrate.c b/mm/migrate.c index 4ed8f0d53c77c2..0dd3ec9525b354 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -474,26 +474,26 @@ EXPORT_SYMBOL(folio_migrate_mapping); * of folio_migrate_mapping(). */ int migrate_huge_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page) + struct folio *dst, struct folio *src) { - XA_STATE(xas, &mapping->i_pages, page_index(page)); + XA_STATE(xas, &mapping->i_pages, folio_index(src)); int expected_count; xas_lock_irq(&xas); - expected_count = 2 + page_has_private(page); - if (!page_ref_freeze(page, expected_count)) { + expected_count = 2 + folio_has_private(src); + if (!folio_ref_freeze(src, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; } - newpage->index = page->index; - newpage->mapping = page->mapping; + dst->index = src->index; + dst->mapping = src->mapping; - get_page(newpage); + folio_get(dst); - xas_store(&xas, newpage); + xas_store(&xas, dst); - page_ref_unfreeze(page, expected_count - 1); + folio_ref_unfreeze(src, expected_count - 1); xas_unlock_irq(&xas); From 48e7ede1b860e44edfd0231f888a7f16af86d6a0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 11:30:43 -0400 Subject: [PATCH 0318/1250] secretmem: Convert to migrate_folio This is little more than changing the types over; there's no real work being done in this function. Signed-off-by: Matthew Wilcox (Oracle) --- mm/secretmem.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 1c7f1775b56e7f..658a7486efa97f 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -133,9 +133,8 @@ static const struct file_operations secretmem_fops = { .mmap = secretmem_mmap, }; -static int secretmem_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) +static int secretmem_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { return -EBUSY; } @@ -149,7 +148,7 @@ static void secretmem_free_folio(struct folio *folio) const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, .free_folio = secretmem_free_folio, - .migratepage = secretmem_migratepage, + .migrate_folio = secretmem_migrate_folio, }; static int secretmem_setattr(struct user_namespace *mnt_userns, From c33b866a97842ac96f4373f737ba608dd157f08a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 11:53:31 -0400 Subject: [PATCH 0319/1250] fs: Remove aops->migratepage() With all users converted to migrate_folio(), remove this operation. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/fs.h | 2 -- mm/compaction.c | 5 ++--- mm/migrate.c | 3 --- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9e6b17da4e11b1..7e06919b8f6030 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -367,8 +367,6 @@ struct address_space_operations { */ int (*migrate_folio)(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); - int (*migratepage) (struct address_space *, - struct page *, struct page *, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); diff --git a/mm/compaction.c b/mm/compaction.c index 458f49f9ab09b5..a2c53fcf933e6f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1031,7 +1031,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Only pages without mappings or that have a - * ->migratepage callback are possible to migrate + * ->migrate_folio callback are possible to migrate * without blocking. However, we can be racing with * truncation so it's necessary to lock the page * to stabilise the mapping as truncation holds @@ -1043,8 +1043,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, mapping = page_mapping(page); migrate_dirty = !mapping || - mapping->a_ops->migrate_folio || - mapping->a_ops->migratepage; + mapping->a_ops->migrate_folio; unlock_page(page); if (!migrate_dirty) goto isolate_fail_put; diff --git a/mm/migrate.c b/mm/migrate.c index 0dd3ec9525b354..1b4b977809a1c0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -909,9 +909,6 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, */ rc = mapping->a_ops->migrate_folio(mapping, dst, src, mode); - else if (mapping->a_ops->migratepage) - rc = mapping->a_ops->migratepage(mapping, &dst->page, - &src->page, mode); else rc = fallback_migrate_folio(mapping, dst, src, mode); } else { From 84578adbb0e0657003e646e0af699ef74b99386e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 6 Jun 2022 13:29:10 -0400 Subject: [PATCH 0320/1250] mm/folio-compat: Remove migration compatibility functions migrate_page_move_mapping(), migrate_page_copy() and migrate_page_states() are all now unused after converting all the filesystems from aops->migratepage() to aops->migrate_folio(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Christoph Hellwig --- include/linux/migrate.h | 11 ----------- mm/folio-compat.c | 22 ---------------------- mm/ksm.c | 2 +- 3 files changed, 1 insertion(+), 34 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 13f793309b7539..ae5bb67a9ba1f2 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -70,12 +70,8 @@ extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, extern struct page *alloc_migration_target(struct page *page, unsigned long private); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); -extern void migrate_page_states(struct page *newpage, struct page *page); -extern void migrate_page_copy(struct page *newpage, struct page *page); int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src); -extern int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, int extra_count); void migration_entry_wait_on_locked(swp_entry_t entry, pte_t *ptep, spinlock_t *ptl); void folio_migrate_flags(struct folio *newfolio, struct folio *folio); @@ -96,13 +92,6 @@ static inline struct page *alloc_migration_target(struct page *page, static inline int isolate_movable_page(struct page *page, isolate_mode_t mode) { return -EBUSY; } -static inline void migrate_page_states(struct page *newpage, struct page *page) -{ -} - -static inline void migrate_page_copy(struct page *newpage, - struct page *page) {} - static inline int migrate_huge_page_move_mapping(struct address_space *mapping, struct folio *dst, struct folio *src) { diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 20bc15b57d93e2..458618c7302c39 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -51,28 +51,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -#ifdef CONFIG_MIGRATION -int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, int extra_count) -{ - return folio_migrate_mapping(mapping, page_folio(newpage), - page_folio(page), extra_count); -} -EXPORT_SYMBOL(migrate_page_move_mapping); - -void migrate_page_states(struct page *newpage, struct page *page) -{ - folio_migrate_flags(page_folio(newpage), page_folio(page)); -} -EXPORT_SYMBOL(migrate_page_states); - -void migrate_page_copy(struct page *newpage, struct page *page) -{ - folio_migrate_copy(page_folio(newpage), page_folio(page)); -} -EXPORT_SYMBOL(migrate_page_copy); -#endif - bool set_page_writeback(struct page *page) { return folio_start_writeback(page_folio(page)); diff --git a/mm/ksm.c b/mm/ksm.c index 54f78c9eecaee7..e8f8c1a2bb3968 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -712,7 +712,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * the same is in reuse_ksm_page() case; but if page is swapcache - * in migrate_page_move_mapping(), it might still be our page, + * in folio_migrate_mapping(), it might still be our page, * in which case it's essential to keep the node. */ while (!get_page_unless_zero(page)) { From 9594da4cec1db0491d35b38d5988eb989720d6f4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Jun 2022 07:37:10 +0200 Subject: [PATCH 0321/1250] ntfs3: refactor ntfs_writepages Handle the resident case with an explicit generic_writepages call instead of using the obscure overload that makes mpage_writepages with a NULL get_block do the same thing. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Matthew Wilcox (Oracle) --- fs/ntfs3/inode.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index be4ebdd8048b04..28c09c25b823d1 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -851,12 +851,10 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct inode *inode = mapping->host; - struct ntfs_inode *ni = ntfs_i(inode); /* Redirect call to 'ntfs_writepage' for resident files. */ - get_block_t *get_block = is_resident(ni) ? NULL : &ntfs_get_block; - - return mpage_writepages(mapping, wbc, get_block); + if (is_resident(ntfs_i(mapping->host))) + return generic_writepages(mapping, wbc); + return mpage_writepages(mapping, wbc, ntfs_get_block); } static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, From 8862fa5da9f144d0554c2177aea7ce0b6f97d8c0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Jun 2022 07:37:11 +0200 Subject: [PATCH 0322/1250] ext2: remove nobh support The nobh mode is an obscure feature to save lowlevel for large memory 32-bit configurations while trading for much slower performance and has been long obsolete. Remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Matthew Wilcox (Oracle) --- Documentation/filesystems/ext2.rst | 2 -- fs/ext2/ext2.h | 1 - fs/ext2/inode.c | 51 ++---------------------------- fs/ext2/namei.c | 10 ++---- fs/ext2/super.c | 6 ++-- 5 files changed, 7 insertions(+), 63 deletions(-) diff --git a/Documentation/filesystems/ext2.rst b/Documentation/filesystems/ext2.rst index 154101cf0e4f56..92aae683e16a77 100644 --- a/Documentation/filesystems/ext2.rst +++ b/Documentation/filesystems/ext2.rst @@ -59,8 +59,6 @@ acl Enable POSIX Access Control Lists support (requires CONFIG_EXT2_FS_POSIX_ACL). noacl Don't support POSIX ACLs. -nobh Do not attach buffer_heads to file pagecache. - quota, usrquota Enable user disk quota support (requires CONFIG_QUOTA). diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index d4f306aa5aceb1..28de11a22e5f6c 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -795,7 +795,6 @@ extern const struct file_operations ext2_file_operations; /* inode.c */ extern void ext2_set_file_ops(struct inode *inode); extern const struct address_space_operations ext2_aops; -extern const struct address_space_operations ext2_nobh_aops; extern const struct iomap_ops ext2_iomap_ops; /* namei.c */ diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 58a9d061f17d14..c5229033baf050 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -908,25 +908,6 @@ static int ext2_write_end(struct file *file, struct address_space *mapping, return ret; } -static int -ext2_nobh_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) -{ - int ret; - - ret = nobh_write_begin(mapping, pos, len, pagep, fsdata, - ext2_get_block); - if (ret < 0) - ext2_write_failed(mapping, pos + len); - return ret; -} - -static int ext2_nobh_writepage(struct page *page, - struct writeback_control *wbc) -{ - return nobh_writepage(page, ext2_get_block, wbc); -} - static sector_t ext2_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,ext2_get_block); @@ -978,21 +959,6 @@ const struct address_space_operations ext2_aops = { .error_remove_page = generic_error_remove_page, }; -const struct address_space_operations ext2_nobh_aops = { - .dirty_folio = block_dirty_folio, - .invalidate_folio = block_invalidate_folio, - .read_folio = ext2_read_folio, - .readahead = ext2_readahead, - .writepage = ext2_nobh_writepage, - .write_begin = ext2_nobh_write_begin, - .write_end = nobh_write_end, - .bmap = ext2_bmap, - .direct_IO = ext2_direct_IO, - .writepages = ext2_writepages, - .migrate_folio = buffer_migrate_folio, - .error_remove_page = generic_error_remove_page, -}; - static const struct address_space_operations ext2_dax_aops = { .writepages = ext2_dax_writepages, .direct_IO = noop_direct_IO, @@ -1298,13 +1264,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (IS_DAX(inode)) { + if (IS_DAX(inode)) error = dax_zero_range(inode, newsize, PAGE_ALIGN(newsize) - newsize, NULL, &ext2_iomap_ops); - } else if (test_opt(inode->i_sb, NOBH)) - error = nobh_truncate_page(inode->i_mapping, - newsize, ext2_get_block); else error = block_truncate_page(inode->i_mapping, newsize, ext2_get_block); @@ -1396,8 +1359,6 @@ void ext2_set_file_ops(struct inode *inode) inode->i_fop = &ext2_file_operations; if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext2_dax_aops; - else if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; else inode->i_mapping->a_ops = &ext2_aops; } @@ -1497,10 +1458,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; } else if (S_ISLNK(inode->i_mode)) { if (ext2_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; @@ -1510,10 +1468,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) } else { inode->i_op = &ext2_symlink_inode_operations; inode_nohighmem(inode); - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; } } else { inode->i_op = &ext2_special_inode_operations; diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 5f6b7560eb3f3a..5fd9a22d2b70c7 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -178,10 +178,7 @@ static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir, /* slow symlink */ inode->i_op = &ext2_symlink_inode_operations; inode_nohighmem(inode); - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; err = page_symlink(inode, symname, l); if (err) goto out_fail; @@ -247,10 +244,7 @@ static int ext2_mkdir(struct user_namespace * mnt_userns, inode->i_op = &ext2_dir_inode_operations; inode->i_fop = &ext2_dir_operations; - if (test_opt(inode->i_sb, NOBH)) - inode->i_mapping->a_ops = &ext2_nobh_aops; - else - inode->i_mapping->a_ops = &ext2_aops; + inode->i_mapping->a_ops = &ext2_aops; inode_inc_link_count(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f6a19f6d9f6d5b..a1c1263c07ab3d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -296,9 +296,6 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noacl"); #endif - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); - if (test_opt(sb, USRQUOTA)) seq_puts(seq, ",usrquota"); @@ -551,7 +548,8 @@ static int parse_options(char *options, struct super_block *sb, clear_opt (opts->s_mount_opt, OLDALLOC); break; case Opt_nobh: - set_opt (opts->s_mount_opt, NOBH); + ext2_msg(sb, KERN_INFO, + "nobh option not supported"); break; #ifdef CONFIG_EXT2_FS_XATTR case Opt_user_xattr: From 3f05372ce9ddbd643a763f60c0d2a115cb2de008 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Jun 2022 07:37:12 +0200 Subject: [PATCH 0323/1250] jfs: stop using the nobh helper The nobh mode is an obscure feature to save lowlevel for large memory 32-bit configurations while trading for much slower performance and has been long obsolete. Switch to the regular buffer head based helpers instead. Signed-off-by: Christoph Hellwig Signed-off-by: Matthew Wilcox (Oracle) --- fs/jfs/inode.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 259326556ada67..d1ec920aa030a8 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -301,13 +301,25 @@ static int jfs_write_begin(struct file *file, struct address_space *mapping, { int ret; - ret = nobh_write_begin(mapping, pos, len, pagep, fsdata, jfs_get_block); + ret = block_write_begin(mapping, pos, len, pagep, jfs_get_block); if (unlikely(ret)) jfs_write_failed(mapping, pos + len); return ret; } +static int jfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + jfs_write_failed(mapping, pos + len); + return ret; +} + static sector_t jfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, jfs_get_block); @@ -346,7 +358,7 @@ const struct address_space_operations jfs_aops = { .writepage = jfs_writepage, .writepages = jfs_writepages, .write_begin = jfs_write_begin, - .write_end = nobh_write_end, + .write_end = jfs_write_end, .bmap = jfs_bmap, .direct_IO = jfs_direct_IO, }; @@ -399,7 +411,7 @@ void jfs_truncate(struct inode *ip) { jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size); - nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block); + block_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block); IWRITE_LOCK(ip, RDWRLOCK_NORMAL); jfs_truncate_nolock(ip, ip->i_size); From 215e71b6ee7ad3363c6e6bd979adbb56e070f6de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Jun 2022 07:37:13 +0200 Subject: [PATCH 0324/1250] fs: remove the nobh helpers All callers are gone, so remove the now dead code. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Matthew Wilcox (Oracle) --- fs/buffer.c | 324 ------------------------------------ fs/mpage.c | 25 +-- include/linux/buffer_head.h | 8 - include/linux/mpage.h | 2 - 4 files changed, 1 insertion(+), 358 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index ce9844d7c10fac..5717d1881d2faf 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2537,330 +2537,6 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, } EXPORT_SYMBOL(block_page_mkwrite); -/* - * nobh_write_begin()'s prereads are special: the buffer_heads are freed - * immediately, while under the page lock. So it needs a special end_io - * handler which does not touch the bh after unlocking it. - */ -static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate) -{ - __end_buffer_read_notouch(bh, uptodate); -} - -/* - * Attach the singly-linked list of buffers created by nobh_write_begin, to - * the page (converting it to circular linked list and taking care of page - * dirty races). - */ -static void attach_nobh_buffers(struct page *page, struct buffer_head *head) -{ - struct buffer_head *bh; - - BUG_ON(!PageLocked(page)); - - spin_lock(&page->mapping->private_lock); - bh = head; - do { - if (PageDirty(page)) - set_buffer_dirty(bh); - if (!bh->b_this_page) - bh->b_this_page = head; - bh = bh->b_this_page; - } while (bh != head); - attach_page_private(page, head); - spin_unlock(&page->mapping->private_lock); -} - -/* - * On entry, the page is fully not uptodate. - * On exit the page is fully uptodate in the areas outside (from,to) - * The filesystem needs to handle block truncation upon failure. - */ -int nobh_write_begin(struct address_space *mapping, loff_t pos, unsigned len, - struct page **pagep, void **fsdata, - get_block_t *get_block) -{ - struct inode *inode = mapping->host; - const unsigned blkbits = inode->i_blkbits; - const unsigned blocksize = 1 << blkbits; - struct buffer_head *head, *bh; - struct page *page; - pgoff_t index; - unsigned from, to; - unsigned block_in_page; - unsigned block_start, block_end; - sector_t block_in_file; - int nr_reads = 0; - int ret = 0; - int is_mapped_to_disk = 1; - - index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; - - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; - *fsdata = NULL; - - if (page_has_buffers(page)) { - ret = __block_write_begin(page, pos, len, get_block); - if (unlikely(ret)) - goto out_release; - return ret; - } - - if (PageMappedToDisk(page)) - return 0; - - /* - * Allocate buffers so that we can keep track of state, and potentially - * attach them to the page if an error occurs. In the common case of - * no error, they will just be freed again without ever being attached - * to the page (which is all OK, because we're under the page lock). - * - * Be careful: the buffer linked list is a NULL terminated one, rather - * than the circular one we're used to. - */ - head = alloc_page_buffers(page, blocksize, false); - if (!head) { - ret = -ENOMEM; - goto out_release; - } - - block_in_file = (sector_t)page->index << (PAGE_SHIFT - blkbits); - - /* - * We loop across all blocks in the page, whether or not they are - * part of the affected region. This is so we can discover if the - * page is fully mapped-to-disk. - */ - for (block_start = 0, block_in_page = 0, bh = head; - block_start < PAGE_SIZE; - block_in_page++, block_start += blocksize, bh = bh->b_this_page) { - int create; - - block_end = block_start + blocksize; - bh->b_state = 0; - create = 1; - if (block_start >= to) - create = 0; - ret = get_block(inode, block_in_file + block_in_page, - bh, create); - if (ret) - goto failed; - if (!buffer_mapped(bh)) - is_mapped_to_disk = 0; - if (buffer_new(bh)) - clean_bdev_bh_alias(bh); - if (PageUptodate(page)) { - set_buffer_uptodate(bh); - continue; - } - if (buffer_new(bh) || !buffer_mapped(bh)) { - zero_user_segments(page, block_start, from, - to, block_end); - continue; - } - if (buffer_uptodate(bh)) - continue; /* reiserfs does this */ - if (block_start < from || block_end > to) { - lock_buffer(bh); - bh->b_end_io = end_buffer_read_nobh; - submit_bh(REQ_OP_READ, 0, bh); - nr_reads++; - } - } - - if (nr_reads) { - /* - * The page is locked, so these buffers are protected from - * any VM or truncate activity. Hence we don't need to care - * for the buffer_head refcounts. - */ - for (bh = head; bh; bh = bh->b_this_page) { - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - ret = -EIO; - } - if (ret) - goto failed; - } - - if (is_mapped_to_disk) - SetPageMappedToDisk(page); - - *fsdata = head; /* to be released by nobh_write_end */ - - return 0; - -failed: - BUG_ON(!ret); - /* - * Error recovery is a bit difficult. We need to zero out blocks that - * were newly allocated, and dirty them to ensure they get written out. - * Buffers need to be attached to the page at this point, otherwise - * the handling of potential IO errors during writeout would be hard - * (could try doing synchronous writeout, but what if that fails too?) - */ - attach_nobh_buffers(page, head); - page_zero_new_buffers(page, from, to); - -out_release: - unlock_page(page); - put_page(page); - *pagep = NULL; - - return ret; -} -EXPORT_SYMBOL(nobh_write_begin); - -int nobh_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - struct inode *inode = page->mapping->host; - struct buffer_head *head = fsdata; - struct buffer_head *bh; - BUG_ON(fsdata != NULL && page_has_buffers(page)); - - if (unlikely(copied < len) && head) - attach_nobh_buffers(page, head); - if (page_has_buffers(page)) - return generic_write_end(file, mapping, pos, len, - copied, page, fsdata); - - SetPageUptodate(page); - set_page_dirty(page); - if (pos+copied > inode->i_size) { - i_size_write(inode, pos+copied); - mark_inode_dirty(inode); - } - - unlock_page(page); - put_page(page); - - while (head) { - bh = head; - head = head->b_this_page; - free_buffer_head(bh); - } - - return copied; -} -EXPORT_SYMBOL(nobh_write_end); - -/* - * nobh_writepage() - based on block_full_write_page() except - * that it tries to operate without attaching bufferheads to - * the page. - */ -int nobh_writepage(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - struct inode * const inode = page->mapping->host; - loff_t i_size = i_size_read(inode); - const pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; - int ret; - - /* Is the page fully inside i_size? */ - if (page->index < end_index) - goto out; - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (page->index >= end_index+1 || !offset) { - unlock_page(page); - return 0; /* don't care */ - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - zero_user_segment(page, offset, PAGE_SIZE); -out: - ret = mpage_writepage(page, get_block, wbc); - if (ret == -EAGAIN) - ret = __block_write_full_page(inode, page, get_block, wbc, - end_buffer_async_write); - return ret; -} -EXPORT_SYMBOL(nobh_writepage); - -int nobh_truncate_page(struct address_space *mapping, - loff_t from, get_block_t *get_block) -{ - pgoff_t index = from >> PAGE_SHIFT; - struct inode *inode = mapping->host; - unsigned blocksize = i_blocksize(inode); - struct folio *folio; - struct buffer_head map_bh; - size_t offset; - sector_t iblock; - int err; - - /* Block boundary? Nothing to do */ - if (!(from & (blocksize - 1))) - return 0; - - folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_CREAT, - mapping_gfp_mask(mapping)); - err = -ENOMEM; - if (!folio) - goto out; - - if (folio_buffers(folio)) - goto has_buffers; - - iblock = from >> inode->i_blkbits; - map_bh.b_size = blocksize; - map_bh.b_state = 0; - err = get_block(inode, iblock, &map_bh, 0); - if (err) - goto unlock; - /* unmapped? It's a hole - nothing to do */ - if (!buffer_mapped(&map_bh)) - goto unlock; - - /* Ok, it's mapped. Make sure it's up-to-date */ - if (!folio_test_uptodate(folio)) { - err = mapping->a_ops->read_folio(NULL, folio); - if (err) { - folio_put(folio); - goto out; - } - folio_lock(folio); - if (!folio_test_uptodate(folio)) { - err = -EIO; - goto unlock; - } - if (folio_buffers(folio)) - goto has_buffers; - } - offset = offset_in_folio(folio, from); - folio_zero_segment(folio, offset, round_up(offset, blocksize)); - folio_mark_dirty(folio); - err = 0; - -unlock: - folio_unlock(folio); - folio_put(folio); -out: - return err; - -has_buffers: - folio_unlock(folio); - folio_put(folio); - return block_truncate_page(mapping, from, get_block); -} -EXPORT_SYMBOL(nobh_truncate_page); - int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block) { diff --git a/fs/mpage.c b/fs/mpage.c index 681a4b9a36e3c7..b7e0b7fbb41f20 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -404,7 +404,6 @@ struct mpage_data { struct bio *bio; sector_t last_block_in_bio; get_block_t *get_block; - unsigned use_writepage; }; /* @@ -624,15 +623,10 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, if (bio) bio = mpage_bio_submit(bio); - if (mpd->use_writepage) { - ret = mapping->a_ops->writepage(page, wbc); - } else { - ret = -EAGAIN; - goto out; - } /* * The caller has a ref on the inode, so *mapping is stable */ + ret = mapping->a_ops->writepage(page, wbc); mapping_set_error(mapping, ret); out: mpd->bio = bio; @@ -674,7 +668,6 @@ mpage_writepages(struct address_space *mapping, .bio = NULL, .last_block_in_bio = 0, .get_block = get_block, - .use_writepage = 1, }; ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); @@ -685,19 +678,3 @@ mpage_writepages(struct address_space *mapping, return ret; } EXPORT_SYMBOL(mpage_writepages); - -int mpage_writepage(struct page *page, get_block_t get_block, - struct writeback_control *wbc) -{ - struct mpage_data mpd = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = get_block, - .use_writepage = 0, - }; - int ret = __mpage_writepage(page, wbc, &mpd); - if (mpd.bio) - mpage_bio_submit(mpd.bio); - return ret; -} -EXPORT_SYMBOL(mpage_writepage); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index b0366c89d6a4d1..61afb81cfdaea4 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -258,14 +258,6 @@ static inline vm_fault_t block_page_mkwrite_return(int err) } sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *); -int nobh_write_begin(struct address_space *, loff_t, unsigned len, - struct page **, void **, get_block_t*); -int nobh_write_end(struct file *, struct address_space *, - loff_t, unsigned, unsigned, - struct page *, void *); -int nobh_truncate_page(struct address_space *, loff_t, get_block_t *); -int nobh_writepage(struct page *page, get_block_t *get_block, - struct writeback_control *wbc); #ifdef CONFIG_MIGRATION extern int buffer_migrate_folio(struct address_space *, diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 43986f7ec4dd35..1bdc39daac0a3e 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h @@ -19,7 +19,5 @@ void mpage_readahead(struct readahead_control *, get_block_t get_block); int mpage_read_folio(struct folio *folio, get_block_t get_block); int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block); -int mpage_writepage(struct page *page, get_block_t *get_block, - struct writeback_control *wbc); #endif From cf95d50205f62c4f5f538676def847292cf39fa9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Jun 2022 07:37:14 +0200 Subject: [PATCH 0325/1250] fs: don't call ->writepage from __mpage_writepage All callers of mpage_writepage use block_write_full_page as their ->writepage implementation when called from mpage_writepages (although for ntfs3 this is obsfucated a bit). Just call block_write_full_page directly instead of going through the ->writepage indirection. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Matthew Wilcox (Oracle) --- fs/mpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/mpage.c b/fs/mpage.c index b7e0b7fbb41f20..bf7d1cf621e2f8 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -626,7 +626,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, /* * The caller has a ref on the inode, so *mapping is stable */ - ret = mapping->a_ops->writepage(page, wbc); + ret = block_write_full_page(page, mpd->get_block, wbc); mapping_set_error(mapping, ret); out: mpd->bio = bio; From 03b33c09ea22fa89dd204ad0a2058e512c691b9f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 Jun 2022 07:37:15 +0200 Subject: [PATCH 0326/1250] fs: remove the NULL get_block case in mpage_writepages No one calls mpage_writepages with a NULL get_block paramter, so remove support for that case. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Matthew Wilcox (Oracle) --- fs/mpage.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index bf7d1cf621e2f8..8326ff8a7a96eb 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -638,8 +638,6 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write * @get_block: the filesystem's block mapper function. - * If this is NULL then use a_ops->writepage. Otherwise, go - * direct-to-BIO. * * This is a library function, which implements the writepages() * address_space_operation. @@ -656,24 +654,16 @@ int mpage_writepages(struct address_space *mapping, struct writeback_control *wbc, get_block_t get_block) { + struct mpage_data mpd = { + .get_block = get_block, + }; struct blk_plug plug; int ret; blk_start_plug(&plug); - - if (!get_block) - ret = generic_writepages(mapping, wbc); - else { - struct mpage_data mpd = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = get_block, - }; - - ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) - mpage_bio_submit(mpd.bio); - } + ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); + if (mpd.bio) + mpage_bio_submit(mpd.bio); blk_finish_plug(&plug); return ret; } From 9df125af0822d3e2bde7508e9536d67ab541a166 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Thu, 7 Jul 2022 21:59:48 -0400 Subject: [PATCH 0327/1250] bus: mhi: ep: Check dev_set_name() return value It's possible that dev_set_name() returns -ENOMEM, catch and handle this. Signed-off-by: Bo Liu Reviewed-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20220708015948.4091-1-liubo03@inspur.com Signed-off-by: Manivannan Sadhasivam --- drivers/bus/mhi/ep/main.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/bus/mhi/ep/main.c b/drivers/bus/mhi/ep/main.c index 40109a79017a2a..1dc8a3557a4649 100644 --- a/drivers/bus/mhi/ep/main.c +++ b/drivers/bus/mhi/ep/main.c @@ -1242,9 +1242,13 @@ static int mhi_ep_create_device(struct mhi_ep_cntrl *mhi_cntrl, u32 ch_id) /* Channel name is same for both UL and DL */ mhi_dev->name = mhi_chan->name; - dev_set_name(&mhi_dev->dev, "%s_%s", + ret = dev_set_name(&mhi_dev->dev, "%s_%s", dev_name(&mhi_cntrl->mhi_dev->dev), mhi_dev->name); + if (ret) { + put_device(&mhi_dev->dev); + return ret; + } ret = device_add(&mhi_dev->dev); if (ret) @@ -1408,7 +1412,10 @@ int mhi_ep_register_controller(struct mhi_ep_cntrl *mhi_cntrl, goto err_free_irq; } - dev_set_name(&mhi_dev->dev, "mhi_ep%u", mhi_cntrl->index); + ret = dev_set_name(&mhi_dev->dev, "mhi_ep%u", mhi_cntrl->index); + if (ret) + goto err_put_dev; + mhi_dev->name = dev_name(&mhi_dev->dev); mhi_cntrl->mhi_dev = mhi_dev; From 1ed2a471703506345dc9a2db657b6aa94b7168fa Mon Sep 17 00:00:00 2001 From: Stefan Mahnke-Hartmann Date: Fri, 3 Jun 2022 10:41:58 +0200 Subject: [PATCH 0328/1250] tpm: Add upgrade/reduced mode support for TPM1.2 modules In case a TPM in failure mode is detected, the TPM should be accessible through a transparent communication channel for analysing purposes (e.g. TPM_GetTestResult) or a field upgrade. Since a TPM in failure mode has similar reduced functionality as in field upgrade mode, the flag TPM_CHIP_FLAG_FIRMWARE_UPGRADE is also valid. As described in TCG TPM Main Part1 Design Principles, Revision 116, chapter 9.2.1. the TPM also allows an update function in case a TPM is in failure mode. If the TPM in failure mode is detected, the function tpm1_auto_startup() sets TPM_CHIP_FLAG_FIRMWARE_UPGRADE flag, which is used later during driver initialization/deinitialization to disable functionality which makes no sense or will fail in the current TPM state. The following functionality is affected: * Do not register TPM as a hwrng * Do not get pcr allocation * Do not register sysfs entries which provide information impossible to obtain in limited mode Signed-off-by: Stefan Mahnke-Hartmann Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm.h | 1 + drivers/char/tpm/tpm1-cmd.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm.h b/drivers/char/tpm/tpm.h index 2163c6ee0d364f..24ee4e1cc452a0 100644 --- a/drivers/char/tpm/tpm.h +++ b/drivers/char/tpm/tpm.h @@ -55,6 +55,7 @@ enum tpm_addr { #define TPM_WARN_DOING_SELFTEST 0x802 #define TPM_ERR_DEACTIVATED 0x6 #define TPM_ERR_DISABLED 0x7 +#define TPM_ERR_FAILEDSELFTEST 0x1C #define TPM_ERR_INVALID_POSTINIT 38 #define TPM_TAG_RQU_COMMAND 193 diff --git a/drivers/char/tpm/tpm1-cmd.c b/drivers/char/tpm/tpm1-cmd.c index f7dc986fa4a0a2..cf64c738510529 100644 --- a/drivers/char/tpm/tpm1-cmd.c +++ b/drivers/char/tpm/tpm1-cmd.c @@ -709,7 +709,12 @@ int tpm1_auto_startup(struct tpm_chip *chip) if (rc) goto out; rc = tpm1_do_selftest(chip); - if (rc) { + if (rc == TPM_ERR_FAILEDSELFTEST) { + dev_warn(&chip->dev, "TPM self test failed, switching to the firmware upgrade mode\n"); + /* A TPM in this state possibly allows or needs a firmware upgrade */ + chip->flags |= TPM_CHIP_FLAG_FIRMWARE_UPGRADE; + return 0; + } else if (rc) { dev_err(&chip->dev, "TPM self test failed\n"); goto out; } From c79aa0e22aa4c333f545515e2c518fbeb83f1831 Mon Sep 17 00:00:00 2001 From: Alexander Steffen Date: Wed, 8 Jun 2022 19:31:11 +0200 Subject: [PATCH 0329/1250] dt-bindings: trivial-devices: Add Infineon SLB9673 TPM Initial device to be supported by the upcoming tpm_tis_i2c driver. More to be added later. Signed-off-by: Alexander Steffen Acked-by: Rob Herring Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- Documentation/devicetree/bindings/trivial-devices.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/trivial-devices.yaml b/Documentation/devicetree/bindings/trivial-devices.yaml index 6aafa71806a3d6..47a88e891a0678 100644 --- a/Documentation/devicetree/bindings/trivial-devices.yaml +++ b/Documentation/devicetree/bindings/trivial-devices.yaml @@ -139,6 +139,8 @@ properties: - infineon,slb9635tt # Infineon SLB9645 I2C TPM (new protocol, max 400khz) - infineon,slb9645tt + # Infineon SLB9673 I2C TPM 2.0 + - infineon,slb9673 # Infineon TLV493D-A1B6 I2C 3D Magnetic Sensor - infineon,tlv493d-a1b6 # Infineon Multi-phase Digital VR Controller xdpe11280 From 9cfa7df97bbb12b119e7373fd4eefa19398f5f9a Mon Sep 17 00:00:00 2001 From: Alexander Steffen Date: Wed, 8 Jun 2022 19:31:12 +0200 Subject: [PATCH 0330/1250] tpm: Add tpm_tis_verify_crc to the tpm_tis_phy_ops protocol layer Some TPMs, e.g. those implementing the I2C variant of TIS, can verify data transfers to/from the FIFO with a CRC. The CRC is calculated over the entirety of the FIFO register. Since the phy_ops layer is not aware when the core layer is done reading/writing the FIFO, CRC verification must be triggered from the core layer. To this end, add an optional phy_ops API call. Co-developed-by: Johannes Holland Signed-off-by: Johannes Holland Signed-off-by: Alexander Steffen Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_core.c | 14 ++++++++++++++ drivers/char/tpm/tpm_tis_core.h | 10 ++++++++++ 2 files changed, 24 insertions(+) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index dc56b976d8162c..757623bacfd502 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -289,6 +289,7 @@ static int tpm_tis_recv(struct tpm_chip *chip, u8 *buf, size_t count) int size = 0; int status; u32 expected; + int rc; if (count < TPM_HEADER_SIZE) { size = -EIO; @@ -328,6 +329,13 @@ static int tpm_tis_recv(struct tpm_chip *chip, u8 *buf, size_t count) goto out; } + rc = tpm_tis_verify_crc(priv, (size_t)size, buf); + if (rc < 0) { + dev_err(&chip->dev, "CRC mismatch for response.\n"); + size = rc; + goto out; + } + out: tpm_tis_ready(chip); return size; @@ -443,6 +451,12 @@ static int tpm_tis_send_main(struct tpm_chip *chip, const u8 *buf, size_t len) if (rc < 0) return rc; + rc = tpm_tis_verify_crc(priv, len, buf); + if (rc < 0) { + dev_err(&chip->dev, "CRC mismatch for command.\n"); + return rc; + } + /* go and do it */ rc = tpm_tis_write8(priv, TPM_STS(priv->locality), TPM_STS_GO); if (rc < 0) diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h index 6c203f36b8a1b2..66a5a13cd1df29 100644 --- a/drivers/char/tpm/tpm_tis_core.h +++ b/drivers/char/tpm/tpm_tis_core.h @@ -121,6 +121,8 @@ struct tpm_tis_phy_ops { u8 *result, enum tpm_tis_io_mode mode); int (*write_bytes)(struct tpm_tis_data *data, u32 addr, u16 len, const u8 *value, enum tpm_tis_io_mode mode); + int (*verify_crc)(struct tpm_tis_data *data, size_t len, + const u8 *value); }; static inline int tpm_tis_read_bytes(struct tpm_tis_data *data, u32 addr, @@ -188,6 +190,14 @@ static inline int tpm_tis_write32(struct tpm_tis_data *data, u32 addr, return rc; } +static inline int tpm_tis_verify_crc(struct tpm_tis_data *data, size_t len, + const u8 *value) +{ + if (!data->phy_ops->verify_crc) + return 0; + return data->phy_ops->verify_crc(data, len, value); +} + static inline bool is_bsw(void) { #ifdef CONFIG_X86 From e874c3f16a02d069e130749b53b22076bbc13493 Mon Sep 17 00:00:00 2001 From: Alexander Steffen Date: Wed, 8 Jun 2022 19:31:13 +0200 Subject: [PATCH 0331/1250] tpm: Add tpm_tis_i2c backend for tpm_tis_core Implement the TCG I2C Interface driver, as specified in the TCG PC Client Platform TPM Profile (PTP) specification for TPM 2.0 v1.04 revision 14, section 8, I2C Interface Definition. This driver supports Guard Times. That is, if required by the TPM, the driver has to wait by a vendor-specific time after each I2C read/write. The specific time is read from the TPM_I2C_INTERFACE_CAPABILITY register. Unfortunately, the TCG specified almost but not quite compatible register addresses. Therefore, the TIS register addresses need to be mapped to I2C ones. The locality is stripped because for now, only locality 0 is supported. Add a sanity check to I2C reads of e.g. TPM_ACCESS and TPM_STS. This is to detect communication errors and issues due to non-standard behaviour (E.g. the clock stretching quirk in the BCM2835, see 4dbfb5f4401f). In case the sanity check fails, attempt a retry. Co-developed-by: Johannes Holland Signed-off-by: Johannes Holland Co-developed-by: Amir Mizinski Signed-off-by: Amir Mizinski Signed-off-by: Alexander Steffen Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/Kconfig | 12 + drivers/char/tpm/Makefile | 1 + drivers/char/tpm/tpm_tis_i2c.c | 391 +++++++++++++++++++++++++++++++++ 3 files changed, 404 insertions(+) create mode 100644 drivers/char/tpm/tpm_tis_i2c.c diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig index 4a5516406c22ed..927088b2c3d3f2 100644 --- a/drivers/char/tpm/Kconfig +++ b/drivers/char/tpm/Kconfig @@ -74,6 +74,18 @@ config TCG_TIS_SPI_CR50 If you have a H1 secure module running Cr50 firmware on SPI bus, say Yes and it will be accessible from within Linux. +config TCG_TIS_I2C + tristate "TPM Interface Specification 1.3 Interface / TPM 2.0 FIFO Interface - (I2C - generic)" + depends on I2C + select CRC_CCITT + select TCG_TIS_CORE + help + If you have a TPM security chip, compliant with the TCG TPM PTP + (I2C interface) specification and connected to an I2C bus master, + say Yes and it will be accessible from within Linux. + To compile this driver as a module, choose M here; + the module will be called tpm_tis_i2c. + config TCG_TIS_SYNQUACER tristate "TPM Interface Specification 1.2 Interface / TPM 2.0 FIFO Interface (MMIO - SynQuacer)" depends on ARCH_SYNQUACER || COMPILE_TEST diff --git a/drivers/char/tpm/Makefile b/drivers/char/tpm/Makefile index 66d39ea6bd10a3..0222b1ddb3105e 100644 --- a/drivers/char/tpm/Makefile +++ b/drivers/char/tpm/Makefile @@ -29,6 +29,7 @@ tpm_tis_spi-$(CONFIG_TCG_TIS_SPI_CR50) += tpm_tis_spi_cr50.o obj-$(CONFIG_TCG_TIS_I2C_CR50) += tpm_tis_i2c_cr50.o +obj-$(CONFIG_TCG_TIS_I2C) += tpm_tis_i2c.o obj-$(CONFIG_TCG_TIS_I2C_ATMEL) += tpm_i2c_atmel.o obj-$(CONFIG_TCG_TIS_I2C_INFINEON) += tpm_i2c_infineon.o obj-$(CONFIG_TCG_TIS_I2C_NUVOTON) += tpm_i2c_nuvoton.o diff --git a/drivers/char/tpm/tpm_tis_i2c.c b/drivers/char/tpm/tpm_tis_i2c.c new file mode 100644 index 00000000000000..8e0686fe4eb1a2 --- /dev/null +++ b/drivers/char/tpm/tpm_tis_i2c.c @@ -0,0 +1,391 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2014-2021 Nuvoton Technology corporation + * Copyright (C) 2019-2022 Infineon Technologies AG + * + * This device driver implements the TPM interface as defined in the TCG PC + * Client Platform TPM Profile (PTP) Specification for TPM 2.0 v1.04 + * Revision 14. + * + * It is based on the tpm_tis_spi device driver. + */ + +#include +#include +#include "tpm_tis_core.h" + +/* TPM registers */ +#define TPM_I2C_LOC_SEL 0x00 +#define TPM_I2C_ACCESS 0x04 +#define TPM_I2C_INTERFACE_CAPABILITY 0x30 +#define TPM_I2C_DEVICE_ADDRESS 0x38 +#define TPM_I2C_DATA_CSUM_ENABLE 0x40 +#define TPM_DATA_CSUM 0x44 +#define TPM_I2C_DID_VID 0x48 +#define TPM_I2C_RID 0x4C + +/* TIS-compatible register address to avoid clash with TPM_ACCESS (0x00) */ +#define TPM_LOC_SEL 0x0FFF + +/* Mask to extract the I2C register from TIS register addresses */ +#define TPM_TIS_REGISTER_MASK 0x0FFF + +/* Default Guard Time of 250µs until interface capability register is read */ +#define GUARD_TIME_DEFAULT_MIN 250 +#define GUARD_TIME_DEFAULT_MAX 300 + +/* Guard Time of 250µs after I2C slave NACK */ +#define GUARD_TIME_ERR_MIN 250 +#define GUARD_TIME_ERR_MAX 300 + +/* Guard Time bit masks; SR is repeated start, RW is read then write, etc. */ +#define TPM_GUARD_TIME_SR_MASK 0x40000000 +#define TPM_GUARD_TIME_RR_MASK 0x00100000 +#define TPM_GUARD_TIME_RW_MASK 0x00080000 +#define TPM_GUARD_TIME_WR_MASK 0x00040000 +#define TPM_GUARD_TIME_WW_MASK 0x00020000 +#define TPM_GUARD_TIME_MIN_MASK 0x0001FE00 +#define TPM_GUARD_TIME_MIN_SHIFT 9 + +/* Masks with bits that must be read zero */ +#define TPM_ACCESS_READ_ZERO 0x48 +#define TPM_INT_ENABLE_ZERO 0x7FFFFF6 +#define TPM_STS_READ_ZERO 0x23 +#define TPM_INTF_CAPABILITY_ZERO 0x0FFFF000 +#define TPM_I2C_INTERFACE_CAPABILITY_ZERO 0x80000000 + +struct tpm_tis_i2c_phy { + struct tpm_tis_data priv; + struct i2c_client *i2c_client; + bool guard_time_read; + bool guard_time_write; + u16 guard_time_min; + u16 guard_time_max; + u8 *io_buf; +}; + +static inline struct tpm_tis_i2c_phy * +to_tpm_tis_i2c_phy(struct tpm_tis_data *data) +{ + return container_of(data, struct tpm_tis_i2c_phy, priv); +} + +/* + * tpm_tis_core uses the register addresses as defined in Table 19 "Allocation + * of Register Space for FIFO TPM Access" of the TCG PC Client PTP + * Specification. In order for this code to work together with tpm_tis_core, + * those addresses need to mapped to the registers defined for I2C TPMs in + * Table 51 "I2C-TPM Register Overview". + * + * For most addresses this can be done by simply stripping off the locality + * information from the address. A few addresses need to be mapped explicitly, + * since the corresponding I2C registers have been moved around. TPM_LOC_SEL is + * only defined for I2C TPMs and is also mapped explicitly here to distinguish + * it from TPM_ACCESS(0). + * + * Locality information is ignored, since this driver assumes exclusive access + * to the TPM and always uses locality 0. + */ +static u8 tpm_tis_i2c_address_to_register(u32 addr) +{ + addr &= TPM_TIS_REGISTER_MASK; + + switch (addr) { + case TPM_ACCESS(0): + return TPM_I2C_ACCESS; + case TPM_LOC_SEL: + return TPM_I2C_LOC_SEL; + case TPM_DID_VID(0): + return TPM_I2C_DID_VID; + case TPM_RID(0): + return TPM_I2C_RID; + default: + return addr; + } +} + +static int tpm_tis_i2c_retry_transfer_until_ack(struct tpm_tis_data *data, + struct i2c_msg *msg) +{ + struct tpm_tis_i2c_phy *phy = to_tpm_tis_i2c_phy(data); + bool guard_time; + int i = 0; + int ret; + + if (msg->flags & I2C_M_RD) + guard_time = phy->guard_time_read; + else + guard_time = phy->guard_time_write; + + do { + ret = i2c_transfer(phy->i2c_client->adapter, msg, 1); + if (ret < 0) + usleep_range(GUARD_TIME_ERR_MIN, GUARD_TIME_ERR_MAX); + else if (guard_time) + usleep_range(phy->guard_time_min, phy->guard_time_max); + /* retry on TPM NACK */ + } while (ret < 0 && i++ < TPM_RETRY); + + return ret; +} + +/* Check that bits which must be read zero are not set */ +static int tpm_tis_i2c_sanity_check_read(u8 reg, u16 len, u8 *buf) +{ + u32 zero_mask; + u32 value; + + switch (len) { + case sizeof(u8): + value = buf[0]; + break; + case sizeof(u16): + value = le16_to_cpup((__le16 *)buf); + break; + case sizeof(u32): + value = le32_to_cpup((__le32 *)buf); + break; + default: + /* unknown length, skip check */ + return 0; + } + + switch (reg) { + case TPM_I2C_ACCESS: + zero_mask = TPM_ACCESS_READ_ZERO; + break; + case TPM_INT_ENABLE(0) & TPM_TIS_REGISTER_MASK: + zero_mask = TPM_INT_ENABLE_ZERO; + break; + case TPM_STS(0) & TPM_TIS_REGISTER_MASK: + zero_mask = TPM_STS_READ_ZERO; + break; + case TPM_INTF_CAPS(0) & TPM_TIS_REGISTER_MASK: + zero_mask = TPM_INTF_CAPABILITY_ZERO; + break; + case TPM_I2C_INTERFACE_CAPABILITY: + zero_mask = TPM_I2C_INTERFACE_CAPABILITY_ZERO; + break; + default: + /* unknown register, skip check */ + return 0; + } + + if (unlikely((value & zero_mask) != 0x00)) { + pr_debug("TPM I2C read of register 0x%02x failed sanity check: 0x%x\n", reg, value); + return -EIO; + } + + return 0; +} + +static int tpm_tis_i2c_read_bytes(struct tpm_tis_data *data, u32 addr, u16 len, + u8 *result, enum tpm_tis_io_mode io_mode) +{ + struct tpm_tis_i2c_phy *phy = to_tpm_tis_i2c_phy(data); + struct i2c_msg msg = { .addr = phy->i2c_client->addr }; + u8 reg = tpm_tis_i2c_address_to_register(addr); + int i; + int ret; + + for (i = 0; i < TPM_RETRY; i++) { + /* write register */ + msg.len = sizeof(reg); + msg.buf = ® + msg.flags = 0; + ret = tpm_tis_i2c_retry_transfer_until_ack(data, &msg); + if (ret < 0) + return ret; + + /* read data */ + msg.buf = result; + msg.len = len; + msg.flags = I2C_M_RD; + ret = tpm_tis_i2c_retry_transfer_until_ack(data, &msg); + if (ret < 0) + return ret; + + ret = tpm_tis_i2c_sanity_check_read(reg, len, result); + if (ret == 0) + return 0; + + usleep_range(GUARD_TIME_ERR_MIN, GUARD_TIME_ERR_MAX); + } + + return ret; +} + +static int tpm_tis_i2c_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, + const u8 *value, + enum tpm_tis_io_mode io_mode) +{ + struct tpm_tis_i2c_phy *phy = to_tpm_tis_i2c_phy(data); + struct i2c_msg msg = { .addr = phy->i2c_client->addr }; + u8 reg = tpm_tis_i2c_address_to_register(addr); + int ret; + + if (len > TPM_BUFSIZE - 1) + return -EIO; + + /* write register and data in one go */ + phy->io_buf[0] = reg; + memcpy(phy->io_buf + sizeof(reg), value, len); + + msg.len = sizeof(reg) + len; + msg.buf = phy->io_buf; + ret = tpm_tis_i2c_retry_transfer_until_ack(data, &msg); + if (ret < 0) + return ret; + + return 0; +} + +static int tpm_tis_i2c_verify_crc(struct tpm_tis_data *data, size_t len, + const u8 *value) +{ + u16 crc_tpm, crc_host; + int rc; + + rc = tpm_tis_read16(data, TPM_DATA_CSUM, &crc_tpm); + if (rc < 0) + return rc; + + /* reflect crc result, regardless of host endianness */ + crc_host = swab16(crc_ccitt(0, value, len)); + if (crc_tpm != crc_host) + return -EIO; + + return 0; +} + +/* + * Guard Time: + * After each I2C operation, the TPM might require the master to wait. + * The time period is vendor-specific and must be read from the + * TPM_I2C_INTERFACE_CAPABILITY register. + * + * Before the Guard Time is read (or after the TPM failed to send an I2C NACK), + * a Guard Time of 250µs applies. + * + * Various flags in the same register indicate if a guard time is needed: + * - SR: + * - RR: + * - RW: + * - WR: + * - WW: + * + * See TCG PC Client PTP Specification v1.04, 8.1.10 GUARD_TIME + */ +static int tpm_tis_i2c_init_guard_time(struct tpm_tis_i2c_phy *phy) +{ + u32 i2c_caps; + int ret; + + phy->guard_time_read = true; + phy->guard_time_write = true; + phy->guard_time_min = GUARD_TIME_DEFAULT_MIN; + phy->guard_time_max = GUARD_TIME_DEFAULT_MAX; + + ret = tpm_tis_i2c_read_bytes(&phy->priv, TPM_I2C_INTERFACE_CAPABILITY, + sizeof(i2c_caps), (u8 *)&i2c_caps, + TPM_TIS_PHYS_32); + if (ret) + return ret; + + phy->guard_time_read = (i2c_caps & TPM_GUARD_TIME_RR_MASK) || + (i2c_caps & TPM_GUARD_TIME_RW_MASK); + phy->guard_time_write = (i2c_caps & TPM_GUARD_TIME_WR_MASK) || + (i2c_caps & TPM_GUARD_TIME_WW_MASK); + phy->guard_time_min = (i2c_caps & TPM_GUARD_TIME_MIN_MASK) >> + TPM_GUARD_TIME_MIN_SHIFT; + /* guard_time_max = guard_time_min * 1.2 */ + phy->guard_time_max = phy->guard_time_min + phy->guard_time_min / 5; + + return 0; +} + +static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_resume); + +static const struct tpm_tis_phy_ops tpm_i2c_phy_ops = { + .read_bytes = tpm_tis_i2c_read_bytes, + .write_bytes = tpm_tis_i2c_write_bytes, + .verify_crc = tpm_tis_i2c_verify_crc, +}; + +static int tpm_tis_i2c_probe(struct i2c_client *dev, + const struct i2c_device_id *id) +{ + struct tpm_tis_i2c_phy *phy; + const u8 crc_enable = 1; + const u8 locality = 0; + int ret; + + phy = devm_kzalloc(&dev->dev, sizeof(struct tpm_tis_i2c_phy), + GFP_KERNEL); + if (!phy) + return -ENOMEM; + + phy->io_buf = devm_kzalloc(&dev->dev, TPM_BUFSIZE, GFP_KERNEL); + if (!phy->io_buf) + return -ENOMEM; + + phy->i2c_client = dev; + + /* must precede all communication with the tpm */ + ret = tpm_tis_i2c_init_guard_time(phy); + if (ret) + return ret; + + ret = tpm_tis_i2c_write_bytes(&phy->priv, TPM_LOC_SEL, sizeof(locality), + &locality, TPM_TIS_PHYS_8); + if (ret) + return ret; + + ret = tpm_tis_i2c_write_bytes(&phy->priv, TPM_I2C_DATA_CSUM_ENABLE, + sizeof(crc_enable), &crc_enable, + TPM_TIS_PHYS_8); + if (ret) + return ret; + + return tpm_tis_core_init(&dev->dev, &phy->priv, -1, &tpm_i2c_phy_ops, + NULL); +} + +static int tpm_tis_i2c_remove(struct i2c_client *client) +{ + struct tpm_chip *chip = i2c_get_clientdata(client); + + tpm_chip_unregister(chip); + tpm_tis_remove(chip); + return 0; +} + +static const struct i2c_device_id tpm_tis_i2c_id[] = { + { "tpm_tis_i2c", 0 }, + {} +}; +MODULE_DEVICE_TABLE(i2c, tpm_tis_i2c_id); + +#ifdef CONFIG_OF +static const struct of_device_id of_tis_i2c_match[] = { + { .compatible = "infineon,slb9673", }, + {} +}; +MODULE_DEVICE_TABLE(of, of_tis_i2c_match); +#endif + +static struct i2c_driver tpm_tis_i2c_driver = { + .driver = { + .owner = THIS_MODULE, + .name = "tpm_tis_i2c", + .pm = &tpm_tis_pm, + .of_match_table = of_match_ptr(of_tis_i2c_match), + }, + .probe = tpm_tis_i2c_probe, + .remove = tpm_tis_i2c_remove, + .id_table = tpm_tis_i2c_id, +}; +module_i2c_driver(tpm_tis_i2c_driver); + +MODULE_DESCRIPTION("TPM Driver for native I2C access"); +MODULE_LICENSE("GPL"); From 332e673041bcc01884a1b8a41c005fa2776fdb8c Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 27 Jun 2022 17:19:58 +0800 Subject: [PATCH 0332/1250] X.509: Support parsing certificate using SM2 algorithm The SM2-with-SM3 certificate generated by latest openssl no longer reuses the OID_id_ecPublicKey, but directly uses OID_sm2. This patch supports this type of x509 certificate parsing. Signed-off-by: Tianjia Zhang Signed-off-by: Jarkko Sakkinen --- crypto/asymmetric_keys/x509_cert_parser.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crypto/asymmetric_keys/x509_cert_parser.c b/crypto/asymmetric_keys/x509_cert_parser.c index 2899ed80bb18e3..7a9b084e2043d7 100644 --- a/crypto/asymmetric_keys/x509_cert_parser.c +++ b/crypto/asymmetric_keys/x509_cert_parser.c @@ -508,6 +508,9 @@ int x509_extract_key_data(void *context, size_t hdrlen, case OID_gost2012PKey512: ctx->cert->pub->pkey_algo = "ecrdsa"; break; + case OID_sm2: + ctx->cert->pub->pkey_algo = "sm2"; + break; case OID_id_ecPublicKey: if (parse_OID(ctx->params, ctx->params_size, &oid) != 0) return -EBADMSG; From fec88eeea7b0f221d630fe3cac1271ce2d9ab824 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 27 Jun 2022 17:21:07 +0800 Subject: [PATCH 0333/1250] sign-file: Fix confusing error messages When an error occurs, use errx() instead of err() to display the error message, because openssl has its own error record. When an error occurs, errno will not be changed, while err() displays the errno error message. It will cause confusion. For example, when CMS_add1_signer() fails, the following message will appear: sign-file: CMS_add1_signer: Success errx() ignores errno and does not cause such issue. Signed-off-by: Tianjia Zhang Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- scripts/sign-file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sign-file.c b/scripts/sign-file.c index 7434e9ea926e2a..598ef5465f8256 100644 --- a/scripts/sign-file.c +++ b/scripts/sign-file.c @@ -114,7 +114,7 @@ static void drain_openssl_errors(void) bool __cond = (cond); \ display_openssl_errors(__LINE__); \ if (__cond) { \ - err(1, fmt, ## __VA_ARGS__); \ + errx(1, fmt, ## __VA_ARGS__); \ } \ } while(0) From 36fdd1af0c2d002c06ef15b898ea31923413f281 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Mon, 27 Jun 2022 17:21:41 +0800 Subject: [PATCH 0334/1250] pkcs7: parser support SM2 and SM3 algorithms combination Support parsing the message signature of the SM2 and SM3 algorithm combination. This group of algorithms has been well supported. One of the main users is module signature verification. Signed-off-by: Tianjia Zhang Reviewed-by: Vitaly Chikunov Reviewed-by: Stefan Berger Signed-off-by: Jarkko Sakkinen --- crypto/asymmetric_keys/pkcs7_parser.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index 6592279d839aff..24e2e4a6d84284 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -248,6 +248,9 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen, case OID_sha224: ctx->sinfo->sig->hash_algo = "sha224"; break; + case OID_sm3: + ctx->sinfo->sig->hash_algo = "sm3"; + break; default: printk("Unsupported digest algo: %u\n", ctx->last_oid); return -ENOPKG; @@ -277,6 +280,10 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, ctx->sinfo->sig->pkey_algo = "ecdsa"; ctx->sinfo->sig->encoding = "x962"; break; + case OID_SM2_with_SM3: + ctx->sinfo->sig->pkey_algo = "sm2"; + ctx->sinfo->sig->encoding = "raw"; + break; default: printk("Unsupported pkey algo: %u\n", ctx->last_oid); return -ENOPKG; From 85b909e55c02cb41a87e7c704f12e282b279306b Mon Sep 17 00:00:00 2001 From: Elvira Khabirova Date: Mon, 27 Jun 2022 17:21:42 +0800 Subject: [PATCH 0335/1250] pkcs7: support EC-RDSA/streebog in SignerInfo Allow using EC-RDSA/streebog in pkcs7 certificates in a similar way to how it's done in the x509 parser. This is needed e.g. for loading kernel modules signed with EC-RDSA. Signed-off-by: Elvira Khabirova Reviewed-by: Vitaly Chikunov Reviewed-by: Tianjia Zhang Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- crypto/asymmetric_keys/pkcs7_parser.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/crypto/asymmetric_keys/pkcs7_parser.c b/crypto/asymmetric_keys/pkcs7_parser.c index 24e2e4a6d84284..277482bb177771 100644 --- a/crypto/asymmetric_keys/pkcs7_parser.c +++ b/crypto/asymmetric_keys/pkcs7_parser.c @@ -251,6 +251,12 @@ int pkcs7_sig_note_digest_algo(void *context, size_t hdrlen, case OID_sm3: ctx->sinfo->sig->hash_algo = "sm3"; break; + case OID_gost2012Digest256: + ctx->sinfo->sig->hash_algo = "streebog256"; + break; + case OID_gost2012Digest512: + ctx->sinfo->sig->hash_algo = "streebog512"; + break; default: printk("Unsupported digest algo: %u\n", ctx->last_oid); return -ENOPKG; @@ -284,6 +290,11 @@ int pkcs7_sig_note_pkey_algo(void *context, size_t hdrlen, ctx->sinfo->sig->pkey_algo = "sm2"; ctx->sinfo->sig->encoding = "raw"; break; + case OID_gost2012PKey256: + case OID_gost2012PKey512: + ctx->sinfo->sig->pkey_algo = "ecrdsa"; + ctx->sinfo->sig->encoding = "raw"; + break; default: printk("Unsupported pkey algo: %u\n", ctx->last_oid); return -ENOPKG; From 9c69021a91e5fdd746af0b3e748c7cdfc0fd60bd Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 28 Jun 2022 11:37:20 +0800 Subject: [PATCH 0336/1250] KEYS: asymmetric: enforce SM2 signature use pkey algo The signature verification of SM2 needs to add the Za value and recalculate sig->digest, which requires the detection of the pkey_algo in public_key_verify_signature(). As Eric Biggers said, the pkey_algo field in sig is attacker-controlled and should be use pkey->pkey_algo instead of sig->pkey_algo, and secondly, if sig->pkey_algo is NULL, it will also cause signature verification failure. The software_key_determine_akcipher() already forces the algorithms are matched, so the SM3 algorithm is enforced in the SM2 signature, although this has been checked, we still avoid using any algorithm information in the signature as input. Fixes: 215525639631 ("X.509: support OSCCA SM2-with-SM3 certificate verification") Reported-by: Eric Biggers Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Tianjia Zhang Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- crypto/asymmetric_keys/public_key.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 7c9e6be35c30c1..2f8352e8886022 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -304,6 +304,10 @@ static int cert_sig_digest_update(const struct public_key_signature *sig, BUG_ON(!sig->data); + /* SM2 signatures always use the SM3 hash algorithm */ + if (!sig->hash_algo || strcmp(sig->hash_algo, "sm3") != 0) + return -EINVAL; + ret = sm2_compute_z_digest(tfm_pkey, SM2_DEFAULT_USERID, SM2_DEFAULT_USERID_LEN, dgst); if (ret) @@ -414,8 +418,7 @@ int public_key_verify_signature(const struct public_key *pkey, if (ret) goto error_free_key; - if (sig->pkey_algo && strcmp(sig->pkey_algo, "sm2") == 0 && - sig->data_size) { + if (strcmp(pkey->pkey_algo, "sm2") == 0 && sig->data_size) { ret = cert_sig_digest_update(sig, tfm); if (ret) goto error_free_key; From 802349eb85a626feb6294ace9a2fb20d102519bf Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 1 Jul 2022 17:13:22 +0800 Subject: [PATCH 0337/1250] tpm: fix platform_no_drv_owner.cocci warning Eliminate the following coccicheck warning: ./drivers/char/tpm/tpm_tis_i2c.c:379:3-8: No need to set .owner here. The core will do it. Remove .owner field if calls are used which set it automatically Signed-off-by: Yang Li Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_i2c.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/char/tpm/tpm_tis_i2c.c b/drivers/char/tpm/tpm_tis_i2c.c index 8e0686fe4eb1a2..ba0911b1d1ff31 100644 --- a/drivers/char/tpm/tpm_tis_i2c.c +++ b/drivers/char/tpm/tpm_tis_i2c.c @@ -376,7 +376,6 @@ MODULE_DEVICE_TABLE(of, of_tis_i2c_match); static struct i2c_driver tpm_tis_i2c_driver = { .driver = { - .owner = THIS_MODULE, .name = "tpm_tis_i2c", .pm = &tpm_tis_pm, .of_match_table = of_match_ptr(of_tis_i2c_match), From 69cb69ea55420388b444ee30b1530ec15ab584f7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 10 Jul 2022 16:15:10 -0400 Subject: [PATCH 0338/1250] ida: Remove assertions that an ID was allocated Nobody finds value in asserting that an ID that we try to free was already allocated, so just remove those assertions. Signed-off-by: Matthew Wilcox (Oracle) --- lib/idr.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/lib/idr.c b/lib/idr.c index 7ecdfdb5309e74..aacb3e6d895d6d 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -491,25 +491,22 @@ void ida_free(struct ida *ida, unsigned int id) struct ida_bitmap *bitmap; unsigned long flags; - if ((int)id < 0) - return; - xas_lock_irqsave(&xas, flags); bitmap = xas_load(&xas); if (xa_is_value(bitmap)) { unsigned long v = xa_to_value(bitmap); if (bit >= BITS_PER_XA_VALUE) - goto err; + goto not_found; if (!(v & (1UL << bit))) - goto err; + goto not_found; v &= ~(1UL << bit); if (!v) goto delete; xas_store(&xas, xa_mk_value(v)); } else { if (!test_bit(bit, bitmap->bitmap)) - goto err; + goto not_found; __clear_bit(bit, bitmap->bitmap); xas_set_mark(&xas, XA_FREE_MARK); if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) { @@ -518,11 +515,8 @@ void ida_free(struct ida *ida, unsigned int id) xas_store(&xas, NULL); } } +not_found: xas_unlock_irqrestore(&xas, flags); - return; - err: - xas_unlock_irqrestore(&xas, flags); - WARN(1, "ida_free called for id=%d which is not allocated.\n", id); } EXPORT_SYMBOL(ida_free); From f9a45e4dcf9043a53b76617799daa96a2c118327 Mon Sep 17 00:00:00 2001 From: William Zhang Date: Wed, 6 Jul 2022 23:57:58 -0700 Subject: [PATCH 0339/1250] spi: bcm63xx-hsspi: bcmbca: Replace ARCH_BCM_63XX with ARCH_BCMBCA Prepare for the BCM63138 ARCH_BCM_63XX migration to ARCH_BCMBCA. Make SPI_BCM63XX_HSSPI depending on ARCH_BCMBCA. Signed-off-by: William Zhang Acked-by: Florian Fainelli Signed-off-by: Florian Fainelli --- drivers/spi/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 3b1044ebc4006b..35ce57878b2778 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -183,7 +183,7 @@ config SPI_BCM63XX config SPI_BCM63XX_HSSPI tristate "Broadcom BCM63XX HS SPI controller driver" - depends on BCM63XX || BMIPS_GENERIC || ARCH_BCM_63XX || COMPILE_TEST + depends on BCM63XX || BMIPS_GENERIC || ARCH_BCMBCA || COMPILE_TEST help This enables support for the High Speed SPI controller present on newer Broadcom BCM63XX SoCs. From 37ac3db2a692578a38e8e414706be22acc0b294c Mon Sep 17 00:00:00 2001 From: William Zhang Date: Wed, 6 Jul 2022 23:57:59 -0700 Subject: [PATCH 0340/1250] tty: serial: bcm63xx: bcmbca: Replace ARCH_BCM_63XX with ARCH_BCMBCA Prepare for the BCM63138 ARCH_BCM_63XX migration to ARCH_BCMBCA. Make SERIAL_BCM63XX depending on ARCH_BCMBCA. Signed-off-by: William Zhang Acked-by: Florian Fainelli Signed-off-by: Florian Fainelli --- drivers/tty/serial/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index a452748c69b254..7172cd1792dfc8 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -1099,8 +1099,8 @@ config SERIAL_TIMBERDALE config SERIAL_BCM63XX tristate "Broadcom BCM63xx/BCM33xx UART support" select SERIAL_CORE - depends on ARCH_BCM4908 || ARCH_BCM_63XX || BCM63XX || BMIPS_GENERIC || COMPILE_TEST - default ARCH_BCM4908 || ARCH_BCM_63XX || BCM63XX || BMIPS_GENERIC + depends on ARCH_BCM4908 || ARCH_BCMBCA || BCM63XX || BMIPS_GENERIC || COMPILE_TEST + default ARCH_BCM4908 || ARCH_BCMBCA || BCM63XX || BMIPS_GENERIC help This enables the driver for the onchip UART core found on the following chipsets: From 1679eb4a6194649e48d729f9e53eb47f9b6a8fdb Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Mon, 11 Jul 2022 09:17:38 +0800 Subject: [PATCH 0341/1250] tpm: eventlog: Fix section mismatch for DEBUG_SECTION_MISMATCH If DEBUG_SECTION_MISMATCH enabled, __calc_tpm2_event_size() will not be inlined, this cause section mismatch like this: WARNING: modpost: vmlinux.o(.text.unlikely+0xe30c): Section mismatch in reference from the variable L0 to the function .init.text:early_ioremap() The function L0() references the function __init early_memremap(). This is often because L0 lacks a __init annotation or the annotation of early_ioremap is wrong. Fix it by using __always_inline instead of inline for the called-once function __calc_tpm2_event_size(). Fixes: 44038bc514a2 ("tpm: Abstract crypto agile event size calculations") Cc: stable@vger.kernel.org # v5.3 Reported-by: WANG Xuerui Signed-off-by: Huacai Chen Signed-off-by: Jarkko Sakkinen --- include/linux/tpm_eventlog.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/tpm_eventlog.h b/include/linux/tpm_eventlog.h index 739ba9a03ec16b..20c0ff54b7a0d3 100644 --- a/include/linux/tpm_eventlog.h +++ b/include/linux/tpm_eventlog.h @@ -157,7 +157,7 @@ struct tcg_algorithm_info { * Return: size of the event on success, 0 on failure */ -static inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, +static __always_inline int __calc_tpm2_event_size(struct tcg_pcr_event2_head *event, struct tcg_pcr_event *event_header, bool do_mapping) { From 16950c52737ee221d99c238001c1624c3bde91d8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 11 Jun 2022 00:03:19 +0900 Subject: [PATCH 0342/1250] modpost: put get_secindex() call inside sec_name() There are 5 call-sites of sec_name(). In all the places, sec_name() is used together with get_secindex(). So, it is simpler to merge two function calls sec_name(elf, get_secindex(elf, sym)) into one call: sec_name_of_symbol(elf, sym) While I was here, I also inserted the array range check: if (secindex >= info->num_sections) return ""; If sym->st_shndx is a special section index (between SHN_LORESERVE and SHN_HIRESERVE), there is no corresponding section header. For example, if a symbol specifies an absolute value, sym->st_shndx is SHN_ABS (=0xfff1). The current users do not cause the out-of-range access of info->sechddrs[], but it is better to avoid such a pitfall. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers --- scripts/mod/modpost.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 620dc8c4c81404..ba7c5a8ad448ca 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -339,8 +339,19 @@ static const char *sech_name(const struct elf_info *info, Elf_Shdr *sechdr) sechdr->sh_name); } -static const char *sec_name(const struct elf_info *info, int secindex) +static const char *sec_name_of_symbol(const struct elf_info *info, + const Elf_Sym *sym) { + unsigned int secindex = get_secindex(info, sym); + + /* + * If sym->st_shndx is a special section index, there is no + * corresponding section header. + * Return "" if the index is out of range of info->sechdrs[] array. + */ + if (secindex >= info->num_sections) + return ""; + return sech_name(info, &info->sechdrs[secindex]); } @@ -649,7 +660,7 @@ static void handle_symbol(struct module *mod, struct elf_info *info, const char *name, *secname; name = symname + strlen("__ksymtab_"); - secname = sec_name(info, get_secindex(info, sym)); + secname = sec_name_of_symbol(info, sym); if (strstarts(secname, "___ksymtab_gpl+")) sym_add_exported(name, mod, true); @@ -1217,7 +1228,7 @@ static Elf_Sym *find_elf_symbol2(struct elf_info *elf, Elf_Addr addr, if (is_shndx_special(sym->st_shndx)) continue; - symsec = sec_name(elf, get_secindex(elf, sym)); + symsec = sec_name_of_symbol(elf, sym); if (strcmp(symsec, sec) != 0) continue; if (!is_valid_name(elf, sym)) @@ -1457,7 +1468,7 @@ static void default_mismatch_handler(const char *modname, struct elf_info *elf, if (strstarts(fromsym, "reference___initcall")) return; - tosec = sec_name(elf, get_secindex(elf, sym)); + tosec = sec_name_of_symbol(elf, sym); to = find_elf_symbol(elf, r->r_addend, sym); tosym = sym_name(elf, to); @@ -1559,7 +1570,7 @@ static void extable_mismatch_handler(const char* modname, struct elf_info *elf, Elf_Rela* r, Elf_Sym* sym, const char *fromsec) { - const char* tosec = sec_name(elf, get_secindex(elf, sym)); + const char *tosec = sec_name_of_symbol(elf, sym); sec_mismatch_count++; @@ -1593,7 +1604,7 @@ static void extable_mismatch_handler(const char* modname, struct elf_info *elf, static void check_section_mismatch(const char *modname, struct elf_info *elf, Elf_Rela *r, Elf_Sym *sym, const char *fromsec) { - const char *tosec = sec_name(elf, get_secindex(elf, sym)); + const char *tosec = sec_name_of_symbol(elf, sym); const struct sectioncheck *mismatch = section_mismatch(fromsec, tosec); if (mismatch) { From 06b8307368c8bffc9c0fb34862f0ad050999008c Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 12 Jun 2022 02:22:32 +0900 Subject: [PATCH 0343/1250] certs: move scripts/check-blacklist-hashes.awk to certs/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This script is only used in certs/Makefile, so certs/ is a better home for it. Signed-off-by: Masahiro Yamada Reviewed-by: Mickaël Salaün Reviewed-by: Jarkko Sakkinen --- MAINTAINERS | 1 - certs/Makefile | 2 +- {scripts => certs}/check-blacklist-hashes.awk | 0 3 files changed, 1 insertion(+), 2 deletions(-) rename {scripts => certs}/check-blacklist-hashes.awk (100%) diff --git a/MAINTAINERS b/MAINTAINERS index f679152bdbadf9..cd569d35d55c16 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4718,7 +4718,6 @@ L: keyrings@vger.kernel.org S: Maintained F: Documentation/admin-guide/module-signing.rst F: certs/ -F: scripts/check-blacklist-hashes.awk F: scripts/sign-file.c F: tools/certs/ diff --git a/certs/Makefile b/certs/Makefile index 88a73b28d254a2..854647dbce05eb 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -13,7 +13,7 @@ CFLAGS_blacklist_hashes.o := -I $(obj) quiet_cmd_check_and_copy_blacklist_hash_list = GEN $@ cmd_check_and_copy_blacklist_hash_list = \ - $(AWK) -f $(srctree)/scripts/check-blacklist-hashes.awk $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) >&2; \ + $(AWK) -f $(srctree)/$(src)/check-blacklist-hashes.awk $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) >&2; \ cat $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) > $@ $(obj)/blacklist_hash_list: $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) FORCE diff --git a/scripts/check-blacklist-hashes.awk b/certs/check-blacklist-hashes.awk similarity index 100% rename from scripts/check-blacklist-hashes.awk rename to certs/check-blacklist-hashes.awk From 6b8b1077a377ebc2ef52b56dc2d8883e2f5e29ea Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 12 Jun 2022 02:22:33 +0900 Subject: [PATCH 0344/1250] certs: unify blacklist_hashes.c and blacklist_nohashes.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These two files are very similar. Unify them. Signed-off-by: Masahiro Yamada Reviewed-by: Mickaël Salaün Reviewed-by: Jarkko Sakkinen --- certs/Makefile | 12 +++++------- certs/blacklist_hashes.c | 1 - certs/blacklist_nohashes.c | 6 ------ 3 files changed, 5 insertions(+), 14 deletions(-) delete mode 100644 certs/blacklist_nohashes.c diff --git a/certs/Makefile b/certs/Makefile index 854647dbce05eb..9486ed924731b8 100644 --- a/certs/Makefile +++ b/certs/Makefile @@ -4,24 +4,22 @@ # obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o -obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o +obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist.o blacklist_hashes.o obj-$(CONFIG_SYSTEM_REVOCATION_LIST) += revocation_certificates.o -ifneq ($(CONFIG_SYSTEM_BLACKLIST_HASH_LIST),) $(obj)/blacklist_hashes.o: $(obj)/blacklist_hash_list CFLAGS_blacklist_hashes.o := -I $(obj) quiet_cmd_check_and_copy_blacklist_hash_list = GEN $@ cmd_check_and_copy_blacklist_hash_list = \ + $(if $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST), \ $(AWK) -f $(srctree)/$(src)/check-blacklist-hashes.awk $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) >&2; \ - cat $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) > $@ + { cat $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST); echo $(comma) NULL; } > $@, \ + echo NULL > $@) $(obj)/blacklist_hash_list: $(CONFIG_SYSTEM_BLACKLIST_HASH_LIST) FORCE $(call if_changed,check_and_copy_blacklist_hash_list) -obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist_hashes.o -else -obj-$(CONFIG_SYSTEM_BLACKLIST_KEYRING) += blacklist_nohashes.o -endif + targets += blacklist_hash_list quiet_cmd_extract_certs = CERT $@ diff --git a/certs/blacklist_hashes.c b/certs/blacklist_hashes.c index 86d66fe1134899..0c5476abebd96a 100644 --- a/certs/blacklist_hashes.c +++ b/certs/blacklist_hashes.c @@ -3,5 +3,4 @@ const char __initconst *const blacklist_hashes[] = { #include "blacklist_hash_list" - , NULL }; diff --git a/certs/blacklist_nohashes.c b/certs/blacklist_nohashes.c deleted file mode 100644 index 753b703ef0ef8d..00000000000000 --- a/certs/blacklist_nohashes.c +++ /dev/null @@ -1,6 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "blacklist.h" - -const char __initconst *const blacklist_hashes[] = { - NULL -}; From fadf028f0be9af6e5105a2e326199939537f28c2 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 14 Jun 2022 14:51:49 +0900 Subject: [PATCH 0345/1250] kbuild: remove sed command from cmd_ar_builtin Replace a pipeline of echo and sed with printf to decrease process forks. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers --- scripts/Makefile.build | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index cac070aee79157..784f46d41959b6 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -358,9 +358,8 @@ $(subdir-modorder): $(obj)/%/modules.order: $(obj)/% ; quiet_cmd_ar_builtin = AR $@ cmd_ar_builtin = rm -f $@; \ - echo $(patsubst $(obj)/%,%,$(real-prereqs)) | \ - sed -E 's:([^ ]+):$(obj)/\1:g' | \ - xargs $(AR) cDPrST $@ + $(if $(real-prereqs), printf "$(obj)/%s " $(patsubst $(obj)/%,%,$(real-prereqs)) | xargs) \ + $(AR) cDPrST $@ $(obj)/built-in.a: $(real-obj-y) FORCE $(call if_changed,ar_builtin) From b63a0c3e51a45d53f0b7946fb0eb1b0621ef9fcb Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 28 Jun 2022 14:04:07 -0700 Subject: [PATCH 0346/1250] kbuild: drop support for CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 The difference in most compilers between `-O3` and `-O2` is mostly down to whether loops with statically determinable trip counts are fully unrolled vs unrolled to a multiple of SIMD width. This patch is effectively a revert of commit 15f5db60a137 ("kbuild,arc: add CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 for ARC") without re-adding ARCH_CFLAGS Ever since commit cfdbc2e16e65 ("ARC: Build system: Makefiles, Kconfig, Linker script") ARC has been built with -O3, though the reason for doing so was not specified in inline comments or the commit message. This commit does not re-add -O3 to arch/arc/Makefile. Folks looking to experiment with `-O3` (or any compiler flag for that matter) may pass them along to the command line invocation of make: $ make KCFLAGS=-O3 Code that looks to re-add an explicit Kconfig option for `-O3` should provide: 1. A rigorous and reproducible performance profile of a reasonable userspace workload that demonstrates a hot loop in the kernel that would benefit from `-O3` over `-O2`. 2. Disassembly of said loop body before and after. 3. Provides stats on terms of increase in file size. Link: https://lore.kernel.org/linux-kbuild/CA+55aFz2sNBbZyg-_i8_Ldr2e8o9dfvdSfHHuRzVtP2VMAUWPg@mail.gmail.com/ Signed-off-by: Nick Desaulniers Signed-off-by: Masahiro Yamada --- Makefile | 2 -- arch/arc/configs/axs101_defconfig | 1 - arch/arc/configs/axs103_defconfig | 1 - arch/arc/configs/axs103_smp_defconfig | 1 - arch/arc/configs/haps_hs_defconfig | 1 - arch/arc/configs/haps_hs_smp_defconfig | 1 - arch/arc/configs/hsdk_defconfig | 1 - arch/arc/configs/nsim_700_defconfig | 1 - arch/arc/configs/nsimosci_defconfig | 1 - arch/arc/configs/nsimosci_hs_defconfig | 1 - arch/arc/configs/nsimosci_hs_smp_defconfig | 1 - arch/arc/configs/tb10x_defconfig | 1 - arch/arc/configs/vdk_hs38_defconfig | 1 - arch/arc/configs/vdk_hs38_smp_defconfig | 1 - init/Kconfig | 7 ------- 15 files changed, 22 deletions(-) diff --git a/Makefile b/Makefile index faa4880f25f7e8..f8e325709bf37e 100644 --- a/Makefile +++ b/Makefile @@ -755,8 +755,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 -else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 -KBUILD_CFLAGS += -O3 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os endif diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig index 0016149f958304..e31a8ebc3eccb0 100644 --- a/arch/arc/configs/axs101_defconfig +++ b/arch/arc/configs/axs101_defconfig @@ -9,7 +9,6 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig index 5b031582a1cf8b..e0e8567f0d7585 100644 --- a/arch/arc/configs/axs103_defconfig +++ b/arch/arc/configs/axs103_defconfig @@ -9,7 +9,6 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig index d4eec39e0112c4..fcbc952bc75bb9 100644 --- a/arch/arc/configs/axs103_smp_defconfig +++ b/arch/arc/configs/axs103_smp_defconfig @@ -9,7 +9,6 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig index 7337cdf4ffddab..d87ad7e88d624a 100644 --- a/arch/arc/configs/haps_hs_defconfig +++ b/arch/arc/configs/haps_hs_defconfig @@ -11,7 +11,6 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EXPERT=y CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig index bc927221afc0e0..8d82cdb7f86a6a 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -11,7 +11,6 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index aa000075a57574..f856b03e0fb5cd 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -9,7 +9,6 @@ CONFIG_NAMESPACES=y # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y CONFIG_BLK_DEV_RAM=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig index 326f6cde782689..a1ce12bf5b1659 100644 --- a/arch/arc/configs/nsim_700_defconfig +++ b/arch/arc/configs/nsim_700_defconfig @@ -11,7 +11,6 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig index bf39a0091679c7..ca10f4a2c823fb 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -10,7 +10,6 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig index 7121bd71c543ad..31b6ec3683c65b 100644 --- a/arch/arc/configs/nsimosci_hs_defconfig +++ b/arch/arc/configs/nsimosci_hs_defconfig @@ -10,7 +10,6 @@ CONFIG_NAMESPACES=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig index f9863b294a707e..41a0037f48a58a 100644 --- a/arch/arc/configs/nsimosci_hs_smp_defconfig +++ b/arch/arc/configs/nsimosci_hs_smp_defconfig @@ -8,7 +8,6 @@ CONFIG_IKCONFIG_PROC=y # CONFIG_UTS_NS is not set # CONFIG_PID_NS is not set CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set CONFIG_KPROBES=y diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig index a12656ec00720e..d93b65008d4afd 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -14,7 +14,6 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio" CONFIG_INITRAMFS_ROOT_UID=2100 CONFIG_INITRAMFS_ROOT_GID=501 # CONFIG_RD_GZIP is not set -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_KALLSYMS_ALL=y # CONFIG_AIO is not set CONFIG_EMBEDDED=y diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig index d7c858df520cc0..0c3b214168197d 100644 --- a/arch/arc/configs/vdk_hs38_defconfig +++ b/arch/arc/configs/vdk_hs38_defconfig @@ -4,7 +4,6 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig index 015c1d43889e69..f9ad9d3ee702d9 100644 --- a/arch/arc/configs/vdk_hs38_smp_defconfig +++ b/arch/arc/configs/vdk_hs38_smp_defconfig @@ -4,7 +4,6 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_BLK_DEV_INITRD=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/init/Kconfig b/init/Kconfig index c7900e8975f181..1b4d8acc3def04 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1401,13 +1401,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. -config CC_OPTIMIZE_FOR_PERFORMANCE_O3 - bool "Optimize more for performance (-O3)" - depends on ARC - help - Choosing this option will pass "-O3" to your compiler to optimize - the kernel yet more for performance. - config CC_OPTIMIZE_FOR_SIZE bool "Optimize for size (-Os)" help From 28aa004b62610576a64d08de24b741fbda67c693 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 7 Jul 2022 07:43:29 +0300 Subject: [PATCH 0347/1250] init/Kconfig: update KALLSYMS_ALL help text CONFIG_KALLSYMS_ALL is required for kernel live patching which is a common use case that is enabled in some major distros. Update the Kconfig help text to reflect that. While at it, s/e.g./i.e./ to match the text intention. Signed-off-by: Baruch Siach Signed-off-by: Masahiro Yamada --- init/Kconfig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 1b4d8acc3def04..db5257bebc181a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1715,16 +1715,17 @@ config KALLSYMS_ALL help Normally kallsyms only contains the symbols of functions for nicer OOPS messages and backtraces (i.e., symbols from the text and inittext - sections). This is sufficient for most cases. And only in very rare - cases (e.g., when a debugger is used) all symbols are required (e.g., - names of variables from the data sections, etc). + sections). This is sufficient for most cases. And only if you want to + enable kernel live patching, or other less common use cases (e.g., + when a debugger is used) all symbols are required (i.e., names of + variables from the data sections, etc). This option makes sure that all symbols are loaded into the kernel image (i.e., symbols from all sections) in cost of increased kernel size (depending on the kernel configuration, it may be 300KiB or something like this). - Say N unless you really need all symbols. + Say N unless you really need all symbols, or kernel live patching. config KALLSYMS_ABSOLUTE_PERCPU bool From 52cae4381f691d40d6213245a61d2a2c1eeaeeb2 Mon Sep 17 00:00:00 2001 From: Herve Codina Date: Mon, 4 Jul 2022 12:28:43 +0200 Subject: [PATCH 0348/1250] clk: lan966x: Fix the lan966x clock gate register address The register address used for the clock gate register is the base register address coming from first reg map (ie. the generic clock registers) instead of the second reg map defining the clock gate register. Use the correct clock gate register address. Fixes: 5ad5915dea00 ("clk: lan966x: Extend lan966x clock driver for clock gating support") Signed-off-by: Herve Codina Tested-by: Michael Walle Reviewed-by: Claudiu Beznea Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220704102845.168438-2-herve.codina@bootlin.com --- drivers/clk/clk-lan966x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/clk-lan966x.c b/drivers/clk/clk-lan966x.c index d1535ac13e8942..81cb90955d68b9 100644 --- a/drivers/clk/clk-lan966x.c +++ b/drivers/clk/clk-lan966x.c @@ -213,7 +213,7 @@ static int lan966x_gate_clk_register(struct device *dev, hw_data->hws[i] = devm_clk_hw_register_gate(dev, clk_gate_desc[idx].name, - "lan966x", 0, base, + "lan966x", 0, gate_base, clk_gate_desc[idx].bit_idx, 0, &clk_gate_lock); From 4e03ba2fea702f1499c7a29e30e2916ec6cfa269 Mon Sep 17 00:00:00 2001 From: Liang He Date: Thu, 30 Jun 2022 22:39:49 +0800 Subject: [PATCH 0349/1250] clk: at91: dt-compat: Hold reference returned by of_get_parent() We need to hold the reference returned by of_get_parent() and use it to call of_node_put() for refcount balance. Fixes: 62061d357c7f ("clk: at91: move DT compatibility code to its own file") Signed-off-by: Liang He Reviewed-by: Claudiu Beznea Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20220630143949.218922-1-windhl@126.com --- drivers/clk/at91/dt-compat.c | 108 +++++++++++++++++++++++++++-------- 1 file changed, 84 insertions(+), 24 deletions(-) diff --git a/drivers/clk/at91/dt-compat.c b/drivers/clk/at91/dt-compat.c index 8ca8bcacf66de1..85a964cb2d89ee 100644 --- a/drivers/clk/at91/dt-compat.c +++ b/drivers/clk/at91/dt-compat.c @@ -33,8 +33,11 @@ static void __init of_sama5d2_clk_audio_pll_frac_setup(struct device_node *np) const char *name = np->name; const char *parent_name; struct regmap *regmap; + struct device_node *parent_np; - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -56,8 +59,11 @@ static void __init of_sama5d2_clk_audio_pll_pad_setup(struct device_node *np) const char *name = np->name; const char *parent_name; struct regmap *regmap; + struct device_node *parent_np; - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -79,8 +85,11 @@ static void __init of_sama5d2_clk_audio_pll_pmc_setup(struct device_node *np) const char *name = np->name; const char *parent_name; struct regmap *regmap; + struct device_node *parent_np; - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -120,7 +129,7 @@ static void __init of_sama5d2_clk_generated_setup(struct device_node *np) struct clk_hw *hw; unsigned int num_parents; const char *parent_names[GENERATED_SOURCE_MAX]; - struct device_node *gcknp; + struct device_node *gcknp, *parent_np; struct clk_range range = CLK_RANGE(0, 0); struct regmap *regmap; @@ -134,7 +143,9 @@ static void __init of_sama5d2_clk_generated_setup(struct device_node *np) if (!num || num > PERIPHERAL_MAX) return; - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -180,8 +191,11 @@ static void __init of_sama5d4_clk_h32mx_setup(struct device_node *np) const char *name = np->name; const char *parent_name; struct regmap *regmap; + struct device_node *parent_np; - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -243,12 +257,15 @@ static void __init of_at91rm9200_clk_main_osc_setup(struct device_node *np) const char *parent_name; struct regmap *regmap; bool bypass; + struct device_node *parent_np; of_property_read_string(np, "clock-output-names", &name); bypass = of_property_read_bool(np, "atmel,osc-bypass"); parent_name = of_clk_get_parent_name(np, 0); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -268,12 +285,15 @@ static void __init of_at91sam9x5_clk_main_rc_osc_setup(struct device_node *np) u32 accuracy = 0; const char *name = np->name; struct regmap *regmap; + struct device_node *parent_np; of_property_read_string(np, "clock-output-names", &name); of_property_read_u32(np, "clock-frequency", &frequency); of_property_read_u32(np, "clock-accuracy", &accuracy); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -292,11 +312,14 @@ static void __init of_at91rm9200_clk_main_setup(struct device_node *np) const char *parent_name; const char *name = np->name; struct regmap *regmap; + struct device_node *parent_np; parent_name = of_clk_get_parent_name(np, 0); of_property_read_string(np, "clock-output-names", &name); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -316,13 +339,16 @@ static void __init of_at91sam9x5_clk_main_setup(struct device_node *np) unsigned int num_parents; const char *name = np->name; struct regmap *regmap; + struct device_node *parent_np; num_parents = of_clk_get_parent_count(np); if (num_parents == 0 || num_parents > 2) return; of_clk_parent_fill(np, parent_names, num_parents); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -373,6 +399,7 @@ of_at91_clk_master_setup(struct device_node *np, const char *name = np->name; struct clk_master_characteristics *characteristics; struct regmap *regmap; + struct device_node *parent_np; num_parents = of_clk_get_parent_count(np); if (num_parents == 0 || num_parents > MASTER_SOURCE_MAX) @@ -386,7 +413,9 @@ of_at91_clk_master_setup(struct device_node *np, if (!characteristics) return; - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -433,6 +462,7 @@ of_at91_clk_periph_setup(struct device_node *np, u8 type) const char *name; struct device_node *periphclknp; struct regmap *regmap; + struct device_node *parent_np; parent_name = of_clk_get_parent_name(np, 0); if (!parent_name) @@ -442,7 +472,9 @@ of_at91_clk_periph_setup(struct device_node *np, u8 type) if (!num || num > PERIPHERAL_MAX) return; - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -601,6 +633,7 @@ of_at91_clk_pll_setup(struct device_node *np, struct regmap *regmap; const char *parent_name; const char *name = np->name; + struct device_node *parent_np; struct clk_pll_characteristics *characteristics; if (of_property_read_u32(np, "reg", &id)) @@ -610,7 +643,9 @@ of_at91_clk_pll_setup(struct device_node *np, of_property_read_string(np, "clock-output-names", &name); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -665,12 +700,15 @@ of_at91sam9x5_clk_plldiv_setup(struct device_node *np) const char *parent_name; const char *name = np->name; struct regmap *regmap; + struct device_node *parent_np; parent_name = of_clk_get_parent_name(np, 0); of_property_read_string(np, "clock-output-names", &name); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -694,7 +732,7 @@ of_at91_clk_prog_setup(struct device_node *np, unsigned int num_parents; const char *parent_names[PROG_SOURCE_MAX]; const char *name; - struct device_node *progclknp; + struct device_node *progclknp, *parent_np; struct regmap *regmap; num_parents = of_clk_get_parent_count(np); @@ -707,7 +745,9 @@ of_at91_clk_prog_setup(struct device_node *np, if (!num || num > (PROG_ID_MAX + 1)) return; - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -756,13 +796,16 @@ static void __init of_at91sam9260_clk_slow_setup(struct device_node *np) unsigned int num_parents; const char *name = np->name; struct regmap *regmap; + struct device_node *parent_np; num_parents = of_clk_get_parent_count(np); if (num_parents != 2) return; of_clk_parent_fill(np, parent_names, num_parents); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -788,6 +831,7 @@ static void __init of_at91sam9x5_clk_smd_setup(struct device_node *np) const char *parent_names[SMD_SOURCE_MAX]; const char *name = np->name; struct regmap *regmap; + struct device_node *parent_np; num_parents = of_clk_get_parent_count(np); if (num_parents == 0 || num_parents > SMD_SOURCE_MAX) @@ -797,7 +841,9 @@ static void __init of_at91sam9x5_clk_smd_setup(struct device_node *np) of_property_read_string(np, "clock-output-names", &name); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -818,7 +864,7 @@ static void __init of_at91rm9200_clk_sys_setup(struct device_node *np) u32 id; struct clk_hw *hw; const char *name; - struct device_node *sysclknp; + struct device_node *sysclknp, *parent_np; const char *parent_name; struct regmap *regmap; @@ -826,7 +872,9 @@ static void __init of_at91rm9200_clk_sys_setup(struct device_node *np) if (num > (SYSTEM_MAX_ID + 1)) return; - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -859,6 +907,7 @@ static void __init of_at91sam9x5_clk_usb_setup(struct device_node *np) const char *parent_names[USB_SOURCE_MAX]; const char *name = np->name; struct regmap *regmap; + struct device_node *parent_np; num_parents = of_clk_get_parent_count(np); if (num_parents == 0 || num_parents > USB_SOURCE_MAX) @@ -868,7 +917,9 @@ static void __init of_at91sam9x5_clk_usb_setup(struct device_node *np) of_property_read_string(np, "clock-output-names", &name); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -888,6 +939,7 @@ static void __init of_at91sam9n12_clk_usb_setup(struct device_node *np) const char *parent_name; const char *name = np->name; struct regmap *regmap; + struct device_node *parent_np; parent_name = of_clk_get_parent_name(np, 0); if (!parent_name) @@ -895,7 +947,9 @@ static void __init of_at91sam9n12_clk_usb_setup(struct device_node *np) of_property_read_string(np, "clock-output-names", &name); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; @@ -915,6 +969,7 @@ static void __init of_at91rm9200_clk_usb_setup(struct device_node *np) const char *name = np->name; u32 divisors[4] = {0, 0, 0, 0}; struct regmap *regmap; + struct device_node *parent_np; parent_name = of_clk_get_parent_name(np, 0); if (!parent_name) @@ -926,7 +981,9 @@ static void __init of_at91rm9200_clk_usb_setup(struct device_node *np) of_property_read_string(np, "clock-output-names", &name); - regmap = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap)) return; hw = at91rm9200_clk_register_usb(regmap, name, parent_name, divisors); @@ -946,12 +1003,15 @@ static void __init of_at91sam9x5_clk_utmi_setup(struct device_node *np) const char *parent_name; const char *name = np->name; struct regmap *regmap_pmc, *regmap_sfr; + struct device_node *parent_np; parent_name = of_clk_get_parent_name(np, 0); of_property_read_string(np, "clock-output-names", &name); - regmap_pmc = syscon_node_to_regmap(of_get_parent(np)); + parent_np = of_get_parent(np); + regmap_pmc = syscon_node_to_regmap(parent_np); + of_node_put(parent_np); if (IS_ERR(regmap_pmc)) return; From 337a9cc9dca719ca74ed34620aba06a7655db3ef Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Jul 2022 13:55:19 -0400 Subject: [PATCH 0350/1250] lockd: set owner when unlocking files Signed-off-by: Jeff Layton --- fs/lockd/svcsubs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 0a22a2faf55224..b2f277727469cd 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -176,7 +176,7 @@ nlm_delete_file(struct nlm_file *file) } } -static int nlm_unlock_files(struct nlm_file *file) +static int nlm_unlock_files(struct nlm_file *file, fl_owner_t owner) { struct file_lock lock; @@ -184,6 +184,7 @@ static int nlm_unlock_files(struct nlm_file *file) lock.fl_type = F_UNLCK; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; + lock.fl_owner = owner; if (file->f_file[O_RDONLY] && vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, &lock, NULL)) goto out_err; @@ -225,7 +226,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file, if (match(lockhost, host)) { spin_unlock(&flctx->flc_lock); - if (nlm_unlock_files(file)) + if (nlm_unlock_files(file, fl->fl_owner)) return 1; goto again; } From 0c94c75e4212084d8dae8594dd150d083ba15d62 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 11 Jul 2022 14:13:07 -0400 Subject: [PATCH 0351/1250] lockd: fix nlm_close_files This loop condition tries a bit too hard to be clever. Just test for the two indexes we care about explicitly. Signed-off-by: Jeff Layton --- fs/lockd/svcsubs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index b2f277727469cd..e1c4617de77147 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -283,11 +283,10 @@ nlm_file_inuse(struct nlm_file *file) static void nlm_close_files(struct nlm_file *file) { - struct file *f; - - for (f = file->f_file[0]; f <= file->f_file[1]; f++) - if (f) - nlmsvc_ops->fclose(f); + if (file->f_file[O_RDONLY]) + nlmsvc_ops->fclose(file->f_file[O_RDONLY]); + if (file->f_file[O_WRONLY]) + nlmsvc_ops->fclose(file->f_file[O_WRONLY]); } /* From e23f2d4af5ee05f0d192ef9343709dd6022dfb94 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 11 Jul 2022 13:49:41 +0900 Subject: [PATCH 0352/1250] Revert "scripts/mod/modpost.c: permit '.cranges' secton for sh64 architecture." This reverts commit 4d10c223baab8be8f717df3625cfece5be26dead. Commit 37744feebc08 ("sh: remove sh5 support") removed the sh64 support entirely. Note: .cranges was only used for sh64 ever. Commit 211dc24b8744 ("Remove sh5 and sh64 support") in binutils-gdb already removed the relevant code. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers --- scripts/mod/modpost.c | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index ba7c5a8ad448ca..7735d095338ca5 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -753,7 +753,6 @@ static const char *const section_white_list[] = { ".comment*", ".debug*", - ".cranges", /* sh64 */ ".zdebug*", /* Compressed debug sections. */ ".GCC.command.line", /* record-gcc-switches */ ".mdebug*", /* alpha, score, mips etc. */ From be66c181264a47fb60e1064a9dd38447863c3478 Mon Sep 17 00:00:00 2001 From: Ondrej Mosnacek Date: Mon, 11 Jul 2022 14:09:23 +0200 Subject: [PATCH 0353/1250] kbuild: dummy-tools: avoid tmpdir leak in dummy gcc When passed -print-file-name=plugin, the dummy gcc script creates a temporary directory that is never cleaned up. To avoid cluttering $TMPDIR, instead use a static directory included in the source tree. Fixes: 76426e238834 ("kbuild: add dummy toolchains to enable all cc-option etc. in Kconfig") Signed-off-by: Ondrej Mosnacek Signed-off-by: Masahiro Yamada --- .../dummy-tools/dummy-plugin-dir/include/plugin-version.h | 0 scripts/dummy-tools/gcc | 8 ++------ 2 files changed, 2 insertions(+), 6 deletions(-) create mode 100644 scripts/dummy-tools/dummy-plugin-dir/include/plugin-version.h diff --git a/scripts/dummy-tools/dummy-plugin-dir/include/plugin-version.h b/scripts/dummy-tools/dummy-plugin-dir/include/plugin-version.h new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/scripts/dummy-tools/gcc b/scripts/dummy-tools/gcc index b2483149bbe550..7db82584343559 100755 --- a/scripts/dummy-tools/gcc +++ b/scripts/dummy-tools/gcc @@ -96,12 +96,8 @@ fi # To set GCC_PLUGINS if arg_contain -print-file-name=plugin "$@"; then - plugin_dir=$(mktemp -d) - - mkdir -p $plugin_dir/include - touch $plugin_dir/include/plugin-version.h - - echo $plugin_dir + # Use $0 to find the in-tree dummy directory + echo "$(dirname "$(readlink -f "$0")")/dummy-plugin-dir" exit 0 fi From 6e2456c3492adf7e3bd67e585fe9fa538291d2ce Mon Sep 17 00:00:00 2001 From: Demi Marie Obenour Date: Sun, 10 Jul 2022 19:05:22 -0400 Subject: [PATCH 0354/1250] xen/gntdev: Ignore failure to unmap INVALID_GRANT_HANDLE The error paths of gntdev_mmap() can call unmap_grant_pages() even though not all of the pages have been successfully mapped. This will trigger the WARN_ON()s in __unmap_grant_pages_done(). The number of warnings can be very large; I have observed thousands of lines of warnings in the systemd journal. Avoid this problem by only warning on unmapping failure if the handle being unmapped is not INVALID_GRANT_HANDLE. The handle field of any page that was not successfully mapped will be INVALID_GRANT_HANDLE, so this catches all cases where unmapping can legitimately fail. Fixes: dbe97cff7dd9 ("xen/gntdev: Avoid blocking in unmap_grant_pages()") Cc: stable@vger.kernel.org Suggested-by: Juergen Gross Signed-off-by: Demi Marie Obenour Reviewed-by: Oleksandr Tyshchenko Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220710230522.1563-1-demi@invisiblethingslab.com Signed-off-by: Juergen Gross --- drivers/xen/gntdev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 4b56c39f766d4d..84b143eef395b1 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -396,13 +396,15 @@ static void __unmap_grant_pages_done(int result, unsigned int offset = data->unmap_ops - map->unmap_ops; for (i = 0; i < data->count; i++) { - WARN_ON(map->unmap_ops[offset+i].status); + WARN_ON(map->unmap_ops[offset + i].status != GNTST_okay && + map->unmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); pr_debug("unmap handle=%d st=%d\n", map->unmap_ops[offset+i].handle, map->unmap_ops[offset+i].status); map->unmap_ops[offset+i].handle = INVALID_GRANT_HANDLE; if (use_ptemod) { - WARN_ON(map->kunmap_ops[offset+i].status); + WARN_ON(map->kunmap_ops[offset + i].status != GNTST_okay && + map->kunmap_ops[offset + i].handle != INVALID_GRANT_HANDLE); pr_debug("kunmap handle=%u st=%d\n", map->kunmap_ops[offset+i].handle, map->kunmap_ops[offset+i].status); From 375e4dd5be0bf6a8971b238461707ce155c19cd2 Mon Sep 17 00:00:00 2001 From: Ross Lagerwall Date: Mon, 27 Jun 2022 15:28:22 +0100 Subject: [PATCH 0355/1250] xen/manage: Use orderly_reboot() to reboot Currently when the toolstack issues a reboot, it gets translated into a call to ctrl_alt_del(). But tying reboot to ctrl-alt-del means rebooting may fail if e.g. the user has masked the ctrl-alt-del.target under systemd. A previous attempt to fix this issue made a change that sets the kernel.ctrl-alt-del sysctl to 1 before ctrl_alt_del() is called. However, this doesn't give userspace the opportunity to block rebooting or even do any cleanup or syncing. Instead, call orderly_reboot() which will call the "reboot" command, giving userspace the opportunity to block it or perform the usual reboot process while being independent of the ctrl-alt-del behaviour. It also matches what happens in the shutdown case. Signed-off-by: Ross Lagerwall Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220627142822.3612106-1-ross.lagerwall@citrix.com Signed-off-by: Juergen Gross --- drivers/xen/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 3d5a384d65f762..c16df629907e13 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -205,7 +205,7 @@ static void do_poweroff(void) static void do_reboot(void) { shutting_down = SHUTDOWN_POWEROFF; /* ? */ - ctrl_alt_del(); + orderly_reboot(); } static struct shutdown_handler shutdown_handlers[] = { From 14b963526313df19e61fc26d303b82049742cdb5 Mon Sep 17 00:00:00 2001 From: Zhang Jiaming Date: Thu, 30 Jun 2022 15:50:27 +0800 Subject: [PATCH 0356/1250] xen: Fix spelling mistake Change 'maped' to 'mapped'. Change 'unmaped' to 'unmapped'. Signed-off-by: Zhang Jiaming Reviewed-by: Juergen Gross Link: https://lore.kernel.org/r/20220630075027.68833-1-jiaming@nfschina.com Signed-off-by: Juergen Gross --- drivers/xen/xen-front-pgdir-shbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/xen-front-pgdir-shbuf.c b/drivers/xen/xen-front-pgdir-shbuf.c index bef8d72a6ca699..5c0b5cb5b4195b 100644 --- a/drivers/xen/xen-front-pgdir-shbuf.c +++ b/drivers/xen/xen-front-pgdir-shbuf.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_get_dir_start); * shared by the frontend itself) or map the provided granted * references onto the backing storage (buf->pages). * - * \param buf shared buffer which grants to be maped. + * \param buf shared buffer which grants to be mapped. * \return zero on success or a negative number on failure. */ int xen_front_pgdir_shbuf_map(struct xen_front_pgdir_shbuf *buf) @@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(xen_front_pgdir_shbuf_map); * shared by the frontend itself) or unmap the provided granted * references. * - * \param buf shared buffer which grants to be unmaped. + * \param buf shared buffer which grants to be unmapped. * \return zero on success or a negative number on failure. */ int xen_front_pgdir_shbuf_unmap(struct xen_front_pgdir_shbuf *buf) From 4e4b8d83659635e1c252d33ee9ae4a1d0f5bfff1 Mon Sep 17 00:00:00 2001 From: Vincent Fu Date: Fri, 8 Jul 2022 17:49:49 +0000 Subject: [PATCH 0357/1250] null_blk: add module parameters for 4 options Add as module parameters these options: memory_backed discard mbps cache_size Previously these could only be set via configfs. Still missing is bad_blocks. The kernel test robot found a documentation formatting issue in v1 of this patch. Reported-by: kernel test robot Signed-off-by: Vincent Fu Link: https://lore.kernel.org/r/20220708174943.87787-2-vincent.fu@samsung.com Signed-off-by: Jens Axboe --- Documentation/block/null_blk.rst | 22 ++++++++++++++++++++++ drivers/block/null_blk/main.c | 20 ++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/Documentation/block/null_blk.rst b/Documentation/block/null_blk.rst index edbbab2f12f8c7..4dd78f24d10af2 100644 --- a/Documentation/block/null_blk.rst +++ b/Documentation/block/null_blk.rst @@ -72,6 +72,28 @@ submit_queues=[1..nr_cpus]: Default: 1 hw_queue_depth=[0..qdepth]: Default: 64 The hardware queue depth of the device. +memory_backed=[0/1]: Default: 0 + Whether or not to use a memory buffer to respond to IO requests + + = ============================================= + 0 Transfer no data in response to IO requests + 1 Use a memory buffer to respond to IO requests + = ============================================= + +discard=[0/1]: Default: 0 + Support discard operations (requires memory-backed null_blk device). + + = ===================================== + 0 Do not support discard operations + 1 Enable support for discard operations + = ===================================== + +cache_size=[Size in MB]: Default: 0 + Cache size in MB for memory-backed device. + +mbps=[Maximum bandwidth in MB/s]: Default: 0 (no limit) + Bandwidth limit for device performance. + Multi-queue specific parameters ------------------------------- diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 3778df206b0136..8f821fa9431534 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -201,6 +201,22 @@ static bool g_use_per_node_hctx; module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); +static bool g_memory_backed; +module_param_named(memory_backed, g_memory_backed, bool, 0444); +MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false"); + +static bool g_discard; +module_param_named(discard, g_discard, bool, 0444); +MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false"); + +static unsigned long g_cache_size; +module_param_named(cache_size, g_cache_size, ulong, 0444); +MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); + +static unsigned int g_mbps; +module_param_named(mbps, g_mbps, uint, 0444); +MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); + static bool g_zoned; module_param_named(zoned, g_zoned, bool, S_IRUGO); MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); @@ -650,6 +666,10 @@ static struct nullb_device *null_alloc_dev(void) dev->irqmode = g_irqmode; dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; + dev->memory_backed = g_memory_backed; + dev->discard = g_discard; + dev->cache_size = g_cache_size; + dev->mbps = g_mbps; dev->use_per_node_hctx = g_use_per_node_hctx; dev->zoned = g_zoned; dev->zone_size = g_zone_size; From 37ae152c7a0dd5993ddab3154f3652b8bc21bc9b Mon Sep 17 00:00:00 2001 From: Vincent Fu Date: Fri, 8 Jul 2022 17:49:49 +0000 Subject: [PATCH 0358/1250] null_blk: add configfs variables for 2 options Allow setting via configfs these two options: no_sched shared_tag_bitmap Previously these could only be activated as module parameters. Still missing are: shared_tags timeout requeue init_hctx Signed-off-by: Vincent Fu Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220708174943.87787-3-vincent.fu@samsung.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 18 +++++++++++++++--- drivers/block/null_blk/null_blk.h | 2 ++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 8f821fa9431534..c955a07dba2d4d 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -425,6 +425,8 @@ NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); +NULLB_DEVICE_ATTR(no_sched, bool, NULL); +NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -548,6 +550,8 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_max_active, &nullb_device_attr_virt_boundary, + &nullb_device_attr_no_sched, + &nullb_device_attr_shared_tag_bitmap, NULL, }; @@ -604,7 +608,13 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active,blocksize,max_sectors,virt_boundary\n"); + "badblocks,blocking,blocksize,cache_size," + "completion_nsec,discard,home_node,hw_queue_depth," + "irqmode,max_sectors,mbps,memory_backed,no_sched," + "poll_queues,power,queue_mode,shared_tag_bitmap,size," + "submit_queues,use_per_node_hctx,virt_boundary,zoned," + "zone_capacity,zone_max_active,zone_max_open," + "zone_nr_conv,zone_size\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -678,6 +688,8 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; dev->virt_boundary = g_virt_boundary; + dev->no_sched = g_no_sched; + dev->shared_tag_bitmap = g_shared_tag_bitmap; return dev; } @@ -1899,9 +1911,9 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) set->numa_node = nullb ? nullb->dev->home_node : g_home_node; set->cmd_size = sizeof(struct nullb_cmd); set->flags = BLK_MQ_F_SHOULD_MERGE; - if (g_no_sched) + if (nullb->dev->no_sched) set->flags |= BLK_MQ_F_NO_SCHED; - if (g_shared_tag_bitmap) + if (nullb->dev->shared_tag_bitmap) set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; set->driver_data = nullb; if (poll_queues) diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 8359b43842f2f6..ce5c810c1f462b 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -113,6 +113,8 @@ struct nullb_device { bool discard; /* if support discard */ bool zoned; /* if device is zoned */ bool virt_boundary; /* virtual boundary on/off for the device */ + bool no_sched; /* no IO scheduler for the device */ + bool shared_tag_bitmap; /* use hostwide shared tags */ }; struct nullb { From 6c16bb03731017adb66e6bf234e6ebd4a64fa926 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 15 May 2022 18:04:40 +0300 Subject: [PATCH 0359/1250] nvme-rdma: remove timeout for getting RDMA-CM established event In case many controllers start error recovery at the same time (i.e., when port is down and up), they may never succeed to reconnect again. This is because the target can't handle all the connect requests at three seconds (the arbitrary value set today). Even if some of the connections are established, when a single queue fails to connect, all the controller's queues are destroyed as well. So, on the following reconnection attempts the number of connect requests may remain the same. To fix this, remove the timeout and wait for RDMA-CM event to abort/complete the connect request. RDMA-CM sends unreachable event when a timeout of ~90 seconds is expired. This approach is used at other RDMA-CM users like SRP and iSER at blocking mode. The commit also renames NVME_RDMA_CONNECT_TIMEOUT_MS to NVME_RDMA_CM_TIMEOUT_MS. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Acked-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 84ce3347d15834..7d01fb7702842f 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -29,7 +29,7 @@ #include "fabrics.h" -#define NVME_RDMA_CONNECT_TIMEOUT_MS 3000 /* 3 second */ +#define NVME_RDMA_CM_TIMEOUT_MS 3000 /* 3 second */ #define NVME_RDMA_MAX_SEGMENTS 256 @@ -248,12 +248,9 @@ static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue) { int ret; - ret = wait_for_completion_interruptible_timeout(&queue->cm_done, - msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1); - if (ret < 0) + ret = wait_for_completion_interruptible(&queue->cm_done); + if (ret) return ret; - if (ret == 0) - return -ETIMEDOUT; WARN_ON_ONCE(queue->cm_error > 0); return queue->cm_error; } @@ -612,7 +609,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, queue->cm_error = -ETIMEDOUT; ret = rdma_resolve_addr(queue->cm_id, src_addr, (struct sockaddr *)&ctrl->addr, - NVME_RDMA_CONNECT_TIMEOUT_MS); + NVME_RDMA_CM_TIMEOUT_MS); if (ret) { dev_info(ctrl->ctrl.device, "rdma_resolve_addr failed (%d).\n", ret); @@ -1887,7 +1884,7 @@ static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) if (ctrl->opts->tos >= 0) rdma_set_service_type(queue->cm_id, ctrl->opts->tos); - ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); + ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CM_TIMEOUT_MS); if (ret) { dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", queue->cm_error); From a70c11e1174ee394d50c9d7b4f485b6771a60b88 Mon Sep 17 00:00:00 2001 From: Caleb Sander Date: Thu, 7 Jul 2022 15:12:45 -0600 Subject: [PATCH 0360/1250] nvme-tcp: use in-capsule data for I/O connect Currently, command data is only sent in-capsule on the for admin or I/O commands on queues that indicate support for it. Send fabrics command data in-capsule for I/O queues as well to avoid needing a separate H2CData PDU for the connect command. This is optimization. Without this change, we send the connect command capsule and data in separate PDUs (CapsuleCmd and H2CData), and must wait for the controller to respond with an R2T PDU before sending the H2CData. With the change, we send a single CapsuleCmd PDU that includes the data. This reduces the number of bytes (and likely packets) sent across the network, and simplifies the send state machine handling in the driver. Signed-off-by: Caleb Sander Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a7848e430a5c53..d6a37c374e9745 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -209,9 +209,11 @@ static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; } -static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue) +static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req) { - return queue->cmnd_capsule_len - sizeof(struct nvme_command); + if (nvme_is_fabrics(req->req.cmd)) + return NVME_TCP_ADMIN_CCSZ; + return req->queue->cmnd_capsule_len - sizeof(struct nvme_command); } static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) @@ -229,7 +231,7 @@ static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) rq = blk_mq_rq_from_pdu(req); return rq_data_dir(rq) == WRITE && req->data_len && - req->data_len <= nvme_tcp_inline_data_size(req->queue); + req->data_len <= nvme_tcp_inline_data_size(req); } static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) @@ -2371,7 +2373,7 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, if (!blk_rq_nr_phys_segments(rq)) nvme_tcp_set_sg_null(c); else if (rq_data_dir(rq) == WRITE && - req->data_len <= nvme_tcp_inline_data_size(queue)) + req->data_len <= nvme_tcp_inline_data_size(req)) nvme_tcp_set_sg_inline(queue, c, req->data_len); else nvme_tcp_set_sg_host_data(c, req->data_len); @@ -2406,7 +2408,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, nvme_tcp_init_iter(req, rq_data_dir(rq)); if (rq_data_dir(rq) == WRITE && - req->data_len <= nvme_tcp_inline_data_size(queue)) + req->data_len <= nvme_tcp_inline_data_size(req)) req->pdu_len = req->data_len; pdu->hdr.type = nvme_tcp_cmd; From 2e099afdcaf50ea9858047030027655426d64b62 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Fri, 8 Jul 2022 11:04:37 +0800 Subject: [PATCH 0361/1250] nvme-pci: use nvme core helper to cancel requests in tagset Use nvme core helper nvme_cancel_tagset and nvme_cancel_admin_tagset instead of same logic code. Signed-off-by: Guixin Liu Reviewed-by: Sagi Grimberg Reviewed-by: Ruozhu Li Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d7b24ee1728599..d35401c7906b7f 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2720,10 +2720,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_pci_disable(dev); nvme_reap_pending_cqes(dev); - blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); - blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); - blk_mq_tagset_wait_completed_request(&dev->tagset); - blk_mq_tagset_wait_completed_request(&dev->admin_tagset); + nvme_cancel_tagset(&dev->ctrl); + nvme_cancel_admin_tagset(&dev->ctrl); /* * The driver will not be starting up queues again if shutting down so From 690cc0db1f05f6f75b50c607a74c43063f45d680 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Fri, 8 Jul 2022 11:06:05 +0800 Subject: [PATCH 0362/1250] nvme-apple: use nvme core helper to cancel requests in tagset Use nvme core helper nvme_cancel_tagset and nvme_cancel_admin_tagset instead of same logic code. Signed-off-by: Guixin Liu Reviewed-by: Sagi Grimberg Reviewed-by: Ruozhu Li Reviewed-by: Sven Peter Signed-off-by: Christoph Hellwig --- drivers/nvme/host/apple.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index d702d7d60235dc..7816b5a7d0e1b1 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -845,11 +845,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown) apple_nvme_handle_cq(&anv->adminq, true); spin_unlock_irqrestore(&anv->lock, flags); - blk_mq_tagset_busy_iter(&anv->tagset, nvme_cancel_request, &anv->ctrl); - blk_mq_tagset_busy_iter(&anv->admin_tagset, nvme_cancel_request, - &anv->ctrl); - blk_mq_tagset_wait_completed_request(&anv->tagset); - blk_mq_tagset_wait_completed_request(&anv->admin_tagset); + nvme_cancel_tagset(&anv->ctrl); + nvme_cancel_admin_tagset(&anv->ctrl); /* * The driver will not be starting up queues again if shutting down so From 7b20ea4f3911c86bee698f1b23ba7f59ec890ceb Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Tue, 28 Jun 2022 21:10:15 +0200 Subject: [PATCH 0363/1250] nvme-multipath: refactor nvme_mpath_add_disk Pass anagrpid as second argument. This is prep patch that allows reusing this function for supporting unknown command sets. Signed-off-by: Joel Granados Signed-off-by: Kanchan Joshi Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/multipath.c | 6 +++--- drivers/nvme/host/nvme.h | 5 ++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8d0089f83cbdc1..eabffbc708cd95 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4167,7 +4167,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (!nvme_ns_head_multipath(ns->head)) nvme_add_ns_cdev(ns); - nvme_mpath_add_disk(ns, id); + nvme_mpath_add_disk(ns, id->anagrpid); nvme_fault_inject_init(&ns->fault_inject, ns->disk->disk_name); kfree(id); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d3e2440d8abb05..94dba4eab4fbbf 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -800,16 +800,16 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl, return -ENXIO; /* just break out of the loop */ } -void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id) +void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) { if (nvme_ctrl_use_ana(ns->ctrl)) { struct nvme_ana_group_desc desc = { - .grpid = id->anagrpid, + .grpid = anagrpid, .state = 0, }; mutex_lock(&ns->ctrl->ana_lock); - ns->ana_grpid = le32_to_cpu(id->anagrpid); + ns->ana_grpid = le32_to_cpu(anagrpid); nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc); mutex_unlock(&ns->ctrl->ana_lock); if (desc.state) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e9350bf7b2d19f..396bb5611900dd 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -845,7 +845,7 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys); void nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); -void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); +void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid); void nvme_mpath_remove_disk(struct nvme_ns_head *head); int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id); void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); @@ -887,8 +887,7 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, { return 0; } -static inline void nvme_mpath_add_disk(struct nvme_ns *ns, - struct nvme_id_ns *id) +static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid) { } static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) From 908903ae8701a0b73cfa71ef203feb6e9261dcf4 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 9 Jul 2022 16:10:52 +0200 Subject: [PATCH 0364/1250] PCI: dwc: Use the bitmap API to allocate bitmaps Use devm_bitmap_zalloc() instead of hand-writing them. It is less verbose and it improves the semantic. Link: https://lore.kernel.org/r/bc6586a603abc0db7d4531308b698fbe7a6d7083.1657375829.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pcie-designware-ep.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 15b8059544e347..0667311e738053 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -713,17 +713,13 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep) ep->phys_base = res->start; ep->addr_size = resource_size(res); - ep->ib_window_map = devm_kcalloc(dev, - BITS_TO_LONGS(pci->num_ib_windows), - sizeof(long), - GFP_KERNEL); + ep->ib_window_map = devm_bitmap_zalloc(dev, pci->num_ib_windows, + GFP_KERNEL); if (!ep->ib_window_map) return -ENOMEM; - ep->ob_window_map = devm_kcalloc(dev, - BITS_TO_LONGS(pci->num_ob_windows), - sizeof(long), - GFP_KERNEL); + ep->ob_window_map = devm_bitmap_zalloc(dev, pci->num_ob_windows, + GFP_KERNEL); if (!ep->ob_window_map) return -ENOMEM; From 0cc323d985f97d5fd9a4217c536585a65dae4888 Mon Sep 17 00:00:00 2001 From: Zhengping Jiang Date: Mon, 11 Jul 2022 17:05:30 -0700 Subject: [PATCH 0365/1250] Bluetooth: hci_sync: Fix resuming scan after suspend resume After resuming, remove setting scanning_paused to false, because it is checked and set to false in hci_resume_scan_sync. Also move setting the value to false before updating passive scan, because the value is used when resuming passive scan. Fixes: 3b42055388c30 (Bluetooth: hci_sync: Fix attempting to suspend with unfiltered passive scan) Signed-off-by: Zhengping Jiang Reviewed-by: Abhishek Pandit-Subedi Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 7cb31005187992..212b0cdb25f5ef 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5063,13 +5063,13 @@ static int hci_resume_scan_sync(struct hci_dev *hdev) if (!hdev->scanning_paused) return 0; + hdev->scanning_paused = false; + hci_update_scan_sync(hdev); /* Reset passive scanning to normal */ hci_update_passive_scan_sync(hdev); - hdev->scanning_paused = false; - return 0; } @@ -5088,7 +5088,6 @@ int hci_resume_sync(struct hci_dev *hdev) return 0; hdev->suspended = false; - hdev->scanning_paused = false; /* Restore event mask */ hci_set_event_mask_sync(hdev); From 80ae4b266e1934eed00e2e6d69066cbe2692e482 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 22 Jun 2022 08:38:36 +0200 Subject: [PATCH 0366/1250] virtio: replace restricted mem access flag with callback Instead of having a global flag to require restricted memory access for all virtio devices, introduce a callback which can select that requirement on a per-device basis. For convenience add a common function returning always true, which can be used for use cases like SEV. Per default use a callback always returning false. As the callback needs to be set in early init code already, add a virtio anchor which is builtin in case virtio is enabled. Signed-off-by: Juergen Gross Tested-by: Oleksandr Tyshchenko # Arm64 guest using Xen Reviewed-by: Stefano Stabellini Link: https://lore.kernel.org/r/20220622063838.8854-2-jgross@suse.com Signed-off-by: Juergen Gross --- arch/s390/mm/init.c | 4 ++-- arch/x86/mm/mem_encrypt_amd.c | 4 ++-- drivers/virtio/Kconfig | 4 ++++ drivers/virtio/Makefile | 1 + drivers/virtio/virtio.c | 4 ++-- drivers/virtio/virtio_anchor.c | 18 ++++++++++++++++++ include/linux/platform-feature.h | 6 +----- include/linux/virtio_anchor.h | 19 +++++++++++++++++++ include/xen/xen.h | 4 ++-- 9 files changed, 51 insertions(+), 13 deletions(-) create mode 100644 drivers/virtio/virtio_anchor.c create mode 100644 include/linux/virtio_anchor.h diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 6a0ac00d5a42b4..4a154a08496600 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -48,6 +47,7 @@ #include #include #include +#include #include pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir"); @@ -175,7 +175,7 @@ static void pv_init(void) if (!is_prot_virt_guest()) return; - platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS); + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); /* make sure bounce buffers are shared */ swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE); diff --git a/arch/x86/mm/mem_encrypt_amd.c b/arch/x86/mm/mem_encrypt_amd.c index f6d038e2cd8e82..97452688f99fee 100644 --- a/arch/x86/mm/mem_encrypt_amd.c +++ b/arch/x86/mm/mem_encrypt_amd.c @@ -20,8 +20,8 @@ #include #include #include +#include #include -#include #include #include @@ -245,7 +245,7 @@ void __init sev_setup_arch(void) swiotlb_adjust_size(size); /* Set restricted memory access for virtio. */ - platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS); + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); } static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index e1556d2a355ae0..56c77f63cd224f 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only +config VIRTIO_ANCHOR + bool + config VIRTIO tristate + select VIRTIO_ANCHOR help This option is selected by any driver which implements the virtio bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index 0a82d087324849..8e98d24917cc05 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o +obj-$(CONFIG_VIRTIO_ANCHOR) += virtio_anchor.o obj-$(CONFIG_VIRTIO_PCI_LIB) += virtio_pci_modern_dev.o obj-$(CONFIG_VIRTIO_PCI_LIB_LEGACY) += virtio_pci_legacy_dev.o obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 7deeed30d1f3a0..14c142d77fba1b 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -2,10 +2,10 @@ #include #include #include +#include #include #include #include -#include #include /* Unique numbering for virtio devices. */ @@ -174,7 +174,7 @@ static int virtio_features_ok(struct virtio_device *dev) might_sleep(); - if (platform_has(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS)) { + if (virtio_check_mem_acc_cb(dev)) { if (!virtio_has_feature(dev, VIRTIO_F_VERSION_1)) { dev_warn(&dev->dev, "device must provide VIRTIO_F_VERSION_1\n"); diff --git a/drivers/virtio/virtio_anchor.c b/drivers/virtio/virtio_anchor.c new file mode 100644 index 00000000000000..4d6a5d269b554a --- /dev/null +++ b/drivers/virtio/virtio_anchor.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include + +bool virtio_require_restricted_mem_acc(struct virtio_device *dev) +{ + return true; +} +EXPORT_SYMBOL_GPL(virtio_require_restricted_mem_acc); + +static bool virtio_no_restricted_mem_acc(struct virtio_device *dev) +{ + return false; +} + +bool (*virtio_check_mem_acc_cb)(struct virtio_device *dev) = + virtio_no_restricted_mem_acc; +EXPORT_SYMBOL_GPL(virtio_check_mem_acc_cb); diff --git a/include/linux/platform-feature.h b/include/linux/platform-feature.h index b2f48be999fa4b..6ed859928b9783 100644 --- a/include/linux/platform-feature.h +++ b/include/linux/platform-feature.h @@ -6,11 +6,7 @@ #include /* The platform features are starting with the architecture specific ones. */ - -/* Used to enable platform specific DMA handling for virtio devices. */ -#define PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS (0 + PLATFORM_ARCH_FEAT_N) - -#define PLATFORM_FEAT_N (1 + PLATFORM_ARCH_FEAT_N) +#define PLATFORM_FEAT_N (0 + PLATFORM_ARCH_FEAT_N) void platform_set(unsigned int feature); void platform_clear(unsigned int feature); diff --git a/include/linux/virtio_anchor.h b/include/linux/virtio_anchor.h new file mode 100644 index 00000000000000..432e6c00b3cae0 --- /dev/null +++ b/include/linux/virtio_anchor.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VIRTIO_ANCHOR_H +#define _LINUX_VIRTIO_ANCHOR_H + +#ifdef CONFIG_VIRTIO_ANCHOR +struct virtio_device; + +bool virtio_require_restricted_mem_acc(struct virtio_device *dev); +extern bool (*virtio_check_mem_acc_cb)(struct virtio_device *dev); + +static inline void virtio_set_mem_acc_cb(bool (*func)(struct virtio_device *)) +{ + virtio_check_mem_acc_cb = func; +} +#else +#define virtio_set_mem_acc_cb(func) do { } while (0) +#endif + +#endif /* _LINUX_VIRTIO_ANCHOR_H */ diff --git a/include/xen/xen.h b/include/xen/xen.h index 0780a81e140de4..ac5a144c6a65e9 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -52,12 +52,12 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, extern u64 xen_saved_max_mem_size; #endif -#include +#include static inline void xen_set_restricted_virtio_memory_access(void) { if (IS_ENABLED(CONFIG_XEN_VIRTIO) && xen_domain()) - platform_set(PLATFORM_VIRTIO_RESTRICTED_MEM_ACCESS); + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); } #ifdef CONFIG_XEN_UNPOPULATED_ALLOC From 077814fe064fbdfaf4486b6b4da19c003bf89773 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 22 Jun 2022 08:38:37 +0200 Subject: [PATCH 0367/1250] kernel: remove platform_has() infrastructure The only use case of the platform_has() infrastructure has been removed again, so remove the whole feature. Signed-off-by: Juergen Gross Tested-by: Oleksandr Tyshchenko # Arm64 guest using Xen Reviewed-by: Stefano Stabellini Link: https://lore.kernel.org/r/20220622063838.8854-3-jgross@suse.com Signed-off-by: Juergen Gross --- MAINTAINERS | 8 -------- include/asm-generic/Kbuild | 1 - include/asm-generic/platform-feature.h | 8 -------- include/linux/platform-feature.h | 15 -------------- kernel/Makefile | 2 +- kernel/platform-feature.c | 27 -------------------------- 6 files changed, 1 insertion(+), 60 deletions(-) delete mode 100644 include/asm-generic/platform-feature.h delete mode 100644 include/linux/platform-feature.h delete mode 100644 kernel/platform-feature.c diff --git a/MAINTAINERS b/MAINTAINERS index f679152bdbadf9..9608c448cb39ba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15953,14 +15953,6 @@ S: Maintained F: Documentation/devicetree/bindings/iio/chemical/plantower,pms7003.yaml F: drivers/iio/chemical/pms7003.c -PLATFORM FEATURE INFRASTRUCTURE -M: Juergen Gross -S: Maintained -F: arch/*/include/asm/platform-feature.h -F: include/asm-generic/platform-feature.h -F: include/linux/platform-feature.h -F: kernel/platform-feature.c - PLDMFW LIBRARY M: Jacob Keller S: Maintained diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 8e47d483b52407..302506bbc2a4f7 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -44,7 +44,6 @@ mandatory-y += msi.h mandatory-y += pci.h mandatory-y += percpu.h mandatory-y += pgalloc.h -mandatory-y += platform-feature.h mandatory-y += preempt.h mandatory-y += rwonce.h mandatory-y += sections.h diff --git a/include/asm-generic/platform-feature.h b/include/asm-generic/platform-feature.h deleted file mode 100644 index 4b0af3d5158886..00000000000000 --- a/include/asm-generic/platform-feature.h +++ /dev/null @@ -1,8 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_GENERIC_PLATFORM_FEATURE_H -#define _ASM_GENERIC_PLATFORM_FEATURE_H - -/* Number of arch specific feature flags. */ -#define PLATFORM_ARCH_FEAT_N 0 - -#endif /* _ASM_GENERIC_PLATFORM_FEATURE_H */ diff --git a/include/linux/platform-feature.h b/include/linux/platform-feature.h deleted file mode 100644 index 6ed859928b9783..00000000000000 --- a/include/linux/platform-feature.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PLATFORM_FEATURE_H -#define _PLATFORM_FEATURE_H - -#include -#include - -/* The platform features are starting with the architecture specific ones. */ -#define PLATFORM_FEAT_N (0 + PLATFORM_ARCH_FEAT_N) - -void platform_set(unsigned int feature); -void platform_clear(unsigned int feature); -bool platform_has(unsigned int feature); - -#endif /* _PLATFORM_FEATURE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index a7e1f49ab2b3bc..318789c728d329 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -7,7 +7,7 @@ obj-y = fork.o exec_domain.o panic.o \ cpu.o exit.o softirq.o resource.o \ sysctl.o capability.o ptrace.o user.o \ signal.o sys.o umh.o workqueue.o pid.o task_work.o \ - extable.o params.o platform-feature.o \ + extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ async.o range.o smpboot.o ucount.o regset.o diff --git a/kernel/platform-feature.c b/kernel/platform-feature.c deleted file mode 100644 index cb6a6c3e4fed10..00000000000000 --- a/kernel/platform-feature.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include -#include -#include - -#define PLATFORM_FEAT_ARRAY_SZ BITS_TO_LONGS(PLATFORM_FEAT_N) -static unsigned long __read_mostly platform_features[PLATFORM_FEAT_ARRAY_SZ]; - -void platform_set(unsigned int feature) -{ - set_bit(feature, platform_features); -} -EXPORT_SYMBOL_GPL(platform_set); - -void platform_clear(unsigned int feature) -{ - clear_bit(feature, platform_features); -} -EXPORT_SYMBOL_GPL(platform_clear); - -bool platform_has(unsigned int feature) -{ - return test_bit(feature, platform_features); -} -EXPORT_SYMBOL_GPL(platform_has); From a47336535f0f7981ac3a8a5af2b705671c5fb6e6 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 22 Jun 2022 08:38:38 +0200 Subject: [PATCH 0368/1250] xen: don't require virtio with grants for non-PV guests Commit fa1f57421e0b ("xen/virtio: Enable restricted memory access using Xen grant mappings") introduced a new requirement for using virtio devices: the backend now needs to support the VIRTIO_F_ACCESS_PLATFORM feature. This is an undue requirement for non-PV guests, as those can be operated with existing backends without any problem, as long as those backends are running in dom0. Per default allow virtio devices without grant support for non-PV guests. On Arm require VIRTIO_F_ACCESS_PLATFORM for devices having been listed in the device tree to use grants. Add a new config item to always force use of grants for virtio. Fixes: fa1f57421e0b ("xen/virtio: Enable restricted memory access using Xen grant mappings") Reported-by: Viresh Kumar Signed-off-by: Juergen Gross Reviewed-by: Oleksandr Tyshchenko Tested-by: Oleksandr Tyshchenko # Arm64 guest using Xen Reviewed-by: Stefano Stabellini Link: https://lore.kernel.org/r/20220622063838.8854-4-jgross@suse.com Signed-off-by: Juergen Gross --- arch/arm/xen/enlighten.c | 4 +++- arch/x86/xen/enlighten_hvm.c | 4 +++- arch/x86/xen/enlighten_pv.c | 5 ++++- drivers/xen/Kconfig | 9 +++++++++ drivers/xen/grant-dma-ops.c | 10 ++++++++++ include/xen/xen-ops.h | 9 +++++++++ include/xen/xen.h | 8 -------- 7 files changed, 38 insertions(+), 11 deletions(-) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 1f9c3ba3283333..93c8ccbf298284 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -34,6 +34,7 @@ #include #include #include +#include #include @@ -443,7 +444,8 @@ static int __init xen_guest_init(void) if (!xen_domain()) return 0; - xen_set_restricted_virtio_memory_access(); + if (IS_ENABLED(CONFIG_XEN_VIRTIO)) + virtio_set_mem_acc_cb(xen_virtio_mem_acc); if (!acpi_disabled) xen_acpi_guest_init(); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 8b71b1dd76396f..28762f80059611 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -195,7 +196,8 @@ static void __init xen_hvm_guest_init(void) if (xen_pv_domain()) return; - xen_set_restricted_virtio_memory_access(); + if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); init_hvm_pv_info(); diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 70fb2ea85e9074..0ed2e487a693fa 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -109,7 +110,9 @@ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); static void __init xen_pv_init_platform(void) { - xen_set_restricted_virtio_memory_access(); + /* PV guests can't operate virtio devices without grants. */ + if (IS_ENABLED(CONFIG_XEN_VIRTIO)) + virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index bfd5f4f706bcc0..a65bd92121a5d8 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -355,4 +355,13 @@ config XEN_VIRTIO If in doubt, say n. +config XEN_VIRTIO_FORCE_GRANT + bool "Require Xen virtio support to use grants" + depends on XEN_VIRTIO + help + Require virtio for Xen guests to use grant mappings. + This will avoid the need to give the backend the right to map all + of the guest memory. This will need support on the backend side + (e.g. qemu or kernel, depending on the virtio device types used). + endmenu diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index fc01424840017e..8973fc1e9cccd2 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include #include @@ -287,6 +289,14 @@ bool xen_is_grant_dma_device(struct device *dev) return has_iommu; } +bool xen_virtio_mem_acc(struct virtio_device *dev) +{ + if (IS_ENABLED(CONFIG_XEN_VIRTIO_FORCE_GRANT)) + return true; + + return xen_is_grant_dma_device(dev->dev.parent); +} + void xen_grant_setup_dma_ops(struct device *dev) { struct xen_grant_dma_data *data; diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index 80546960f8b77f..dae0f350c6780b 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -217,6 +218,7 @@ static inline void xen_preemptible_hcall_end(void) { } #ifdef CONFIG_XEN_GRANT_DMA_OPS void xen_grant_setup_dma_ops(struct device *dev); bool xen_is_grant_dma_device(struct device *dev); +bool xen_virtio_mem_acc(struct virtio_device *dev); #else static inline void xen_grant_setup_dma_ops(struct device *dev) { @@ -225,6 +227,13 @@ static inline bool xen_is_grant_dma_device(struct device *dev) { return false; } + +struct virtio_device; + +static inline bool xen_virtio_mem_acc(struct virtio_device *dev) +{ + return false; +} #endif /* CONFIG_XEN_GRANT_DMA_OPS */ #endif /* INCLUDE_XEN_OPS_H */ diff --git a/include/xen/xen.h b/include/xen/xen.h index ac5a144c6a65e9..a99bab8175234e 100644 --- a/include/xen/xen.h +++ b/include/xen/xen.h @@ -52,14 +52,6 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, extern u64 xen_saved_max_mem_size; #endif -#include - -static inline void xen_set_restricted_virtio_memory_access(void) -{ - if (IS_ENABLED(CONFIG_XEN_VIRTIO) && xen_domain()) - virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc); -} - #ifdef CONFIG_XEN_UNPOPULATED_ALLOC int xen_alloc_unpopulated_pages(unsigned int nr_pages, struct page **pages); void xen_free_unpopulated_pages(unsigned int nr_pages, struct page **pages); From 5a044eef1265581683530e75351c19e29ee33a11 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Jul 2022 08:32:54 -0700 Subject: [PATCH 0369/1250] block: ensure iov_iter advances for added pages There are cases where a bio may not accept additional pages, and the iov needs to advance to the last data length that was accepted. The zone append used to handle this correctly, but was inadvertently broken when the setup was made common with the normal r/w case. Fixes: 576ed9135489c ("block: use bio_add_page in bio_iov_iter_get_pages") Fixes: c58c0074c54c2 ("block/bio: remove duplicate append pages code") Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220712153256.2202024-1-kbusch@fb.com Signed-off-by: Jens Axboe --- block/bio.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/block/bio.c b/block/bio.c index 933ea321095474..fdd58461b78f96 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1211,6 +1211,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ssize_t size, left; unsigned len, i; size_t offset; + int ret = 0; /* * Move page array up in the allocated memory for the bio vecs as far as @@ -1235,7 +1236,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) for (left = size, i = 0; left > 0; left -= len, i++) { struct page *page = pages[i]; - int ret; len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_op(bio) == REQ_OP_ZONE_APPEND) @@ -1246,13 +1246,13 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (ret) { bio_put_pages(pages + i, left, offset); - return ret; + break; } offset = 0; } - iov_iter_advance(iter, size); - return 0; + iov_iter_advance(iter, size - left); + return ret; } /** From ac3c48e32c047a3781d6bc28bb5013e4431350fd Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Jul 2022 08:32:55 -0700 Subject: [PATCH 0370/1250] block: ensure bio_iov_add_page can't fail Adding the page could fail on the bio_full() condition, which checks for either exceeding the bio's max segments or total size exceeding UINT_MAX. We already ensure the max segments can't be exceeded, so just ensure the total size won't reach the limit. This simplifies error handling and removes unnecessary repeated bio_full() checks. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220712153256.2202024-2-kbusch@fb.com Signed-off-by: Jens Axboe --- block/bio.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/block/bio.c b/block/bio.c index fdd58461b78f96..01223f8086ed58 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1165,8 +1165,6 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, bool same_page = false; if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (WARN_ON_ONCE(bio_full(bio, len))) - return -EINVAL; __bio_add_page(bio, page, len, offset); return 0; } @@ -1228,7 +1226,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * result to ensure the bio's total size is correct. The remainder of * the iov data will be picked up in the next bio iteration. */ - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + size = iov_iter_get_pages(iter, pages, UINT_MAX - bio->bi_iter.bi_size, + nr_pages, &offset); if (size > 0) size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); if (unlikely(size <= 0)) @@ -1238,16 +1237,16 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page *page = pages[i]; len = min_t(size_t, PAGE_SIZE - offset, left); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { ret = bio_iov_add_zone_append_page(bio, page, len, offset); - else - ret = bio_iov_add_page(bio, page, len, offset); + if (ret) { + bio_put_pages(pages + i, left, offset); + break; + } + } else + bio_iov_add_page(bio, page, len, offset); - if (ret) { - bio_put_pages(pages + i, left, offset); - break; - } offset = 0; } From 44b6b0b0e980d99d24de7e5d57baae48a78db3b6 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 12 Jul 2022 08:32:56 -0700 Subject: [PATCH 0371/1250] block: fix leaking page ref on truncated direct io The size being added to a bio from an iov is aligned to a block size after the pages were gotten. If the new aligned size truncates the last page, its reference was being leaked. Ensure all pages that were not added to the bio have their reference released. Since this essentially requires doing the same that bio_put_pages(), and there was only one caller for that function, this patch makes the put_page() loop common for everyone. Fixes: b1a000d3b8ec5 ("block: relax direct io memory alignment") Reported-by: Al Viro Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20220712153256.2202024-3-kbusch@fb.com Signed-off-by: Jens Axboe --- block/bio.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/block/bio.c b/block/bio.c index 01223f8086ed58..de345a9b52dbd6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1151,14 +1151,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } -static void bio_put_pages(struct page **pages, size_t size, size_t off) -{ - size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); - - for (i = 0; i < nr; i++) - put_page(pages[i]); -} - static int bio_iov_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { @@ -1207,7 +1199,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; ssize_t size, left; - unsigned len, i; + unsigned len, i = 0; size_t offset; int ret = 0; @@ -1228,10 +1220,16 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) */ size = iov_iter_get_pages(iter, pages, UINT_MAX - bio->bi_iter.bi_size, nr_pages, &offset); - if (size > 0) + if (size > 0) { + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); - if (unlikely(size <= 0)) - return size ? size : -EFAULT; + } else + nr_pages = 0; + + if (unlikely(size <= 0)) { + ret = size ? size : -EFAULT; + goto out; + } for (left = size, i = 0; left > 0; left -= len, i++) { struct page *page = pages[i]; @@ -1240,10 +1238,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) if (bio_op(bio) == REQ_OP_ZONE_APPEND) { ret = bio_iov_add_zone_append_page(bio, page, len, offset); - if (ret) { - bio_put_pages(pages + i, left, offset); + if (ret) break; - } } else bio_iov_add_page(bio, page, len, offset); @@ -1251,6 +1247,10 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) } iov_iter_advance(iter, size - left); +out: + while (i < nr_pages) + put_page(pages[i++]); + return ret; } From d118a4943c5853b536151992ceb24dd60186e064 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Thu, 14 Jul 2022 07:25:46 +0300 Subject: [PATCH 0372/1250] docs: kbuild: fix typo on -> one. Signed-off-by: Baruch Siach Signed-off-by: Masahiro Yamada --- Documentation/kbuild/kconfig-language.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/kbuild/kconfig-language.rst b/Documentation/kbuild/kconfig-language.rst index a7173843a29409..7fb398649f5108 100644 --- a/Documentation/kbuild/kconfig-language.rst +++ b/Documentation/kbuild/kconfig-language.rst @@ -672,7 +672,7 @@ Future kconfig work Work on kconfig is welcomed on both areas of clarifying semantics and on evaluating the use of a full SAT solver for it. A full SAT solver can be desirable to enable more complex dependency mappings and / or queries, -for instance on possible use case for a SAT solver could be that of handling +for instance one possible use case for a SAT solver could be that of handling the current known recursive dependency issues. It is not known if this would address such issues but such evaluation is desirable. If support for a full SAT solver proves too complex or that it cannot address recursive dependency issues From d6ed6f570513c4c8d869d1d4e62890de19600420 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 31 Mar 2022 16:29:00 -0400 Subject: [PATCH 0373/1250] fs: change test in inode_insert5 for adding to the sb list inode_insert5 currently looks at I_CREATING to decide whether to insert the inode into the sb list. This test is a bit ambiguous, as I_CREATING state is not directly related to that list. This test is also problematic for some upcoming ceph changes to add fscrypt support. We need to be able to allocate an inode using new_inode and insert it into the hash later iff we end up using it, and doing that now means that we double add it and corrupt the list. What we really want to know in this test is whether the inode is already in its superblock list, and then add it if it isn't. Have it test for list_empty instead and ensure that we always initialize the list by doing it in inode_init_once. It's only ever removed from the list with list_del_init, so that should be sufficient. Suggested-by: Al Viro Reviewed-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/inode.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index bd4da9c5207eab..d5db55df442b78 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -422,6 +422,7 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); + INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } @@ -1021,7 +1022,6 @@ struct inode *new_inode_pseudo(struct super_block *sb) spin_lock(&inode->i_lock); inode->i_state = 0; spin_unlock(&inode->i_lock); - INIT_LIST_HEAD(&inode->i_sb_list); } return inode; } @@ -1165,7 +1165,6 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, { struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); struct inode *old; - bool creating = inode->i_state & I_CREATING; again: spin_lock(&inode_hash_lock); @@ -1199,7 +1198,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, inode->i_state |= I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); - if (!creating) + + /* + * Add inode to the sb list if it's not already. It has I_NEW at this + * point, so it should be safe to test i_sb_list locklessly. + */ + if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); unlock: spin_unlock(&inode_hash_lock); From 7f46dc5808710d261600d715402c839033d61902 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 8 Jan 2021 15:34:38 -0500 Subject: [PATCH 0374/1250] fscrypt: export fscrypt_fname_encrypt and fscrypt_fname_encrypted_size For ceph, we want to use our own scheme for handling filenames that are are longer than NAME_MAX after encryption and Base64 encoding. This allows us to have a consistent view of the encrypted filenames for clients that don't support fscrypt and clients that do but that don't have the key. Currently, fs/crypto only supports encrypting filenames using fscrypt_setup_filename, but that also handles encoding nokey names. Ceph can't use that because it handles nokey names in a different way. Export fscrypt_fname_encrypt. Rename fscrypt_fname_encrypted_size to __fscrypt_fname_encrypted_size and add a new wrapper called fscrypt_fname_encrypted_size that takes an inode argument rather than a pointer to a fscrypt_policy union. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Acked-by: Eric Biggers Signed-off-by: Ilya Dryomov --- fs/crypto/fname.c | 36 ++++++++++++++++++++++++++++++------ fs/crypto/fscrypt_private.h | 9 +++------ fs/crypto/hooks.c | 6 +++--- include/linux/fscrypt.h | 4 ++++ 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 14e0ef5e9a20ae..12bd61d20f6940 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -86,7 +86,8 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) /** * fscrypt_fname_encrypt() - encrypt a filename * @inode: inode of the parent directory (for regular filenames) - * or of the symlink (for symlink targets) + * or of the symlink (for symlink targets). Key must already be + * set up. * @iname: the filename to encrypt * @out: (output) the encrypted filename * @olen: size of the encrypted filename. It must be at least @iname->len. @@ -137,6 +138,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, return 0; } +EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); /** * fname_decrypt() - decrypt a filename @@ -264,9 +266,9 @@ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst) return bp - dst; } -bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret) +bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret) { int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); @@ -280,6 +282,29 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, return true; } +/** + * fscrypt_fname_encrypted_size() - calculate length of encrypted filename + * @inode: parent inode of dentry name being encrypted. Key must + * already be set up. + * @orig_len: length of the original filename + * @max_len: maximum length to return + * @encrypted_len_ret: where calculated length should be returned (on success) + * + * Filenames that are shorter than the maximum length may have their lengths + * increased slightly by encryption, due to padding that is applied. + * + * Return: false if the orig_len is greater than max_len. Otherwise, true and + * fill out encrypted_len_ret with the length (up to max_len). + */ +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret) +{ + return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, + orig_len, max_len, + encrypted_len_ret); +} +EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); + /** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames * @max_encrypted_len: maximum length of encrypted filenames the buffer will be @@ -435,8 +460,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; if (fscrypt_has_encryption_key(dir)) { - if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, - iname->len, NAME_MAX, + if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 6b4c8094cc7b08..11fe9d213ae14d 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -297,14 +297,11 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, - u8 *out, unsigned int olen); -bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, - u32 orig_len, u32 max_len, - u32 *encrypted_len_ret); +bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, + u32 orig_len, u32 max_len, + u32 *encrypted_len_ret); /* hkdf.c */ - struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index af74599ae1cf06..7c01025879b38f 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -228,9 +228,9 @@ int fscrypt_prepare_symlink(struct inode *dir, const char *target, * counting it (even though it is meaningless for ciphertext) is simpler * for now since filesystems will assume it is there and subtract it. */ - if (!fscrypt_fname_encrypted_size(policy, len, - max_len - sizeof(struct fscrypt_symlink_data), - &disk_link->len)) + if (!__fscrypt_fname_encrypted_size(policy, len, + max_len - sizeof(struct fscrypt_symlink_data), + &disk_link->len)) return -ENAMETOOLONG; disk_link->len += sizeof(struct fscrypt_symlink_data); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index e60d57c99cb6f2..5926a4081c6d9b 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -327,6 +327,10 @@ void fscrypt_free_inode(struct inode *inode); int fscrypt_drop_inode(struct inode *inode); /* fname.c */ +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen); +bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, + u32 max_len, u32 *encrypted_len_ret); int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, int lookup, struct fscrypt_name *fname); From 91c08da8e7fc4269de542500519c4df1d93a2731 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 1 Sep 2020 12:56:42 -0400 Subject: [PATCH 0375/1250] fscrypt: add fscrypt_context_for_new_inode Most filesystems just call fscrypt_set_context on new inodes, which usually causes a setxattr. That's a bit late for ceph, which can send along a full set of attributes with the create request. Doing so allows it to avoid race windows that where the new inode could be seen by other clients without the crypto context attached. It also avoids the separate round trip to the server. Refactor the fscrypt code a bit to allow us to create a new crypto context, attach it to the inode, and write it to the buffer, but without calling set_context on it. ceph can later use this to marshal the context into the attributes we send along with the create request. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Acked-by: Eric Biggers Signed-off-by: Ilya Dryomov --- fs/crypto/policy.c | 35 +++++++++++++++++++++++++++++------ include/linux/fscrypt.h | 1 + 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 5f858cee1e3b04..a450189565e329 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -685,6 +685,32 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) return fscrypt_get_dummy_policy(dir->i_sb); } +/** + * fscrypt_context_for_new_inode() - create an encryption context for a new inode + * @ctx: where context should be written + * @inode: inode from which to fetch policy and nonce + * + * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode, + * generate a new context and write it to ctx. ctx _must_ be at least + * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes. + * + * Return: size of the resulting context or a negative error code. + */ +int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) +{ + struct fscrypt_info *ci = inode->i_crypt_info; + + BUILD_BUG_ON(sizeof(union fscrypt_context) != + FSCRYPT_SET_CONTEXT_MAX_SIZE); + + /* fscrypt_prepare_new_inode() should have set up the key already. */ + if (WARN_ON_ONCE(!ci)) + return -ENOKEY; + + return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce); +} +EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); + /** * fscrypt_set_context() - Set the fscrypt context of a new inode * @inode: a new inode @@ -701,12 +727,9 @@ int fscrypt_set_context(struct inode *inode, void *fs_data) union fscrypt_context ctx; int ctxsize; - /* fscrypt_prepare_new_inode() should have set up the key already. */ - if (WARN_ON_ONCE(!ci)) - return -ENOKEY; - - BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); - ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); + ctxsize = fscrypt_context_for_new_inode(&ctx, inode); + if (ctxsize < 0) + return ctxsize; /* * This may be the first time the inode number is available, so do any diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 5926a4081c6d9b..7d2f1e0f23b1fe 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -284,6 +284,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg); int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); +int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); int fscrypt_set_context(struct inode *inode, void *fs_data); struct fscrypt_dummy_policy { From b28a31ebc74fc72acbc9cb9d865c14000d420773 Mon Sep 17 00:00:00 2001 From: Tamas Koczka Date: Thu, 14 Jul 2022 10:48:14 +0000 Subject: [PATCH 0376/1250] Bluetooth: Collect kcov coverage from hci_rx_work Annotate hci_rx_work() with kcov_remote_start() and kcov_remote_stop() calls, so remote KCOV coverage is collected while processing the rx_q queue which is the main incoming Bluetooth packet queue. Coverage is associated with the thread which created the packet skb. The collected extra coverage helps kernel fuzzing efforts in finding vulnerabilities. This change only has effect if the kernel is compiled with CONFIG_KCOV, otherwise kcov_ functions don't do anything. Signed-off-by: Tamas Koczka Tested-by: Aleksandr Nogikh Reviewed-by: Dmitry Vyukov Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 27e90eb4bf4c30..0a51858d863af3 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -3804,7 +3805,14 @@ static void hci_rx_work(struct work_struct *work) BT_DBG("%s", hdev->name); - while ((skb = skb_dequeue(&hdev->rx_q))) { + /* The kcov_remote functions used for collecting packet parsing + * coverage information from this background thread and associate + * the coverage with the syscall's thread which originally injected + * the packet. This helps fuzzing the kernel. + */ + for (; (skb = skb_dequeue(&hdev->rx_q)); kcov_remote_stop()) { + kcov_remote_start_common(skb_get_kcov_handle(skb)); + /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); From c86c8360959ec706576baf17237dec3004154d4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Thu, 14 Jul 2022 20:57:00 +0200 Subject: [PATCH 0377/1250] arm: ioremap: Fix pci_remap_iospace() when CONFIG_MMU unset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The custom ARM version of pci_remap_iospace() is defined in arch/arm/mm/ioremap.c, which is only built when CONFIG_MMU=y. When CONFIG_MMU is unset there's no implementation, which causes errors: arm-linux-gnueabi-ld: arch/arm/mach-dove/pcie.o: in function `dove_pcie_setup': >> pcie.c:(.init.text+0x170): undefined reference to `pci_remap_iospace' arm-linux-gnueabi-ld: drivers/pci/pci.o: in function `devm_pci_remap_iospace': (.text+0x1a84): undefined reference to `pci_remap_iospace' When CONFIG_MMU is unset, leave the pci_remap_iospace macro undefined so we use the default pci_remap_iospace() implementation from drivers/pci/pci.c, which just returns failure. [bhelgaas: commit log] Fixes: bc02973a06a6 ("arm: ioremap: Implement standard PCI function pci_remap_iospace()") Link: https://lore.kernel.org/r/20220714185700.6137-1-pali@kernel.org Reported-by: kernel test robot Signed-off-by: Pali Rohár Signed-off-by: Bjorn Helgaas --- arch/arm/include/asm/io.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index eba7cbc93b8698..47cf79229b7cf5 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -180,10 +180,12 @@ void pci_ioremap_set_mem_type(int mem_type); static inline void pci_ioremap_set_mem_type(int mem_type) {} #endif +#ifdef CONFIG_MMU struct resource; #define pci_remap_iospace pci_remap_iospace int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr); +#endif /* * PCI configuration space mapping function. From abe27d64086158f4dafb9fb4c08902f91d4af4c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= Date: Wed, 13 Jul 2022 22:03:50 +0200 Subject: [PATCH 0378/1250] dt-bindings: arm: Add Asus GT-AX6000 based on BCM4912 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's a home router, the first BCM4912 SoC based public device. Signed-off-by: Rafał Miłecki Acked-by: William Zhang Link: https://lore.kernel.org/r/20220713200351.28526-1-zajec5@gmail.com Signed-off-by: Florian Fainelli --- Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml b/Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml index d9dc4f22f4a529..324e591043609d 100644 --- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml +++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcmbca.yaml @@ -31,6 +31,7 @@ properties: - description: BCM4912 based boards items: - enum: + - asus,gt-ax6000 - brcm,bcm94912 - const: brcm,bcm4912 - const: brcm,bcmbca From dec7e933d65dbc6eaa6c7fd8f960df164a20dd4d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 15 Jul 2022 22:28:47 +0800 Subject: [PATCH 0379/1250] null_blk: cleanup null_init_tag_set The passed 'nullb' can be NULL, so cause null ptr reference. Fix the issue, meantime cleanup null_init_tag_set for avoiding to add similar issue in future. Meantime set BLK_MQ_F_NO_SCHED if g_no_sched is true in case of NULL device, same with BLK_MQ_F_TAG_HCTX_SHARED. Cc: Vincent Fu Fixes: 37ae152c7a0d ("null_blk: add configfs variables for 2 options") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220715142847.188275-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 53 +++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index c955a07dba2d4d..1501c85fc9e4b3 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1898,31 +1898,48 @@ static int null_gendisk_register(struct nullb *nullb) static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) { + unsigned int flags = BLK_MQ_F_SHOULD_MERGE; + int hw_queues, numa_node; + unsigned int queue_depth; int poll_queues; + if (nullb) { + hw_queues = nullb->dev->submit_queues; + poll_queues = nullb->dev->poll_queues; + queue_depth = nullb->dev->hw_queue_depth; + numa_node = nullb->dev->home_node; + if (nullb->dev->no_sched) + flags |= BLK_MQ_F_NO_SCHED; + if (nullb->dev->shared_tag_bitmap) + flags |= BLK_MQ_F_TAG_HCTX_SHARED; + if (nullb->dev->blocking) + flags |= BLK_MQ_F_BLOCKING; + } else { + hw_queues = g_submit_queues; + poll_queues = g_poll_queues; + queue_depth = g_hw_queue_depth; + numa_node = g_home_node; + if (g_no_sched) + flags |= BLK_MQ_F_NO_SCHED; + if (g_shared_tag_bitmap) + flags |= BLK_MQ_F_TAG_HCTX_SHARED; + if (g_blocking) + flags |= BLK_MQ_F_BLOCKING; + } + set->ops = &null_mq_ops; - set->nr_hw_queues = nullb ? nullb->dev->submit_queues : - g_submit_queues; - poll_queues = nullb ? nullb->dev->poll_queues : g_poll_queues; - if (poll_queues) - set->nr_hw_queues += poll_queues; - set->queue_depth = nullb ? nullb->dev->hw_queue_depth : - g_hw_queue_depth; - set->numa_node = nullb ? nullb->dev->home_node : g_home_node; set->cmd_size = sizeof(struct nullb_cmd); - set->flags = BLK_MQ_F_SHOULD_MERGE; - if (nullb->dev->no_sched) - set->flags |= BLK_MQ_F_NO_SCHED; - if (nullb->dev->shared_tag_bitmap) - set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; + set->flags = flags; set->driver_data = nullb; - if (poll_queues) + set->nr_hw_queues = hw_queues; + set->queue_depth = queue_depth; + set->numa_node = numa_node; + if (poll_queues) { + set->nr_hw_queues += poll_queues; set->nr_maps = 3; - else + } else { set->nr_maps = 1; - - if ((nullb && nullb->dev->blocking) || g_blocking) - set->flags |= BLK_MQ_F_BLOCKING; + } return blk_mq_alloc_tag_set(set); } From 8c740c6bf12dec03b6f35b19fe6c183929d0b88a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 15 Jul 2022 11:12:14 +0300 Subject: [PATCH 0380/1250] null_blk: fix ida error handling in null_add_dev() There needs to be some error checking if ida_simple_get() fails. Also call ida_free() if there are errors later. Fixes: 94bc02e30fb8 ("nullb: use ida to manage index") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/YtEhXsr6vJeoiYhd@kili Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 1501c85fc9e4b3..bdb522d512cac3 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2091,8 +2091,13 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); mutex_lock(&lock); - nullb->index = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); - dev->index = nullb->index; + rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); + if (rv < 0) { + mutex_unlock(&lock); + goto out_cleanup_zone; + } + nullb->index = rv; + dev->index = rv; mutex_unlock(&lock); blk_queue_logical_block_size(nullb->q, dev->blocksize); @@ -2118,7 +2123,7 @@ static int null_add_dev(struct nullb_device *dev) rv = null_gendisk_register(nullb); if (rv) - goto out_cleanup_zone; + goto out_ida_free; mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); @@ -2127,6 +2132,9 @@ static int null_add_dev(struct nullb_device *dev) pr_info("disk %s created\n", nullb->disk_name); return 0; + +out_ida_free: + ida_free(&nullb_indexes, nullb->index); out_cleanup_zone: null_free_zoned_dev(dev); out_cleanup_disk: From 7d57337b2194141704785f2af1266bd64c82cb37 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Jul 2022 21:11:26 +0200 Subject: [PATCH 0381/1250] intel: thermal: PCH: Drop ACPI_FADT_LOW_POWER_S0 check If ACPI_FADT_LOW_POWER_S0 is not set, this doesn't mean that low-power S0 idle is not usable. It merely means that using S3 on the given system is more beneficial from the energy saving perspective than using low-power S0 idle, as long as S3 is supported. Suspend-to-idle is still a valid suspend mode if ACPI_FADT_LOW_POWER_S0 is not set and the pm_suspend_via_firmware() check in pch_wpt_suspend() is sufficient to distinguish suspend-to-idle from S3, so drop the confusing ACPI_FADT_LOW_POWER_S0 check. Signed-off-by: Rafael J. Wysocki --- drivers/thermal/intel/intel_pch_thermal.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/thermal/intel/intel_pch_thermal.c b/drivers/thermal/intel/intel_pch_thermal.c index c1fa2b29b153b8..dabf11a687a153 100644 --- a/drivers/thermal/intel/intel_pch_thermal.c +++ b/drivers/thermal/intel/intel_pch_thermal.c @@ -207,14 +207,6 @@ static int pch_wpt_suspend(struct pch_thermal_device *ptd) return 0; } - /* Do not check temperature if it is not a S0ix capable platform */ -#ifdef CONFIG_ACPI - if (!(acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0)) - return 0; -#else - return 0; -#endif - /* Do not check temperature if it is not s2idle */ if (pm_suspend_via_firmware()) return 0; From cd5fc01e17e1a63f87a1f7c7b951dde6ef4c4d3c Mon Sep 17 00:00:00 2001 From: Micah Morton Date: Thu, 16 Jun 2022 21:46:08 +0000 Subject: [PATCH 0382/1250] LSM: SafeSetID: fix bug during GID policy check A bug exists in the GID transition policy checking code that always checks the policy against the UID of the process rather than GID. --- security/safesetid/lsm.c | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/security/safesetid/lsm.c b/security/safesetid/lsm.c index e806739f78684d..cd259ae38d83ce 100644 --- a/security/safesetid/lsm.c +++ b/security/safesetid/lsm.c @@ -148,37 +148,33 @@ static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum set { bool permitted; - /* If our old creds already had this ID in it, it's fine. */ + /* + * If our old creds already had this ID in it, it's fine. Otherwise need + * to check against the policy of the old ID. + */ if (new_type == UID) { if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) || uid_eq(new_id.uid, old->suid)) return true; + permitted = + setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; + if (!permitted) + pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", + __kuid_val(old->uid), __kuid_val(old->euid), + __kuid_val(old->suid), __kuid_val(new_id.uid)); } else if (new_type == GID){ if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) || gid_eq(new_id.gid, old->sgid)) return true; - } else /* Error, new_type is an invalid type */ - return false; - - /* - * Transitions to new UIDs require a check against the policy of the old - * RUID. - */ - permitted = - setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED; - - if (!permitted) { - if (new_type == UID) { - pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n", - __kuid_val(old->uid), __kuid_val(old->euid), - __kuid_val(old->suid), __kuid_val(new_id.uid)); - } else if (new_type == GID) { + permitted = + setid_policy_lookup((kid_t){.gid = old->gid}, new_id, new_type) != SIDPOL_CONSTRAINED; + if (!permitted) pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n", __kgid_val(old->gid), __kgid_val(old->egid), __kgid_val(old->sgid), __kgid_val(new_id.gid)); - } else /* Error, new_type is an invalid type */ - return false; - } + } else /* Error, new_type is an invalid type */ + return false; + return permitted; } From 05b8962d374fdf5ad6b72af51d5f213c210744d2 Mon Sep 17 00:00:00 2001 From: Jianmin Lv Date: Thu, 14 Jul 2022 20:42:16 +0800 Subject: [PATCH 0383/1250] PCI: loongson: Work around LS7A incorrect Interrupt Pin registers Several devices integrated into LS7A report 0 (which means they do not use legacy INTx) in their Interrupt Pin registers even though they *do* use INTx. Add a quirk to override the incorrect Interrupt Pin values. This is only needed by ACPI-based systems. For DT-based systems, the IRQ mappings are defined in .dts files and handled by of_irq_parse_pci(). Link: https://lore.kernel.org/r/20220714124216.1489304-8-chenhuacai@loongson.cn Signed-off-by: Jianmin Lv Signed-off-by: Huacai Chen Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/pci-loongson.c | 32 +++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/drivers/pci/controller/pci-loongson.c b/drivers/pci/controller/pci-loongson.c index 6ed125c7d86a38..95b08219295d94 100644 --- a/drivers/pci/controller/pci-loongson.c +++ b/drivers/pci/controller/pci-loongson.c @@ -22,6 +22,13 @@ #define DEV_LS2K_APB 0x7a02 #define DEV_LS7A_CONF 0x7a10 #define DEV_LS7A_LPC 0x7a0c +#define DEV_LS7A_GMAC 0x7a03 +#define DEV_LS7A_DC1 0x7a06 +#define DEV_LS7A_DC2 0x7a36 +#define DEV_LS7A_GPU 0x7a15 +#define DEV_LS7A_AHCI 0x7a08 +#define DEV_LS7A_EHCI 0x7a14 +#define DEV_LS7A_OHCI 0x7a24 #define FLAG_CFG0 BIT(0) #define FLAG_CFG1 BIT(1) @@ -100,6 +107,31 @@ static void loongson_mrrs_quirk(struct pci_dev *dev) } DECLARE_PCI_FIXUP_ENABLE(PCI_ANY_ID, PCI_ANY_ID, loongson_mrrs_quirk); +static void loongson_pci_pin_quirk(struct pci_dev *pdev) +{ + pdev->pin = 1 + (PCI_FUNC(pdev->devfn) & 3); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_LS7A_DC1, loongson_pci_pin_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_LS7A_DC2, loongson_pci_pin_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_LS7A_GPU, loongson_pci_pin_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_LS7A_GMAC, loongson_pci_pin_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_LS7A_AHCI, loongson_pci_pin_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_LS7A_EHCI, loongson_pci_pin_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_LS7A_OHCI, loongson_pci_pin_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_PCIE_PORT_0, loongson_pci_pin_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_PCIE_PORT_1, loongson_pci_pin_quirk); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_LOONGSON, + DEV_PCIE_PORT_2, loongson_pci_pin_quirk); + static struct loongson_pci *pci_bus_to_loongson_pci(struct pci_bus *bus) { struct pci_config_window *cfg; From e05f33c3eb8b8269c3dfdd45e5065022b2e158d6 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 14 Jul 2022 15:30:53 +0800 Subject: [PATCH 0384/1250] PCI: imx6: Move imx6_pcie_grp_offset(), imx6_pcie_configure_type() earlier Move imx6_pcie_grp_offset() and imx6_pcie_configure_type() earlier in the file since they depend on nothing and are used by several other functions that will be moved earlier. No functional change intended. Link: https://lore.kernel.org/r/1657783869-19194-2-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach Acked-by: Richard Zhu --- drivers/pci/controller/dwc/pci-imx6.c | 50 +++++++++++++-------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 5ea01ed4674dbd..66b9134354de52 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -146,6 +146,31 @@ struct imx6_pcie { #define PHY_RX_OVRD_IN_LO_RX_DATA_EN BIT(5) #define PHY_RX_OVRD_IN_LO_RX_PLL_EN BIT(3) +static unsigned int imx6_pcie_grp_offset(const struct imx6_pcie *imx6_pcie) +{ + WARN_ON(imx6_pcie->drvdata->variant != IMX8MQ && + imx6_pcie->drvdata->variant != IMX8MM); + return imx6_pcie->controller_id == 1 ? IOMUXC_GPR16 : IOMUXC_GPR14; +} + +static void imx6_pcie_configure_type(struct imx6_pcie *imx6_pcie) +{ + unsigned int mask, val; + + if (imx6_pcie->drvdata->variant == IMX8MQ && + imx6_pcie->controller_id == 1) { + mask = IMX8MQ_GPR12_PCIE2_CTRL_DEVICE_TYPE; + val = FIELD_PREP(IMX8MQ_GPR12_PCIE2_CTRL_DEVICE_TYPE, + PCI_EXP_TYPE_ROOT_PORT); + } else { + mask = IMX6Q_GPR12_DEVICE_TYPE; + val = FIELD_PREP(IMX6Q_GPR12_DEVICE_TYPE, + PCI_EXP_TYPE_ROOT_PORT); + } + + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, mask, val); +} + static int pcie_phy_poll_ack(struct imx6_pcie *imx6_pcie, bool exp_val) { struct dw_pcie *pci = imx6_pcie->pci; @@ -415,13 +440,6 @@ static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) imx6_pcie->gpio_active_high); } -static unsigned int imx6_pcie_grp_offset(const struct imx6_pcie *imx6_pcie) -{ - WARN_ON(imx6_pcie->drvdata->variant != IMX8MQ && - imx6_pcie->drvdata->variant != IMX8MM); - return imx6_pcie->controller_id == 1 ? IOMUXC_GPR16 : IOMUXC_GPR14; -} - static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie) { struct dw_pcie *pci = imx6_pcie->pci; @@ -617,24 +635,6 @@ static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) } } -static void imx6_pcie_configure_type(struct imx6_pcie *imx6_pcie) -{ - unsigned int mask, val; - - if (imx6_pcie->drvdata->variant == IMX8MQ && - imx6_pcie->controller_id == 1) { - mask = IMX8MQ_GPR12_PCIE2_CTRL_DEVICE_TYPE; - val = FIELD_PREP(IMX8MQ_GPR12_PCIE2_CTRL_DEVICE_TYPE, - PCI_EXP_TYPE_ROOT_PORT); - } else { - mask = IMX6Q_GPR12_DEVICE_TYPE; - val = FIELD_PREP(IMX6Q_GPR12_DEVICE_TYPE, - PCI_EXP_TYPE_ROOT_PORT); - } - - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, mask, val); -} - static void imx6_pcie_init_phy(struct imx6_pcie *imx6_pcie) { switch (imx6_pcie->drvdata->variant) { From 8b2a017eaa432ae72bbbf98a3bc16547953342b5 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 14 Jul 2022 15:30:54 +0800 Subject: [PATCH 0385/1250] PCI: imx6: Move PHY management functions together Collect imx6_pcie_init_phy(), imx7d_pcie_wait_for_phy_pll_lock(), and imx6_setup_phy_mpll() earlier with other PHY-related code. No functional change intended. Link: https://lore.kernel.org/r/1657783869-19194-3-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach Acked-by: Richard Zhu --- drivers/pci/controller/dwc/pci-imx6.c | 256 +++++++++++++------------- 1 file changed, 128 insertions(+), 128 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 66b9134354de52..a7d2f07d61c617 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -296,6 +296,134 @@ static int pcie_phy_write(struct imx6_pcie *imx6_pcie, int addr, u16 data) return 0; } +static void imx6_pcie_init_phy(struct imx6_pcie *imx6_pcie) +{ + switch (imx6_pcie->drvdata->variant) { + case IMX8MM: + /* + * The PHY initialization had been done in the PHY + * driver, break here directly. + */ + break; + case IMX8MQ: + /* + * TODO: Currently this code assumes external + * oscillator is being used + */ + regmap_update_bits(imx6_pcie->iomuxc_gpr, + imx6_pcie_grp_offset(imx6_pcie), + IMX8MQ_GPR_PCIE_REF_USE_PAD, + IMX8MQ_GPR_PCIE_REF_USE_PAD); + /* + * Regarding the datasheet, the PCIE_VPH is suggested + * to be 1.8V. If the PCIE_VPH is supplied by 3.3V, the + * VREG_BYPASS should be cleared to zero. + */ + if (imx6_pcie->vph && + regulator_get_voltage(imx6_pcie->vph) > 3000000) + regmap_update_bits(imx6_pcie->iomuxc_gpr, + imx6_pcie_grp_offset(imx6_pcie), + IMX8MQ_GPR_PCIE_VREG_BYPASS, + 0); + break; + case IMX7D: + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, + IMX7D_GPR12_PCIE_PHY_REFCLK_SEL, 0); + break; + case IMX6SX: + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, + IMX6SX_GPR12_PCIE_RX_EQ_MASK, + IMX6SX_GPR12_PCIE_RX_EQ_2); + fallthrough; + default: + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, + IMX6Q_GPR12_PCIE_CTL_2, 0 << 10); + + /* configure constant input signal to the pcie ctrl and phy */ + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, + IMX6Q_GPR12_LOS_LEVEL, 9 << 4); + + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, + IMX6Q_GPR8_TX_DEEMPH_GEN1, + imx6_pcie->tx_deemph_gen1 << 0); + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, + IMX6Q_GPR8_TX_DEEMPH_GEN2_3P5DB, + imx6_pcie->tx_deemph_gen2_3p5db << 6); + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, + IMX6Q_GPR8_TX_DEEMPH_GEN2_6DB, + imx6_pcie->tx_deemph_gen2_6db << 12); + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, + IMX6Q_GPR8_TX_SWING_FULL, + imx6_pcie->tx_swing_full << 18); + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, + IMX6Q_GPR8_TX_SWING_LOW, + imx6_pcie->tx_swing_low << 25); + break; + } + + imx6_pcie_configure_type(imx6_pcie); +} + +static void imx7d_pcie_wait_for_phy_pll_lock(struct imx6_pcie *imx6_pcie) +{ + u32 val; + struct device *dev = imx6_pcie->pci->dev; + + if (regmap_read_poll_timeout(imx6_pcie->iomuxc_gpr, + IOMUXC_GPR22, val, + val & IMX7D_GPR22_PCIE_PHY_PLL_LOCKED, + PHY_PLL_LOCK_WAIT_USLEEP_MAX, + PHY_PLL_LOCK_WAIT_TIMEOUT)) + dev_err(dev, "PCIe PLL lock timeout\n"); +} + +static int imx6_setup_phy_mpll(struct imx6_pcie *imx6_pcie) +{ + unsigned long phy_rate = clk_get_rate(imx6_pcie->pcie_phy); + int mult, div; + u16 val; + + if (!(imx6_pcie->drvdata->flags & IMX6_PCIE_FLAG_IMX6_PHY)) + return 0; + + switch (phy_rate) { + case 125000000: + /* + * The default settings of the MPLL are for a 125MHz input + * clock, so no need to reconfigure anything in that case. + */ + return 0; + case 100000000: + mult = 25; + div = 0; + break; + case 200000000: + mult = 25; + div = 1; + break; + default: + dev_err(imx6_pcie->pci->dev, + "Unsupported PHY reference clock rate %lu\n", phy_rate); + return -EINVAL; + } + + pcie_phy_read(imx6_pcie, PCIE_PHY_MPLL_OVRD_IN_LO, &val); + val &= ~(PCIE_PHY_MPLL_MULTIPLIER_MASK << + PCIE_PHY_MPLL_MULTIPLIER_SHIFT); + val |= mult << PCIE_PHY_MPLL_MULTIPLIER_SHIFT; + val |= PCIE_PHY_MPLL_MULTIPLIER_OVRD; + pcie_phy_write(imx6_pcie, PCIE_PHY_MPLL_OVRD_IN_LO, val); + + pcie_phy_read(imx6_pcie, PCIE_PHY_ATEOVRD, &val); + val &= ~(PCIE_PHY_ATEOVRD_REF_CLKDIV_MASK << + PCIE_PHY_ATEOVRD_REF_CLKDIV_SHIFT); + val |= div << PCIE_PHY_ATEOVRD_REF_CLKDIV_SHIFT; + val |= PCIE_PHY_ATEOVRD_EN; + pcie_phy_write(imx6_pcie, PCIE_PHY_ATEOVRD, val); + + return 0; +} + static void imx6_pcie_reset_phy(struct imx6_pcie *imx6_pcie) { u16 tmp; @@ -500,19 +628,6 @@ static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie) return ret; } -static void imx7d_pcie_wait_for_phy_pll_lock(struct imx6_pcie *imx6_pcie) -{ - u32 val; - struct device *dev = imx6_pcie->pci->dev; - - if (regmap_read_poll_timeout(imx6_pcie->iomuxc_gpr, - IOMUXC_GPR22, val, - val & IMX7D_GPR22_PCIE_PHY_PLL_LOCKED, - PHY_PLL_LOCK_WAIT_USLEEP_MAX, - PHY_PLL_LOCK_WAIT_TIMEOUT)) - dev_err(dev, "PCIe PLL lock timeout\n"); -} - static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) { struct dw_pcie *pci = imx6_pcie->pci; @@ -635,121 +750,6 @@ static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) } } -static void imx6_pcie_init_phy(struct imx6_pcie *imx6_pcie) -{ - switch (imx6_pcie->drvdata->variant) { - case IMX8MM: - /* - * The PHY initialization had been done in the PHY - * driver, break here directly. - */ - break; - case IMX8MQ: - /* - * TODO: Currently this code assumes external - * oscillator is being used - */ - regmap_update_bits(imx6_pcie->iomuxc_gpr, - imx6_pcie_grp_offset(imx6_pcie), - IMX8MQ_GPR_PCIE_REF_USE_PAD, - IMX8MQ_GPR_PCIE_REF_USE_PAD); - /* - * Regarding the datasheet, the PCIE_VPH is suggested - * to be 1.8V. If the PCIE_VPH is supplied by 3.3V, the - * VREG_BYPASS should be cleared to zero. - */ - if (imx6_pcie->vph && - regulator_get_voltage(imx6_pcie->vph) > 3000000) - regmap_update_bits(imx6_pcie->iomuxc_gpr, - imx6_pcie_grp_offset(imx6_pcie), - IMX8MQ_GPR_PCIE_VREG_BYPASS, - 0); - break; - case IMX7D: - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, - IMX7D_GPR12_PCIE_PHY_REFCLK_SEL, 0); - break; - case IMX6SX: - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, - IMX6SX_GPR12_PCIE_RX_EQ_MASK, - IMX6SX_GPR12_PCIE_RX_EQ_2); - fallthrough; - default: - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, - IMX6Q_GPR12_PCIE_CTL_2, 0 << 10); - - /* configure constant input signal to the pcie ctrl and phy */ - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, - IMX6Q_GPR12_LOS_LEVEL, 9 << 4); - - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, - IMX6Q_GPR8_TX_DEEMPH_GEN1, - imx6_pcie->tx_deemph_gen1 << 0); - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, - IMX6Q_GPR8_TX_DEEMPH_GEN2_3P5DB, - imx6_pcie->tx_deemph_gen2_3p5db << 6); - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, - IMX6Q_GPR8_TX_DEEMPH_GEN2_6DB, - imx6_pcie->tx_deemph_gen2_6db << 12); - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, - IMX6Q_GPR8_TX_SWING_FULL, - imx6_pcie->tx_swing_full << 18); - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR8, - IMX6Q_GPR8_TX_SWING_LOW, - imx6_pcie->tx_swing_low << 25); - break; - } - - imx6_pcie_configure_type(imx6_pcie); -} - -static int imx6_setup_phy_mpll(struct imx6_pcie *imx6_pcie) -{ - unsigned long phy_rate = clk_get_rate(imx6_pcie->pcie_phy); - int mult, div; - u16 val; - - if (!(imx6_pcie->drvdata->flags & IMX6_PCIE_FLAG_IMX6_PHY)) - return 0; - - switch (phy_rate) { - case 125000000: - /* - * The default settings of the MPLL are for a 125MHz input - * clock, so no need to reconfigure anything in that case. - */ - return 0; - case 100000000: - mult = 25; - div = 0; - break; - case 200000000: - mult = 25; - div = 1; - break; - default: - dev_err(imx6_pcie->pci->dev, - "Unsupported PHY reference clock rate %lu\n", phy_rate); - return -EINVAL; - } - - pcie_phy_read(imx6_pcie, PCIE_PHY_MPLL_OVRD_IN_LO, &val); - val &= ~(PCIE_PHY_MPLL_MULTIPLIER_MASK << - PCIE_PHY_MPLL_MULTIPLIER_SHIFT); - val |= mult << PCIE_PHY_MPLL_MULTIPLIER_SHIFT; - val |= PCIE_PHY_MPLL_MULTIPLIER_OVRD; - pcie_phy_write(imx6_pcie, PCIE_PHY_MPLL_OVRD_IN_LO, val); - - pcie_phy_read(imx6_pcie, PCIE_PHY_ATEOVRD, &val); - val &= ~(PCIE_PHY_ATEOVRD_REF_CLKDIV_MASK << - PCIE_PHY_ATEOVRD_REF_CLKDIV_SHIFT); - val |= div << PCIE_PHY_ATEOVRD_REF_CLKDIV_SHIFT; - val |= PCIE_PHY_ATEOVRD_EN; - pcie_phy_write(imx6_pcie, PCIE_PHY_ATEOVRD, val); - - return 0; -} - static int imx6_pcie_wait_for_speed_change(struct imx6_pcie *imx6_pcie) { struct dw_pcie *pci = imx6_pcie->pci; From b805cf0a70d2089cf7ca49bdf2b2e274bf02c9d9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 14 Jul 2022 15:30:55 +0800 Subject: [PATCH 0386/1250] PCI: imx6: Move imx6_pcie_enable_ref_clk() earlier Move imx6_pcie_enable_ref_clk() earlier so it's not in the middle between imx6_pcie_assert_core_reset() and imx6_pcie_deassert_core_reset(). No functional change intended. Link: https://lore.kernel.org/r/1657783869-19194-4-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach Acked-by: Richard Zhu --- drivers/pci/controller/dwc/pci-imx6.c | 96 +++++++++++++-------------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index a7d2f07d61c617..02cdffbc72b191 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -520,54 +520,6 @@ static int imx6_pcie_attach_pd(struct device *dev) return 0; } -static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) -{ - struct device *dev = imx6_pcie->pci->dev; - - switch (imx6_pcie->drvdata->variant) { - case IMX7D: - case IMX8MQ: - reset_control_assert(imx6_pcie->pciephy_reset); - fallthrough; - case IMX8MM: - reset_control_assert(imx6_pcie->apps_reset); - break; - case IMX6SX: - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, - IMX6SX_GPR12_PCIE_TEST_POWERDOWN, - IMX6SX_GPR12_PCIE_TEST_POWERDOWN); - /* Force PCIe PHY reset */ - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR5, - IMX6SX_GPR5_PCIE_BTNRST_RESET, - IMX6SX_GPR5_PCIE_BTNRST_RESET); - break; - case IMX6QP: - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1, - IMX6Q_GPR1_PCIE_SW_RST, - IMX6Q_GPR1_PCIE_SW_RST); - break; - case IMX6Q: - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1, - IMX6Q_GPR1_PCIE_TEST_PD, 1 << 18); - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1, - IMX6Q_GPR1_PCIE_REF_CLK_EN, 0 << 16); - break; - } - - if (imx6_pcie->vpcie && regulator_is_enabled(imx6_pcie->vpcie) > 0) { - int ret = regulator_disable(imx6_pcie->vpcie); - - if (ret) - dev_err(dev, "failed to disable vpcie regulator: %d\n", - ret); - } - - /* Some boards don't have PCIe reset GPIO. */ - if (gpio_is_valid(imx6_pcie->reset_gpio)) - gpio_set_value_cansleep(imx6_pcie->reset_gpio, - imx6_pcie->gpio_active_high); -} - static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie) { struct dw_pcie *pci = imx6_pcie->pci; @@ -628,6 +580,54 @@ static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie) return ret; } +static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) +{ + struct device *dev = imx6_pcie->pci->dev; + + switch (imx6_pcie->drvdata->variant) { + case IMX7D: + case IMX8MQ: + reset_control_assert(imx6_pcie->pciephy_reset); + fallthrough; + case IMX8MM: + reset_control_assert(imx6_pcie->apps_reset); + break; + case IMX6SX: + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, + IMX6SX_GPR12_PCIE_TEST_POWERDOWN, + IMX6SX_GPR12_PCIE_TEST_POWERDOWN); + /* Force PCIe PHY reset */ + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR5, + IMX6SX_GPR5_PCIE_BTNRST_RESET, + IMX6SX_GPR5_PCIE_BTNRST_RESET); + break; + case IMX6QP: + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1, + IMX6Q_GPR1_PCIE_SW_RST, + IMX6Q_GPR1_PCIE_SW_RST); + break; + case IMX6Q: + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1, + IMX6Q_GPR1_PCIE_TEST_PD, 1 << 18); + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1, + IMX6Q_GPR1_PCIE_REF_CLK_EN, 0 << 16); + break; + } + + if (imx6_pcie->vpcie && regulator_is_enabled(imx6_pcie->vpcie) > 0) { + int ret = regulator_disable(imx6_pcie->vpcie); + + if (ret) + dev_err(dev, "failed to disable vpcie regulator: %d\n", + ret); + } + + /* Some boards don't have PCIe reset GPIO. */ + if (gpio_is_valid(imx6_pcie->reset_gpio)) + gpio_set_value_cansleep(imx6_pcie->reset_gpio, + imx6_pcie->gpio_active_high); +} + static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) { struct dw_pcie *pci = imx6_pcie->pci; From 4b88d2da2b9d78510bd7e6ea86104b976a57aa05 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:30:56 +0800 Subject: [PATCH 0387/1250] PCI: imx6: Move imx6_pcie_clk_disable() earlier Move imx6_pcie_clk_disable() earlier to be near other clock-related functions. No functional change intended. [bhelgaas: reorder patch so pure moves are earlier] Link: https://lore.kernel.org/r/1657783869-19194-5-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 48 +++++++++++++-------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 02cdffbc72b191..8aafc588a4f32d 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -580,6 +580,30 @@ static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie) return ret; } +static void imx6_pcie_clk_disable(struct imx6_pcie *imx6_pcie) +{ + clk_disable_unprepare(imx6_pcie->pcie); + clk_disable_unprepare(imx6_pcie->pcie_phy); + clk_disable_unprepare(imx6_pcie->pcie_bus); + + switch (imx6_pcie->drvdata->variant) { + case IMX6SX: + clk_disable_unprepare(imx6_pcie->pcie_inbound_axi); + break; + case IMX7D: + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, + IMX7D_GPR12_PCIE_PHY_REFCLK_SEL, + IMX7D_GPR12_PCIE_PHY_REFCLK_SEL); + break; + case IMX8MQ: + case IMX8MM: + clk_disable_unprepare(imx6_pcie->pcie_aux); + break; + default: + break; + } +} + static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) { struct device *dev = imx6_pcie->pci->dev; @@ -941,30 +965,6 @@ static void imx6_pcie_pm_turnoff(struct imx6_pcie *imx6_pcie) usleep_range(1000, 10000); } -static void imx6_pcie_clk_disable(struct imx6_pcie *imx6_pcie) -{ - clk_disable_unprepare(imx6_pcie->pcie); - clk_disable_unprepare(imx6_pcie->pcie_phy); - clk_disable_unprepare(imx6_pcie->pcie_bus); - - switch (imx6_pcie->drvdata->variant) { - case IMX6SX: - clk_disable_unprepare(imx6_pcie->pcie_inbound_axi); - break; - case IMX7D: - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, - IMX7D_GPR12_PCIE_PHY_REFCLK_SEL, - IMX7D_GPR12_PCIE_PHY_REFCLK_SEL); - break; - case IMX8MQ: - case IMX8MM: - clk_disable_unprepare(imx6_pcie->pcie_aux); - break; - default: - break; - } -} - static int imx6_pcie_suspend_noirq(struct device *dev) { struct imx6_pcie *imx6_pcie = dev_get_drvdata(dev); From 8a5834a6f94422dfa7a04f02dfa8b6a368163fe9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 14 Jul 2022 15:30:57 +0800 Subject: [PATCH 0388/1250] PCI: imx6: Factor out ref clock disable to match enable The PCIe ref clocks are specific to different variants. The enables are already split out into imx6_pcie_enable_ref_clk(), but the disables were combined with the more generic bus/phy/pcie clock disables in imx6_pcie_clk_disable(). Split out the variant-specific disables into imx6_pcie_disable_ref_clk() to match imx6_pcie_enable_ref_clk(). No functional change intended. Link: https://lore.kernel.org/r/1657783869-19194-6-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach Acked-by: Richard Zhu --- drivers/pci/controller/dwc/pci-imx6.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 8aafc588a4f32d..a58642af2e16c8 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -580,12 +580,8 @@ static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie) return ret; } -static void imx6_pcie_clk_disable(struct imx6_pcie *imx6_pcie) +static void imx6_pcie_disable_ref_clk(struct imx6_pcie *imx6_pcie) { - clk_disable_unprepare(imx6_pcie->pcie); - clk_disable_unprepare(imx6_pcie->pcie_phy); - clk_disable_unprepare(imx6_pcie->pcie_bus); - switch (imx6_pcie->drvdata->variant) { case IMX6SX: clk_disable_unprepare(imx6_pcie->pcie_inbound_axi); @@ -595,8 +591,8 @@ static void imx6_pcie_clk_disable(struct imx6_pcie *imx6_pcie) IMX7D_GPR12_PCIE_PHY_REFCLK_SEL, IMX7D_GPR12_PCIE_PHY_REFCLK_SEL); break; - case IMX8MQ: case IMX8MM: + case IMX8MQ: clk_disable_unprepare(imx6_pcie->pcie_aux); break; default: @@ -604,6 +600,14 @@ static void imx6_pcie_clk_disable(struct imx6_pcie *imx6_pcie) } } +static void imx6_pcie_clk_disable(struct imx6_pcie *imx6_pcie) +{ + clk_disable_unprepare(imx6_pcie->pcie); + clk_disable_unprepare(imx6_pcie->pcie_phy); + clk_disable_unprepare(imx6_pcie->pcie_bus); + imx6_pcie_disable_ref_clk(imx6_pcie); +} + static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) { struct device *dev = imx6_pcie->pci->dev; From e3334dfafb2a0423ae9740d2b6c54fc0de5c61a5 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:30:58 +0800 Subject: [PATCH 0389/1250] PCI: imx6: Collect clock enables in imx6_pcie_clk_enable() Encapsulate the i.MX PCIe clock enable operations into one standalone function, imx6_pcie_clk_enable(). No functional change intended. [bhelgaas: split pure code moves into separate patches] Link: https://lore.kernel.org/r/1657783869-19194-7-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 95 ++++++++++++++++----------- 1 file changed, 56 insertions(+), 39 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index a58642af2e16c8..cbf018fb0511dc 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -600,6 +600,58 @@ static void imx6_pcie_disable_ref_clk(struct imx6_pcie *imx6_pcie) } } +static int imx6_pcie_clk_enable(struct imx6_pcie *imx6_pcie) +{ + struct dw_pcie *pci = imx6_pcie->pci; + struct device *dev = pci->dev; + int ret; + + ret = clk_prepare_enable(imx6_pcie->pcie_phy); + if (ret) { + dev_err(dev, "unable to enable pcie_phy clock\n"); + return ret; + } + + ret = clk_prepare_enable(imx6_pcie->pcie_bus); + if (ret) { + dev_err(dev, "unable to enable pcie_bus clock\n"); + goto err_pcie_bus; + } + + ret = clk_prepare_enable(imx6_pcie->pcie); + if (ret) { + dev_err(dev, "unable to enable pcie clock\n"); + goto err_pcie; + } + + ret = imx6_pcie_enable_ref_clk(imx6_pcie); + if (ret) { + dev_err(dev, "unable to enable pcie ref clock\n"); + goto err_ref_clk; + } + + switch (imx6_pcie->drvdata->variant) { + case IMX8MM: + if (phy_power_on(imx6_pcie->phy)) + dev_err(dev, "unable to power on PHY\n"); + break; + default: + break; + } + /* allow the clocks to stabilize */ + usleep_range(200, 500); + return 0; + +err_ref_clk: + clk_disable_unprepare(imx6_pcie->pcie); +err_pcie: + clk_disable_unprepare(imx6_pcie->pcie_bus); +err_pcie_bus: + clk_disable_unprepare(imx6_pcie->pcie_phy); + + return ret; +} + static void imx6_pcie_clk_disable(struct imx6_pcie *imx6_pcie) { clk_disable_unprepare(imx6_pcie->pcie); @@ -671,40 +723,11 @@ static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) } } - ret = clk_prepare_enable(imx6_pcie->pcie_phy); - if (ret) { - dev_err(dev, "unable to enable pcie_phy clock\n"); - goto err_pcie_phy; - } - - ret = clk_prepare_enable(imx6_pcie->pcie_bus); - if (ret) { - dev_err(dev, "unable to enable pcie_bus clock\n"); - goto err_pcie_bus; - } - - ret = clk_prepare_enable(imx6_pcie->pcie); - if (ret) { - dev_err(dev, "unable to enable pcie clock\n"); - goto err_pcie; - } - - ret = imx6_pcie_enable_ref_clk(imx6_pcie); + ret = imx6_pcie_clk_enable(imx6_pcie); if (ret) { - dev_err(dev, "unable to enable pcie ref clock\n"); - goto err_ref_clk; - } - - switch (imx6_pcie->drvdata->variant) { - case IMX8MM: - if (phy_power_on(imx6_pcie->phy)) - dev_err(dev, "unable to power on PHY\n"); - break; - default: - break; + dev_err(dev, "unable to enable pcie clocks: %d\n", ret); + goto err_clks; } - /* allow the clocks to stabilize */ - usleep_range(200, 500); switch (imx6_pcie->drvdata->variant) { case IMX8MQ: @@ -763,13 +786,7 @@ static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) return; -err_ref_clk: - clk_disable_unprepare(imx6_pcie->pcie); -err_pcie: - clk_disable_unprepare(imx6_pcie->pcie_bus); -err_pcie_bus: - clk_disable_unprepare(imx6_pcie->pcie_phy); -err_pcie_phy: +err_clks: if (imx6_pcie->vpcie && regulator_is_enabled(imx6_pcie->vpcie) > 0) { ret = regulator_disable(imx6_pcie->vpcie); if (ret) From a5bea9a09d089095ad3663f86256d62d0fd06b7c Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:30:59 +0800 Subject: [PATCH 0390/1250] PCI: imx6: Propagate .host_init() errors to caller Since dw_pcie_host_init() checks for errors from ops->host_init(), check for errors when enabling power regulators and clocks and return them. [bhelgaas: commit log] Link: https://lore.kernel.org/r/1657783869-19194-8-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index cbf018fb0511dc..bfa470c46906ba 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -708,7 +708,7 @@ static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) imx6_pcie->gpio_active_high); } -static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) +static int imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) { struct dw_pcie *pci = imx6_pcie->pci; struct device *dev = pci->dev; @@ -719,7 +719,7 @@ static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) if (ret) { dev_err(dev, "failed to enable vpcie regulator: %d\n", ret); - return; + return ret; } } @@ -784,7 +784,7 @@ static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) msleep(100); } - return; + return 0; err_clks: if (imx6_pcie->vpcie && regulator_is_enabled(imx6_pcie->vpcie) > 0) { @@ -793,6 +793,7 @@ static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) dev_err(dev, "failed to disable vpcie regulator: %d\n", ret); } + return ret; } static int imx6_pcie_wait_for_speed_change(struct imx6_pcie *imx6_pcie) @@ -911,11 +912,18 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) static int imx6_pcie_host_init(struct dw_pcie_rp *pp) { struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct device *dev = pci->dev; struct imx6_pcie *imx6_pcie = to_imx6_pcie(pci); + int ret; imx6_pcie_assert_core_reset(imx6_pcie); imx6_pcie_init_phy(imx6_pcie); - imx6_pcie_deassert_core_reset(imx6_pcie); + ret = imx6_pcie_deassert_core_reset(imx6_pcie); + if (ret < 0) { + dev_err(dev, "pcie deassert core reset failed: %d\n", ret); + return ret; + } + imx6_setup_phy_mpll(imx6_pcie); return 0; From 2b5b48d64a45b877474080d97df4d47e4a69374d Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:31:00 +0800 Subject: [PATCH 0391/1250] PCI: imx6: Disable i.MX6QDL clock when disabling ref clocks When disabling PCIe clocks, disable i.MX6QDL ref clock too. Link: https://lore.kernel.org/r/1657783869-19194-9-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index bfa470c46906ba..bd3ef1f4bd94c0 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -586,6 +586,14 @@ static void imx6_pcie_disable_ref_clk(struct imx6_pcie *imx6_pcie) case IMX6SX: clk_disable_unprepare(imx6_pcie->pcie_inbound_axi); break; + case IMX6QP: + case IMX6Q: + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1, + IMX6Q_GPR1_PCIE_REF_CLK_EN, 0); + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR1, + IMX6Q_GPR1_PCIE_TEST_PD, + IMX6Q_GPR1_PCIE_TEST_PD); + break; case IMX7D: regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, IMX7D_GPR12_PCIE_PHY_REFCLK_SEL, From 67052832be7eede0b9bc2c56afa899a4f5f42152 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:31:01 +0800 Subject: [PATCH 0392/1250] PCI: imx6: Call host init function directly in resume Call imx6_pcie_host_init() instead of duplicating codes in resume. Note that this also means we do MPLL setup again during resume, which we didn't do before. [bhelgaas: add MPLL setup note, pointed out by Lucas] Link: https://lore.kernel.org/r/1657783869-19194-10-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index bd3ef1f4bd94c0..45671d717b8c01 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -1034,9 +1034,9 @@ static int imx6_pcie_resume_noirq(struct device *dev) if (!(imx6_pcie->drvdata->flags & IMX6_PCIE_FLAG_SUPPORTS_SUSPEND)) return 0; - imx6_pcie_assert_core_reset(imx6_pcie); - imx6_pcie_init_phy(imx6_pcie); - imx6_pcie_deassert_core_reset(imx6_pcie); + ret = imx6_pcie_host_init(pp); + if (ret) + return ret; dw_pcie_setup_rc(pp); ret = imx6_pcie_start_link(imx6_pcie->pci); From 41de2be1a14e414712c8b46745ca9460e435bb16 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:31:02 +0800 Subject: [PATCH 0393/1250] PCI: imx6: Turn off regulator when system is in suspend mode The driver should undo any enables it did itself. The regulator disable shouldn't be basing decisions on regulator_is_enabled(). Move the regulator_disable to the suspend function, turn off regulator when the system is in suspend mode. Link: https://lore.kernel.org/r/1657783869-19194-11-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pci-imx6.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 45671d717b8c01..a17cefa7497c5b 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -670,8 +670,6 @@ static void imx6_pcie_clk_disable(struct imx6_pcie *imx6_pcie) static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) { - struct device *dev = imx6_pcie->pci->dev; - switch (imx6_pcie->drvdata->variant) { case IMX7D: case IMX8MQ: @@ -702,14 +700,6 @@ static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) break; } - if (imx6_pcie->vpcie && regulator_is_enabled(imx6_pcie->vpcie) > 0) { - int ret = regulator_disable(imx6_pcie->vpcie); - - if (ret) - dev_err(dev, "failed to disable vpcie regulator: %d\n", - ret); - } - /* Some boards don't have PCIe reset GPIO. */ if (gpio_is_valid(imx6_pcie->reset_gpio)) gpio_set_value_cansleep(imx6_pcie->reset_gpio, @@ -722,7 +712,7 @@ static int imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) struct device *dev = pci->dev; int ret; - if (imx6_pcie->vpcie && !regulator_is_enabled(imx6_pcie->vpcie)) { + if (imx6_pcie->vpcie) { ret = regulator_enable(imx6_pcie->vpcie); if (ret) { dev_err(dev, "failed to enable vpcie regulator: %d\n", @@ -795,7 +785,7 @@ static int imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) return 0; err_clks: - if (imx6_pcie->vpcie && regulator_is_enabled(imx6_pcie->vpcie) > 0) { + if (imx6_pcie->vpcie) { ret = regulator_disable(imx6_pcie->vpcie); if (ret) dev_err(dev, "failed to disable vpcie regulator: %d\n", @@ -1022,6 +1012,9 @@ static int imx6_pcie_suspend_noirq(struct device *dev) break; } + if (imx6_pcie->vpcie) + regulator_disable(imx6_pcie->vpcie); + return 0; } From 8e014add498a101897476ea0fa1c71b2ba4e2fbf Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:31:03 +0800 Subject: [PATCH 0394/1250] PCI: imx6: Move regulator enable out of imx6_pcie_deassert_core_reset() Move regulator enable out of imx6_pcie_deassert_core_reset(), since the regulator_enable() has nothing to do with imx6_pcie_deassert_core_reset(). Link: https://lore.kernel.org/r/1657783869-19194-12-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pci-imx6.c | 36 ++++++++++++--------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index a17cefa7497c5b..873983ba19ab2c 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -712,19 +712,10 @@ static int imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) struct device *dev = pci->dev; int ret; - if (imx6_pcie->vpcie) { - ret = regulator_enable(imx6_pcie->vpcie); - if (ret) { - dev_err(dev, "failed to enable vpcie regulator: %d\n", - ret); - return ret; - } - } - ret = imx6_pcie_clk_enable(imx6_pcie); if (ret) { dev_err(dev, "unable to enable pcie clocks: %d\n", ret); - goto err_clks; + return ret; } switch (imx6_pcie->drvdata->variant) { @@ -783,15 +774,6 @@ static int imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) } return 0; - -err_clks: - if (imx6_pcie->vpcie) { - ret = regulator_disable(imx6_pcie->vpcie); - if (ret) - dev_err(dev, "failed to disable vpcie regulator: %d\n", - ret); - } - return ret; } static int imx6_pcie_wait_for_speed_change(struct imx6_pcie *imx6_pcie) @@ -914,17 +896,31 @@ static int imx6_pcie_host_init(struct dw_pcie_rp *pp) struct imx6_pcie *imx6_pcie = to_imx6_pcie(pci); int ret; + if (imx6_pcie->vpcie) { + ret = regulator_enable(imx6_pcie->vpcie); + if (ret) { + dev_err(dev, "failed to enable vpcie regulator: %d\n", + ret); + return ret; + } + } + imx6_pcie_assert_core_reset(imx6_pcie); imx6_pcie_init_phy(imx6_pcie); ret = imx6_pcie_deassert_core_reset(imx6_pcie); if (ret < 0) { dev_err(dev, "pcie deassert core reset failed: %d\n", ret); - return ret; + goto err_reg_disable; } imx6_setup_phy_mpll(imx6_pcie); return 0; + +err_reg_disable: + if (imx6_pcie->vpcie) + regulator_disable(imx6_pcie->vpcie); + return ret; } static const struct dw_pcie_host_ops imx6_pcie_host_ops = { From 38c00d4ae6717a43d33882aa54ac0049658bb050 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:31:04 +0800 Subject: [PATCH 0395/1250] PCI: imx6: Mark the link down as non-fatal error If the PCIe link is down, return zero from imx6_pcie_start_link() so the driver will probe successfully. Link: https://lore.kernel.org/r/1657783869-19194-13-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 873983ba19ab2c..0870c67f90d069 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -836,7 +836,9 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) /* Start LTSSM. */ imx6_pcie_ltssm_enable(dev); - dw_pcie_wait_for_link(pci); + ret = dw_pcie_wait_for_link(pci); + if (ret) + goto err_reset_phy; if (pci->link_gen == 2) { /* Allow Gen2 mode after the link is up. */ @@ -872,7 +874,9 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) } /* Make sure link training is finished as well! */ - dw_pcie_wait_for_link(pci); + ret = dw_pcie_wait_for_link(pci); + if (ret) + goto err_reset_phy; } else { dev_info(dev, "Link: Gen2 disabled\n"); } @@ -886,7 +890,7 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) dw_pcie_readl_dbi(pci, PCIE_PORT_DEBUG0), dw_pcie_readl_dbi(pci, PCIE_PORT_DEBUG1)); imx6_pcie_reset_phy(imx6_pcie); - return ret; + return 0; } static int imx6_pcie_host_init(struct dw_pcie_rp *pp) @@ -1028,10 +1032,7 @@ static int imx6_pcie_resume_noirq(struct device *dev) return ret; dw_pcie_setup_rc(pp); - ret = imx6_pcie_start_link(imx6_pcie->pci); - if (ret < 0) - dev_info(dev, "pcie link is down after resume.\n"); - + imx6_pcie_start_link(imx6_pcie->pci); return 0; } #endif From 034a46afcb9ba790c39a52c2430c8b8e7efdf86f Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:31:05 +0800 Subject: [PATCH 0396/1250] PCI: imx6: Reduce resume time by only starting link if it was up before suspend i.MX PCIe doesn't support hotplug. During resume, only start PCIe link training when the link was up before system suspend to avoid the long latency in the link training period. Link: https://lore.kernel.org/r/1657783869-19194-14-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pci-imx6.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 0870c67f90d069..264f786fb0a53f 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -67,6 +67,7 @@ struct imx6_pcie { struct dw_pcie *pci; int reset_gpio; bool gpio_active_high; + bool link_is_up; struct clk *pcie_bus; struct clk *pcie_phy; struct clk *pcie_inbound_axi; @@ -881,11 +882,13 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) dev_info(dev, "Link: Gen2 disabled\n"); } + imx6_pcie->link_is_up = true; tmp = dw_pcie_readw_dbi(pci, offset + PCI_EXP_LNKSTA); dev_info(dev, "Link up, Gen%i\n", tmp & PCI_EXP_LNKSTA_CLS); return 0; err_reset_phy: + imx6_pcie->link_is_up = false; dev_dbg(dev, "PHY DEBUG_R0=0x%08x DEBUG_R1=0x%08x\n", dw_pcie_readl_dbi(pci, PCIE_PORT_DEBUG0), dw_pcie_readl_dbi(pci, PCIE_PORT_DEBUG1)); @@ -1032,7 +1035,9 @@ static int imx6_pcie_resume_noirq(struct device *dev) return ret; dw_pcie_setup_rc(pp); - imx6_pcie_start_link(imx6_pcie->pci); + if (imx6_pcie->link_is_up) + imx6_pcie_start_link(imx6_pcie->pci); + return 0; } #endif From fc59b59e157bb007f6a6e6ee49523105366ea303 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:31:06 +0800 Subject: [PATCH 0397/1250] PCI: imx6: Do not hide PHY driver callbacks and refine the error handling Move the phy_power_on() to host_init from imx6_pcie_clk_enable(). Move the phy_init() to host_init from imx6_pcie_deassert_core_reset(). Refine the error handling in imx6_pcie_host_init() accordingly. Link: https://lore.kernel.org/r/1657783869-19194-15-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pci-imx6.c | 36 +++++++++++++++++---------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 264f786fb0a53f..b6c090e0f2cac3 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -639,14 +639,6 @@ static int imx6_pcie_clk_enable(struct imx6_pcie *imx6_pcie) goto err_ref_clk; } - switch (imx6_pcie->drvdata->variant) { - case IMX8MM: - if (phy_power_on(imx6_pcie->phy)) - dev_err(dev, "unable to power on PHY\n"); - break; - default: - break; - } /* allow the clocks to stabilize */ usleep_range(200, 500); return 0; @@ -723,10 +715,6 @@ static int imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) case IMX8MQ: reset_control_deassert(imx6_pcie->pciephy_reset); break; - case IMX8MM: - if (phy_init(imx6_pcie->phy)) - dev_err(dev, "waiting for phy ready timeout!\n"); - break; case IMX7D: reset_control_deassert(imx6_pcie->pciephy_reset); @@ -762,6 +750,7 @@ static int imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) usleep_range(200, 500); break; case IMX6Q: /* Nothing to do */ + case IMX8MM: break; } @@ -914,16 +903,37 @@ static int imx6_pcie_host_init(struct dw_pcie_rp *pp) imx6_pcie_assert_core_reset(imx6_pcie); imx6_pcie_init_phy(imx6_pcie); + + if (imx6_pcie->phy) { + ret = phy_power_on(imx6_pcie->phy); + if (ret) { + dev_err(dev, "pcie PHY power up failed\n"); + goto err_reg_disable; + } + } + ret = imx6_pcie_deassert_core_reset(imx6_pcie); if (ret < 0) { dev_err(dev, "pcie deassert core reset failed: %d\n", ret); - goto err_reg_disable; + goto err_phy_off; } + if (imx6_pcie->phy) { + ret = phy_init(imx6_pcie->phy); + if (ret) { + dev_err(dev, "waiting for PHY ready timeout!\n"); + goto err_clk_disable; + } + } imx6_setup_phy_mpll(imx6_pcie); return 0; +err_clk_disable: + imx6_pcie_clk_disable(imx6_pcie); +err_phy_off: + if (imx6_pcie->phy) + phy_power_off(imx6_pcie->phy); err_reg_disable: if (imx6_pcie->vpcie) regulator_disable(imx6_pcie->vpcie); From 5af501602ec7d58b8a759a591db1b17483acd4b1 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Thu, 14 Jul 2022 15:31:07 +0800 Subject: [PATCH 0398/1250] PCI: imx6: Disable clocks in reverse order of enable imx6_pcie_clk_enable() enables clocks in the order: pcie_phy pcie_bus pcie imx6_pcie_enable_ref_clk Change imx6_pcie_clk_disable() to disable them in the reverse order. Link: https://lore.kernel.org/r/1657783869-19194-16-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach Acked-by: Richard Zhu --- drivers/pci/controller/dwc/pci-imx6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index b6c090e0f2cac3..a2966a741e3fec 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -655,10 +655,10 @@ static int imx6_pcie_clk_enable(struct imx6_pcie *imx6_pcie) static void imx6_pcie_clk_disable(struct imx6_pcie *imx6_pcie) { + imx6_pcie_disable_ref_clk(imx6_pcie); clk_disable_unprepare(imx6_pcie->pcie); - clk_disable_unprepare(imx6_pcie->pcie_phy); clk_disable_unprepare(imx6_pcie->pcie_bus); - imx6_pcie_disable_ref_clk(imx6_pcie); + clk_disable_unprepare(imx6_pcie->pcie_phy); } static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie) From 1d193057de55e8560264a1671e0f276dc55ef650 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:31:08 +0800 Subject: [PATCH 0399/1250] PCI: imx6: Move the imx6_pcie_ltssm_disable() earlier Move the imx6_pcie_ltssm_disable() earlier and place it just behind the imx6_pcie_ltssm_enable(), since it might not be only used by suspend callback directly. To be symmetric with imx6_pcie_ltssm_enable(), add the IMX6Q and IMX8MQ switch cases in imx6_pcie_ltssm_disable(). Link: https://lore.kernel.org/r/1657783869-19194-17-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pci-imx6.c | 38 +++++++++++++-------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index a2966a741e3fec..84686aa8c9125d 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -805,6 +805,25 @@ static void imx6_pcie_ltssm_enable(struct device *dev) } } +static void imx6_pcie_ltssm_disable(struct device *dev) +{ + struct imx6_pcie *imx6_pcie = dev_get_drvdata(dev); + + switch (imx6_pcie->drvdata->variant) { + case IMX6Q: + case IMX6SX: + case IMX6QP: + regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, + IMX6Q_GPR12_PCIE_CTL_2, 0); + break; + case IMX7D: + case IMX8MQ: + case IMX8MM: + reset_control_assert(imx6_pcie->apps_reset); + break; + } +} + static int imx6_pcie_start_link(struct dw_pcie *pci) { struct imx6_pcie *imx6_pcie = to_imx6_pcie(pci); @@ -949,25 +968,6 @@ static const struct dw_pcie_ops dw_pcie_ops = { }; #ifdef CONFIG_PM_SLEEP -static void imx6_pcie_ltssm_disable(struct device *dev) -{ - struct imx6_pcie *imx6_pcie = dev_get_drvdata(dev); - - switch (imx6_pcie->drvdata->variant) { - case IMX6SX: - case IMX6QP: - regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12, - IMX6Q_GPR12_PCIE_CTL_2, 0); - break; - case IMX7D: - case IMX8MM: - reset_control_assert(imx6_pcie->apps_reset); - break; - default: - dev_err(dev, "ltssm_disable not supported\n"); - } -} - static void imx6_pcie_pm_turnoff(struct imx6_pcie *imx6_pcie) { struct device *dev = imx6_pcie->pci->dev; From 25ae5434c3de67ae316e2663f5b277fe4560a8b6 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Thu, 14 Jul 2022 15:31:09 +0800 Subject: [PATCH 0400/1250] PCI: imx6: Reformat suspend callback to keep symmetric with resume Create imx6_pcie_stop_link() and imx6_pcie_host_exit() functions. Encapsulate clocks, regulators disables and PHY uninitialization into imx6_pcie_host_exit(). To keep suspend/resume symmetric as much as possible, invoke these two new created functions in suspend callback. To be symmetric with imx6_pcie_host_exit(), move imx6_pcie_clk_enable() to imx6_pcie_host_init() from imx6_pcie_deassert_core_reset(). Link: https://lore.kernel.org/r/1657783869-19194-18-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas Reviewed-by: Lucas Stach --- drivers/pci/controller/dwc/pci-imx6.c | 62 ++++++++++++++++----------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 84686aa8c9125d..0a81e649b213cd 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -703,13 +703,6 @@ static int imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie) { struct dw_pcie *pci = imx6_pcie->pci; struct device *dev = pci->dev; - int ret; - - ret = imx6_pcie_clk_enable(imx6_pcie); - if (ret) { - dev_err(dev, "unable to enable pcie clocks: %d\n", ret); - return ret; - } switch (imx6_pcie->drvdata->variant) { case IMX8MQ: @@ -904,6 +897,14 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) return 0; } +static void imx6_pcie_stop_link(struct dw_pcie *pci) +{ + struct device *dev = pci->dev; + + /* Turn off PCIe LTSSM */ + imx6_pcie_ltssm_disable(dev); +} + static int imx6_pcie_host_init(struct dw_pcie_rp *pp) { struct dw_pcie *pci = to_dw_pcie_from_pp(pp); @@ -923,11 +924,17 @@ static int imx6_pcie_host_init(struct dw_pcie_rp *pp) imx6_pcie_assert_core_reset(imx6_pcie); imx6_pcie_init_phy(imx6_pcie); + ret = imx6_pcie_clk_enable(imx6_pcie); + if (ret) { + dev_err(dev, "unable to enable pcie clocks: %d\n", ret); + goto err_reg_disable; + } + if (imx6_pcie->phy) { ret = phy_power_on(imx6_pcie->phy); if (ret) { dev_err(dev, "pcie PHY power up failed\n"); - goto err_reg_disable; + goto err_clk_disable; } } @@ -941,24 +948,40 @@ static int imx6_pcie_host_init(struct dw_pcie_rp *pp) ret = phy_init(imx6_pcie->phy); if (ret) { dev_err(dev, "waiting for PHY ready timeout!\n"); - goto err_clk_disable; + goto err_phy_off; } } imx6_setup_phy_mpll(imx6_pcie); return 0; -err_clk_disable: - imx6_pcie_clk_disable(imx6_pcie); err_phy_off: if (imx6_pcie->phy) phy_power_off(imx6_pcie->phy); +err_clk_disable: + imx6_pcie_clk_disable(imx6_pcie); err_reg_disable: if (imx6_pcie->vpcie) regulator_disable(imx6_pcie->vpcie); return ret; } +static void imx6_pcie_host_exit(struct dw_pcie_rp *pp) +{ + struct dw_pcie *pci = to_dw_pcie_from_pp(pp); + struct imx6_pcie *imx6_pcie = to_imx6_pcie(pci); + + if (imx6_pcie->phy) { + if (phy_power_off(imx6_pcie->phy)) + dev_err(pci->dev, "unable to power off PHY\n"); + phy_exit(imx6_pcie->phy); + } + imx6_pcie_clk_disable(imx6_pcie); + + if (imx6_pcie->vpcie) + regulator_disable(imx6_pcie->vpcie); +} + static const struct dw_pcie_host_ops imx6_pcie_host_ops = { .host_init = imx6_pcie_host_init, }; @@ -1008,25 +1031,14 @@ static void imx6_pcie_pm_turnoff(struct imx6_pcie *imx6_pcie) static int imx6_pcie_suspend_noirq(struct device *dev) { struct imx6_pcie *imx6_pcie = dev_get_drvdata(dev); + struct dw_pcie_rp *pp = &imx6_pcie->pci->pp; if (!(imx6_pcie->drvdata->flags & IMX6_PCIE_FLAG_SUPPORTS_SUSPEND)) return 0; imx6_pcie_pm_turnoff(imx6_pcie); - imx6_pcie_ltssm_disable(dev); - imx6_pcie_clk_disable(imx6_pcie); - switch (imx6_pcie->drvdata->variant) { - case IMX8MM: - if (phy_power_off(imx6_pcie->phy)) - dev_err(dev, "unable to power off PHY\n"); - phy_exit(imx6_pcie->phy); - break; - default: - break; - } - - if (imx6_pcie->vpcie) - regulator_disable(imx6_pcie->vpcie); + imx6_pcie_stop_link(imx6_pcie->pci); + imx6_pcie_host_exit(pp); return 0; } From 87f1cecb09ae80ebdab6e4e791a3849a4a0e57f8 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Wed, 18 May 2022 17:35:27 +0800 Subject: [PATCH 0401/1250] PCI: imx6: Set PCIE_DBI_RO_WR_EN before writing DBI registers The PCIE_DBI_RO_WR_EN bit should be set when write some DBI registers. To make sure that the DBI registers are writable, set the PCIE_DBI_RO_WR_EN properly when writing the DBI registers. Link: https://lore.kernel.org/r/1652866528-13220-1-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pci-imx6.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index 0a81e649b213cd..ce60e87ffbb373 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -830,10 +830,12 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) * started in Gen2 mode, there is a possibility the devices on the * bus will not be detected at all. This happens with PCIe switches. */ + dw_pcie_dbi_ro_wr_en(pci); tmp = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); tmp &= ~PCI_EXP_LNKCAP_SLS; tmp |= PCI_EXP_LNKCAP_SLS_2_5GB; dw_pcie_writel_dbi(pci, offset + PCI_EXP_LNKCAP, tmp); + dw_pcie_dbi_ro_wr_dis(pci); /* Start LTSSM. */ imx6_pcie_ltssm_enable(dev); @@ -844,6 +846,7 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) if (pci->link_gen == 2) { /* Allow Gen2 mode after the link is up. */ + dw_pcie_dbi_ro_wr_en(pci); tmp = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); tmp &= ~PCI_EXP_LNKCAP_SLS; tmp |= PCI_EXP_LNKCAP_SLS_5_0GB; @@ -856,6 +859,7 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) tmp = dw_pcie_readl_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL); tmp |= PORT_LOGIC_SPEED_CHANGE; dw_pcie_writel_dbi(pci, PCIE_LINK_WIDTH_SPEED_CONTROL, tmp); + dw_pcie_dbi_ro_wr_dis(pci); if (imx6_pcie->drvdata->flags & IMX6_PCIE_FLAG_IMX6_SPEED_CHANGE) { From 19f5e788ff02e7401740b73ae4e285bd5bc0ac05 Mon Sep 17 00:00:00 2001 From: Richard Zhu Date: Tue, 12 Jul 2022 14:20:02 -0500 Subject: [PATCH 0402/1250] PCI: imx6: Support more than Gen2 speed link mode Support more than Gen2 speed link mode, since i.MX8MP PCIe supports up to Gen3 link speed. Link: https://lore.kernel.org/r/1652866528-13220-2-git-send-email-hongxing.zhu@nxp.com Signed-off-by: Richard Zhu Signed-off-by: Bjorn Helgaas --- drivers/pci/controller/dwc/pci-imx6.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-imx6.c b/drivers/pci/controller/dwc/pci-imx6.c index ce60e87ffbb373..55e109d1ab27f5 100644 --- a/drivers/pci/controller/dwc/pci-imx6.c +++ b/drivers/pci/controller/dwc/pci-imx6.c @@ -844,8 +844,8 @@ static int imx6_pcie_start_link(struct dw_pcie *pci) if (ret) goto err_reset_phy; - if (pci->link_gen == 2) { - /* Allow Gen2 mode after the link is up. */ + if (pci->link_gen > 1) { + /* Allow faster modes after the link is up */ dw_pcie_dbi_ro_wr_en(pci); tmp = dw_pcie_readl_dbi(pci, offset + PCI_EXP_LNKCAP); tmp &= ~PCI_EXP_LNKCAP_SLS; From 3a1e907afbc25feeab763ae6fd1b03a9aea08cae Mon Sep 17 00:00:00 2001 From: Bryan Brattlof Date: Wed, 25 May 2022 16:36:17 -0500 Subject: [PATCH 0403/1250] thermal/drivers/k3_j72xx_bandgap: Fix ref_table memory leak during probe If an error occurs in the k3_j72xx_bandgap_probe() function the memory allocated to the 'ref_table' will not be released. Add a err_free_ref_table step to the error path to free 'ref_table' Fixes: 72b3fc61c752 ("thermal: k3_j72xx_bandgap: Add the bandgap driver support") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Bryan Brattlof Reviewed-by: Keerthy Link: https://lore.kernel.org/r/20220525213617.30002-1-bb@ti.com Signed-off-by: Daniel Lezcano --- drivers/thermal/k3_j72xx_bandgap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index 64e32315895271..3a35aa38ff5120 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -433,7 +433,7 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) GFP_KERNEL); if (!derived_table) { ret = -ENOMEM; - goto err_alloc; + goto err_free_ref_table; } /* Workaround not needed if bit30/bit31 is set even for J721e */ @@ -483,7 +483,7 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) if (IS_ERR(ti_thermal)) { dev_err(bgp->dev, "thermal zone device is NULL\n"); ret = PTR_ERR(ti_thermal); - goto err_alloc; + goto err_free_ref_table; } } @@ -514,6 +514,9 @@ static int k3_j72xx_bandgap_probe(struct platform_device *pdev) return 0; +err_free_ref_table: + kfree(ref_table); + err_alloc: pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); From b4ebc59642b84479cda29111a072963e0865329d Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 20 May 2022 18:02:39 +0300 Subject: [PATCH 0404/1250] thermal/drivers/k3_j72xx_bandgap: Fix array underflow in prep_lookup_table() This while loop exits with "i" set to -1 and so then it sets: derived_table[-1] = derived_table[0] - 300; There is no need for this assignment at all. Just delete it. Fixes: 72b3fc61c752 ("thermal: k3_j72xx_bandgap: Add the bandgap driver support") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/YoetjwcOEzYEFp9b@kili Signed-off-by: Daniel Lezcano --- drivers/thermal/k3_j72xx_bandgap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index 3a35aa38ff5120..27d4cae44aa730 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -151,8 +151,6 @@ static int prep_lookup_table(struct err_values *err_vals, int *ref_table) /* 300 milli celsius steps */ while (i--) derived_table[i] = derived_table[i + 1] - 300; - /* case 0 */ - derived_table[i] = derived_table[i + 1] - 300; } /* From 5caed9894e60575bdb85b31ed741cbc23bedd8ac Mon Sep 17 00:00:00 2001 From: Jin Xiaoyun Date: Mon, 13 Jun 2022 14:31:11 +0800 Subject: [PATCH 0405/1250] thermal/drivers/k3_j72xx_bandgap: Make k3_j72xx_bandgap_j721e_data and k3_j72xx_bandgap_j7200_data static Fix sparse warnings: drivers/thermal/k3_j72xx_bandgap.c:532:36: sparse: sparse: symbol 'k3_j72xx_bandgap_j721e_data' was not declared. Should it be static? drivers/thermal/k3_j72xx_bandgap.c:536:36: sparse: sparse: symbol 'k3_j72xx_bandgap_j7200_data' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Jin Xiaoyun Link: https://lore.kernel.org/r/20220613063111.654893-1-jinxiaoyun2@huawei.com Signed-off-by: Daniel Lezcano --- drivers/thermal/k3_j72xx_bandgap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/k3_j72xx_bandgap.c b/drivers/thermal/k3_j72xx_bandgap.c index 27d4cae44aa730..115a44eb4fbfbf 100644 --- a/drivers/thermal/k3_j72xx_bandgap.c +++ b/drivers/thermal/k3_j72xx_bandgap.c @@ -530,11 +530,11 @@ static int k3_j72xx_bandgap_remove(struct platform_device *pdev) return 0; } -const struct k3_j72xx_bandgap_data k3_j72xx_bandgap_j721e_data = { +static const struct k3_j72xx_bandgap_data k3_j72xx_bandgap_j721e_data = { .has_errata_i2128 = 1, }; -const struct k3_j72xx_bandgap_data k3_j72xx_bandgap_j7200_data = { +static const struct k3_j72xx_bandgap_data k3_j72xx_bandgap_j7200_data = { .has_errata_i2128 = 0, }; From 95883cb87ce2c467a9e1fe853b5aa73c3eb2d522 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 21 May 2022 13:10:46 +0200 Subject: [PATCH 0406/1250] thermal/drivers/sun8i: Fix typo in comment Spelling mistake (triple letters) in comment. Detected with the help of Coccinelle. Signed-off-by: Julia Lawall Acked-by: Vasily Khoruzhick Reviewed-by: Jernej Skrabec Link: https://lore.kernel.org/r/20220521111145.81697-36-Julia.Lawall@inria.fr Signed-off-by: Daniel Lezcano --- drivers/thermal/sun8i_thermal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c index d9cd23cbb6717e..212c87e63a667c 100644 --- a/drivers/thermal/sun8i_thermal.c +++ b/drivers/thermal/sun8i_thermal.c @@ -237,7 +237,7 @@ static int sun50i_h6_ths_calibrate(struct ths_device *tmdev, * The calibration data on the H6 is the ambient temperature and * sensor values that are filled during the factory test stage. * - * The unit of stored FT temperature is 0.1 degreee celusis. + * The unit of stored FT temperature is 0.1 degree celsius. * * We need to calculate a delta between measured and caluclated * register values and this will become a calibration offset. From ac7d746be8b9ad03bfb05f7fc1327b1060061338 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 29 Jun 2022 00:04:08 +0200 Subject: [PATCH 0407/1250] phy: samsung: phy-exynos-pcie: sanitize init/power_on callbacks The exynos-pcie driver called phy_power_on() before phy_init() for some historical reasons. However the generic PHY framework assumes that the proper sequence is to call phy_init() first, then phy_power_on(). The operations done by both functions should be considered as one action and as such they are called by the exynos-pcie driver (without doing anything between them). The initialization is just a sequence of register writes, which cannot be altered without breaking the hardware operation. To match the generic PHY framework requirement, simply move all register writes to the phy_init()/phy_exit() and drop power_on()/power_off() callbacks. This way the driver will also work with the old (incorrect) PHY initialization call sequence. Link: https://lore.kernel.org/r/20220628220409.26545-1-m.szyprowski@samsung.com Reported-by: Bjorn Helgaas Signed-off-by: Marek Szyprowski Signed-off-by: Bjorn Helgaas Reviewed-by: Chanho Park Acked-by: Krzysztof Kozlowski Acked-By: Vinod Koul --- drivers/phy/samsung/phy-exynos-pcie.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/drivers/phy/samsung/phy-exynos-pcie.c b/drivers/phy/samsung/phy-exynos-pcie.c index 578cfe07d07abf..53c9230c290783 100644 --- a/drivers/phy/samsung/phy-exynos-pcie.c +++ b/drivers/phy/samsung/phy-exynos-pcie.c @@ -51,6 +51,13 @@ static int exynos5433_pcie_phy_init(struct phy *phy) { struct exynos_pcie_phy *ep = phy_get_drvdata(phy); + regmap_update_bits(ep->pmureg, EXYNOS5433_PMU_PCIE_PHY_OFFSET, + BIT(0), 1); + regmap_update_bits(ep->fsysreg, PCIE_EXYNOS5433_PHY_GLOBAL_RESET, + PCIE_APP_REQ_EXIT_L1_MODE, 0); + regmap_update_bits(ep->fsysreg, PCIE_EXYNOS5433_PHY_L1SUB_CM_CON, + PCIE_REFCLK_GATING_EN, 0); + regmap_update_bits(ep->fsysreg, PCIE_EXYNOS5433_PHY_COMMON_RESET, PCIE_PHY_RESET, 1); regmap_update_bits(ep->fsysreg, PCIE_EXYNOS5433_PHY_MAC_RESET, @@ -109,20 +116,7 @@ static int exynos5433_pcie_phy_init(struct phy *phy) return 0; } -static int exynos5433_pcie_phy_power_on(struct phy *phy) -{ - struct exynos_pcie_phy *ep = phy_get_drvdata(phy); - - regmap_update_bits(ep->pmureg, EXYNOS5433_PMU_PCIE_PHY_OFFSET, - BIT(0), 1); - regmap_update_bits(ep->fsysreg, PCIE_EXYNOS5433_PHY_GLOBAL_RESET, - PCIE_APP_REQ_EXIT_L1_MODE, 0); - regmap_update_bits(ep->fsysreg, PCIE_EXYNOS5433_PHY_L1SUB_CM_CON, - PCIE_REFCLK_GATING_EN, 0); - return 0; -} - -static int exynos5433_pcie_phy_power_off(struct phy *phy) +static int exynos5433_pcie_phy_exit(struct phy *phy) { struct exynos_pcie_phy *ep = phy_get_drvdata(phy); @@ -135,8 +129,7 @@ static int exynos5433_pcie_phy_power_off(struct phy *phy) static const struct phy_ops exynos5433_phy_ops = { .init = exynos5433_pcie_phy_init, - .power_on = exynos5433_pcie_phy_power_on, - .power_off = exynos5433_pcie_phy_power_off, + .exit = exynos5433_pcie_phy_exit, .owner = THIS_MODULE, }; From 1357da5bfff7321490010d21b3e06ec1721b3513 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 29 Jun 2022 00:04:09 +0200 Subject: [PATCH 0408/1250] PCI: exynos: Correct generic PHY usage The proper initialization for generic PHYs is to call first phy_init(), then phy_power_on(). While touching this, remove the phy_reset() call. It is just a left-over from the obsoleted Exynos5440 support and the current exynos-pcie PHY driver doesn't even support this function. It is also rarely used by other drivers. Link: https://lore.kernel.org/r/20220628220409.26545-2-m.szyprowski@samsung.com Reported-by: Bjorn Helgaas Signed-off-by: Marek Szyprowski Signed-off-by: Bjorn Helgaas Reviewed-by: Chanho Park Acked-by: Krzysztof Kozlowski --- drivers/pci/controller/dwc/pci-exynos.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/controller/dwc/pci-exynos.c b/drivers/pci/controller/dwc/pci-exynos.c index 2044d191fba6b6..6d0742207f43e7 100644 --- a/drivers/pci/controller/dwc/pci-exynos.c +++ b/drivers/pci/controller/dwc/pci-exynos.c @@ -258,9 +258,8 @@ static int exynos_pcie_host_init(struct dw_pcie_rp *pp) exynos_pcie_assert_core_reset(ep); - phy_reset(ep->phy); - phy_power_on(ep->phy); phy_init(ep->phy); + phy_power_on(ep->phy); exynos_pcie_deassert_core_reset(ep); exynos_pcie_enable_irq_pulse(ep); From 5fe24f83038168b28cd2eeb3dbc9744033ce2662 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 14 Jul 2022 14:02:40 +0900 Subject: [PATCH 0409/1250] kbuild: rpm-pkg: fix build error when _arch is undefined Cross-building (bin)rpm-pkg fails on several architectures. For example, 'make ARCH=arm binrpm-pkg' fails like follows: sh ./scripts/package/mkspec prebuilt > ./binkernel.spec rpmbuild --define "_builddir ." --target \ arm -bb ./binkernel.spec Building target platforms: arm Building for target arm warning: line 19: It's not recommended to have unversioned Obsoletes: Obsoletes: kernel-headers Executing(%install): /bin/sh -e /var/tmp/rpm-tmp.0S8t2F + umask 022 + cd . + mkdir -p /home/masahiro/rpmbuild/BUILDROOT/kernel-5.19.0_rc6-19.%{_arch}/boot + make -f ./Makefile image_name + cp arch/arm/boot/zImage /home/masahiro/rpmbuild/BUILDROOT/kernel-5.19.0_rc6-19.%{_arch}/boot/vmlinuz-5.19.0-rc6 + make -f ./Makefile INSTALL_MOD_PATH=/home/masahiro/rpmbuild/BUILDROOT/kernel-5.19.0_rc6-19.%{_arch} modules_install make[3]: *** No rule to make target '/home/masahiro/rpmbuild/BUILDROOT/kernel-5.19.0_rc6-19.arch/arm/crypto/aes-arm-bs.ko{_arch}/lib/modules/5.19.0-rc6/kernel/%', needed by '__modinst'. Stop. make[2]: *** [Makefile:1768: modules_install] Error 2 error: Bad exit status from /var/tmp/rpm-tmp.0S8t2F (%install) By default, 'buildroot' contains %{_arch} (see /usr/lib/rpm/macros). _arch is generally defined in /usr/lib/rpm/platforms/*/macros, where the platform sub-directory is specified by --target= option for cross builds. If the given arch does not exist, %{_arch} is not expanded. In the example above, --target=arm is passed to rpmbuild, but /usr/lib/rpm/platforms/arm-linux/ does not exist. The '%' character in the path confuses GNU make and rpmbuild. The same occurs for such architectures as csky, microblaze, nios2, etc. Define _arch if it has not been defined. Reported-by: Jason Self Signed-off-by: Masahiro Yamada --- scripts/package/mkspec | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/package/mkspec b/scripts/package/mkspec index 7c477ca7dc9826..8fa7c5b8a1a15d 100755 --- a/scripts/package/mkspec +++ b/scripts/package/mkspec @@ -49,6 +49,9 @@ sed -e '/^DEL/d' -e 's/^\t*//' < Date: Thu, 14 Jul 2022 14:02:41 +0900 Subject: [PATCH 0410/1250] kbuild: rpm-pkg: pass 'linux' to --target option of rpmbuild Presumably, _target_os is defined even if the --target flag does not specify it, but it is better to make it explicit. Signed-off-by: Masahiro Yamada --- scripts/Makefile.package | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/Makefile.package b/scripts/Makefile.package index 77b612183c082a..5017f6b2da809c 100644 --- a/scripts/Makefile.package +++ b/scripts/Makefile.package @@ -56,7 +56,7 @@ rpm-pkg: $(MAKE) clean $(CONFIG_SHELL) $(MKSPEC) >$(objtree)/kernel.spec $(call cmd,src_tar,$(KERNELPATH),kernel.spec) - +rpmbuild $(RPMOPTS) --target $(UTS_MACHINE) -ta $(KERNELPATH).tar.gz \ + +rpmbuild $(RPMOPTS) --target $(UTS_MACHINE)-linux -ta $(KERNELPATH).tar.gz \ --define='_smp_mflags %{nil}' # binrpm-pkg @@ -66,7 +66,7 @@ binrpm-pkg: $(MAKE) -f $(srctree)/Makefile $(CONFIG_SHELL) $(MKSPEC) prebuilt > $(objtree)/binkernel.spec +rpmbuild $(RPMOPTS) --define "_builddir $(objtree)" --target \ - $(UTS_MACHINE) -bb $(objtree)/binkernel.spec + $(UTS_MACHINE)-linux -bb $(objtree)/binkernel.spec PHONY += deb-pkg deb-pkg: From 058b3d34eb21bd7faf0fa03c6b9652dfd6152bb8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 14 Jul 2022 14:02:42 +0900 Subject: [PATCH 0411/1250] kbuild: error out if $(KBUILD_EXTMOD) contains % or : If the directory path given to KBUILD_EXTMOD (or M=) contains % or :, the module fails to build. % is used in pattern rules, and : as the separator of dependencies. Bail out with a clearer error message. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index f8e325709bf37e..dee707c98bbe9d 100644 --- a/Makefile +++ b/Makefile @@ -129,6 +129,9 @@ endif $(if $(word 2, $(KBUILD_EXTMOD)), \ $(error building multiple external modules is not supported)) +$(foreach x, % :, $(if $(findstring $x, $(KBUILD_EXTMOD)), \ + $(error module directory path cannot contain '$x'))) + # Remove trailing slashes ifneq ($(filter %/, $(KBUILD_EXTMOD)),) KBUILD_EXTMOD := $(shell dirname $(KBUILD_EXTMOD).) From e8c79d98b73cb7d17b1504ba966cdaebc369c13e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 14 Jul 2022 14:02:43 +0900 Subject: [PATCH 0412/1250] kbuild: error out if $(INSTALL_MOD_PATH) contains % or : If the directory path given to INSTALL_MOD_PATH contains % or :, the module_install fails. % is used in pattern rules, and : as the separator of dependencies. Bail out with a clearer error message. Signed-off-by: Masahiro Yamada Reviewed-by: Nicolas Schier --- scripts/Makefile.modinst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst index 16a02e9237d360..a4c987c23750f6 100644 --- a/scripts/Makefile.modinst +++ b/scripts/Makefile.modinst @@ -18,6 +18,9 @@ INSTALL_MOD_DIR ?= extra dst := $(MODLIB)/$(INSTALL_MOD_DIR) endif +$(foreach x, % :, $(if $(findstring $x, $(dst)), \ + $(error module installation path cannot contain '$x'))) + suffix-y := suffix-$(CONFIG_MODULE_COMPRESS_GZIP) := .gz suffix-$(CONFIG_MODULE_COMPRESS_XZ) := .xz From 4d4bf485cca92edd1a77c9022f66a7657ec2b8fe Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:00 +0200 Subject: [PATCH 0413/1250] thermal/core: Remove duplicate information when an error occurs The pr_err already tells it is an error, it is pointless to add the 'Error:' string in the messages. Remove them. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20220710123512.1714714-2-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index cdc0552e8c42e1..e22e7d939c5485 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1198,23 +1198,23 @@ thermal_zone_device_register(const char *type, int trips, int mask, struct thermal_governor *governor; if (!type || strlen(type) == 0) { - pr_err("Error: No thermal zone type defined\n"); + pr_err("No thermal zone type defined\n"); return ERR_PTR(-EINVAL); } if (type && strlen(type) >= THERMAL_NAME_LENGTH) { - pr_err("Error: Thermal zone name (%s) too long, should be under %d chars\n", + pr_err("Thermal zone name (%s) too long, should be under %d chars\n", type, THERMAL_NAME_LENGTH); return ERR_PTR(-EINVAL); } if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips) { - pr_err("Error: Incorrect number of thermal trips\n"); + pr_err("Incorrect number of thermal trips\n"); return ERR_PTR(-EINVAL); } if (!ops) { - pr_err("Error: Thermal zone device ops not defined\n"); + pr_err("Thermal zone device ops not defined\n"); return ERR_PTR(-EINVAL); } From 853881e4395ba3432d1d67a15e35badd4acfd15f Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:01 +0200 Subject: [PATCH 0414/1250] thermal/of: Replace device node match with device node search The thermal_of code builds a trip array associated with the node pointer in order to compare the trip point phandle with the list. The thermal trip is a thermal zone property and should be moved there. If some sensors have hardcoded trip points, they should use the exported structure instead of redefining again and again their own structure and data to describe exactly the same things. In order to move this to the thermal.h header and allow more cleanup, we need to remove the node pointer from the structure. Instead of building storing the device node, we search directly in the device tree the corresponding node. That results in a simplification of the code and allows to move the structure to thermal.h Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220710123512.1714714-3-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_of.c | 64 +++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index b65d435cb92f63..9295bc59998f7c 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -671,6 +671,35 @@ EXPORT_SYMBOL_GPL(devm_thermal_zone_of_sensor_unregister); /*** functions parsing device tree nodes ***/ +static int of_find_trip_id(struct device_node *np, struct device_node *trip) +{ + struct device_node *trips; + struct device_node *t; + int i = 0; + + trips = of_get_child_by_name(np, "trips"); + if (!trips) { + pr_err("Failed to find 'trips' node\n"); + return -EINVAL; + } + + /* + * Find the trip id point associated with the cooling device map + */ + for_each_child_of_node(trips, t) { + + if (t == trip) + goto out; + i++; + } + + i = -ENXIO; +out: + of_node_put(trips); + + return i; +} + /** * thermal_of_populate_bind_params - parse and fill cooling map data * @np: DT node containing a cooling-map node @@ -686,14 +715,13 @@ EXPORT_SYMBOL_GPL(devm_thermal_zone_of_sensor_unregister); * Return: 0 on success, proper error code otherwise */ static int thermal_of_populate_bind_params(struct device_node *np, - struct __thermal_bind_params *__tbp, - struct thermal_trip *trips, - int ntrips) + struct __thermal_bind_params *__tbp) { struct of_phandle_args cooling_spec; struct __thermal_cooling_bind_param *__tcbp; struct device_node *trip; int ret, i, count; + int trip_id; u32 prop; /* Default weight. Usage is optional */ @@ -708,18 +736,14 @@ static int thermal_of_populate_bind_params(struct device_node *np, return -ENODEV; } - /* match using device_node */ - for (i = 0; i < ntrips; i++) - if (trip == trips[i].np) { - __tbp->trip_id = i; - break; - } - - if (i == ntrips) { - ret = -ENODEV; + trip_id = of_find_trip_id(np, trip); + if (trip_id < 0) { + ret = trip_id; goto end; } + __tbp->trip_id = trip_id; + count = of_count_phandle_with_args(np, "cooling-device", "#cooling-cells"); if (count <= 0) { @@ -868,6 +892,7 @@ static struct __thermal_zone __init *thermal_of_build_thermal_zone(struct device_node *np) { struct device_node *child = NULL, *gchild; + struct device_node *trips; struct __thermal_zone *tz; int ret, i; u32 prop, coef[2]; @@ -910,13 +935,13 @@ __init *thermal_of_build_thermal_zone(struct device_node *np) } /* trips */ - child = of_get_child_by_name(np, "trips"); + trips = of_get_child_by_name(np, "trips"); /* No trips provided */ - if (!child) + if (!trips) goto finish; - tz->ntrips = of_get_child_count(child); + tz->ntrips = of_get_child_count(trips); if (tz->ntrips == 0) /* must have at least one child */ goto finish; @@ -927,14 +952,12 @@ __init *thermal_of_build_thermal_zone(struct device_node *np) } i = 0; - for_each_child_of_node(child, gchild) { + for_each_child_of_node(trips, gchild) { ret = thermal_of_populate_trip(gchild, &tz->trips[i++]); if (ret) goto free_trips; } - of_node_put(child); - /* cooling-maps */ child = of_get_child_by_name(np, "cooling-maps"); @@ -954,13 +977,13 @@ __init *thermal_of_build_thermal_zone(struct device_node *np) i = 0; for_each_child_of_node(child, gchild) { - ret = thermal_of_populate_bind_params(gchild, &tz->tbps[i++], - tz->trips, tz->ntrips); + ret = thermal_of_populate_bind_params(gchild, &tz->tbps[i++]); if (ret) goto free_tbps; } finish: + of_node_put(trips); of_node_put(child); return tz; @@ -981,6 +1004,7 @@ __init *thermal_of_build_thermal_zone(struct device_node *np) for (i = 0; i < tz->ntrips; i++) of_node_put(tz->trips[i].np); kfree(tz->trips); + of_node_put(trips); of_node_put(gchild); free_tz: kfree(tz); From 0401713606abd65c7d40c7a9fa14b4a5bebae86f Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:02 +0200 Subject: [PATCH 0415/1250] thermal/of: Remove the device node pointer for thermal_trip The device node pointer is no longer needed in the thermal trip structure, remove it. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20220710123512.1714714-4-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_core.h | 2 -- drivers/thermal/thermal_of.c | 8 -------- 2 files changed, 10 deletions(-) diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 726e327b4205fa..ff10cdda056c3f 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -70,13 +70,11 @@ void __thermal_cdev_update(struct thermal_cooling_device *cdev); /** * struct thermal_trip - representation of a point in temperature domain - * @np: pointer to struct device_node that this trip point was created from * @temperature: temperature value in miliCelsius * @hysteresis: relative hysteresis in miliCelsius * @type: trip point type */ struct thermal_trip { - struct device_node *np; int temperature; int hysteresis; enum thermal_trip_type type; diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index 9295bc59998f7c..0d04474ed951cf 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -867,10 +867,6 @@ static int thermal_of_populate_trip(struct device_node *np, return ret; } - /* Required for cooling map matching */ - trip->np = np; - of_node_get(np); - return 0; } @@ -1001,8 +997,6 @@ __init *thermal_of_build_thermal_zone(struct device_node *np) kfree(tz->tbps); free_trips: - for (i = 0; i < tz->ntrips; i++) - of_node_put(tz->trips[i].np); kfree(tz->trips); of_node_put(trips); of_node_put(gchild); @@ -1028,8 +1022,6 @@ static __init void of_thermal_free_zone(struct __thermal_zone *tz) } kfree(tz->tbps); - for (i = 0; i < tz->ntrips; i++) - of_node_put(tz->trips[i].np); kfree(tz->trips); kfree(tz); } From 18c51d1fea6d3c00eb71321923e1e0b09d60ee47 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:03 +0200 Subject: [PATCH 0416/1250] thermal/of: Move thermal_trip structure to thermal.h The structure thermal_trip is now generic and will be usable by the different sensor drivers in place of their own structure. Move its definition to thermal.h to make it accessible. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20220710123512.1714714-5-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_core.h | 12 ------------ include/linux/thermal.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index ff10cdda056c3f..60844e2d59bbe3 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -68,18 +68,6 @@ static inline bool cdev_is_power_actor(struct thermal_cooling_device *cdev) void thermal_cdev_update(struct thermal_cooling_device *); void __thermal_cdev_update(struct thermal_cooling_device *cdev); -/** - * struct thermal_trip - representation of a point in temperature domain - * @temperature: temperature value in miliCelsius - * @hysteresis: relative hysteresis in miliCelsius - * @type: trip point type - */ -struct thermal_trip { - int temperature; - int hysteresis; - enum thermal_trip_type type; -}; - int get_tz_trend(struct thermal_zone_device *tz, int trip); struct thermal_instance * diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 231bac2768fb79..7e66970f0464da 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -78,6 +78,18 @@ struct thermal_zone_device_ops { void (*critical)(struct thermal_zone_device *); }; +/** + * struct thermal_trip - representation of a point in temperature domain + * @temperature: temperature value in miliCelsius + * @hysteresis: relative hysteresis in miliCelsius + * @type: trip point type + */ +struct thermal_trip { + int temperature; + int hysteresis; + enum thermal_trip_type type; +}; + struct thermal_cooling_device_ops { int (*get_max_state) (struct thermal_cooling_device *, unsigned long *); int (*get_cur_state) (struct thermal_cooling_device *, unsigned long *); From 84cf997c6de5cd78516a05b7a1e43e550284fd40 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:04 +0200 Subject: [PATCH 0417/1250] thermal/core: Remove unneeded EXPORT_SYMBOLS Different functions are exporting the symbols but are actually only used by the thermal framework internals. Remove these EXPORT_SYMBOLS. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220710123512.1714714-6-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_helpers.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c index 3edd047e144f01..f4c1e87ef040e0 100644 --- a/drivers/thermal/thermal_helpers.c +++ b/drivers/thermal/thermal_helpers.c @@ -39,7 +39,6 @@ int get_tz_trend(struct thermal_zone_device *tz, int trip) return trend; } -EXPORT_SYMBOL(get_tz_trend); struct thermal_instance * get_thermal_instance(struct thermal_zone_device *tz, @@ -228,7 +227,6 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev) } mutex_unlock(&cdev->lock); } -EXPORT_SYMBOL(thermal_cdev_update); /** * thermal_zone_get_slope - return the slope attribute of the thermal zone From 44bfc6c5a7783c4f4e484a0473aacc3e58923cfd Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:05 +0200 Subject: [PATCH 0418/1250] thermal/core: Move thermal_set_delay_jiffies to static The function 'thermal_set_delay_jiffies' is only used in thermal_core.c but it is defined and implemented in a separate file. Move the function to thermal_core.c and make it static. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20220710123512.1714714-7-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_core.c | 7 +++++++ drivers/thermal/thermal_core.h | 1 - drivers/thermal/thermal_helpers.c | 7 ------- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index e22e7d939c5485..a8b1628937c62f 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1159,6 +1159,13 @@ static void bind_tz(struct thermal_zone_device *tz) mutex_unlock(&thermal_list_lock); } +static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms) +{ + *delay_jiffies = msecs_to_jiffies(delay_ms); + if (delay_ms > 1000) + *delay_jiffies = round_jiffies(*delay_jiffies); +} + /** * thermal_zone_device_register() - register a new thermal zone device * @type: the thermal zone device type diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 60844e2d59bbe3..c991bb290512ca 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -112,7 +112,6 @@ int thermal_build_list_of_policies(char *buf); /* Helpers */ void thermal_zone_set_trips(struct thermal_zone_device *tz); -void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms); /* sysfs I/F */ int thermal_zone_create_device_groups(struct thermal_zone_device *, int); diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c index f4c1e87ef040e0..60bfda1a1db781 100644 --- a/drivers/thermal/thermal_helpers.c +++ b/drivers/thermal/thermal_helpers.c @@ -174,13 +174,6 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz) mutex_unlock(&tz->lock); } -void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms) -{ - *delay_jiffies = msecs_to_jiffies(delay_ms); - if (delay_ms > 1000) - *delay_jiffies = round_jiffies(*delay_jiffies); -} - static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev, int target) { From c42ceda3d1a9de0222b38d61a8db9902da93f083 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:06 +0200 Subject: [PATCH 0419/1250] thermal/core: Rename trips to ntrips In order to use thermal trips defined in the thermal structure, rename the 'trips' field to 'ntrips' to have the 'trips' field containing the thermal trip points. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220710123512.1714714-8-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/gov_fair_share.c | 6 +++--- drivers/thermal/gov_power_allocator.c | 4 ++-- drivers/thermal/tegra/tegra30-tsensor.c | 2 +- drivers/thermal/thermal_core.c | 20 ++++++++++---------- drivers/thermal/thermal_helpers.c | 4 ++-- drivers/thermal/thermal_netlink.c | 2 +- drivers/thermal/thermal_sysfs.c | 22 +++++++++++----------- include/linux/thermal.h | 4 ++-- 8 files changed, 32 insertions(+), 32 deletions(-) diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c index 1e5abf4822bed2..6a2abcfc648f7f 100644 --- a/drivers/thermal/gov_fair_share.c +++ b/drivers/thermal/gov_fair_share.c @@ -25,10 +25,10 @@ static int get_trip_level(struct thermal_zone_device *tz) int trip_temp; enum thermal_trip_type trip_type; - if (tz->trips == 0 || !tz->ops->get_trip_temp) + if (tz->num_trips == 0 || !tz->ops->get_trip_temp) return 0; - for (count = 0; count < tz->trips; count++) { + for (count = 0; count < tz->num_trips; count++) { tz->ops->get_trip_temp(tz, count, &trip_temp); if (tz->temperature < trip_temp) break; @@ -53,7 +53,7 @@ static long get_target_state(struct thermal_zone_device *tz, cdev->ops->get_max_state(cdev, &max_state); - return (long)(percentage * level * max_state) / (100 * tz->trips); + return (long)(percentage * level * max_state) / (100 * tz->num_trips); } /** diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c index 13e375751d2296..1d505247096728 100644 --- a/drivers/thermal/gov_power_allocator.c +++ b/drivers/thermal/gov_power_allocator.c @@ -527,7 +527,7 @@ static void get_governor_trips(struct thermal_zone_device *tz, last_active = INVALID_TRIP; last_passive = INVALID_TRIP; - for (i = 0; i < tz->trips; i++) { + for (i = 0; i < tz->num_trips; i++) { enum thermal_trip_type type; int ret; @@ -668,7 +668,7 @@ static int power_allocator_bind(struct thermal_zone_device *tz) get_governor_trips(tz, params); - if (tz->trips > 0) { + if (tz->num_trips > 0) { ret = tz->ops->get_trip_temp(tz, params->trip_max_desired_temperature, &control_temp); diff --git a/drivers/thermal/tegra/tegra30-tsensor.c b/drivers/thermal/tegra/tegra30-tsensor.c index 9b6b693cbcf850..05886684f42956 100644 --- a/drivers/thermal/tegra/tegra30-tsensor.c +++ b/drivers/thermal/tegra/tegra30-tsensor.c @@ -316,7 +316,7 @@ static void tegra_tsensor_get_hw_channel_trips(struct thermal_zone_device *tzd, *hot_trip = 85000; *crit_trip = 90000; - for (i = 0; i < tzd->trips; i++) { + for (i = 0; i < tzd->num_trips; i++) { enum thermal_trip_type type; int trip_temp; diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index a8b1628937c62f..cb9b1bd03bd33c 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -505,7 +505,7 @@ void thermal_zone_device_update(struct thermal_zone_device *tz, tz->notify_event = event; - for (count = 0; count < tz->trips; count++) + for (count = 0; count < tz->num_trips; count++) handle_thermal_trip(tz, count); } EXPORT_SYMBOL_GPL(thermal_zone_device_update); @@ -630,7 +630,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, unsigned long max_state; int result, ret; - if (trip >= tz->trips || trip < 0) + if (trip >= tz->num_trips || trip < 0) return -EINVAL; list_for_each_entry(pos1, &thermal_tz_list, node) { @@ -811,7 +811,7 @@ static void __bind(struct thermal_zone_device *tz, int mask, { int i, ret; - for (i = 0; i < tz->trips; i++) { + for (i = 0; i < tz->num_trips; i++) { if (mask & (1 << i)) { unsigned long upper, lower; @@ -1057,7 +1057,7 @@ static void __unbind(struct thermal_zone_device *tz, int mask, { int i; - for (i = 0; i < tz->trips; i++) + for (i = 0; i < tz->num_trips; i++) if (mask & (1 << i)) thermal_zone_unbind_cooling_device(tz, i, cdev); } @@ -1169,7 +1169,7 @@ static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms /** * thermal_zone_device_register() - register a new thermal zone device * @type: the thermal zone device type - * @trips: the number of trip points the thermal zone support + * @num_trips: the number of trip points the thermal zone support * @mask: a bit string indicating the writeablility of trip points * @devdata: private device data * @ops: standard thermal zone device callbacks @@ -1191,7 +1191,7 @@ static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms * IS_ERR*() helpers. */ struct thermal_zone_device * -thermal_zone_device_register(const char *type, int trips, int mask, +thermal_zone_device_register(const char *type, int num_trips, int mask, void *devdata, struct thermal_zone_device_ops *ops, struct thermal_zone_params *tzp, int passive_delay, int polling_delay) @@ -1215,7 +1215,7 @@ thermal_zone_device_register(const char *type, int trips, int mask, return ERR_PTR(-EINVAL); } - if (trips > THERMAL_MAX_TRIPS || trips < 0 || mask >> trips) { + if (num_trips > THERMAL_MAX_TRIPS || num_trips < 0 || mask >> num_trips) { pr_err("Incorrect number of thermal trips\n"); return ERR_PTR(-EINVAL); } @@ -1225,7 +1225,7 @@ thermal_zone_device_register(const char *type, int trips, int mask, return ERR_PTR(-EINVAL); } - if (trips > 0 && (!ops->get_trip_type || !ops->get_trip_temp)) + if (num_trips > 0 && (!ops->get_trip_type || !ops->get_trip_temp)) return ERR_PTR(-EINVAL); tz = kzalloc(sizeof(*tz), GFP_KERNEL); @@ -1255,7 +1255,7 @@ thermal_zone_device_register(const char *type, int trips, int mask, tz->tzp = tzp; tz->device.class = &thermal_class; tz->devdata = devdata; - tz->trips = trips; + tz->num_trips = num_trips; thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay); thermal_set_delay_jiffies(&tz->polling_delay_jiffies, polling_delay); @@ -1273,7 +1273,7 @@ thermal_zone_device_register(const char *type, int trips, int mask, if (result) goto release_device; - for (count = 0; count < trips; count++) { + for (count = 0; count < num_trips; count++) { if (tz->ops->get_trip_type(tz, count, &trip_type) || tz->ops->get_trip_temp(tz, count, &trip_temp) || !trip_temp) diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c index 60bfda1a1db781..690890f054a397 100644 --- a/drivers/thermal/thermal_helpers.c +++ b/drivers/thermal/thermal_helpers.c @@ -89,7 +89,7 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp) ret = tz->ops->get_temp(tz, temp); if (IS_ENABLED(CONFIG_THERMAL_EMULATION) && tz->emul_temperature) { - for (count = 0; count < tz->trips; count++) { + for (count = 0; count < tz->num_trips; count++) { ret = tz->ops->get_trip_type(tz, count, &type); if (!ret && type == THERMAL_TRIP_CRITICAL) { ret = tz->ops->get_trip_temp(tz, count, @@ -137,7 +137,7 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz) if (!tz->ops->set_trips || !tz->ops->get_trip_hyst) goto exit; - for (i = 0; i < tz->trips; i++) { + for (i = 0; i < tz->num_trips; i++) { int trip_low; tz->ops->get_trip_temp(tz, i, &trip_temp); diff --git a/drivers/thermal/thermal_netlink.c b/drivers/thermal/thermal_netlink.c index 32fea5174cc0d7..050d243a5fa1f9 100644 --- a/drivers/thermal/thermal_netlink.c +++ b/drivers/thermal/thermal_netlink.c @@ -469,7 +469,7 @@ static int thermal_genl_cmd_tz_get_trip(struct param *p) mutex_lock(&tz->lock); - for (i = 0; i < tz->trips; i++) { + for (i = 0; i < tz->num_trips; i++) { enum thermal_trip_type type; int temp, hyst = 0; diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c index 1c4aac8464a709..5018459e8dd940 100644 --- a/drivers/thermal/thermal_sysfs.c +++ b/drivers/thermal/thermal_sysfs.c @@ -416,15 +416,15 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask) int indx; /* This function works only for zones with at least one trip */ - if (tz->trips <= 0) + if (tz->num_trips <= 0) return -EINVAL; - tz->trip_type_attrs = kcalloc(tz->trips, sizeof(*tz->trip_type_attrs), + tz->trip_type_attrs = kcalloc(tz->num_trips, sizeof(*tz->trip_type_attrs), GFP_KERNEL); if (!tz->trip_type_attrs) return -ENOMEM; - tz->trip_temp_attrs = kcalloc(tz->trips, sizeof(*tz->trip_temp_attrs), + tz->trip_temp_attrs = kcalloc(tz->num_trips, sizeof(*tz->trip_temp_attrs), GFP_KERNEL); if (!tz->trip_temp_attrs) { kfree(tz->trip_type_attrs); @@ -432,7 +432,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask) } if (tz->ops->get_trip_hyst) { - tz->trip_hyst_attrs = kcalloc(tz->trips, + tz->trip_hyst_attrs = kcalloc(tz->num_trips, sizeof(*tz->trip_hyst_attrs), GFP_KERNEL); if (!tz->trip_hyst_attrs) { @@ -442,7 +442,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask) } } - attrs = kcalloc(tz->trips * 3 + 1, sizeof(*attrs), GFP_KERNEL); + attrs = kcalloc(tz->num_trips * 3 + 1, sizeof(*attrs), GFP_KERNEL); if (!attrs) { kfree(tz->trip_type_attrs); kfree(tz->trip_temp_attrs); @@ -451,7 +451,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask) return -ENOMEM; } - for (indx = 0; indx < tz->trips; indx++) { + for (indx = 0; indx < tz->num_trips; indx++) { /* create trip type attribute */ snprintf(tz->trip_type_attrs[indx].name, THERMAL_NAME_LENGTH, "trip_point_%d_type", indx); @@ -478,7 +478,7 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask) tz->trip_temp_attrs[indx].attr.store = trip_point_temp_store; } - attrs[indx + tz->trips] = &tz->trip_temp_attrs[indx].attr.attr; + attrs[indx + tz->num_trips] = &tz->trip_temp_attrs[indx].attr.attr; /* create Optional trip hyst attribute */ if (!tz->ops->get_trip_hyst) @@ -496,10 +496,10 @@ static int create_trip_attrs(struct thermal_zone_device *tz, int mask) tz->trip_hyst_attrs[indx].attr.store = trip_point_hyst_store; } - attrs[indx + tz->trips * 2] = + attrs[indx + tz->num_trips * 2] = &tz->trip_hyst_attrs[indx].attr.attr; } - attrs[tz->trips * 3] = NULL; + attrs[tz->num_trips * 3] = NULL; tz->trips_attribute_group.attrs = attrs; @@ -540,7 +540,7 @@ int thermal_zone_create_device_groups(struct thermal_zone_device *tz, for (i = 0; i < size - 2; i++) groups[i] = thermal_zone_attribute_groups[i]; - if (tz->trips) { + if (tz->num_trips) { result = create_trip_attrs(tz, mask); if (result) { kfree(groups); @@ -561,7 +561,7 @@ void thermal_zone_destroy_device_groups(struct thermal_zone_device *tz) if (!tz) return; - if (tz->trips) + if (tz->num_trips) destroy_trip_attrs(tz); kfree(tz->device.groups); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 7e66970f0464da..ae579a70cc1a33 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -123,7 +123,7 @@ struct thermal_cooling_device { * @trip_hyst_attrs: attributes for trip points for sysfs: trip hysteresis * @mode: current mode of this thermal zone * @devdata: private pointer for device private data - * @trips: number of trip points the thermal zone supports + * @num_trips: number of trip points the thermal zone supports * @trips_disabled; bitmap for disabled trips * @passive_delay_jiffies: number of jiffies to wait between polls when * performing passive cooling. @@ -163,7 +163,7 @@ struct thermal_zone_device { struct thermal_attr *trip_hyst_attrs; enum thermal_device_mode mode; void *devdata; - int trips; + int num_trips; unsigned long trips_disabled; /* bitmap for disabled trips */ unsigned long passive_delay_jiffies; unsigned long polling_delay_jiffies; From b0e0e608c1de79085f483874308a5a3983fffc86 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:08 +0200 Subject: [PATCH 0420/1250] thermal/core: Add thermal_trip in thermal_zone The thermal trip points are properties of a thermal zone and the different sub systems should be able to save them in the thermal zone structure instead of having their own definition. Give the opportunity to the drivers to create a thermal zone with thermal trips which will be accessible directly from the thermal core framework. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220710123512.1714714-10-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_core.h | 10 ++++++++++ include/linux/thermal.h | 2 ++ 2 files changed, 12 insertions(+) diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index c991bb290512ca..a4e730391cab4d 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -113,6 +113,16 @@ int thermal_build_list_of_policies(char *buf); /* Helpers */ void thermal_zone_set_trips(struct thermal_zone_device *tz); +static inline struct thermal_trip *thermal_zone_get_trips(struct thermal_zone_device *tz) +{ + return tz->trips; +} + +static inline int thermal_zone_get_num_trips(struct thermal_zone_device *tz) +{ + return tz->num_trips; +} + /* sysfs I/F */ int thermal_zone_create_device_groups(struct thermal_zone_device *, int); void thermal_zone_destroy_device_groups(struct thermal_zone_device *); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index ae579a70cc1a33..3b2d31c8ea2011 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -123,6 +123,7 @@ struct thermal_cooling_device { * @trip_hyst_attrs: attributes for trip points for sysfs: trip hysteresis * @mode: current mode of this thermal zone * @devdata: private pointer for device private data + * @trips: an array of struct thermal_trip * @num_trips: number of trip points the thermal zone supports * @trips_disabled; bitmap for disabled trips * @passive_delay_jiffies: number of jiffies to wait between polls when @@ -163,6 +164,7 @@ struct thermal_zone_device { struct thermal_attr *trip_hyst_attrs; enum thermal_device_mode mode; void *devdata; + struct thermal_trip *trips; int num_trips; unsigned long trips_disabled; /* bitmap for disabled trips */ unsigned long passive_delay_jiffies; From 2a1c450038f45a3bce8c69f6a319e3ac22fb8873 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:09 +0200 Subject: [PATCH 0421/1250] thermal/core: Register with the trip points As we added the thermal trip points structure in the thermal zone, let's extend the thermal zone register function to have the thermal trip structures as a parameter and store it in the 'trips' field of the thermal zone structure. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220710123512.1714714-11-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_core.c | 22 +++++++++++++++++----- include/linux/thermal.h | 6 ++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index cb9b1bd03bd33c..0d9e9b175f9399 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1167,8 +1167,9 @@ static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms } /** - * thermal_zone_device_register() - register a new thermal zone device + * thermal_zone_device_register_with_trips() - register a new thermal zone device * @type: the thermal zone device type + * @trips: a pointer to an array of thermal trips * @num_trips: the number of trip points the thermal zone support * @mask: a bit string indicating the writeablility of trip points * @devdata: private device data @@ -1191,10 +1192,10 @@ static void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms * IS_ERR*() helpers. */ struct thermal_zone_device * -thermal_zone_device_register(const char *type, int num_trips, int mask, - void *devdata, struct thermal_zone_device_ops *ops, - struct thermal_zone_params *tzp, int passive_delay, - int polling_delay) +thermal_zone_device_register_with_trips(const char *type, struct thermal_trip *trips, int num_trips, int mask, + void *devdata, struct thermal_zone_device_ops *ops, + struct thermal_zone_params *tzp, int passive_delay, + int polling_delay) { struct thermal_zone_device *tz; enum thermal_trip_type trip_type; @@ -1255,6 +1256,7 @@ thermal_zone_device_register(const char *type, int num_trips, int mask, tz->tzp = tzp; tz->device.class = &thermal_class; tz->devdata = devdata; + tz->trips = trips; tz->num_trips = num_trips; thermal_set_delay_jiffies(&tz->passive_delay_jiffies, passive_delay); @@ -1331,6 +1333,16 @@ thermal_zone_device_register(const char *type, int num_trips, int mask, kfree(tz); return ERR_PTR(result); } + +struct thermal_zone_device *thermal_zone_device_register(const char *type, int ntrips, int mask, + void *devdata, struct thermal_zone_device_ops *ops, + struct thermal_zone_params *tzp, int passive_delay, + int polling_delay) +{ + return thermal_zone_device_register_with_trips(type, NULL, ntrips, mask, + devdata, ops, tzp, + passive_delay, polling_delay); +} EXPORT_SYMBOL_GPL(thermal_zone_device_register); /** diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 3b2d31c8ea2011..1386c713885d87 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -378,8 +378,14 @@ void devm_thermal_zone_of_sensor_unregister(struct device *dev, struct thermal_zone_device *thermal_zone_device_register(const char *, int, int, void *, struct thermal_zone_device_ops *, struct thermal_zone_params *, int, int); + void thermal_zone_device_unregister(struct thermal_zone_device *); +struct thermal_zone_device * +thermal_zone_device_register_with_trips(const char *, struct thermal_trip *, int, int, + void *, struct thermal_zone_device_ops *, + struct thermal_zone_params *, int, int); + int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int, struct thermal_cooling_device *, unsigned long, unsigned long, From 45acd85edf2c4562421b94e9a504a311277647d4 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:10 +0200 Subject: [PATCH 0422/1250] thermal/of: Store the trips in the thermal zone As the thermal zone contains the trip point, we can store them directly when registering the thermal zone. That will allow another step forward to remove the duplicate thermal zone structure we find in the thermal_of code. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220710123512.1714714-12-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_of.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index 0d04474ed951cf..d7ff6d558a82ae 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -1119,11 +1119,9 @@ int __init of_parse_thermal_zones(void) tzp->slope = tz->slope; tzp->offset = tz->offset; - zone = thermal_zone_device_register(child->name, tz->ntrips, - mask, tz, - ops, tzp, - tz->passive_delay, - tz->polling_delay); + zone = thermal_zone_device_register_with_trips(child->name, tz->trips, tz->ntrips, + mask, tz, ops, tzp, tz->passive_delay, + tz->polling_delay); if (IS_ERR(zone)) { pr_err("Failed to build %pOFn zone %ld\n", child, PTR_ERR(zone)); From 2c32c87fa8fc3591f311c5f7ca00a82efb0f806c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:11 +0200 Subject: [PATCH 0423/1250] thermal/of: Use thermal trips stored in the thermal zone Now that we have the thermal trip stored in the thermal zone in a generic way, we can rely on them and remove one indirection we found in the thermal_of code and do one more step forward the removal of the duplicated structures. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220710123512.1714714-13-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_of.c | 53 +++++++++++------------------------- 1 file changed, 16 insertions(+), 37 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index d7ff6d558a82ae..dbc07f2f77d187 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -118,12 +118,7 @@ static int of_thermal_set_trips(struct thermal_zone_device *tz, */ int of_thermal_get_ntrips(struct thermal_zone_device *tz) { - struct __thermal_zone *data = tz->devdata; - - if (!data || IS_ERR(data)) - return -ENODEV; - - return data->ntrips; + return tz->ntrips; } EXPORT_SYMBOL_GPL(of_thermal_get_ntrips); @@ -139,9 +134,7 @@ EXPORT_SYMBOL_GPL(of_thermal_get_ntrips); */ bool of_thermal_is_trip_valid(struct thermal_zone_device *tz, int trip) { - struct __thermal_zone *data = tz->devdata; - - if (!data || trip >= data->ntrips || trip < 0) + if (trip >= tz->ntrips || trip < 0) return false; return true; @@ -161,12 +154,7 @@ EXPORT_SYMBOL_GPL(of_thermal_is_trip_valid); const struct thermal_trip * of_thermal_get_trip_points(struct thermal_zone_device *tz) { - struct __thermal_zone *data = tz->devdata; - - if (!data) - return NULL; - - return data->trips; + return tz->trips; } EXPORT_SYMBOL_GPL(of_thermal_get_trip_points); @@ -281,12 +269,10 @@ static int of_thermal_unbind(struct thermal_zone_device *thermal, static int of_thermal_get_trip_type(struct thermal_zone_device *tz, int trip, enum thermal_trip_type *type) { - struct __thermal_zone *data = tz->devdata; - - if (trip >= data->ntrips || trip < 0) + if (trip >= tz->ntrips || trip < 0) return -EDOM; - *type = data->trips[trip].type; + *type = tz->trips[trip].type; return 0; } @@ -294,12 +280,10 @@ static int of_thermal_get_trip_type(struct thermal_zone_device *tz, int trip, static int of_thermal_get_trip_temp(struct thermal_zone_device *tz, int trip, int *temp) { - struct __thermal_zone *data = tz->devdata; - - if (trip >= data->ntrips || trip < 0) + if (trip >= tz->ntrips || trip < 0) return -EDOM; - *temp = data->trips[trip].temperature; + *temp = tz->trips[trip].temperature; return 0; } @@ -309,7 +293,7 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip, { struct __thermal_zone *data = tz->devdata; - if (trip >= data->ntrips || trip < 0) + if (trip >= tz->ntrips || trip < 0) return -EDOM; if (data->ops && data->ops->set_trip_temp) { @@ -321,7 +305,7 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip, } /* thermal framework should take care of data->mask & (1 << trip) */ - data->trips[trip].temperature = temp; + tz->trips[trip].temperature = temp; return 0; } @@ -329,12 +313,10 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip, static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip, int *hyst) { - struct __thermal_zone *data = tz->devdata; - - if (trip >= data->ntrips || trip < 0) + if (trip >= tz->ntrips || trip < 0) return -EDOM; - *hyst = data->trips[trip].hysteresis; + *hyst = tz->trips[trip].hysteresis; return 0; } @@ -342,13 +324,11 @@ static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip, static int of_thermal_set_trip_hyst(struct thermal_zone_device *tz, int trip, int hyst) { - struct __thermal_zone *data = tz->devdata; - - if (trip >= data->ntrips || trip < 0) + if (trip >= tz->ntrips || trip < 0) return -EDOM; /* thermal framework should take care of data->mask & (1 << trip) */ - data->trips[trip].hysteresis = hyst; + tz->trips[trip].hysteresis = hyst; return 0; } @@ -356,12 +336,11 @@ static int of_thermal_set_trip_hyst(struct thermal_zone_device *tz, int trip, static int of_thermal_get_crit_temp(struct thermal_zone_device *tz, int *temp) { - struct __thermal_zone *data = tz->devdata; int i; - for (i = 0; i < data->ntrips; i++) - if (data->trips[i].type == THERMAL_TRIP_CRITICAL) { - *temp = data->trips[i].temperature; + for (i = 0; i < tz->ntrips; i++) + if (tz->trips[i].type == THERMAL_TRIP_CRITICAL) { + *temp = tz->trips[i].temperature; return 0; } From 33fe964a5a8ba03030548d79047b402dbc2b31c2 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Sun, 10 Jul 2022 14:35:12 +0200 Subject: [PATCH 0424/1250] thermal/of: Initialize trip points separately Self contain the trip initialization from the device tree in a single function for the sake of making the code flow more clear. Cc: Alexandre Bailon Cc: Kevin Hilman Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220710123512.1714714-14-daniel.lezcano@linexp.org Signed-off-by: Daniel Lezcano --- drivers/thermal/thermal_of.c | 102 ++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 37 deletions(-) diff --git a/drivers/thermal/thermal_of.c b/drivers/thermal/thermal_of.c index dbc07f2f77d187..802c30b72a925d 100644 --- a/drivers/thermal/thermal_of.c +++ b/drivers/thermal/thermal_of.c @@ -118,7 +118,7 @@ static int of_thermal_set_trips(struct thermal_zone_device *tz, */ int of_thermal_get_ntrips(struct thermal_zone_device *tz) { - return tz->ntrips; + return tz->num_trips; } EXPORT_SYMBOL_GPL(of_thermal_get_ntrips); @@ -134,7 +134,7 @@ EXPORT_SYMBOL_GPL(of_thermal_get_ntrips); */ bool of_thermal_is_trip_valid(struct thermal_zone_device *tz, int trip) { - if (trip >= tz->ntrips || trip < 0) + if (trip >= tz->num_trips || trip < 0) return false; return true; @@ -269,7 +269,7 @@ static int of_thermal_unbind(struct thermal_zone_device *thermal, static int of_thermal_get_trip_type(struct thermal_zone_device *tz, int trip, enum thermal_trip_type *type) { - if (trip >= tz->ntrips || trip < 0) + if (trip >= tz->num_trips || trip < 0) return -EDOM; *type = tz->trips[trip].type; @@ -280,7 +280,7 @@ static int of_thermal_get_trip_type(struct thermal_zone_device *tz, int trip, static int of_thermal_get_trip_temp(struct thermal_zone_device *tz, int trip, int *temp) { - if (trip >= tz->ntrips || trip < 0) + if (trip >= tz->num_trips || trip < 0) return -EDOM; *temp = tz->trips[trip].temperature; @@ -293,7 +293,7 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip, { struct __thermal_zone *data = tz->devdata; - if (trip >= tz->ntrips || trip < 0) + if (trip >= tz->num_trips || trip < 0) return -EDOM; if (data->ops && data->ops->set_trip_temp) { @@ -313,7 +313,7 @@ static int of_thermal_set_trip_temp(struct thermal_zone_device *tz, int trip, static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip, int *hyst) { - if (trip >= tz->ntrips || trip < 0) + if (trip >= tz->num_trips || trip < 0) return -EDOM; *hyst = tz->trips[trip].hysteresis; @@ -324,7 +324,7 @@ static int of_thermal_get_trip_hyst(struct thermal_zone_device *tz, int trip, static int of_thermal_set_trip_hyst(struct thermal_zone_device *tz, int trip, int hyst) { - if (trip >= tz->ntrips || trip < 0) + if (trip >= tz->num_trips || trip < 0) return -EDOM; /* thermal framework should take care of data->mask & (1 << trip) */ @@ -338,7 +338,7 @@ static int of_thermal_get_crit_temp(struct thermal_zone_device *tz, { int i; - for (i = 0; i < tz->ntrips; i++) + for (i = 0; i < tz->num_trips; i++) if (tz->trips[i].type == THERMAL_TRIP_CRITICAL) { *temp = tz->trips[i].temperature; return 0; @@ -693,7 +693,8 @@ static int of_find_trip_id(struct device_node *np, struct device_node *trip) * * Return: 0 on success, proper error code otherwise */ -static int thermal_of_populate_bind_params(struct device_node *np, +static int thermal_of_populate_bind_params(struct device_node *tz_np, + struct device_node *np, struct __thermal_bind_params *__tbp) { struct of_phandle_args cooling_spec; @@ -715,7 +716,7 @@ static int thermal_of_populate_bind_params(struct device_node *np, return -ENODEV; } - trip_id = of_find_trip_id(np, trip); + trip_id = of_find_trip_id(tz_np, trip); if (trip_id < 0) { ret = trip_id; goto end; @@ -849,6 +850,53 @@ static int thermal_of_populate_trip(struct device_node *np, return 0; } +static struct thermal_trip *thermal_of_trips_init(struct device_node *np, int *ntrips) +{ + struct thermal_trip *tt; + struct device_node *trips, *trip; + int ret, count; + + trips = of_get_child_by_name(np, "trips"); + if (!trips) { + pr_err("Failed to find 'trips' node\n"); + return ERR_PTR(-EINVAL); + } + + count = of_get_child_count(trips); + if (!count) { + pr_err("No trip point defined\n"); + ret = -EINVAL; + goto out_of_node_put; + } + + tt = kzalloc(sizeof(*tt) * count, GFP_KERNEL); + if (!tt) { + ret = -ENOMEM; + goto out_of_node_put; + } + + *ntrips = count; + + count = 0; + for_each_child_of_node(trips, trip) { + ret = thermal_of_populate_trip(trip, &tt[count++]); + if (ret) + goto out_kfree; + } + + of_node_put(trips); + + return tt; + +out_kfree: + kfree(tt); + *ntrips = 0; +out_of_node_put: + of_node_put(trips); + + return ERR_PTR(ret); +} + /** * thermal_of_build_thermal_zone - parse and fill one thermal zone data * @np: DT node containing a thermal zone node @@ -867,7 +915,6 @@ static struct __thermal_zone __init *thermal_of_build_thermal_zone(struct device_node *np) { struct device_node *child = NULL, *gchild; - struct device_node *trips; struct __thermal_zone *tz; int ret, i; u32 prop, coef[2]; @@ -909,28 +956,10 @@ __init *thermal_of_build_thermal_zone(struct device_node *np) tz->offset = 0; } - /* trips */ - trips = of_get_child_by_name(np, "trips"); - - /* No trips provided */ - if (!trips) + tz->trips = thermal_of_trips_init(np, &tz->ntrips); + if (IS_ERR(tz->trips)) { + ret = PTR_ERR(tz->trips); goto finish; - - tz->ntrips = of_get_child_count(trips); - if (tz->ntrips == 0) /* must have at least one child */ - goto finish; - - tz->trips = kcalloc(tz->ntrips, sizeof(*tz->trips), GFP_KERNEL); - if (!tz->trips) { - ret = -ENOMEM; - goto free_tz; - } - - i = 0; - for_each_child_of_node(trips, gchild) { - ret = thermal_of_populate_trip(gchild, &tz->trips[i++]); - if (ret) - goto free_trips; } /* cooling-maps */ @@ -952,13 +981,14 @@ __init *thermal_of_build_thermal_zone(struct device_node *np) i = 0; for_each_child_of_node(child, gchild) { - ret = thermal_of_populate_bind_params(gchild, &tz->tbps[i++]); - if (ret) + ret = thermal_of_populate_bind_params(np, gchild, &tz->tbps[i++]); + if (ret) { + of_node_put(gchild); goto free_tbps; + } } finish: - of_node_put(trips); of_node_put(child); return tz; @@ -977,8 +1007,6 @@ __init *thermal_of_build_thermal_zone(struct device_node *np) kfree(tz->tbps); free_trips: kfree(tz->trips); - of_node_put(trips); - of_node_put(gchild); free_tz: kfree(tz); of_node_put(child); From 5d07c987b485177fa3db89f7c351d17fe0eef7c3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 30 May 2022 19:17:12 -0700 Subject: [PATCH 0425/1250] m68k: coldfire/device.c: protect FLEXCAN blocks When CAN_FLEXCAN=y and M5441x is not set/enabled, there are build errors in coldfire/device.c: ../arch/m68k/coldfire/device.c:595:26: error: 'MCFFLEXCAN_BASE0' undeclared here (not in a function); did you mean 'MCFDMA_BASE0'? 595 | .start = MCFFLEXCAN_BASE0, ../arch/m68k/coldfire/device.c:596:43: error: 'MCFFLEXCAN_SIZE' undeclared here (not in a function) 596 | .end = MCFFLEXCAN_BASE0 + MCFFLEXCAN_SIZE, ../arch/m68k/coldfire/device.c:600:26: error: 'MCF_IRQ_IFL0' undeclared here (not in a function); did you mean 'MCF_IRQ_I2C0'? 600 | .start = MCF_IRQ_IFL0, ../arch/m68k/coldfire/device.c:605:26: error: 'MCF_IRQ_BOFF0' undeclared here (not in a function); did you mean 'MCF_IRQ_I2C0'? 605 | .start = MCF_IRQ_BOFF0, ../arch/m68k/coldfire/device.c:610:26: error: 'MCF_IRQ_ERR0' undeclared here (not in a function); did you mean 'MCF_IRQ_I2C0'? 610 | .start = MCF_IRQ_ERR0, Protect the FLEXCAN code blocks by checking if MCFFLEXCAN_SIZE is defined. Fixes: 35a9f9363a89 ("m68k: m5441x: add flexcan support") Signed-off-by: Randy Dunlap Cc: Greg Ungerer Cc: Geert Uytterhoeven Cc: linux-m68k@lists.linux-m68k.org Cc: uclinux-dev@uclinux.org Cc: Angelo Dureghello Signed-off-by: Greg Ungerer --- arch/m68k/coldfire/device.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/m68k/coldfire/device.c b/arch/m68k/coldfire/device.c index 4218750414bbfd..7dab46728aedaf 100644 --- a/arch/m68k/coldfire/device.c +++ b/arch/m68k/coldfire/device.c @@ -581,7 +581,7 @@ static struct platform_device mcf_esdhc = { }; #endif /* MCFSDHC_BASE */ -#if IS_ENABLED(CONFIG_CAN_FLEXCAN) +#ifdef MCFFLEXCAN_SIZE #include @@ -620,7 +620,7 @@ static struct platform_device mcf_flexcan0 = { .resource = mcf5441x_flexcan0_resource, .dev.platform_data = &mcf5441x_flexcan_info, }; -#endif /* IS_ENABLED(CONFIG_CAN_FLEXCAN) */ +#endif /* MCFFLEXCAN_SIZE */ static struct platform_device *mcf_devices[] __initdata = { &mcf_uart, @@ -657,7 +657,7 @@ static struct platform_device *mcf_devices[] __initdata = { #ifdef MCFSDHC_BASE &mcf_esdhc, #endif -#if IS_ENABLED(CONFIG_CAN_FLEXCAN) +#ifdef MCFFLEXCAN_SIZE &mcf_flexcan0, #endif }; From f57966e40d63099f48b3ea8ad629f219544a4310 Mon Sep 17 00:00:00 2001 From: Wang Jingjin Date: Fri, 10 Jun 2022 10:07:55 +0800 Subject: [PATCH 0426/1250] m68k: coldfire: make symbol m523x_clk_lookup static Fix sparse warnings: arch/m68k/coldfire/m523x.c:31:19: sparse: sparse: symbol 'm523x_clk_lookup' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Wang Jingjin Signed-off-by: Greg Ungerer --- arch/m68k/coldfire/m523x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/coldfire/m523x.c b/arch/m68k/coldfire/m523x.c index 193c178162c1ce..83a997313393c4 100644 --- a/arch/m68k/coldfire/m523x.c +++ b/arch/m68k/coldfire/m523x.c @@ -28,7 +28,7 @@ DEFINE_CLK(pll, "pll.0", MCF_CLK); DEFINE_CLK(sys, "sys.0", MCF_BUSCLK); -struct clk_lookup m523x_clk_lookup[] = { +static struct clk_lookup m523x_clk_lookup[] = { CLKDEV_INIT(NULL, "pll.0", &clk_pll), CLKDEV_INIT(NULL, "sys.0", &clk_sys), CLKDEV_INIT("mcfpit.0", NULL, &clk_pll), From 483e7343bd404b63ac5e63b6a9ce48e2e47ce2e5 Mon Sep 17 00:00:00 2001 From: Xiang wangx Date: Fri, 10 Jun 2022 16:34:20 +0800 Subject: [PATCH 0427/1250] m68k: Fix syntax errors in comments comments "the the" should be replaced by "of the" instead. Signed-off-by: Xiang wangx Signed-off-by: Greg Ungerer --- arch/m68k/coldfire/intc-2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/m68k/coldfire/intc-2.c b/arch/m68k/coldfire/intc-2.c index 995093357c59fd..f74f0e47311938 100644 --- a/arch/m68k/coldfire/intc-2.c +++ b/arch/m68k/coldfire/intc-2.c @@ -7,7 +7,7 @@ * family, the 5270, 5271, 5274, 5275, and the 528x family which have two such * controllers, and the 547x and 548x families which have only one of them. * - * The external 7 fixed interrupts are part the the Edge Port unit of these + * The external 7 fixed interrupts are part of the Edge Port unit of these * ColdFire parts. They can be configured as level or edge triggered. * * (C) Copyright 2009-2011, Greg Ungerer From 45d9321d646adf1586c6ed870d6612cf7a5d2ea1 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 12 Jul 2022 11:43:44 -0500 Subject: [PATCH 0428/1250] smb3: check xattr value length earlier Coverity complains about assigning a pointer based on value length before checking that value length goes beyond the end of the SMB. Although this is even more unlikely as value length is a single byte, and the pointer is not dereferenced until laterm, it is clearer to check the lengths first. Addresses-Coverity: 1467704 ("Speculative execution data leak") Reviewed-by: Ronnie Sahlberg Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 8802995b2d3d63..aa4c1d403708fa 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1145,9 +1145,7 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size, size_t name_len, value_len, user_name_len; while (src_size > 0) { - name = &src->ea_data[0]; name_len = (size_t)src->ea_name_length; - value = &src->ea_data[src->ea_name_length + 1]; value_len = (size_t)le16_to_cpu(src->ea_value_length); if (name_len == 0) @@ -1159,6 +1157,9 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size, goto out; } + name = &src->ea_data[0]; + value = &src->ea_data[src->ea_name_length + 1]; + if (ea_name) { if (ea_name_len == name_len && memcmp(ea_name, name, name_len) == 0) { From 9f727eba3c151c1d7e3122d527eccf703ca1c8fb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 28 Jun 2022 22:32:29 +0100 Subject: [PATCH 0429/1250] cifs: remove redundant initialization to variable mnt_sign_enabled Variable mnt_sign_enabled is being initialized with a value that is never read, it is being reassigned later on with a different value. The initialization is redundant and can be removed. Cleans up clang scan-build warning: fs/cifs/cifssmb.c:465:7: warning: Value stored to 'mnt_sign_enabled during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 6371b9eebdad82..9ed21752f2df8a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -462,7 +462,7 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) { bool srv_sign_required = server->sec_mode & server->vals->signing_required; bool srv_sign_enabled = server->sec_mode & server->vals->signing_enabled; - bool mnt_sign_enabled = global_secflags & CIFSSEC_MAY_SIGN; + bool mnt_sign_enabled; /* * Is signing required by mnt options? If not then check From c2d16631997deff467335794dc97d38a4c5133c1 Mon Sep 17 00:00:00 2001 From: Yu Zhe Date: Thu, 30 Jun 2022 17:30:27 +0800 Subject: [PATCH 0430/1250] cifs: remove unnecessary type castings remove unnecessary void* type castings. Signed-off-by: Yu Zhe Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- fs/cifs/inode.c | 4 ++-- fs/cifs/netmisc.c | 2 +- fs/cifs/smb2misc.c | 2 +- fs/cifs/smb2pdu.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 386bb523c69ea7..47a161f86662ec 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2644,7 +2644,7 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) int cifs_match_super(struct super_block *sb, void *data) { - struct cifs_mnt_data *mnt_data = (struct cifs_mnt_data *)data; + struct cifs_mnt_data *mnt_data = data; struct smb3_fs_context *ctx; struct cifs_sb_info *cifs_sb; struct TCP_Server_Info *tcp_srv; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 81da81e185538a..3ad303dd5e5aa4 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1223,7 +1223,7 @@ static const struct inode_operations cifs_ipc_inode_ops = { static int cifs_find_inode(struct inode *inode, void *opaque) { - struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + struct cifs_fattr *fattr = opaque; /* don't match inode with different uniqueid */ if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) @@ -1247,7 +1247,7 @@ cifs_find_inode(struct inode *inode, void *opaque) static int cifs_init_inode(struct inode *inode, void *opaque) { - struct cifs_fattr *fattr = (struct cifs_fattr *) opaque; + struct cifs_fattr *fattr = opaque; CIFS_I(inode)->uniqueid = fattr->cf_uniqueid; CIFS_I(inode)->createtime = fattr->cf_createtime; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 235aa1b395ebcc..28caae7aed1bb0 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -911,7 +911,7 @@ map_and_check_smb_error(struct mid_q_entry *mid, bool logErr) unsigned int smbCalcSize(void *buf, struct TCP_Server_Info *server) { - struct smb_hdr *ptr = (struct smb_hdr *)buf; + struct smb_hdr *ptr = buf; return (sizeof(struct smb_hdr) + (2 * ptr->WordCount) + 2 /* size of the bcc field */ + get_bcc(ptr)); } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 17813c3d0c6e0a..db0f27fd373b6f 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -402,7 +402,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *srvr) { - struct smb2_pdu *pdu = (struct smb2_pdu *)buf; + struct smb2_pdu *pdu = buf; struct smb2_hdr *shdr = &pdu->hdr; int offset; /* the offset from the beginning of SMB to data area */ int data_length; /* the length of the variable length data area */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index c705de32e22579..295ee8b8805538 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -354,7 +354,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, unsigned int *total_len) { - struct smb2_pdu *spdu = (struct smb2_pdu *)buf; + struct smb2_pdu *spdu = buf; /* lookup word count ie StructureSize from table */ __u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)]; From 8dbb34711bfaf2a8889cfd559b9a397ba9a920ab Mon Sep 17 00:00:00 2001 From: Yu Zhe Date: Tue, 14 Jun 2022 01:54:49 -0700 Subject: [PATCH 0431/1250] cifs: remove unnecessary (void*) conversions. One more. remove unnecessary void* type castings. Signed-off-by: Yu Zhe Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 2cfbac8bb96505..97116c1710e2c3 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -36,7 +36,7 @@ cifs_dump_mem(char *label, void *data, int length) void cifs_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 - struct smb_hdr *smb = (struct smb_hdr *)buf; + struct smb_hdr *smb = buf; cifs_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d\n", smb->Command, smb->Status.CifsError, From 46a5b27901cc12123c0f2d9d2afc9e592c8d855b Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 15 Jul 2022 23:45:45 -0500 Subject: [PATCH 0432/1250] cifs: remove some camelCase and also some static build warnings Remove warnings for five global variables. For example: fs/cifs/cifsglob.h:1984:24: warning: symbol 'midCount' was not declared. Should it be static? Also change them from camelCase (e.g. "midCount" to "mid_count") Signed-off-by: Steve French --- fs/cifs/cifs_debug.c | 14 +++++++------- fs/cifs/cifsfs.c | 17 ++++++++++++----- fs/cifs/cifsglob.h | 10 +++++----- fs/cifs/connect.c | 2 +- fs/cifs/misc.c | 12 ++++++------ fs/cifs/smb2transport.c | 2 +- fs/cifs/transport.c | 4 ++-- 7 files changed, 34 insertions(+), 27 deletions(-) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 97116c1710e2c3..f5e63dfac2b1fb 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -514,8 +514,8 @@ static ssize_t cifs_stats_proc_write(struct file *file, #ifdef CONFIG_CIFS_STATS2 int i; - atomic_set(&totBufAllocCount, 0); - atomic_set(&totSmBufAllocCount, 0); + atomic_set(&total_buf_alloc_count, 0); + atomic_set(&total_small_buf_alloc_count, 0); #endif /* CONFIG_CIFS_STATS2 */ atomic_set(&tcpSesReconnectCount, 0); atomic_set(&tconInfoReconnectCount, 0); @@ -579,17 +579,17 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v) seq_printf(m, "Share (unique mount targets): %d\n", tconInfoAllocCount.counter); seq_printf(m, "SMB Request/Response Buffer: %d Pool size: %d\n", - bufAllocCount.counter, + buf_alloc_count.counter, cifs_min_rcv + tcpSesAllocCount.counter); seq_printf(m, "SMB Small Req/Resp Buffer: %d Pool size: %d\n", - smBufAllocCount.counter, cifs_min_small); + small_buf_alloc_count.counter, cifs_min_small); #ifdef CONFIG_CIFS_STATS2 seq_printf(m, "Total Large %d Small %d Allocations\n", - atomic_read(&totBufAllocCount), - atomic_read(&totSmBufAllocCount)); + atomic_read(&total_buf_alloc_count), + atomic_read(&total_small_buf_alloc_count)); #endif /* CONFIG_CIFS_STATS2 */ - seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&midCount)); + seq_printf(m, "Operations (MIDs): %d\n", atomic_read(&mid_count)); seq_printf(m, "\n%d session %d share reconnects\n", tcpSesReconnectCount.counter, tconInfoReconnectCount.counter); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8f2e003e059075..2732953f49e49f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -68,6 +68,13 @@ bool enable_negotiate_signing; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; +atomic_t mid_count; +atomic_t buf_alloc_count; +atomic_t small_buf_alloc_count; +#ifdef CONFIG_CIFS_STATS2 +atomic_t total_buf_alloc_count; +atomic_t total_small_buf_alloc_count; +#endif/* STATS2 */ static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; module_param(CIFSMaxBufSize, uint, 0444); @@ -1579,11 +1586,11 @@ init_cifs(void) atomic_set(&tcpSesReconnectCount, 0); atomic_set(&tconInfoReconnectCount, 0); - atomic_set(&bufAllocCount, 0); - atomic_set(&smBufAllocCount, 0); + atomic_set(&buf_alloc_count, 0); + atomic_set(&small_buf_alloc_count, 0); #ifdef CONFIG_CIFS_STATS2 - atomic_set(&totBufAllocCount, 0); - atomic_set(&totSmBufAllocCount, 0); + atomic_set(&total_buf_alloc_count, 0); + atomic_set(&total_small_buf_alloc_count, 0); if (slow_rsp_threshold < 1) cifs_dbg(FYI, "slow_response_threshold msgs disabled\n"); else if (slow_rsp_threshold > 32767) @@ -1591,7 +1598,7 @@ init_cifs(void) "slow response threshold set higher than recommended (0 to 32767)\n"); #endif /* CONFIG_CIFS_STATS2 */ - atomic_set(&midCount, 0); + atomic_set(&mid_count, 0); GlobalCurrentXid = 0; GlobalTotalActiveXid = 0; GlobalMaxActiveXid = 0; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index a643c84ff1e930..c5eecc9522b328 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1974,14 +1974,13 @@ GLOBAL_EXTERN atomic_t tcpSesReconnectCount; GLOBAL_EXTERN atomic_t tconInfoReconnectCount; /* Various Debug counters */ -GLOBAL_EXTERN atomic_t bufAllocCount; /* current number allocated */ +extern atomic_t buf_alloc_count; /* current number allocated */ +extern atomic_t small_buf_alloc_count; #ifdef CONFIG_CIFS_STATS2 -GLOBAL_EXTERN atomic_t totBufAllocCount; /* total allocated over all time */ -GLOBAL_EXTERN atomic_t totSmBufAllocCount; +extern atomic_t total_buf_alloc_count; /* total allocated over all time */ +extern atomic_t total_small_buf_alloc_count; extern unsigned int slow_rsp_threshold; /* number of secs before logging */ #endif -GLOBAL_EXTERN atomic_t smBufAllocCount; -GLOBAL_EXTERN atomic_t midCount; /* Misc globals */ extern bool enable_oplocks; /* enable or disable oplocks */ @@ -1998,6 +1997,7 @@ extern unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ extern unsigned int cifs_min_small; /* min size of small buf pool */ extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ +extern atomic_t mid_count; void cifs_oplock_break(struct work_struct *work); void cifs_queue_oplock_break(struct cifsFileInfo *cfile); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 47a161f86662ec..fdd8452b8450d4 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1240,7 +1240,7 @@ cifs_demultiplex_thread(void *p) cifs_dbg(FYI, "Received oplock break\n"); } else { cifs_server_dbg(VFS, "No task to wake, unknown frame received! NumMids %d\n", - atomic_read(&midCount)); + atomic_read(&mid_count)); cifs_dump_mem("Received Data is: ", bufs[i], HEADER_SIZE(server)); smb2_add_credits_from_hdr(bufs[i], server); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 0e84e6fcf8ab4f..16168ebd1a62f7 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -172,9 +172,9 @@ cifs_buf_get(void) /* clear the first few header bytes */ /* for most paths, more is cleared in header_assemble */ memset(ret_buf, 0, buf_size + 3); - atomic_inc(&bufAllocCount); + atomic_inc(&buf_alloc_count); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totBufAllocCount); + atomic_inc(&total_buf_alloc_count); #endif /* CONFIG_CIFS_STATS2 */ return ret_buf; @@ -189,7 +189,7 @@ cifs_buf_release(void *buf_to_free) } mempool_free(buf_to_free, cifs_req_poolp); - atomic_dec(&bufAllocCount); + atomic_dec(&buf_alloc_count); return; } @@ -205,9 +205,9 @@ cifs_small_buf_get(void) ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS); /* No need to clear memory here, cleared in header assemble */ /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ - atomic_inc(&smBufAllocCount); + atomic_inc(&small_buf_alloc_count); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totSmBufAllocCount); + atomic_inc(&total_small_buf_alloc_count); #endif /* CONFIG_CIFS_STATS2 */ return ret_buf; @@ -223,7 +223,7 @@ cifs_small_buf_release(void *buf_to_free) } mempool_free(buf_to_free, cifs_sm_req_poolp); - atomic_dec(&smBufAllocCount); + atomic_dec(&small_buf_alloc_count); return; } diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 55e79f6ee78d17..53ff6bc11939f8 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -750,7 +750,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *shdr, temp->callback = cifs_wake_up_task; temp->callback_data = current; - atomic_inc(&midCount); + atomic_inc(&mid_count); temp->mid_state = MID_REQUEST_ALLOCATED; trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId), le64_to_cpu(shdr->SessionId), diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index bfc9bd55870a06..dac8d6f9b309fc 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -68,7 +68,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) temp->callback = cifs_wake_up_task; temp->callback_data = current; - atomic_inc(&midCount); + atomic_inc(&mid_count); temp->mid_state = MID_REQUEST_ALLOCATED; return temp; } @@ -91,7 +91,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount) server->ops->handle_cancelled_mid(midEntry, server); midEntry->mid_state = MID_FREE; - atomic_dec(&midCount); + atomic_dec(&mid_count); if (midEntry->large_buf) cifs_buf_release(midEntry->resp_buf); else From 3a7fd55d0be72fa5b7f764410b26325e1d4b6c4e Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 15 Jul 2022 23:57:08 -0500 Subject: [PATCH 0433/1250] cifs: remove minor build warning The build warning: warning: symbol 'cifs_tcp_ses_lock' was not declared. Should it be static? can be distracting. Fix two of these. Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 2 ++ fs/cifs/cifsglob.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2732953f49e49f..f909d9e9faaa52 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -75,6 +75,8 @@ atomic_t small_buf_alloc_count; atomic_t total_buf_alloc_count; atomic_t total_small_buf_alloc_count; #endif/* STATS2 */ +struct list_head cifs_tcp_ses_list; +spinlock_t cifs_tcp_ses_lock; static const struct super_operations cifs_super_ops; unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE; module_param(CIFSMaxBufSize, uint, 0444); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c5eecc9522b328..9b7f409bfc8c1f 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1941,7 +1941,7 @@ require use of the stronger protocol */ * sessions (and from that the tree connections) can be found * by iterating over cifs_tcp_ses_list */ -GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; +extern struct list_head cifs_tcp_ses_list; /* * This lock protects the cifs_tcp_ses_list, the list of smb sessions per @@ -1953,7 +1953,7 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; * tcon->open_file_lock and that before file->file_info_lock since the * structure order is cifs_socket-->cifs_ses-->cifs_tcon-->cifs_file */ -GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; +extern spinlock_t cifs_tcp_ses_lock; /* * Global transaction id (XID) information From c2e425f31a0f82ea237812afc70372d8ebda7283 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 18 Jul 2022 09:40:25 +0200 Subject: [PATCH 0434/1250] soc: document merges Signed-off-by: Arnd Bergmann --- arch/arm/arm-soc-for-next-contents.txt | 157 +++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 arch/arm/arm-soc-for-next-contents.txt diff --git a/arch/arm/arm-soc-for-next-contents.txt b/arch/arm/arm-soc-for-next-contents.txt new file mode 100644 index 00000000000000..f5d1de17c91f75 --- /dev/null +++ b/arch/arm/arm-soc-for-next-contents.txt @@ -0,0 +1,157 @@ +arm/soc + samsung/soc + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/samsung-soc-5.20 + omap/soc + git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap tags/omap-for-v5.20/soc-signed + zynq/soc-64 + https://github.com/Xilinx/linux-xlnx tags/zynqmp-soc-for-v5.20 + zynq/soc + https://github.com/Xilinx/linux-xlnx tags/zynq-soc-for-v5.20 + renesas/soc + git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-arm-soc-for-v5.20-tag1 + broadcom/soc + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/soc + broadcom/maintainers + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/maintainers + imx/soc + git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux tags/imx-soc-5.20 + davinci/dm3xxx-remove + git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio tags/davinci-boards-delete-v5.20 + broadcom/soc-2 + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/soc-part2 + +arm/dt + samsung/dt + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/samsung-dt-5.20 + samsung/dt64 + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/samsung-dt64-5.20 + renesas/dt + git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-arm-dt-for-v5.20-tag1 + renesas/dt-bindings +renesas/dt-fixes + git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-dt-bindings-for-v5.20-tag1 + socfpga/dt + git://git.kernel.org/pub/scm/linux/kernel/git/dinguyen/linux tags/socfpga_dts_updates_for_v5.20 + dt/cleanup-arm64 + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/dt64-cleanup-5.20 + dt/cleanup-arm + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/dt-cleanup-5.20 + amlogic/dt + git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux tags/amlogic-arm-dt-for-v5.20 + amlogic/dt64 + git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux tags/amlogic-arm64-dt-for-v5.20 + omap/dt + git://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap tags/omap-for-v5.20/dt-signed + rockchip/dt64 + git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip tags/v5.20-rockchip-dts64-1 + rockchip/dt + git://git.kernel.org/pub/scm/linux/kernel/git/mmind/linux-rockchip tags/v5.20-rockchip-dts32-1 + ux500/dt + git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-nomadik tags/ux500-dts-v5.20 + at91/dt + git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux tags/at91-dt-5.20 + stm32/dt + git://git.kernel.org/pub/scm/linux/kernel/git/atorgue/stm32 tags/stm32-dt-for-v5.20-1 + juno/dt + git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux tags/juno-updates-5.20 + dt/cleanup-arm64-2 + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/dt64-cleanup-5.20-2 + dt/cleanup-arm-2 + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/dt-cleanup-5.20-2 + samsung/dt-2 + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/samsung-dt-5.20-2 + samsung/dt64-2 + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux tags/samsung-dt64-5.20-2 + zynq/dt + https://github.com/Xilinx/linux-xlnx tags/zynq-dt-for-v5.20 + zynq/dt64 + https://github.com/Xilinx/linux-xlnx tags/zynqmp-dt-for-v5.20 + renesas/dt-2 + git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-arm-dt-for-v5.20-tag2 + renesas/dt-bindings-2 + git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-dt-bindings-for-v5.20-tag2 + mediatek/dt + git://git.kernel.org/pub/scm/linux/kernel/git/matthias.bgg/linux tags/v5.19-next-dts32 + mediatek/dt64 + git://git.kernel.org/pub/scm/linux/kernel/git/matthias.bgg/linux tags/v5.19-next-dts64 + sunxi/dt + git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux tags/sunxi-dt-for-5.20-1 + tegra/dt-bindings + git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux tags/tegra-for-5.20-dt-bindings + tegra/dt + git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux tags/tegra-for-5.20-arm-dt + tegra/dt64 + git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux tags/tegra-for-5.20-arm64-dt + k3/dt + git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux tags/ti-k3-dt-for-v5.20 + keystone/dt + git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux tags/ti-keystone-dt-for-v5.20 + imx/dt-bindings + git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux tags/imx-bindings-5.20 + imx/dt + git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux tags/imx-dt-5.20 + imx/dt64 + git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux tags/imx-dt64-5.20 + sunxi/dt-2 + git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux tags/sunxi-dt-for-5.20-2 + at91/dt-2 + git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux tags/at91-dt-5.20-2 + qcom/dt + git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-dts-for-5.20 + qcom/dt64 + git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-arm64-for-5.20 + broadcom/dt + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/devicetree + broadcom/dt64 + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/devicetree-arm64 + broadcom/dt-bindings + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/devicetree-part2 + broadcom/dt64-2 + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/devicetree-arm64-part2 + +arm/drivers + renesas/drivers + git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-drivers-for-v5.20-tag1 + amlogic/drivers + git://git.kernel.org/pub/scm/linux/kernel/git/amlogic/linux tags/amlogic-drivers-for-v5.20 + drivers/memory + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-mem-ctrl tags/memory-controller-drv-5.20 + firmware/scmi + git://git.kernel.org/pub/scm/linux/kernel/git/sudeep.holla/linux tags/scmi-updates-5.20 + sunxi/drivers + git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux tags/sunxi-drivers-for-5.20-1 + mediatek/drivers + git://git.kernel.org/pub/scm/linux/kernel/git/matthias.bgg/linux tags/v5.19-next-soc + tegra/firmware + git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux tags/tegra-for-5.20-firmware + tegra/memory + git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux tags/tegra-for-5.20-memory + ti/soc-drivers + git://git.kernel.org/pub/scm/linux/kernel/git/ti/linux tags/ti-driver-soc-for-v5.20 + imx/drivers + git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux tags/imx-drivers-5.20 + broadcom/drivers + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/drivers + qcom/drivers + git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-drivers-for-5.20 + +arm/defconfig + renesas/defconfig + git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-arm-defconfig-for-v5.20-tag1 + sunxi/defconfig + git://git.kernel.org/pub/scm/linux/kernel/git/sunxi/linux tags/sunxi-config64-for-5.20-1 + tegra/defconfig + git://git.kernel.org/pub/scm/linux/kernel/git/tegra/linux tags/tegra-for-5.20-arm64-defconfig + imx/defconfig + git://git.kernel.org/pub/scm/linux/kernel/git/shawnguo/linux tags/imx-defconfig-5.20 + broadcom/defconfig + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/defconfig + broadcom/defconfig-64 + https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/defconfig-arm64 + qcom/defconfig + git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-arm64-defconfig-for-5.20 + +arm/late + +arm/fixes + From 4648f81306d54242bd623c5c8947330c849415f8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 16 Jul 2022 21:35:31 -0700 Subject: [PATCH 0435/1250] fs/lock: Don't allocate file_lock in flock_make_lock(). Two functions, flock syscall and locks_remove_flock(), call flock_make_lock(). It allocates struct file_lock from slab cache if its argument fl is NULL. When we call flock syscall, we pass NULL to allocate memory for struct file_lock. However, we always free it at the end by locks_free_lock(). We need not allocate it and instead should use a local variable as locks_remove_flock() does. Also, the validation for flock_translate_cmd() is not necessary for locks_remove_flock(). So we move the part to flock syscall and make flock_make_lock() return nothing. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Chuck Lever Signed-off-by: Jeff Layton --- fs/locks.c | 46 +++++++++++++++------------------------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index ca28e0e50e569f..b134eaefd7d610 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -425,21 +425,9 @@ static inline int flock_translate_cmd(int cmd) { } /* Fill in a file_lock structure with an appropriate FLOCK lock. */ -static struct file_lock * -flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) +static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) { - int type = flock_translate_cmd(cmd); - - if (type < 0) - return ERR_PTR(type); - - if (fl == NULL) { - fl = locks_alloc_lock(); - if (fl == NULL) - return ERR_PTR(-ENOMEM); - } else { - locks_init_lock(fl); - } + locks_init_lock(fl); fl->fl_file = filp; fl->fl_owner = filp; @@ -447,8 +435,6 @@ flock_make_lock(struct file *filp, unsigned int cmd, struct file_lock *fl) fl->fl_flags = FL_FLOCK; fl->fl_type = type; fl->fl_end = OFFSET_MAX; - - return fl; } static int assign_type(struct file_lock *fl, long type) @@ -2097,10 +2083,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { + int can_sleep, error, unlock, type; struct fd f = fdget(fd); - struct file_lock *lock; - int can_sleep, unlock; - int error; + struct file_lock fl; error = -EBADF; if (!f.file) @@ -2127,28 +2112,27 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) goto out_putf; } - lock = flock_make_lock(f.file, cmd, NULL); - if (IS_ERR(lock)) { - error = PTR_ERR(lock); + type = flock_translate_cmd(cmd); + if (type < 0) { + error = type; goto out_putf; } + flock_make_lock(f.file, &fl, type); + if (can_sleep) - lock->fl_flags |= FL_SLEEP; + fl.fl_flags |= FL_SLEEP; - error = security_file_lock(f.file, lock->fl_type); + error = security_file_lock(f.file, fl.fl_type); if (error) - goto out_free; + goto out_putf; if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, (can_sleep) ? F_SETLKW : F_SETLK, - lock); + &fl); else - error = locks_lock_file_wait(f.file, lock); - - out_free: - locks_free_lock(lock); + error = locks_lock_file_wait(f.file, &fl); out_putf: fdput(f); @@ -2614,7 +2598,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) if (list_empty(&flctx->flc_flock)) return; - flock_make_lock(filp, LOCK_UN, &fl); + flock_make_lock(filp, &fl, F_UNLCK); fl.fl_flags |= FL_CLOSE; if (filp->f_op->flock) From 0064b3d9f96f3dc466e44a6fc716910cea56dbbf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 16 Jul 2022 21:35:32 -0700 Subject: [PATCH 0436/1250] fs/lock: Rearrange ops in flock syscall. The previous patch added flock_translate_cmd() in flock syscall. The test and the other one for LOCK_MAND do not depend on struct fd and are cheaper, so we can put them at the top and defer fdget() after that. Also, we can remove the unlock variable and use type instead. While at it, we fix this checkpatch error. CHECK: spaces preferred around that '|' (ctx:VxV) #45: FILE: fs/locks.c:2099: + if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) ^ Finally, we can move the can_sleep part just before we use it. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jeff Layton --- fs/locks.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index b134eaefd7d610..c266cfdc3291f9 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2083,20 +2083,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { - int can_sleep, error, unlock, type; - struct fd f = fdget(fd); + int can_sleep, error, type; struct file_lock fl; - - error = -EBADF; - if (!f.file) - goto out; - - can_sleep = !(cmd & LOCK_NB); - cmd &= ~LOCK_NB; - unlock = (cmd == LOCK_UN); - - if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) - goto out_putf; + struct fd f; /* * LOCK_MAND locks were broken for a long time in that they never @@ -2108,35 +2097,41 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) */ if (cmd & LOCK_MAND) { pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); - error = 0; - goto out_putf; + return 0; } - type = flock_translate_cmd(cmd); - if (type < 0) { - error = type; + type = flock_translate_cmd(cmd & ~LOCK_NB); + if (type < 0) + return type; + + error = -EBADF; + f = fdget(fd); + if (!f.file) + return error; + + if (type != F_UNLCK && !(f.file->f_mode & (FMODE_READ | FMODE_WRITE))) goto out_putf; - } flock_make_lock(f.file, &fl, type); - if (can_sleep) - fl.fl_flags |= FL_SLEEP; - error = security_file_lock(f.file, fl.fl_type); if (error) goto out_putf; + can_sleep = !(cmd & LOCK_NB); + if (can_sleep) + fl.fl_flags |= FL_SLEEP; + if (f.file->f_op->flock) error = f.file->f_op->flock(f.file, - (can_sleep) ? F_SETLKW : F_SETLK, + (can_sleep) ? F_SETLKW : F_SETLK, &fl); else error = locks_lock_file_wait(f.file, &fl); out_putf: fdput(f); - out: + return error; } From db10b31e12e7d77e420d5086825717f00b7b6623 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 25 May 2022 16:27:25 +0200 Subject: [PATCH 0437/1250] btrfs: fix typos in comments Codespell has found a few typos. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/ordered-data.c | 4 ++-- fs/btrfs/raid56.c | 2 +- fs/btrfs/space-info.c | 2 +- fs/btrfs/subpage.c | 2 +- fs/btrfs/super.c | 2 +- fs/btrfs/sysfs.c | 2 +- fs/btrfs/tree-log.c | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9c21e214d29e41..ad31cc5cdd50cd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2734,7 +2734,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); /* - * Take the number of bytes to be checksummmed and figure out how many leaves + * Take the number of bytes to be checksummed and figure out how many leaves * it would require to store the csums for that many bytes. */ static inline u64 btrfs_csum_bytes_to_leaves( diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index de440ebf5648b0..018510188a0d13 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1872,7 +1872,7 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, fail: /* * If our caller provided us an anonymous device, then it's his - * responsability to free it in case we fail. So we have to set our + * responsibility to free it in case we fail. So we have to set our * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d50448bf8eedd9..77a00155413346 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4228,7 +4228,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, /* * If we are in a rename context, we don't need to update anything in the * log. That will be done later during the rename by btrfs_log_new_name(). - * Besides that, doing it here would only cause extra unncessary btree + * Besides that, doing it here would only cause extra unnecessary btree * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1957b14b329a7a..dc88d2b3721fdf 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -275,7 +275,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, /* * Mark all ordered extents io inside the specified range finished. * - * @page: The invovled page for the opeartion. + * @page: The involved page for the operation. * For uncompressed buffered IO, the page status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. @@ -285,7 +285,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, * extent are finished. * * This function is called for endio, thus the range must have ordered - * extent(s) coveri it. + * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index a5b623ee6facdd..e03a38af12cdb2 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -132,7 +132,7 @@ struct btrfs_raid_bio { /* Number of data stripes (no p/q) */ u8 nr_data; - /* Numer of all stripes (including P/Q) */ + /* Number of all stripes (including P/Q) */ u8 real_stripes; /* How many pages there are for each stripe */ diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2dd8754cb990dd..2cf8da1116eb28 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1280,7 +1280,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still - * because we may have only satisified the priority tickets and still + * because we may have only satisfied the priority tickets and still * left non priority tickets on the list. We would then have * to_reclaim but ->bytes == 0. */ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index a105b291444f3c..0146fee730a093 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -123,7 +123,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* - * We have cases like a dummy extent buffer page, which is not mappped + * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ if (page->mapping) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6627dd7875ee0e..24b86061c5df30 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -72,7 +72,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data); #define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) /* - * Characters to print to indicate error conditions or uncommon filesystem sate. + * Characters to print to indicate error conditions or uncommon filesystem state. * RO is not an error. */ static const char fs_state_chars[] = { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 92a1fa8e3da6ff..963d6321981454 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -296,7 +296,7 @@ BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); /* * Features which depend on feature bits and may differ between each fs. * - * /sys/fs/btrfs/features - all available features implemeted by this version + * /sys/fs/btrfs/features - all available features implemented by this version * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or * can be changed on a mounted filesystem. */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 370388fadf960a..1201f083d4dbc5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2287,7 +2287,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_key location; /* - * Currenly we only log dir index keys. Even if we replay a log created + * Currently we only log dir index keys. Even if we replay a log created * by an older kernel that logged both dir index and dir item keys, all * we need to do is process the dir index keys, we (and our caller) can * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). From f003be483c020a7649cef3f21ffd0f3b29abdd66 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Tue, 17 May 2022 20:45:32 +0200 Subject: [PATCH 0438/1250] btrfs: zoned: fix comment description for sb_write_pointer logic Fix the comment to represent the actual logic used for sb_write_pointer - Empty[0] && In use[1] should be an invalid state instead of returning zone 0 wp - Empty[0] && Full[1] should be returning zone 0 wp instead of zone 1 wp - In use[0] && Empty[1] should be returning zone 0 wp instead of being an invalid state - In use[0] && Full[1] should be returning zone 0 wp instead of returning zone 1 wp - Full[0] && Empty[1] should be returning zone 1 wp instead of returning zone 0 wp - Full[0] && In use[1] should be returning zone 1 wp instead of returning zone 0 wp Reviewed-by: Johannes Thumshirn Signed-off-by: Pankaj Raghav Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d99026df6f6795..79a2d48a525165 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -94,9 +94,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, * Possible states of log buffer zones * * Empty[0] In use[0] Full[0] - * Empty[1] * x 0 - * In use[1] 0 x 0 - * Full[1] 1 1 C + * Empty[1] * 0 1 + * In use[1] x x 1 + * Full[1] 0 0 C * * Log position: * *: Special case, no superblock is written From fe4b2e41bfe7a38b5c6f927afeca0c08926f03d1 Mon Sep 17 00:00:00 2001 From: Fanjun Kong Date: Thu, 26 May 2022 22:35:40 +0800 Subject: [PATCH 0439/1250] btrfs: use PAGE_ALIGNED instead of IS_ALIGNED The already provides the PAGE_ALIGNED macro. Let's use it instead of IS_ALIGNED and passing PAGE_SIZE directly. Reviewed-by: Muchun Song Reviewed-by: Nikolay Borisov Signed-off-by: Fanjun Kong Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f03ab5dbda7ae2..c4c1e65a31ff90 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6206,7 +6206,7 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return -EINVAL; } if (fs_info->nodesize >= PAGE_SIZE && - !IS_ALIGNED(start, PAGE_SIZE)) { + !PAGE_ALIGNED(start)) { btrfs_err(fs_info, "tree block is not page aligned, start %llu nodesize %u", start, fs_info->nodesize); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 77a00155413346..332b8f1bf609da 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -560,8 +560,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, * will unlock the full page. */ if (fs_info->sectorsize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(end + 1, PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(end + 1)) return 0; } @@ -678,8 +678,8 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) * Thus we must also check against @actual_end, not just @end. */ if (blocksize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(round_up(actual_end, blocksize))) goto cleanup_and_bail_uncompressed; } From d3d3dd0de658306d3204b908db6eda16c1d347cb Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 22 May 2022 13:47:47 +0200 Subject: [PATCH 0440/1250] btrfs: quit early if the fs has no RAID56 support for raid56 related checks The following functions do special handling for RAID56 chunks: - btrfs_is_parity_mirror() Check if the range is in RAID56 chunks. - btrfs_full_stripe_len() Either return sectorsize for non-RAID56 profiles or full stripe length for RAID56 chunks. But if a filesystem without any RAID56 chunks, it will not have RAID56 incompat flags, and we can skip the chunk tree looking up completely. Reviewed-by: Nikolay Borisov Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c20049d1fecf3..a2bb0928dc066f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5768,6 +5768,9 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct map_lookup *map; unsigned long len = fs_info->sectorsize; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return len; + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { @@ -5785,6 +5788,9 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret = 0; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return 0; + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { From 38ffa20dd8dc42ad7d93e2f9b9124ed23aeaf7ad Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 22 May 2022 13:47:48 +0200 Subject: [PATCH 0441/1250] btrfs: introduce a data checksum checking helper Although we have several data csum verification code, we never have a function really just to verify checksum for one sector. Function check_data_csum() do extra work for error reporting, thus it requires a lot of extra things like file offset, bio_offset etc. Function btrfs_verify_data_csum() is even worse, it will utilize page checked flag, which means it can not be utilized for direct IO pages. Here we introduce a new helper, btrfs_check_sector_csum(), which really only accept a sector in page, and expected checksum pointer. We use this function to implement check_data_csum(), and export it for incoming patch. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo [hch: keep passing the csum array as an arguments, as the callers want to print it, rename per request] Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 13 ++++--------- fs/btrfs/ctree.h | 2 ++ fs/btrfs/inode.c | 38 ++++++++++++++++++++++++++++---------- 3 files changed, 34 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index f4564f32f6d93b..6ab82e142f1f86 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -147,12 +147,10 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, u64 disk_start) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); const u32 csum_size = fs_info->csum_size; const u32 sectorsize = fs_info->sectorsize; struct page *page; unsigned int i; - char *kaddr; u8 csum[BTRFS_CSUM_SIZE]; struct compressed_bio *cb = bio->bi_private; u8 *cb_sum = cb->sums; @@ -161,8 +159,6 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) return 0; - shash->tfm = fs_info->csum_shash; - for (i = 0; i < cb->nr_pages; i++) { u32 pg_offset; u32 bytes_left = PAGE_SIZE; @@ -175,12 +171,11 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, /* Hash through the page sector by sector */ for (pg_offset = 0; pg_offset < bytes_left; pg_offset += sectorsize) { - kaddr = kmap_atomic(page); - crypto_shash_digest(shash, kaddr + pg_offset, - sectorsize, csum); - kunmap_atomic(kaddr); + int ret; - if (memcmp(&csum, cb_sum, csum_size) != 0) { + ret = btrfs_check_sector_csum(fs_info, page, pg_offset, + csum, cb_sum); + if (ret) { btrfs_print_data_csum_error(inode, disk_start, csum, cb_sum, cb->mirror_num); if (btrfs_bio(bio)->device) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ad31cc5cdd50cd..6e65778040ed12 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3253,6 +3253,8 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 332b8f1bf609da..193931b3c20ad7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3328,6 +3328,29 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, finish_ordered_fn, uptodate); } +/* + * Verify the checksum for a single sector without any extra action that depend + * on the type of I/O. + */ +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected) +{ + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + char *kaddr; + + ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); + + shash->tfm = fs_info->csum_shash; + + kaddr = kmap_local_page(page) + pgoff; + crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + kunmap_local(kaddr); + + if (memcmp(csum, csum_expected, fs_info->csum_size)) + return -EIO; + return 0; +} + /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode @@ -3338,14 +3361,15 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, * @start: logical offset in the file * * The length of such check is always one sector size. + * + * When csum mismatch is detected, we will also report the error and fill the + * corrupted range with zero. (Thus it needs the extra parameters) */ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff, u64 start) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; u32 len = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; unsigned int offset_sectors; @@ -3357,16 +3381,10 @@ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, offset_sectors = bio_offset >> fs_info->sectorsize_bits; csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; - kaddr = kmap_atomic(page); - shash->tfm = fs_info->csum_shash; - - crypto_shash_digest(shash, kaddr + pgoff, len, csum); - kunmap_atomic(kaddr); - - if (memcmp(csum, csum_expected, csum_size)) + if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) goto zeroit; - return 0; + zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, bbio->mirror_num); From 4f7a23bdb657c1fe496ce4b4f189317416097631 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 22 May 2022 13:47:49 +0200 Subject: [PATCH 0442/1250] btrfs: remove duplicated parameters from submit_data_read_repair() The function submit_data_read_repair() is only called for buffered data read path, thus those members can be calculated using bvec directly: - start start = page_offset(bvec->bv_page) + bvec->bv_offset; - end end = start + bvec->bv_len - 1; - page page = bvec->bv_page; - pgoff pgoff = bvec->bv_offset; Thus we can safely replace those 4 parameters with just one bio_vec. Also remove the unused return value. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo [hch: also remove the return value] Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c4c1e65a31ff90..0fbe28b3de6b26 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2727,18 +2727,17 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -static blk_status_t submit_data_read_repair(struct inode *inode, - struct bio *failed_bio, - u32 bio_offset, struct page *page, - unsigned int pgoff, - u64 start, u64 end, - int failed_mirror, - unsigned int error_bitmap) +static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio, + u32 bio_offset, const struct bio_vec *bvec, + int failed_mirror, unsigned int error_bitmap) { + const unsigned int pgoff = bvec->bv_offset; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct page *page = bvec->bv_page; + const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; + const u64 end = start + bvec->bv_len - 1; const u32 sectorsize = fs_info->sectorsize; const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int error = 0; int i; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2785,11 +2784,9 @@ static blk_status_t submit_data_read_repair(struct inode *inode, continue; } /* - * Repair failed, just record the error but still continue. - * Or the remaining sectors will not be properly unlocked. + * Continue on failed repair, otherwise the remaining sectors + * will not be properly unlocked. */ - if (!error) - error = ret; next: end_page_read(page, uptodate, start + offset, sectorsize); if (uptodate) @@ -2802,7 +2799,6 @@ static blk_status_t submit_data_read_repair(struct inode *inode, start + offset + sectorsize - 1, &cached); } - return errno_to_blk_status(error); } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -3095,10 +3091,8 @@ static void end_bio_extent_readpage(struct bio *bio) * submit_data_read_repair() will handle all the good * and bad sectors, we just continue to the next bvec. */ - submit_data_read_repair(inode, bio, bio_offset, page, - start - page_offset(page), - start, end, mirror, - error_bitmap); + submit_data_read_repair(inode, bio, bio_offset, bvec, + mirror, error_bitmap); ASSERT(bio_offset + len > bio_offset); bio_offset += len; From 448d7e84337f71eb4bd8da9a024e6b8c5fb7cf3b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 May 2022 13:47:50 +0200 Subject: [PATCH 0443/1250] btrfs: factor out a helper to end a single sector buffer I/O Add a helper to end I/O on a single sector, which will come in handy with the new read repair code. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0fbe28b3de6b26..b50c1f1e21350e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2727,6 +2727,20 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } +static void end_sector_io(struct page *page, u64 offset, bool uptodate) +{ + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct extent_state *cached = NULL; + + end_page_read(page, uptodate, offset, sectorsize); + if (uptodate) + set_extent_uptodate(&inode->io_tree, offset, + offset + sectorsize - 1, &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(&inode->io_tree, offset, + offset + sectorsize - 1, &cached); +} + static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio, u32 bio_offset, const struct bio_vec *bvec, int failed_mirror, unsigned int error_bitmap) @@ -2757,7 +2771,6 @@ static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio, /* Iterate through all the sectors in the range */ for (i = 0; i < nr_bits; i++) { const unsigned int offset = i * sectorsize; - struct extent_state *cached = NULL; bool uptodate = false; int ret; @@ -2788,16 +2801,7 @@ static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio, * will not be properly unlocked. */ next: - end_page_read(page, uptodate, start + offset, sectorsize); - if (uptodate) - set_extent_uptodate(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached); + end_sector_io(page, start + offset, uptodate); } } From 17208f7b70060d18b8c76ef271e12a7c543e64a3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 May 2022 13:47:51 +0200 Subject: [PATCH 0444/1250] btrfs: refactor end_bio_extent_readpage code flow Untangle the goto and move the code it jumps to so it goes in the order of the most likely states first. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 86 +++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b50c1f1e21350e..62425d1494a90d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3017,7 +3017,6 @@ static void end_bio_extent_readpage(struct bio *bio) */ u32 bio_offset = 0; int mirror; - int ret; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -3028,6 +3027,7 @@ static void end_bio_extent_readpage(struct bio *bio) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; unsigned int error_bitmap = (unsigned int)-1; + bool repair = false; u64 start; u64 end; u32 len; @@ -3065,55 +3065,23 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); - ret = error_bitmap; + if (error_bitmap) + uptodate = false; } else { - ret = btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror); + if (btrfs_validate_metadata_buffer(bbio, + page, start, end, mirror)) + uptodate = false; } - if (ret) - uptodate = false; - else - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, tree, start, - page, - btrfs_ino(BTRFS_I(inode)), 0); } - if (likely(uptodate)) - goto readpage_ok; - - if (is_data_inode(inode)) { - /* - * If we failed to submit the IO at all we'll have a - * mirror_num == 0, in which case we need to just mark - * the page with an error and unlock it and carry on. - */ - if (mirror == 0) - goto readpage_ok; - - /* - * submit_data_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_data_read_repair(inode, bio, bio_offset, bvec, - mirror, error_bitmap); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - continue; - } else { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = mirror; - atomic_dec(&eb->io_pages); - } -readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, tree, start, page, + btrfs_ino(BTRFS_I(inode)), 0); + /* * Zero out the remaining part if this range straddles * i_size. @@ -3130,14 +3098,40 @@ static void end_bio_extent_readpage(struct bio *bio) zero_user_segment(page, zero_start, offset_in_page(end) + 1); } + } else if (is_data_inode(inode)) { + /* + * Only try to repair bios that actually made it to a + * device. If the bio failed to be submitted mirror + * is 0 and we need to fail it without retrying. + */ + if (mirror > 0) + repair = true; + } else { + struct extent_buffer *eb; + + eb = find_extent_buffer_readpage(fs_info, page, start); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); } + + if (repair) { + /* + * submit_data_read_repair() will handle all the good + * and bad sectors, we just continue to the next bvec. + */ + submit_data_read_repair(inode, bio, bio_offset, bvec, + mirror, error_bitmap); + } else { + /* Update page status and unlock */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); + } + ASSERT(bio_offset + len > bio_offset); bio_offset += len; - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); From 434c2c59ed6ac59e111fa09db49c863053253a99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 May 2022 13:47:52 +0200 Subject: [PATCH 0445/1250] btrfs: factor out a btrfs_csum_ptr helper Add a helper to find the csum for a byte offset into the csum buffer. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 8 ++++++++ fs/btrfs/inode.c | 13 +++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6e65778040ed12..613f46bab3e22f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2733,6 +2733,14 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); +static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, + u64 offset) +{ + u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; + + return csums + offset_in_sectors * fs_info->csum_size; +} + /* * Take the number of bytes to be checksummed and figure out how many leaves * it would require to store the csums for that many bytes. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 193931b3c20ad7..631239f76bc25a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3371,15 +3371,12 @@ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u32 len = fs_info->sectorsize; - const u32 csum_size = fs_info->csum_size; - unsigned int offset_sectors; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(pgoff + len <= PAGE_SIZE); - offset_sectors = bio_offset >> fs_info->sectorsize_bits; - csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) goto zeroit; @@ -8020,12 +8017,8 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, if (ret) goto err; } else { - u64 csum_offset; - - csum_offset = file_offset - dip->file_offset; - csum_offset >>= fs_info->sectorsize_bits; - csum_offset *= fs_info->csum_size; - btrfs_bio(bio)->csum = dip->csums + csum_offset; + btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, + file_offset - dip->file_offset); } map: ret = btrfs_map_bio(fs_info, bio, 0); From b01f15be991c3f384b17fd1e59bc167a07469a1e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 22 May 2022 13:47:53 +0200 Subject: [PATCH 0446/1250] btrfs: add a helper to iterate through a btrfs_bio with sector sized chunks Add a helper that works similar to __bio_for_each_segment, but instead of iterating over PAGE_SIZE chunks it iterates over each sector. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo [hch: split from a larger patch, and iterate over the offset instead of the offset bits] Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba [ add parameter comments ] Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 6721002000ee0f..1e86c48268edde 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -391,6 +391,22 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) } } +/* + * Iterate through a btrfs_bio (@bbio) on a per-sector basis. + * + * bvl - struct bio_vec + * bbio - struct btrfs_bio + * iters - struct bvec_iter + * bio_offset - unsigned int + */ +#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ + for ((iter) = (bbio)->iter, (bio_offset) = 0; \ + (iter).bi_size && \ + (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ + (bio_offset) += fs_info->sectorsize, \ + bio_advance_iter_single(&(bbio)->bio, &(iter), \ + (fs_info)->sectorsize)) + struct btrfs_io_stripe { struct btrfs_device *dev; u64 physical; From 462e7e1a9b32c612f5cf1593e1c5d4a1ee6f2383 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 May 2022 13:47:54 +0200 Subject: [PATCH 0447/1250] btrfs: use btrfs_bio_for_each_sector in btrfs_check_read_dio_bio Use the new btrfs_bio_for_each_sector iterator to simplify btrfs_check_read_dio_bio. Reviewed-by: Qu Wenruo Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 56 +++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 631239f76bc25a..76e493e2d9b28a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7899,47 +7899,35 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - struct bio_vec bvec; - struct bvec_iter iter; - u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (uptodate && + (!csum || !check_data_csum(inode, bbio, offset, bv.bv_page, + bv.bv_offset, start))) { + clean_io_failure(fs_info, failure_tree, io_tree, start, + bv.bv_page, btrfs_ino(BTRFS_I(inode)), + bv.bv_offset); + } else { + int ret; - __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec.bv_offset; - for (i = 0; i < nr_sectors; i++) { - u64 start = bbio->file_offset + bio_offset; - - ASSERT(pgoff < PAGE_SIZE); - if (uptodate && - (!csum || !check_data_csum(inode, bbio, - bio_offset, bvec.bv_page, - pgoff, start))) { - clean_io_failure(fs_info, failure_tree, io_tree, - start, bvec.bv_page, - btrfs_ino(BTRFS_I(inode)), - pgoff); - } else { - int ret; - - ret = btrfs_repair_one_sector(inode, &bbio->bio, - bio_offset, bvec.bv_page, pgoff, - start, bbio->mirror_num, - submit_dio_repair_bio); - if (ret) - err = errno_to_blk_status(ret); - } - ASSERT(bio_offset + sectorsize > bio_offset); - bio_offset += sectorsize; - pgoff += sectorsize; + ret = btrfs_repair_one_sector(inode, &bbio->bio, offset, + bv.bv_page, bv.bv_offset, start, + bbio->mirror_num, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } } + return err; } From 9047d5aba2ed3ff65da67c0a7c8606d787ad8081 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 3 May 2022 11:36:36 +0300 Subject: [PATCH 0448/1250] btrfs: introduce btrfs_try_lock_balance This function contains the factored out locking sequence of btrfs_ioctl_balance. Having this piece of code separate helps to simplify btrfs_ioctl_balance which has too complicated. This will be used in the next patch to streamline the logic in btrfs_ioctl_balance. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0f79af919bc4ea..e40ce7a6d21e3d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4355,6 +4355,72 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } +/** + * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as + * required. + * + * @fs_info: the filesystem + * @excl_acquired: ptr to boolean value which is set to false in case balance + * is being resumed + * + * Return 0 on success in which case both fs_info::balance is acquired as well + * as exclusive ops are blocked. In case of failure return an error code. + */ +static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired) +{ + int ret; + + /* + * Exclusive operation is locked. Three possibilities: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ + while (1) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + *excl_acquired = true; + mutex_lock(&fs_info->balance_mutex); + return 0; + } + + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* This is either (2) or (3) */ + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (2) */ + ret = -EINPROGRESS; + goto out_failure; + + } else { + mutex_unlock(&fs_info->balance_mutex); + /* + * Lock released to allow other waiters to + * continue, we'll reexamine the status again. + */ + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (3) */ + *excl_acquired = false; + return 0; + } + } + } else { + /* This is (1) */ + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_failure; + } + + mutex_unlock(&fs_info->balance_mutex); + } + +out_failure: + mutex_unlock(&fs_info->balance_mutex); + *excl_acquired = false; + return ret; +} + static long btrfs_ioctl_balance(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; From f7752b086c89364717a881236722fc46dbc2b087 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 5 May 2022 10:08:25 +0300 Subject: [PATCH 0449/1250] btrfs: use btrfs_try_lock_balance in btrfs_ioctl_balance This eliminates 2 labels and makes the code generally more streamlined. Also rename the 'out_bargs' label to 'out_unlock' since bargs is going to be freed under the 'out' label. This also fixes a memory leak since bargs wasn't correctly freed in one of the condition which are now moved in btrfs_try_lock_balance. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 51 +++++------------------------------------------- 1 file changed, 5 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e40ce7a6d21e3d..679ce4c5c341ab 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4427,7 +4427,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; - bool need_unlock; /* for mut. excl. ops lock */ + bool need_unlock = true; int ret; if (!capable(CAP_SYS_ADMIN)) @@ -4444,53 +4444,12 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) goto out; } -again: - if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - mutex_lock(&fs_info->balance_mutex); - need_unlock = true; - goto locked; - } - - /* - * mut. excl. ops lock is locked. Three possibilities: - * (1) some other op is running - * (2) balance is running - * (3) balance is paused -- special case (think resume) - */ - mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - /* this is either (2) or (3) */ - if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - mutex_unlock(&fs_info->balance_mutex); - /* - * Lock released to allow other waiters to continue, - * we'll reexamine the status again. - */ - mutex_lock(&fs_info->balance_mutex); - - if (fs_info->balance_ctl && - !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - /* this is (3) */ - need_unlock = false; - goto locked; - } - - mutex_unlock(&fs_info->balance_mutex); - goto again; - } else { - /* this is (2) */ - mutex_unlock(&fs_info->balance_mutex); - ret = -EINPROGRESS; - goto out; - } - } else { - /* this is (1) */ - mutex_unlock(&fs_info->balance_mutex); - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + ret = btrfs_try_lock_balance(fs_info, &need_unlock); + if (ret) goto out; - } -locked: + lockdep_assert_held(&fs_info->balance_mutex); + if (bargs->flags & BTRFS_BALANCE_RESUME) { if (!fs_info->balance_ctl) { ret = -ENOTCONN; From 0b343788d0e1199483189c559db486ca1c9a3848 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 27 May 2022 15:28:17 +0800 Subject: [PATCH 0450/1250] btrfs: use integrated bitmaps for btrfs_raid_bio::dbitmap and finish_pbitmap Previsouly we use "unsigned long *" for those two bitmaps. But since we only support fixed stripe length (64KiB, already checked in tree-checker), "unsigned long *" is really a waste of memory, while we can just use "unsigned long". This saves us 8 bytes in total for btrfs_raid_bio. To be extra safe, add an ASSERT() making sure calculated @stripe_nsectors is always smaller than BITS_PER_LONG. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e03a38af12cdb2..90f6ae49fd7b98 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -164,6 +164,13 @@ struct btrfs_raid_bio { atomic_t stripes_pending; atomic_t error; + + /* Bitmap to record which horizontal stripe has data */ + unsigned long dbitmap; + + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ + unsigned long finish_pbitmap; + /* * these are two arrays of pointers. We allocate the * rbio big enough to hold them both and setup their @@ -184,14 +191,8 @@ struct btrfs_raid_bio { */ struct sector_ptr *stripe_sectors; - /* Bitmap to record which horizontal stripe has data */ - unsigned long *dbitmap; - /* allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; - - /* Allocated with stripe_nsectors-many bits for finish_*() calls */ - unsigned long *finish_pbitmap; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); @@ -1038,14 +1039,17 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * Our current stripe len should be fixed to 64k thus stripe_nsectors + * (at most 16) should be no larger than BITS_PER_LONG. + */ + ASSERT(stripe_nsectors <= BITS_PER_LONG); rbio = kzalloc(sizeof(*rbio) + sizeof(*rbio->stripe_pages) * num_pages + sizeof(*rbio->bio_sectors) * num_sectors + sizeof(*rbio->stripe_sectors) * num_sectors + - sizeof(*rbio->finish_pointers) * real_stripes + - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) + - sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors), + sizeof(*rbio->finish_pointers) * real_stripes, GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1081,8 +1085,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, CONSUME_ALLOC(rbio->bio_sectors, num_sectors); CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); CONSUME_ALLOC(rbio->finish_pointers, real_stripes); - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors)); - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors)); #undef CONSUME_ALLOC if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) @@ -1939,7 +1941,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sectornr, rbio->dbitmap)) + !test_bit(sectornr, &rbio->dbitmap)) continue; /* @@ -2374,7 +2376,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, } ASSERT(i < rbio->real_stripes); - bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); /* * We have already increased bio_counter when getting bioc, record it @@ -2412,7 +2414,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) int stripe; int sectornr; - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { for (stripe = 0; stripe < rbio->real_stripes; stripe++) { struct page *page; int index = (stripe * rbio->stripe_nsectors + sectornr) * @@ -2437,7 +2439,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; - unsigned long *pbitmap = rbio->finish_pbitmap; + unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; int sectornr; @@ -2460,7 +2462,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors); + bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } /* @@ -2497,7 +2499,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; void *parity; @@ -2525,7 +2527,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ - bitmap_clear(rbio->dbitmap, sectornr, 1); + bitmap_clear(&rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) @@ -2547,7 +2549,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); @@ -2714,7 +2716,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) { + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; /* * We want to find all the sectors missing from the From 8548fcd727ad1fb13e6abb2c843145a0204d84db Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 27 May 2022 15:28:18 +0800 Subject: [PATCH 0451/1250] btrfs: use integrated bitmaps for scrub_parity::dbitmap and ebitmap Previously we use "unsigned long *" for those two bitmaps. But since we only support fixed stripe length (64KiB, already checked in tree-checker), "unsigned long *" is really a waste of memory, while we can just use "unsigned long". This saves us 8 bytes in total for scrub_parity. To be extra safe, add an ASSERT() making sure calclulated @nsectors is always smaller than BITS_PER_LONG. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e7b0323e6efd8c..db700e6ec5a932 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -135,15 +135,13 @@ struct scrub_parity { struct work_struct work; /* Mark the parity blocks which have data */ - unsigned long *dbitmap; + unsigned long dbitmap; /* * Mark the parity blocks which have data, but errors happen when * read data or check data */ - unsigned long *ebitmap; - - unsigned long bitmap[]; + unsigned long ebitmap; }; struct scrub_ctx { @@ -2406,13 +2404,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len); } static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len); } static void scrub_block_complete(struct scrub_block *sblock) @@ -2763,7 +2761,7 @@ static void scrub_free_parity(struct scrub_parity *sparity) struct scrub_sector *curr, *next; int nbits; - nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); + nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors); if (nbits) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors += nbits; @@ -2795,8 +2793,8 @@ static void scrub_parity_bio_endio(struct bio *bio) struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; if (bio->bi_status) - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, - sparity->nsectors); + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, + &sparity->dbitmap, sparity->nsectors); bio_put(bio); @@ -2814,8 +2812,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) u64 length; int ret; - if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, - sparity->nsectors)) + if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap, + &sparity->ebitmap, sparity->nsectors)) goto out; length = sparity->logic_end - sparity->logic_start; @@ -2833,7 +2831,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, sparity->scrub_dev, - sparity->dbitmap, + &sparity->dbitmap, sparity->nsectors); if (!rbio) goto rbio_out; @@ -2847,7 +2845,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) bioc_out: btrfs_bio_counter_dec(fs_info); btrfs_put_bioc(bioc); - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2856,11 +2854,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) scrub_free_parity(sparity); } -static inline int scrub_calc_parity_bitmap_len(int nsectors) -{ - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); -} - static void scrub_parity_get(struct scrub_parity *sparity) { refcount_inc(&sparity->refs); @@ -3131,7 +3124,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int ret; struct scrub_parity *sparity; int nsectors; - int bitmap_len; path = btrfs_alloc_path(); if (!path) { @@ -3145,9 +3137,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, ASSERT(map->stripe_len <= U32_MAX); nsectors = map->stripe_len >> fs_info->sectorsize_bits; - bitmap_len = scrub_calc_parity_bitmap_len(nsectors); - sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, - GFP_NOFS); + ASSERT(nsectors <= BITS_PER_LONG); + sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS); if (!sparity) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -3165,8 +3156,6 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->logic_end = logic_end; refcount_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->sectors_list); - sparity->dbitmap = sparity->bitmap; - sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; ret = 0; for (cur_logical = logic_start; cur_logical < logic_end; From 9918fad10a147ce4809d4066258923c5f3e6e3c1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 27 May 2022 15:28:19 +0800 Subject: [PATCH 0452/1250] btrfs: only write the sectors in the vertical stripe which has data stripes If we have only 8K partial write at the beginning of a full RAID56 stripe, we will write the following contents: 0 8K 32K 64K Disk 1 (data): |XX| | | Disk 2 (data): | | | Disk 3 (parity): |XXXXXXXXXXXXXXX|XXXXXXXXXXXXXXX| |X| means the sector will be written back to disk. Note that, although we won't write any sectors from disk 2, but we will write the full 64KiB of parity to disk. This behavior is fine for now, but not for the future (especially for RAID56J, as we waste quite some space to journal the unused parity stripes). So here we will also utilize the btrfs_raid_bio::dbitmap, anytime we queue a higher level bio into an rbio, we will update rbio::dbitmap to indicate which vertical stripes we need to writeback. And at finish_rmw(), we also check dbitmap to see if we need to write any sector in the vertical stripe. So after the patch, above example will only lead to the following writeback pattern: 0 8K 32K 64K Disk 1 (data): |XX| | | Disk 2 (data): | | | Disk 3 (parity): |XX| | | Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 53 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 90f6ae49fd7b98..454ceee6bab532 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -392,6 +392,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + /* Also inherit the bitmaps from @victim. */ + bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, + dest->stripe_nsectors); dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -933,6 +936,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) if (rbio->generic_bio_cnt) btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -1284,6 +1293,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else BUG(); + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + /* at this point we either have a full stripe, * or we've read the full stripe from the drive. * recalculate the parity and write the new results. @@ -1358,6 +1370,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { struct sector_ptr *sector; + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (stripe < rbio->nr_data) { sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (!sector) @@ -1384,6 +1400,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { struct sector_ptr *sector; + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (stripe < rbio->nr_data) { sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (!sector) @@ -1835,6 +1855,33 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) run_plug(plug); } +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) +{ + const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bioc->raid_map[0]; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; + + ASSERT(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * rbio->stripe_len); + + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + fs_info->sectorsize_bits) % rbio->stripe_nsectors; + + set_bit(bit, &rbio->dbitmap); + } +} + /* * our main entry point for writes from the rest of the FS. */ @@ -1851,9 +1898,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri btrfs_put_bioc(bioc); return PTR_ERR(rbio); } - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; + rbio_add_bio(rbio, bio); btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; @@ -2258,8 +2304,7 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, } rbio->operation = BTRFS_RBIO_READ_REBUILD; - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio_add_bio(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { From 5306c83bdeae6e2d5181bf98f8b94b09255ae9d6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 1 Jun 2022 13:47:54 +0200 Subject: [PATCH 0453/1250] btrfs: remove redundant calls to flush_dcache_page Both memzero_page and memcpy_to_page already call flush_dcache_page so we can remove the calls from btrfs code. Reviewed-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 -- fs/btrfs/extent_io.c | 7 +------ fs/btrfs/inode.c | 6 ++---- fs/btrfs/reflink.c | 5 +---- 4 files changed, 4 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6ab82e142f1f86..2536754656b665 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -760,7 +760,6 @@ static noinline int add_ra_bio_pages(struct inode *inode, int zeros; zeros = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, zeros); - flush_dcache_page(page); } } @@ -1476,7 +1475,6 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, ASSERT(copy_start - decompressed < buf_len); memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + copy_start - decompressed, copy_len); - flush_dcache_page(bvec.bv_page); cur_offset += copy_len; bio_advance(orig_bio, copy_len); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 62425d1494a90d..69b6b4ba009e47 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3641,7 +3641,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (zero_offset) { iosize = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, iosize); - flush_dcache_page(page); } } begin_page_read(fs_info, page); @@ -3656,7 +3655,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, @@ -3740,7 +3738,6 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct extent_state *cached = NULL; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -4158,10 +4155,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, return 0; } - if (page->index == end_index) { + if (page->index == end_index) memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); - flush_dcache_page(page); - } ret = set_page_extent_mapped(page); if (ret < 0) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 76e493e2d9b28a..f96e332bfe963d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4873,7 +4873,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, else memzero_page(page, (block_start - page_offset(page)) + offset, len); - flush_dcache_page(page); } btrfs_page_clear_checked(fs_info, page, block_start, block_end + 1 - block_start); @@ -8598,10 +8597,9 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) else zero_start = PAGE_SIZE; - if (zero_start != PAGE_SIZE) { + if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - flush_dcache_page(page); - } + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index a3549d587464af..e30f53bd4e5581 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -110,7 +110,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (comp_type == BTRFS_COMPRESS_NONE) { memcpy_to_page(page, offset_in_page(file_offset), data_start, datal); - flush_dcache_page(page); } else { ret = btrfs_decompress(comp_type, data_start, page, offset_in_page(file_offset), @@ -132,10 +131,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * * So what's in the range [500, 4095] corresponds to zeroes. */ - if (datal < block_size) { + if (datal < block_size) memzero_page(page, datal, block_size - datal); - flush_dcache_page(page); - } btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); btrfs_page_clear_checked(fs_info, page, file_offset, block_size); From 34f0ce5fb5b2626ab4a7748c26bff94093b5883a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 1 Jun 2022 13:54:28 +0800 Subject: [PATCH 0454/1250] btrfs: update stripe_sectors::uptodate in steal_rbio [BUG] With added debugging, it turns out the following write sequence would cause extra read which is unnecessary: # xfs_io -f -s -c "pwrite -b 32k 0 32k" -c "pwrite -b 32k 32k 32k" \ -c "pwrite -b 32k 64k 32k" -c "pwrite -b 32k 96k 32k" \ $mnt/file The debug message looks like this (btrfs header skipped): partial rmw, full stripe=389152768 opf=0x0 devid=3 type=1 offset=32768 physical=323059712 len=32768 partial rmw, full stripe=389152768 opf=0x0 devid=1 type=2 offset=0 physical=67174400 len=65536 full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=0 physical=323026944 len=32768 full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=0 physical=323026944 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=1 type=1 offset=32768 physical=22052864 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=2 type=2 offset=0 physical=277872640 len=65536 full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=0 physical=22020096 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=0 physical=277872640 len=32768 partial rmw, full stripe=389152768 opf=0x0 devid=3 type=1 offset=0 physical=323026944 len=32768 partial rmw, full stripe=389152768 opf=0x0 devid=1 type=2 offset=0 physical=67174400 len=65536 ^^^^ Still partial read, even 389152768 is already cached by the first. write. full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=32768 physical=323059712 len=32768 full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=32768 physical=323059712 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=1 type=1 offset=0 physical=22020096 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=2 type=2 offset=0 physical=277872640 len=65536 ^^^^ Still partial read for 298844160. full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=32768 physical=22052864 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=32768 physical=277905408 len=32768 This means every 32K writes, even they are in the same full stripe, still trigger read for previously cached data. This would cause extra RAID56 IO, making the btrfs raid56 cache useless. [CAUSE] Commit d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") tries to make steal_rbio() subpage compatible, but during that conversion, there is one thing missing. We no longer rely on PageUptodate(rbio->stripe_pages[i]), but rbio->stripe_nsectors[i].uptodate to determine if a sector is uptodate. This means, previously if we switch the pointer, everything is done, as the PageUptodate flag is still bound to that page. But now we have to manually mark the involved sectors uptodate, or later raid56_rmw_stripe() will find the stolen sector is not uptodate, and assemble the read bio for it, wasting IO. [FIX] We can easily fix the bug, by also update the rbio->stripe_sectors[].uptodate in steal_rbio(). With this fixed, now the same write pattern no longer leads to the same unnecessary read: partial rmw, full stripe=389152768 opf=0x0 devid=3 type=1 offset=32768 physical=323059712 len=32768 partial rmw, full stripe=389152768 opf=0x0 devid=1 type=2 offset=0 physical=67174400 len=65536 full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=0 physical=323026944 len=32768 full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=0 physical=323026944 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=1 type=1 offset=32768 physical=22052864 len=32768 partial rmw, full stripe=298844160 opf=0x0 devid=2 type=2 offset=0 physical=277872640 len=65536 full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=0 physical=22020096 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=0 physical=277872640 len=32768 ^^^ No more partial read, directly into the write path. full stripe rmw, full stripe=389152768 opf=0x1 devid=3 type=1 offset=32768 physical=323059712 len=32768 full stripe rmw, full stripe=389152768 opf=0x1 devid=2 type=-1 offset=32768 physical=323059712 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=1 type=1 offset=32768 physical=22052864 len=32768 full stripe rmw, full stripe=298844160 opf=0x1 devid=3 type=-1 offset=32768 physical=277905408 len=32768 Fixes: d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 454ceee6bab532..c48b7a0992f641 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -348,6 +348,24 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) } } +static void steal_rbio_page(struct btrfs_raid_bio *src, + struct btrfs_raid_bio *dest, int page_nr) +{ + const u32 sectorsize = src->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + if (dest->stripe_pages[page_nr]) + __free_page(dest->stripe_pages[page_nr]); + dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; + src->stripe_pages[page_nr] = NULL; + + /* Also update the sector->uptodate bits. */ + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; i++) + dest->stripe_sectors[i].uptodate = true; +} + /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. @@ -359,7 +377,6 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; struct page *s; - struct page *d; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; @@ -369,12 +386,7 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) if (!s || !full_page_sectors_uptodate(src, i)) continue; - d = dest->stripe_pages[i]; - if (d) - __free_page(d); - - dest->stripe_pages[i] = s; - src->stripe_pages[i] = NULL; + steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); index_stripe_sectors(src); From 06268502c648e1b4b7424a9777da57c5f1811e5d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 1 Jun 2022 17:46:59 +0800 Subject: [PATCH 0455/1250] btrfs: add trace event for submitted RAID56 bio Add tracepoint for better insight to how the RAID56 data are submitted. The output looks like this: (trace event header and UUID skipped) raid56_read_partial: full_stripe=389152768 devid=3 type=DATA1 offset=32768 opf=0x0 physical=323059712 len=32768 raid56_read_partial: full_stripe=389152768 devid=1 type=DATA2 offset=0 opf=0x0 physical=67174400 len=65536 raid56_write_stripe: full_stripe=389152768 devid=3 type=DATA1 offset=0 opf=0x1 physical=323026944 len=32768 raid56_write_stripe: full_stripe=389152768 devid=2 type=PQ1 offset=0 opf=0x1 physical=323026944 len=32768 The above debug output is from a 32K data write into an empty RAID56 data chunk. Some explanation on the event output: full_stripe: the logical bytenr of the full stripe devid: btrfs devid type: raid stripe type. DATA1: the first data stripe DATA2: the second data stripe PQ1: the P stripe PQ2: the Q stripe offset: the offset inside the stripe. opf: the bio op type physical: the physical offset the bio is for len: the length of the bio The first two lines are from partial RMW read, which is reading the remaining data stripes from disks. The last two lines are for full stripe RMW write, which is writing the involved two 16K stripes (one for DATA1 stripe, one for P stripe). The stripe for DATA2 doesn't need to be written. There are 5 types of trace events: - raid56_read_partial Read remaining data for regular read/write path. - raid56_write_stripe Write the modified stripes for regular read/write path. - raid56_scrub_read_recover Read remaining data for scrub recovery path. - raid56_scrub_write_stripe Write the modified stripes for scrub path. - raid56_scrub_read Read remaining data for scrub path. Also, since the trace events are included at super.c, we have to export needed structure definitions to 'raid56.h' and include the header in super.c, or we're unable to access those members. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ reformat comments ] Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 190 +++++++++++------------------------ fs/btrfs/raid56.h | 148 ++++++++++++++++++++++++++- fs/btrfs/super.c | 1 + include/trace/events/btrfs.h | 94 +++++++++++++++++ 4 files changed, 300 insertions(+), 133 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c48b7a0992f641..baba435692d2c4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -63,138 +63,6 @@ struct sector_ptr { unsigned int uptodate:8; }; -enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE, - BTRFS_RBIO_READ_REBUILD, - BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, -}; - -struct btrfs_raid_bio { - struct btrfs_io_context *bioc; - - /* while we're doing rmw on a stripe - * we put it into a hash table so we can - * lock the stripe and merge more rbios - * into it. - */ - struct list_head hash_list; - - /* - * LRU list for the stripe cache - */ - struct list_head stripe_cache; - - /* - * for scheduling work in the helper threads - */ - struct work_struct work; - - /* - * bio list and bio_list_lock are used - * to add more bios into the stripe - * in hopes of avoiding the full rmw - */ - struct bio_list bio_list; - spinlock_t bio_list_lock; - - /* also protected by the bio_list_lock, the - * plug list is used by the plugging code - * to collect partial bios while plugged. The - * stripe locking code also uses it to hand off - * the stripe lock to the next pending IO - */ - struct list_head plug_list; - - /* - * flags that tell us if it is safe to - * merge with this bio - */ - unsigned long flags; - - /* - * set if we're doing a parity rebuild - * for a read from higher up, which is handled - * differently from a parity rebuild as part of - * rmw - */ - enum btrfs_rbio_ops operation; - - /* Size of each individual stripe on disk */ - u32 stripe_len; - - /* How many pages there are for the full stripe including P/Q */ - u16 nr_pages; - - /* How many sectors there are for the full stripe including P/Q */ - u16 nr_sectors; - - /* Number of data stripes (no p/q) */ - u8 nr_data; - - /* Number of all stripes (including P/Q) */ - u8 real_stripes; - - /* How many pages there are for each stripe */ - u8 stripe_npages; - - /* How many sectors there are for each stripe */ - u8 stripe_nsectors; - - /* First bad stripe, -1 means no corruption */ - s8 faila; - - /* Second bad stripe (for RAID6 use) */ - s8 failb; - - /* Stripe number that we're scrubbing */ - u8 scrubp; - - /* - * size of all the bios in the bio_list. This - * helps us decide if the rbio maps to a full - * stripe or not - */ - int bio_list_bytes; - - int generic_bio_cnt; - - refcount_t refs; - - atomic_t stripes_pending; - - atomic_t error; - - /* Bitmap to record which horizontal stripe has data */ - unsigned long dbitmap; - - /* Allocated with stripe_nsectors-many bits for finish_*() calls */ - unsigned long finish_pbitmap; - - /* - * these are two arrays of pointers. We allocate the - * rbio big enough to hold them both and setup their - * locations when the rbio is allocated - */ - - /* pointers to pages that we allocated for - * reading/writing stripes directly from the disk (including P/Q) - */ - struct page **stripe_pages; - - /* Pointers to the sectors in the bio_list, for faster lookup */ - struct sector_ptr *bio_sectors; - - /* - * For subpage support, we need to map each sector to above - * stripe_pages. - */ - struct sector_ptr *stripe_sectors; - - /* allocated with real_stripes-many pointers for finish_*() calls */ - void **finish_pointers; -}; - static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_work(struct work_struct *work); @@ -1275,6 +1143,34 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) spin_unlock_irq(&rbio->bio_list_lock); } +static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, + struct raid56_bio_trace_info *trace_info) +{ + const struct btrfs_io_context *bioc = rbio->bioc; + int i; + + ASSERT(bioc); + + /* We rely on bio->bi_bdev to find the stripe number. */ + if (!bio->bi_bdev) + goto not_found; + + for (i = 0; i < bioc->num_stripes; i++) { + if (bio->bi_bdev != bioc->stripes[i].dev->bdev) + continue; + trace_info->stripe_nr = i; + trace_info->devid = bioc->stripes[i].dev->devid; + trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + bioc->stripes[i].physical; + return; + } + +not_found: + trace_info->devid = -1; + trace_info->offset = -1; + trace_info->stripe_nr = -1; +} + /* * this is called from one of two situations. We either * have a full stripe from the higher layers, or we've read all @@ -1440,6 +1336,12 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) while ((bio = bio_list_pop(&bio_list))) { bio->bi_end_io = raid_write_end_io; + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -1701,6 +1603,12 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_read_partial_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_read_partial(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -2274,6 +2182,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_recover_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + } submit_bio(bio); } @@ -2643,6 +2557,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(&bio_list))) { bio->bi_end_io = raid_write_end_io; + if (trace_raid56_scrub_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -2822,6 +2742,12 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index aaad08aefd7d08..3badde24dcbf06 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,6 +7,152 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H +#include +#include "volumes.h" + +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, + BTRFS_RBIO_REBUILD_MISSING, +}; + +struct btrfs_raid_bio { + struct btrfs_io_context *bioc; + + /* + * While we're doing RMW on a stripe we put it into a hash table so we + * can lock the stripe and merge more rbios into it. + */ + struct list_head hash_list; + + /* LRU list for the stripe cache */ + struct list_head stripe_cache; + + /* For scheduling work in the helper threads */ + struct work_struct work; + + /* + * bio_list and bio_list_lock are used to add more bios into the stripe + * in hopes of avoiding the full RMW + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* + * Also protected by the bio_list_lock, the plug list is used by the + * plugging code to collect partial bios while plugged. The stripe + * locking code also uses it to hand off the stripe lock to the next + * pending IO. + */ + struct list_head plug_list; + + /* Flags that tell us if it is safe to merge with this bio. */ + unsigned long flags; + + /* + * Set if we're doing a parity rebuild for a read from higher up, which + * is handled differently from a parity rebuild as part of RMW. + */ + enum btrfs_rbio_ops operation; + + /* Size of each individual stripe on disk */ + u32 stripe_len; + + /* How many pages there are for the full stripe including P/Q */ + u16 nr_pages; + + /* How many sectors there are for the full stripe including P/Q */ + u16 nr_sectors; + + /* Number of data stripes (no p/q) */ + u8 nr_data; + + /* Numer of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ + u8 stripe_npages; + + /* How many sectors there are for each stripe */ + u8 stripe_nsectors; + + /* First bad stripe, -1 means no corruption */ + s8 faila; + + /* Second bad stripe (for RAID6 use) */ + s8 failb; + + /* Stripe number that we're scrubbing */ + u8 scrubp; + + /* + * Size of all the bios in the bio_list. This helps us decide if the + * rbio maps to a full stripe or not. + */ + int bio_list_bytes; + + int generic_bio_cnt; + + refcount_t refs; + + atomic_t stripes_pending; + + atomic_t error; + + /* Bitmap to record which horizontal stripe has data */ + unsigned long dbitmap; + + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ + unsigned long finish_pbitmap; + + /* + * These are two arrays of pointers. We allocate the rbio big enough + * to hold them both and setup their locations when the rbio is + * allocated. + */ + + /* + * Pointers to pages that we allocated for reading/writing stripes + * directly from the disk (including P/Q). + */ + struct page **stripe_pages; + + /* Pointers to the sectors in the bio_list, for faster lookup */ + struct sector_ptr *bio_sectors; + + /* + * For subpage support, we need to map each sector to above + * stripe_pages. + */ + struct sector_ptr *stripe_sectors; + + /* Allocated with real_stripes-many pointers for finish_*() calls */ + void **finish_pointers; +}; + +/* + * For trace event usage only. Records useful debug info for each bio submitted + * by RAID56 to each physical device. + * + * No matter signed or not, (-1) is always the one indicating we can not grab + * the proper stripe number. + */ +struct raid56_bio_trace_info { + u64 devid; + + /* The offset inside the stripe. (<= STRIPE_LEN) */ + u32 offset; + + /* + * Stripe number. + * 0 is the first data stripe, and nr_data for P stripe, + * nr_data + 1 for Q stripe. + * >= real_stripes for + */ + u8 stripe_nr; +}; + static inline int nr_parity_stripes(const struct map_lookup *map) { if (map->type & BTRFS_BLOCK_GROUP_RAID5) @@ -21,13 +167,13 @@ static inline int nr_data_stripes(const struct map_lookup *map) { return map->num_stripes - nr_parity_stripes(map); } + #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) -struct btrfs_raid_bio; struct btrfs_device; int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 24b86061c5df30..8539ee2dc79f4a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,6 +48,7 @@ #include "block-group.h" #include "discard.h" #include "qgroup.h" +#include "raid56.h" #define CREATE_TRACE_POINTS #include diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 9ae94ef3e270be..29fa8ea2cc0f6c 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -30,6 +30,8 @@ struct btrfs_qgroup; struct extent_io_tree; struct prelim_ref; struct btrfs_space_info; +struct btrfs_raid_bio; +struct raid56_bio_trace_info; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -2258,6 +2260,98 @@ DEFINE_EVENT(btrfs__space_info_update, update_bytes_pinned, TP_ARGS(fs_info, sinfo, old, diff) ); +DECLARE_EVENT_CLASS(btrfs_raid56_bio, + + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info), + + TP_STRUCT__entry_btrfs( + __field( u64, full_stripe ) + __field( u64, physical ) + __field( u64, devid ) + __field( u32, offset ) + __field( u32, len ) + __field( u8, opf ) + __field( u8, total_stripes ) + __field( u8, real_stripes ) + __field( u8, nr_data ) + __field( u8, stripe_nr ) + ), + + TP_fast_assign_btrfs(rbio->bioc->fs_info, + __entry->full_stripe = rbio->bioc->raid_map[0]; + __entry->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + __entry->len = bio->bi_iter.bi_size; + __entry->opf = bio_op(bio); + __entry->devid = trace_info->devid; + __entry->offset = trace_info->offset; + __entry->stripe_nr = trace_info->stripe_nr; + __entry->total_stripes = rbio->bioc->num_stripes; + __entry->real_stripes = rbio->real_stripes; + __entry->nr_data = rbio->nr_data; + ), + /* + * For type output, we need to output things like "DATA1" + * (the first data stripe), "DATA2" (the second data stripe), + * "PQ1" (P stripe),"PQ2" (Q stripe), "REPLACE0" (replace target device). + */ + TP_printk_btrfs( +"full_stripe=%llu devid=%lld type=%s%d offset=%d opf=0x%x physical=%llu len=%u", + __entry->full_stripe, __entry->devid, + (__entry->stripe_nr < __entry->nr_data) ? "DATA" : + ((__entry->stripe_nr < __entry->real_stripes) ? "PQ" : + "REPLACE"), + (__entry->stripe_nr < __entry->nr_data) ? + (__entry->stripe_nr + 1) : + ((__entry->stripe_nr < __entry->real_stripes) ? + (__entry->stripe_nr - __entry->nr_data + 1) : 0), + __entry->offset, __entry->opf, __entry->physical, __entry->len) +); + +DEFINE_EVENT(btrfs_raid56_bio, raid56_read_partial, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + +DEFINE_EVENT(btrfs_raid56_bio, raid56_write_stripe, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + + +DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_write_stripe, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + +DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + +DEFINE_EVENT(btrfs_raid56_bio, raid56_scrub_read_recover, + TP_PROTO(const struct btrfs_raid_bio *rbio, + const struct bio *bio, + const struct raid56_bio_trace_info *trace_info), + + TP_ARGS(rbio, bio, trace_info) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ From 1e7c0077d6f43a031274b7edf6dac921b5cbb370 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:32 +0100 Subject: [PATCH 0456/1250] btrfs: balance btree dirty pages and delayed items after a rename A rename operation modifies a subvolume's btree, to remove the old dir item, add the new dir item, remove an inode ref and add a new inode ref. It can also create the delayed inode for the inodes involved in the operation, and it creates two delayed dir index items, one to delete the old name and another one to add the new name. However we are neither balancing the btree dirty pages nor the delayed items after a rename, which can result in accumulation of too many btree dirty pages and delayed items, specially if a task is doing a series of rename operations (for example it can happen for package installations/upgrades through the zypper tool). So just call btrfs_btree_balance_dirty() after a rename, just like we do for every other system call that results on modifying a btree and adding delayed items. Reviewed-by: Anand Jain Reviewed-by: Nikolay Borisov Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f96e332bfe963d..419bae723890cb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9557,15 +9557,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int ret; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) - return btrfs_rename_exchange(old_dir, old_dentry, new_dir, - new_dentry); + ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + else + ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + new_dentry, flags); - return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, - new_dentry, flags); + btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); + + return ret; } struct btrfs_delalloc_work { From 1f779f7bd1b8eedd58a1dd55d7a24b98b2f69fbc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:33 +0100 Subject: [PATCH 0457/1250] btrfs: free the path earlier when creating a new inode When creating an inode, through btrfs_create_new_inode(), we release the path we allocated before once we don't need it anymore. But we keep it allocated until we return from that function, which is wasteful because after we release the path we do several things that can allocate yet another path: inheriting properties, setting the xattrs used by ACLs and secutiry modules, adding an orphan item (O_TMPFILE case) or adding a dir item (for the non-O_TMPFILE case). So instead of releasing the path once we don't need it anymore, free it instead. This way we avoid having two paths allocated until we return from btrfs_create_new_inode(). Reviewed-by: Nikolay Borisov Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 419bae723890cb..7329a03292ebe8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6380,7 +6380,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(path); + /* + * We don't need the path anymore, plus inheriting properties, adding + * ACLs, security xattrs, orphan item or adding the link, will result in + * allocating yet another path. So just free our path. + */ + btrfs_free_path(path); + path = NULL; if (args->subvol) { struct inode *parent; @@ -6437,8 +6443,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - ret = 0; - goto out; + return 0; discard: /* From 5e81c7f670654a52f0737737b243110bc2aa605e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:34 +0100 Subject: [PATCH 0458/1250] btrfs: balance btree dirty pages and delayed items after clone and dedupe When reflinking extents (clone and deduplication), we need to touch the btree of the destination inode's subvolume, as well as potentially create a delayed inode for the destination inode (if it was not created before). However we are neither balancing the btree dirty pages nor the delayed items after such operations, so if we have a task that is doing a long series of clone or deduplication operations, it can result in accumulation of too many btree dirty pages and delayed items. So just call btrfs_btree_balance_dirty() after clone and deduplication, just like we do for every other system call that results on modifying a btree and adding delayed items. Reviewed-by: Anand Jain Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index e30f53bd4e5581..8a6cabdb8f9328 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -5,6 +5,7 @@ #include "compression.h" #include "ctree.h" #include "delalloc-space.h" +#include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" @@ -655,7 +656,8 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { - const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + const u64 bs = fs_info->sb->s_blocksize; int ret; /* @@ -666,6 +668,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_btree_balance_dirty(fs_info); + return ret; } @@ -775,6 +779,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); + btrfs_btree_balance_dirty(fs_info); + return ret; } From 72789b575171daff25d29c64473105f508d731bb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:35 +0100 Subject: [PATCH 0459/1250] btrfs: add assertions when deleting batches of delayed items There are a few impossible cases that btrfs_batch_delete_items() tries to deal with: 1) Getting a path pointing to a NULL leaf; 2) The leaf slot is pointing beyond the last item in the leaf; 3) We can't find a single item to delete. The first case is impossible because the given path was returned by a successful call to btrfs_search_slot(). Replace the BUG_ON() with an ASSERT for this. The second case is impossible because we are always called when a delayed item matches an item in the given leaf. So add an ASSERT() for that and if that condition is not satisfied, trigger a warning and return an error. The third case is impossible exactly because of the same reason as the second case. The given delayed item matches one item in the leaf, so we know that our batch always has at least one item. Add an ASSERT to check that, trigger a warning if that expectation fails and return an error. Reviewed-by: Anand Jain Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 748bf6b0d8600f..1dc4ebba04f549 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -797,20 +797,23 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_item *item) { struct btrfs_delayed_item *curr, *next; - struct extent_buffer *leaf; + struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; struct list_head head; int nitems, i, last_item; int ret = 0; - BUG_ON(!path->nodes[0]); - - leaf = path->nodes[0]; + ASSERT(leaf != NULL); i = path->slots[0]; last_item = btrfs_header_nritems(leaf) - 1; - if (i > last_item) - return -ENOENT; /* FIXME: Is errno suitable? */ + /* + * Our caller always gives us a path pointing to an existing item, so + * this can not happen. + */ + ASSERT(i <= last_item); + if (WARN_ON(i > last_item)) + return -ENOENT; next = item; INIT_LIST_HEAD(&head); @@ -837,8 +840,13 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, i); } - if (!nitems) - return 0; + /* + * Our caller always gives us a path pointing to an existing item, so + * this can not happen. + */ + ASSERT(nitems >= 1); + if (nitems < 1) + return -ENOENT; ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); if (ret) From 70a97f2fa66aeafdd80d13ee9ca6e73bdeab1809 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:36 +0100 Subject: [PATCH 0460/1250] btrfs: deal with deletion errors when deleting delayed items Currently, btrfs_delete_delayed_items() ignores any errors returned from btrfs_batch_delete_items(). This looks fishy but it's not a problem at the moment because: 1) Two of the errors returned from btrfs_batch_delete_items() are for impossible cases, cases where a delayed item does not match any item in the leaf the path points to - btrfs_delete_delayed_items() always calls btrfs_batch_delete_items() with a path that points to a leaf that contains an item matching a delayed item; 2) btrfs_batch_delete_items() may return an error from btrfs_del_items(), in which case it does not release the delayed items of the batch. At the moment this is harmless because btrfs_del_items() actually is always able to delete items, even if it returns an error - when it returns an error it's because it ended up with a leaf mostly empty (less than 1/3 full) and failed to migrate items from that leaf into its neighbour leaves - this is not critical, as all the items were deleted, we just left the tree a bit unbalanced, but it's still a valid tree and causes no harm, and future operations on the tree will eventually balance it. So even if we get an error from btrfs_del_items(), the delayed items will not be released but the next time we run delayed items we will find out, at btrfs_delete_delayed_items(), that they are not present in the tree anymore and then release them. This is all a bit subtle, and it's certainly prone to be a disaster in case btrfs_del_items() changes one day and may return errors before being able to delete all the requested items, in which case we could leave the filesystem in an inconsistent state as we would commit a transaction despite a failure from deleting items from the tree. So make btrfs_delete_delayed_items() check for any errors from the call to btrfs_batch_delete_items(). Reviewed-by: Anand Jain Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1dc4ebba04f549..c8deab7fe25358 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -896,7 +896,9 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, goto delete_fail; } - btrfs_batch_delete_items(trans, root, path, curr); + ret = btrfs_batch_delete_items(trans, root, path, curr); + if (ret) + goto delete_fail; btrfs_release_path(path); mutex_unlock(&node->mutex); goto do_again; From db2466a174fb7a4c90decd0a8c0b63ca2a6d9229 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:37 +0100 Subject: [PATCH 0461/1250] btrfs: refactor the delayed item deletion entry point The delayed item deletion entry point, btrfs_delete_delayed_items(), is a bit convoluted for a few reasons: 1) It's really a loop disguised with labels and goto statements; 2) There's a 'delete_fail' label which isn't only for error cases, we can jump to that label even if no error happened, if we simply don't have more delayed items to delete; 3) Unnecessarily keeps track of the current and previous items for no good reason, as after getting the next item and releasing the current one, it just jumps to the 'again' label just to look again for the first delayed item; 4) When a delayed item is not in the tree (because it was already deleted before), it releases the item while holding a path locked, which is not necessary and adds more contention to the tree, specially taking into account that the path came from a deletion search, meaning we have write locks for nodes at levels 2, 1 and 0. And releasing the item is not computationally trivial (rb tree deletion, a kfree() and some trivial things). So refactor it to use a while loop and add some comments to make it more obvious why we can have delayed items without a matching item in the tree as well as why not keep the delayed node locked all the time when running all its deletion items. This is also a preparation for some upcoming work involving delayed items. Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 71 ++++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c8deab7fe25358..ff986c78c087c4 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -867,45 +867,52 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_node *node) { - struct btrfs_delayed_item *curr, *prev; int ret = 0; -do_again: - mutex_lock(&node->mutex); - curr = __btrfs_first_delayed_deletion_item(node); - if (!curr) - goto delete_fail; + while (ret == 0) { + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_deletion_item(node); + if (!item) { + mutex_unlock(&node->mutex); + break; + } + + ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1); + if (ret > 0) { + /* + * There's no matching item in the leaf. This means we + * have already deleted this item in a past run of the + * delayed items. We ignore errors when running delayed + * items from an async context, through a work queue job + * running btrfs_async_run_delayed_root(), and don't + * release delayed items that failed to complete. This + * is because we will retry later, and at transaction + * commit time we always run delayed items and will + * then deal with errors if they fail to run again. + * + * So just release delayed items for which we can't find + * an item in the tree, and move to the next item. + */ + btrfs_release_path(path); + btrfs_release_delayed_item(item); + ret = 0; + } else if (ret == 0) { + ret = btrfs_batch_delete_items(trans, root, path, item); + btrfs_release_path(path); + } - ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); - if (ret < 0) - goto delete_fail; - else if (ret > 0) { /* - * can't find the item which the node points to, so this node - * is invalid, just drop it. + * We unlock and relock on each iteration, this is to prevent + * blocking other tasks for too long while we are being run from + * the async context (work queue job). Those tasks are typically + * running system calls like creat/mkdir/rename/unlink/etc which + * need to add delayed items to this delayed node. */ - prev = curr; - curr = __btrfs_next_delayed_item(prev); - btrfs_release_delayed_item(prev); - ret = 0; - btrfs_release_path(path); - if (curr) { - mutex_unlock(&node->mutex); - goto do_again; - } else - goto delete_fail; + mutex_unlock(&node->mutex); } - ret = btrfs_batch_delete_items(trans, root, path, curr); - if (ret) - goto delete_fail; - btrfs_release_path(path); - mutex_unlock(&node->mutex); - goto do_again; - -delete_fail: - btrfs_release_path(path); - mutex_unlock(&node->mutex); return ret; } From 2a954ee8e96a06ffde5ea56fdce6ef245f9ebbd9 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:38 +0100 Subject: [PATCH 0462/1250] btrfs: improve batch deletion of delayed dir index items Currently we group delayed dir index items for deletion in a single batch (single btree operation) as long as they all exist in the same leaf and as long as their keys are sequential in the key space. For example if we have a leaf that has dir index items with offsets: 2, 3, 4, 6, 7, 10 And we have delayed dir index items for deleting all these indexes, and no delayed items for any other index keys in between, then we end up deleting in 3 batches: 1) First batch for indexes 2, 3 and 4; 2) Second batch for indexes 6 and 7; 3) Third batch for index 10. This is a waste because we can delete all the index keys in a single batch. What matters is that each consecutive delayed index key matches each consecutive dir index key in a leaf. So update the logic at btrfs_batch_delete_items() to check only for a key match between delayed dir index items and dir index items in a leaf. Also avoid the useless first iteration on comparing the key of the first slot to delete with the key of the first delayed item, as it's silly since they always match, as the delayed item's key was used for the btree search that gave us the path we have. This is more efficient and reduces runtime of running delayed items, as well as lock contention on the subvolume's tree. For example, the following test script: $ cat test.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT NUM_FILES=1000 mkdir $MNT/testdir for ((i = 1; i <= $NUM_FILES; i++)); do echo -n > $MNT/testdir/file_$i done # Now delete every other file, to create gaps in the dir index keys. for ((i = 1; i <= $NUM_FILES; i += 2)); do rm -f $MNT/testdir/file_$i done # Sync to force any delayed items to be flushed to the tree. sync start=$(date +%s%N) rm -fr $MNT/testdir end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo -e "\nrm -fr took $dur milliseconds" umount $MNT Running that test script while having the following bpftrace script running in another shell: $ cat bpf-measure.sh #!/usr/bin/bpftrace /* Add 'noinline' to btrfs_delete_delayed_items()'s definition. */ k:btrfs_delete_delayed_items { @start_delete_delayed_items[tid] = nsecs; } k:btrfs_del_items /@start_delete_delayed_items[tid]/ { @delete_batches = count(); } kr:btrfs_delete_delayed_items /@start_delete_delayed_items[tid]/ { $dur = (nsecs - @start_delete_delayed_items[tid]) / 1000; @btrfs_delete_delayed_items_total_time = sum($dur); delete(@start_delete_delayed_items[tid]); } Before this change: @btrfs_delete_delayed_items_total_time: 9563 @delete_batches: 1001 After this change: @btrfs_delete_delayed_items_total_time: 7328 @delete_batches: 509 Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 60 +++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index ff986c78c087c4..f5db8c5461592e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -798,68 +798,58 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, { struct btrfs_delayed_item *curr, *next; struct extent_buffer *leaf = path->nodes[0]; - struct btrfs_key key; - struct list_head head; - int nitems, i, last_item; - int ret = 0; + LIST_HEAD(batch_list); + int nitems, slot, last_slot; + int ret; ASSERT(leaf != NULL); - i = path->slots[0]; - last_item = btrfs_header_nritems(leaf) - 1; + slot = path->slots[0]; + last_slot = btrfs_header_nritems(leaf) - 1; /* * Our caller always gives us a path pointing to an existing item, so * this can not happen. */ - ASSERT(i <= last_item); - if (WARN_ON(i > last_item)) + ASSERT(slot <= last_slot); + if (WARN_ON(slot > last_slot)) return -ENOENT; - next = item; - INIT_LIST_HEAD(&head); - btrfs_item_key_to_cpu(leaf, &key, i); - nitems = 0; + nitems = 1; + curr = item; + list_add_tail(&curr->tree_list, &batch_list); + /* - * count the number of the dir index items that we can delete in batch + * Keep checking if the next delayed item matches the next item in the + * leaf - if so, we can add it to the batch of items to delete from the + * leaf. */ - while (btrfs_comp_cpu_keys(&next->key, &key) == 0) { - list_add_tail(&next->tree_list, &head); - nitems++; + while (slot < last_slot) { + struct btrfs_key key; - curr = next; next = __btrfs_next_delayed_item(curr); if (!next) break; - if (!btrfs_is_continuous_delayed_item(curr, next)) + slot++; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_comp_cpu_keys(&next->key, &key) != 0) break; - - i++; - if (i > last_item) - break; - btrfs_item_key_to_cpu(leaf, &key, i); + nitems++; + curr = next; + list_add_tail(&curr->tree_list, &batch_list); } - /* - * Our caller always gives us a path pointing to an existing item, so - * this can not happen. - */ - ASSERT(nitems >= 1); - if (nitems < 1) - return -ENOENT; - ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); if (ret) - goto out; + return ret; - list_for_each_entry_safe(curr, next, &head, tree_list) { + list_for_each_entry_safe(curr, next, &batch_list, tree_list) { btrfs_delayed_item_release_metadata(root, curr); list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } -out: - return ret; + return 0; } static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, From 3e59072462bea70a1d75216b47cf4cce4bae17d4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:39 +0100 Subject: [PATCH 0463/1250] btrfs: assert that delayed item is a dir index item when adding it All delayed items are for dir index items, we don't support any other item types at the moment. So simplify __btrfs_add_delayed_item() and add an assertion for checking the item's key type. This also allows the next change to be simpler and avoid to check key types. In case we add support for different item types in the future, then we'll hit the assertion during development and be able to adjust any code that is assuming delayed items are always associated to dir index items. Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f5db8c5461592e..80679c33e400a3 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -438,10 +438,12 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, ins->delayed_node = delayed_node; ins->ins_or_del = action; - if (ins->key.type == BTRFS_DIR_INDEX_KEY && - action == BTRFS_DELAYED_INSERTION_ITEM && + /* Delayed items are always for dir index items. */ + ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY); + + if (action == BTRFS_DELAYED_INSERTION_ITEM && ins->key.offset >= delayed_node->index_cnt) - delayed_node->index_cnt = ins->key.offset + 1; + delayed_node->index_cnt = ins->key.offset + 1; delayed_node->count++; atomic_inc(&delayed_node->root->fs_info->delayed_root->items); From 38dd0ce56f1bfd5a74d23cfb04d3cd49644bd2f7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:40 +0100 Subject: [PATCH 0464/1250] btrfs: improve batch insertion of delayed dir index items Currently we group delayed dir index items for insertion as a single batch (a single btree operation) as long as their keys are sequential in the key space. For example we have delayed index items for the following index keys: 10, 11, 12, 15, 16, 20, 21 We end up building three batches: 1) First one for index keys 10, 11 and 12; 2) Second one for index keys 15 and 16; 3) Third one for index keys 20 and 21. However, since the dir index numbers come from a monotonically increasing counter and are never reused, we could group all these items into a single batch. The existence of holes in the sequence happens only when we had delayed dir index items for insertion that got deleted before they were flushed to the subvolume's tree. The delayed items are stored in a rbtree based on their key order, so we can just group items into a batch as long as they all fit in a leaf, and ignore if there's a gap (key offset, index number) between two consecutive items. This is more efficient and reduces the amount of time spent when running delayed items if there are gaps between dir index items. For example running the following test script: $ cat test.sh #!/bin/bash DEV=/dev/sdj MNT=/mnt/sdj mkfs.btrfs -f $DEV mount $DEV $MNT NUM_FILES=100 mkdir $MNT/testdir for ((i = 1; i <= $NUM_FILES; i++)); do echo -n > $MNT/testdir/file_$i done # Now delete every other file, to create gaps in the dir index keys. for ((i = 1; i <= $NUM_FILES; i += 2)); do rm -f $MNT/testdir/file_$i done start=$(date +%s%N) sync end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo -e "\nsync took $dur milliseconds" umount $MNT While having the following bpftrace script running in another shell: $ cat bpf-delayed-items-inserts.sh #!/usr/bin/bpftrace /* Must add 'noinline' to btrfs_insert_delayed_items(). */ k:btrfs_insert_delayed_items { @start_insert_delayed_items[tid] = nsecs; } k:btrfs_insert_empty_items /@start_insert_delayed_items[tid]/ { @insert_batches = count(); } kr:btrfs_insert_delayed_items /@start_insert_delayed_items[tid]/ { $dur = (nsecs - @start_insert_delayed_items[tid]) / 1000; @btrfs_insert_delayed_items_total_time = sum($dur); delete(@start_insert_delayed_items[tid]); } Before this change: @btrfs_insert_delayed_items_total_time: 576 @insert_batches: 51 After this change: @btrfs_insert_delayed_items_total_time: 174 @insert_batches: 2 Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 80679c33e400a3..d9be90ac7c3a38 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,18 +52,6 @@ static inline void btrfs_init_delayed_node( INIT_LIST_HEAD(&delayed_node->p_list); } -static inline int btrfs_is_continuous_delayed_item( - struct btrfs_delayed_item *item1, - struct btrfs_delayed_item *item2) -{ - if (item1->key.type == BTRFS_DIR_INDEX_KEY && - item1->key.objectid == item2->key.objectid && - item1->key.type == item2->key.type && - item1->key.offset + 1 == item2->key.offset) - return 1; - return 0; -} - static struct btrfs_delayed_node *btrfs_get_delayed_node( struct btrfs_inode *btrfs_inode) { @@ -674,8 +662,14 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, } /* - * Insert a single delayed item or a batch of delayed items that have consecutive - * keys if they exist. + * Insert a single delayed item or a batch of delayed items, as many as possible + * that fit in a leaf. The delayed items (dir index keys) are sorted by their key + * in the rbtree, and if there's a gap between two consecutive dir index items, + * then it means at some point we had delayed dir indexes to add but they got + * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them + * into the subvolume tree. Dir index keys also have their offsets coming from a + * monotonically increasing counter, so we can't get new keys with an offset that + * fits within a gap between delayed dir index items. */ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -701,7 +695,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, int next_size; next = __btrfs_next_delayed_item(curr); - if (!next || !btrfs_is_continuous_delayed_item(curr, next)) + if (!next) break; next_size = next->data_len + sizeof(struct btrfs_item); From 79fbc93904b6f0e642ea4f000693dedefc192767 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:41 +0100 Subject: [PATCH 0465/1250] btrfs: do not BUG_ON() on failure to reserve metadata for delayed item At btrfs_insert_delayed_dir_index(), we don't expect the metadata reservation for the delayed dir index item insertion to fail, because the caller is supposed to have reserved 1 unit of metadata space for that. All callers are able to deal with an error in case that happens, so there is no need for something so drastic as a BUG_ON() in case of failure. Instead just emit a warning, so that's easily noticed during development (fstests in particular), and return the error to the caller. Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index d9be90ac7c3a38..6019c35de8de00 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1386,10 +1386,13 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item); /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible + * Space was reserved for a dir index item insertion when we started the + * transaction, so getting a failure here should be impossible. */ - BUG_ON(ret); + if (WARN_ON(ret)) { + btrfs_release_delayed_item(delayed_item); + goto release_node; + } mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); From 7812727fc4ad0bed6ecf044b234c445622852460 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 31 May 2022 16:06:42 +0100 Subject: [PATCH 0466/1250] btrfs: set delayed item type when initializing it Currently we set the type of a delayed item only after successfully inserting it into its respective rbtree. This is fine, as the type is not used anywhere before that point, but for the next patch in the series, there will be the need to check the type of a delayed item before inserting it into a rbtree. So set the type of a delayed item immediately after allocating it. This also makes the trivial wrappers for adding insertion and deletion useless, so it removes them as well. Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6019c35de8de00..189b8801c62dba 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -386,8 +386,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( } static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, - struct btrfs_delayed_item *ins, - int action) + struct btrfs_delayed_item *ins) { struct rb_node **p, *node; struct rb_node *parent_node = NULL; @@ -396,9 +395,9 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, int cmp; bool leftmost = true; - if (action == BTRFS_DELAYED_INSERTION_ITEM) + if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; - else if (action == BTRFS_DELAYED_DELETION_ITEM) + else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) root = &delayed_node->del_root; else BUG(); @@ -424,12 +423,11 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_link_node(node, parent_node, p); rb_insert_color_cached(node, root, leftmost); ins->delayed_node = delayed_node; - ins->ins_or_del = action; /* Delayed items are always for dir index items. */ ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY); - if (action == BTRFS_DELAYED_INSERTION_ITEM && + if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM && ins->key.offset >= delayed_node->index_cnt) delayed_node->index_cnt = ins->key.offset + 1; @@ -438,20 +436,6 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, return 0; } -static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_INSERTION_ITEM); -} - -static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_DELETION_ITEM); -} - static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); @@ -1375,6 +1359,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, delayed_item->key.objectid = btrfs_ino(dir); delayed_item->key.type = BTRFS_DIR_INDEX_KEY; delayed_item->key.offset = index; + delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM; dir_item = (struct btrfs_dir_item *)delayed_item->data; dir_item->location = *disk_key; @@ -1395,7 +1380,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_lock(&delayed_node->mutex); - ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", @@ -1457,6 +1442,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } item->key = item_key; + item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM; ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item); /* @@ -1471,7 +1457,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_lock(&node->mutex); - ret = __btrfs_add_delayed_deletion_item(node, item); + ret = __btrfs_add_delayed_item(node, item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", From c64b1d00ec649f3d42799b4408c429e76af32548 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 22 Jun 2022 10:37:45 +0100 Subject: [PATCH 0467/1250] btrfs: reduce amount of reserved metadata for delayed item insertion Whenever we want to create a new dir index item (when creating an inode, create a hard link, rename a file) we reserve 1 unit of metadata space for it in a transaction (that's 256K for a node/leaf size of 16K), and then create a delayed insertion item for it to be added later to the subvolume's tree. That unit of metadata is kept until the delayed item is inserted into the subvolume tree, which may take a while to happen (in the worst case, it's done only when the transaction commits). If we have multiple dir index items to insert for the same directory, say N index items, and they all fit in a single leaf of metadata, then we are holding N units of reserved metadata space when all we need is 1 unit. This change addresses that, whenever a new delayed dir index item is added, we release the unit of metadata the caller has reserved when it started the transaction if adding that new dir index item does not result in touching one more metadata leaf, otherwise the reservation is kept by transferring it from the transaction block reserve to the delayed items block reserve, just like before. Given that with a leaf size of 16K we can have a few hundred dir index items in a single leaf (the exact value depends on file name lengths), this reduces pressure on metadata reservation by releasing unnecessary space much sooner. The following fs_mark test showed some improvement when creating many files in parallel on machine running a non debug kernel (debian's default kernel config) with 12 cores: $ cat test.sh #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/nvme0n1 MOUNT_OPTIONS="-o ssd" FILES=100000 THREADS=$(nproc --all) echo "performance" | \ tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $DEV mount $MOUNT_OPTIONS $DEV $MNT OPTS="-S 0 -L 10 -n $FILES -s 0 -t $THREADS -k" for ((i = 1; i <= $THREADS; i++)); do OPTS="$OPTS -d $MNT/d$i" done fs_mark $OPTS umount $MNT Before: FSUse% Count Size Files/sec App Overhead 2 1200000 0 225991.3 5465891 4 2400000 0 345728.1 5512106 4 3600000 0 346959.5 5557653 8 4800000 0 329643.0 5587548 8 6000000 0 312657.4 5606717 8 7200000 0 281707.5 5727985 12 8400000 0 88309.8 5020422 12 9600000 0 85835.9 5207496 16 10800000 0 81039.2 5404964 16 12000000 0 58548.6 5842468 After: FSUse% Count Size Files/sec App Overhead 2 1200000 0 230604.5 5778375 4 2400000 0 348908.3 5508072 4 3600000 0 357028.7 5484337 6 4800000 0 342898.3 5565703 6 6000000 0 314670.8 5751555 8 7200000 0 282548.2 5778177 12 8400000 0 90844.9 5306819 12 9600000 0 86963.1 5304689 16 10800000 0 89113.2 5455248 16 12000000 0 86693.5 5518933 The "after" results are after applying this patch and all the other patches in the same patchset, which is comprised of the following changes: btrfs: balance btree dirty pages and delayed items after a rename btrfs: free the path earlier when creating a new inode btrfs: balance btree dirty pages and delayed items after clone and dedupe btrfs: add assertions when deleting batches of delayed items btrfs: deal with deletion errors when deleting delayed items btrfs: refactor the delayed item deletion entry point btrfs: improve batch deletion of delayed dir index items btrfs: assert that delayed item is a dir index item when adding it btrfs: improve batch insertion of delayed dir index items btrfs: do not BUG_ON() on failure to reserve metadata for delayed item btrfs: set delayed item type when initializing it btrfs: reduce amount of reserved metadata for delayed item insertion Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 157 +++++++++++++++++++++++++++++++++++---- fs/btrfs/delayed-inode.h | 11 +++ 2 files changed, 154 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 189b8801c62dba..3f85182e4b8734 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -547,7 +547,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, num_bytes, 1); - item->bytes_reserved = num_bytes; + /* + * For insertions we track reserved metadata space by accounting + * for the number of leaves that will be used, based on the delayed + * node's index_items_size field. + */ + if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) + item->bytes_reserved = num_bytes; } return ret; @@ -573,6 +579,21 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } +static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node, + unsigned int num_leaves) +{ + struct btrfs_fs_info *fs_info = node->root->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves); + + /* There are no space reservations during log replay, bail out. */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id, + bytes, 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL); +} + static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -660,15 +681,27 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *first_item) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_delayed_node *node = first_item->delayed_node; LIST_HEAD(item_list); struct btrfs_delayed_item *curr; struct btrfs_delayed_item *next; - const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_item_batch batch; int total_size; char *ins_data = NULL; int ret; + lockdep_assert_held(&node->mutex); + + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(first_item->bytes_reserved == 0); + list_add_tail(&first_item->tree_list, &item_list); batch.total_data_size = first_item->data_len; batch.nr = 1; @@ -682,6 +715,8 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, if (!next) break; + ASSERT(next->bytes_reserved == 0); + next_size = next->data_len + sizeof(struct btrfs_item); if (total_size + next_size > max_size) break; @@ -738,9 +773,31 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, */ btrfs_release_path(path); + ASSERT(node->index_item_leaves > 0); + + if (next) { + /* + * We inserted one batch of items into a leaf a there are more + * items to flush in a future batch, now release one unit of + * metadata space from the delayed block reserve, corresponding + * the leaf we just flushed to. + */ + btrfs_delayed_item_release_leaves(node, 1); + node->index_item_leaves--; + } else { + /* + * There are no more items to insert. We can have a number of + * reserved leaves > 1 here - this happens when many dir index + * items are added and then removed before they are flushed (file + * names with a very short life, never span a transaction). So + * release all remaining leaves. + */ + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + list_for_each_entry_safe(curr, next, &item_list, tree_list) { list_del(&curr->tree_list); - btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); } out: @@ -1341,9 +1398,13 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_disk_key *disk_key, u8 type, u64 index) { + struct btrfs_fs_info *fs_info = trans->fs_info; + const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *delayed_item; struct btrfs_dir_item *dir_item; + bool reserve_leaf_space; + u32 data_len; int ret; delayed_node = btrfs_get_or_create_delayed_node(dir); @@ -1369,17 +1430,51 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item); - /* - * Space was reserved for a dir index item insertion when we started the - * transaction, so getting a failure here should be impossible. - */ - if (WARN_ON(ret)) { - btrfs_release_delayed_item(delayed_item); - goto release_node; - } + data_len = delayed_item->data_len + sizeof(struct btrfs_item); mutex_lock(&delayed_node->mutex); + + if (delayed_node->index_item_leaves == 0 || + delayed_node->curr_index_batch_size + data_len > leaf_data_size) { + delayed_node->curr_index_batch_size = data_len; + reserve_leaf_space = true; + } else { + delayed_node->curr_index_batch_size += data_len; + reserve_leaf_space = false; + } + + if (reserve_leaf_space) { + ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, + delayed_item); + /* + * Space was reserved for a dir index item insertion when we + * started the transaction, so getting a failure here should be + * impossible. + */ + if (WARN_ON(ret)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_item(delayed_item); + goto release_node; + } + + delayed_node->index_item_leaves++; + } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; + } + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); if (unlikely(ret)) { btrfs_err(trans->fs_info, @@ -1408,8 +1503,37 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, return 1; } - btrfs_delayed_item_release_metadata(node->root, item); + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(item->bytes_reserved == 0); + ASSERT(node->index_item_leaves > 0); + + /* + * If there's only one leaf reserved, we can decrement this item from the + * current batch, otherwise we can not because we don't know which leaf + * it belongs to. With the current limit on delayed items, we rarely + * accumulate enough dir index items to fill more than one leaf (even + * when using a leaf size of 4K). + */ + if (node->index_item_leaves == 1) { + const u32 data_len = item->data_len + sizeof(struct btrfs_item); + + ASSERT(node->curr_index_batch_size >= data_len); + node->curr_index_batch_size -= data_len; + } + btrfs_release_delayed_item(item); + + /* If we now have no more dir index items, we can release all leaves. */ + if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) { + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + mutex_unlock(&node->mutex); return 0; } @@ -1825,12 +1949,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) mutex_lock(&delayed_node->mutex); curr_item = __btrfs_first_delayed_insertion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(root, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); } + if (delayed_node->index_item_leaves > 0) { + btrfs_delayed_item_release_leaves(delayed_node, + delayed_node->index_item_leaves); + delayed_node->index_item_leaves = 0; + } + curr_item = __btrfs_first_delayed_deletion_item(delayed_node); while (curr_item) { btrfs_delayed_item_release_metadata(root, curr_item); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index b2412160c5bc90..9795dc295a183c 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -58,6 +58,17 @@ struct btrfs_delayed_node { u64 index_cnt; unsigned long flags; int count; + /* + * The size of the next batch of dir index items to insert (if this + * node is from a directory inode). Protected by @mutex. + */ + u32 curr_index_batch_size; + /* + * Number of leaves reserved for inserting dir index items (if this + * node belongs to a directory inode). This may be larger then the + * actual number of leaves we end up using. Protected by @mutex. + */ + u32 index_item_leaves; }; struct btrfs_delayed_item { From 67b17d8b75b6122426b26073d765e3c2f432dcd7 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Tue, 8 Feb 2022 11:31:20 -0800 Subject: [PATCH 0468/1250] btrfs: store chunk size in space-info struct The chunk size is stored in the btrfs_space_info structure. It is initialized at the start and is then used. A new API is added to update the current chunk size. This API is used to be able to expose the chunk_size as a sysfs setting. Signed-off-by: Stefan Roesch Reviewed-by: David Sterba [ rename and merge helpers, switch atomic type to u64, style fixes ] Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 32 ++++++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 4 ++++ fs/btrfs/volumes.c | 28 +++++++++------------------- 3 files changed, 45 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2cf8da1116eb28..62d25112310d96 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -187,6 +187,37 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) */ #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) +/* + * Calculate chunk size depending on volume type (regular or zoned). + */ +static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) +{ + if (btrfs_is_zoned(fs_info)) + return fs_info->zone_size; + + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + return SZ_1G; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + return SZ_32M; + + /* Handle BTRFS_BLOCK_GROUP_METADATA */ + if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) + return SZ_1G; + + return SZ_256M; +} + +/* + * Update default chunk size. + */ +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size) +{ + WRITE_ONCE(space_info->chunk_size, chunk_size); +} + static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -208,6 +239,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; + btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); if (btrfs_is_zoned(info)) space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c096695598c129..e7de24a529cfb5 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -25,6 +25,8 @@ struct btrfs_space_info { u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the allocator. */ + /* Chunk size in bytes */ + u64 chunk_size; /* * Once a block group drops below this threshold (percents) we'll @@ -123,6 +125,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, u64 bytes_zone_unusable, struct btrfs_space_info **space_info); +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a2bb0928dc066f..b8e40d55012f0b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5071,26 +5071,16 @@ static void init_alloc_chunk_ctl_policy_regular( struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { - u64 type = ctl->type; + struct btrfs_space_info *space_info; - if (type & BTRFS_BLOCK_GROUP_DATA) { - ctl->max_stripe_size = SZ_1G; - ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* For larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - ctl->max_stripe_size = SZ_1G; - else - ctl->max_stripe_size = SZ_256M; - ctl->max_chunk_size = ctl->max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - ctl->max_stripe_size = SZ_32M; - ctl->max_chunk_size = 2 * ctl->max_stripe_size; - ctl->devs_max = min_t(int, ctl->devs_max, - BTRFS_MAX_DEVS_SYS_CHUNK); - } else { - BUG(); - } + space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); + ASSERT(space_info); + + ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); + ctl->max_stripe_size = ctl->max_chunk_size; + + if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) + ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), From 5b244492ba489f0a449741c4c8ebe428f31d9c84 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Tue, 8 Feb 2022 11:31:21 -0800 Subject: [PATCH 0469/1250] btrfs: sysfs: export chunk size in space infos Add new sysfs knob /sys/fs/btrfs//allocation//chunk_size. This allows to query the chunk size and also set the chunk size. Constraints: - can be changed by root only - system chunk size can't be set - maximum chunk size is 10% of the filesystem size - final value is rounded down to a multiple of 256M - cannot be set on zoned filesystem Note, that rounding and the 10% clamp will result to a different value on filesystems smaller than 10G, typically 768M. Signed-off-by: Stefan Roesch Reviewed-by: David Sterba [ Changes to original submission: - document setting constraints - drop read-only requirement - drop unnecessary error messages - fix return values of _store callback - use memparse for the value - fix rounding down to 256M ] Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 963d6321981454..43368db059680e 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,6 +21,7 @@ #include "space-info.h" #include "block-group.h" #include "qgroup.h" +#include "misc.h" /* * Structure name Path @@ -92,6 +93,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); +static struct kobject *get_btrfs_kobj(struct kobject *kobj); static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) { @@ -709,6 +711,66 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) +static ssize_t btrfs_chunk_size_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + + return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size)); +} + +/* + * Store new chunk size in space info. Can be called on a read-only filesystem. + * + * If the new chunk size value is larger than 10% of free space it is reduced + * to match that limit. Alignment must be to 256M and the system chunk size + * cannot be set. + */ +static ssize_t btrfs_chunk_size_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + char *retptr; + u64 val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!fs_info->fs_devices) + return -EINVAL; + + if (btrfs_is_zoned(fs_info)) + return -EINVAL; + + /* System block type must not be changed. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return -EPERM; + + val = memparse(buf, &retptr); + /* There could be trailing '\n', also catch any typos after the value */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || val == 0) + return -EINVAL; + + val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); + + /* Limit stripe size to 10% of available space. */ + val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val); + + /* Must be multiple of 256M. */ + val &= ~((u64)SZ_256M - 1); + + /* Must be at least 256M. */ + if (val < SZ_256M) + return -EINVAL; + + btrfs_update_space_info_chunk_size(space_info, val); + + return len; +} + SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); @@ -719,6 +781,7 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, @@ -773,6 +836,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, chunk_size), NULL, }; ATTRIBUTE_GROUPS(space_info); @@ -1140,6 +1204,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) return to_fs_devs(kobj)->fs_info; } +static struct kobject *get_btrfs_kobj(struct kobject *kobj) +{ + while (kobj) { + if (kobj->ktype == &btrfs_ktype) + return kobj; + kobj = kobj->parent; + } + return NULL; +} + #define NUM_FEATURE_BITS 64 #define BTRFS_FEATURE_NAME_MAX 13 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; From 9197f100fc27b0a924eb7e37df1af6e72f8a3cb7 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Tue, 8 Feb 2022 11:31:22 -0800 Subject: [PATCH 0470/1250] btrfs: sysfs: add force_chunk_alloc trigger to force allocation Adds write-only trigger to force new chunk allocation for a given block group type. It is at /sys/fs/btrfs//allocation//force_chunk_alloc Note: this is now only for debugging and testing and is enabled with the CONFIG_BTRFS_DEBUG configuration option. The transaction is started from sysfs context and can be problematic in some cases. Signed-off-by: Stefan Roesch Reviewed-by: David Sterba [ Changes from the original submission: - update changelog - drop unnecessary error messages - switch value to bool and use kstrtobool - move BTRFS_ATTR_W definition - add comment for using transaction ] Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 43368db059680e..a536091c3f7608 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -62,6 +62,10 @@ struct raid_kobject { .store = _store, \ } +#define BTRFS_ATTR_W(_prefix, _name, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + #define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) @@ -771,6 +775,52 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, return len; } +#ifdef CONFIG_BTRFS_DEBUG +/* + * Request chunk allocation with current chunk size. + */ +static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + struct btrfs_trans_handle *trans; + bool val; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (!val) + return -EINVAL; + + /* + * This is unsafe to be called from sysfs context and may cause + * unexpected problems. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_force_chunk_alloc(trans, space_info->flags); + btrfs_end_transaction(trans); + + if (ret == 1) + return len; + + return -ENOSPC; +} +BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store); + +#endif + SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); @@ -837,6 +887,9 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), BTRFS_ATTR_PTR(space_info, chunk_size), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(space_info, force_chunk_alloc), +#endif NULL, }; ATTRIBUTE_GROUPS(space_info); From 617095a18acd59039eee2b5d89a74a129442941d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Mar 2022 10:25:37 -0700 Subject: [PATCH 0471/1250] btrfs: send: remove unused send_ctx::{total,cmd}_send_size We collect these statistics but have never exposed them in any way. I also didn't find any patches that ever attempted to make use of them. Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/send.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c7dea639a56f9d..cbf894fa79df7a 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -82,8 +82,6 @@ struct send_ctx { char *send_buf; u32 send_size; u32 send_max_size; - u64 total_send_size; - u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ /* Protocol version compatibility requested */ u32 proto; @@ -734,8 +732,6 @@ static int send_cmd(struct send_ctx *sctx) ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); - sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; return ret; From 8299ead165b2a613626efe80432aef362e723b0d Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Mar 2022 10:25:38 -0700 Subject: [PATCH 0472/1250] btrfs: send: explicitly number commands and attributes Commit e77fbf990316 ("btrfs: send: prepare for v2 protocol") added _BTRFS_SEND_C_MAX_V* macros equal to the maximum command number for the version plus 1, but as written this creates gaps in the number space. The maximum command number is currently 22, and __BTRFS_SEND_C_MAX_V1 is accordingly 23. But then __BTRFS_SEND_C_MAX_V2 is 24, suggesting that v2 has a command numbered 23, and __BTRFS_SEND_C_MAX is 25, suggesting that 23 and 24 are valid commands. Instead, let's explicitly number all of the commands, attributes, and sentinel MAX constants. Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 4 +- fs/btrfs/send.h | 120 ++++++++++++++++++++++++------------------------ 2 files changed, 61 insertions(+), 63 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index cbf894fa79df7a..237753860758cb 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -333,8 +333,8 @@ __maybe_unused static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) { switch (sctx->proto) { - case 1: return cmd < __BTRFS_SEND_C_MAX_V1; - case 2: return cmd < __BTRFS_SEND_C_MAX_V2; + case 1: return cmd <= BTRFS_SEND_C_MAX_V1; + case 2: return cmd <= BTRFS_SEND_C_MAX_V2; default: return false; } } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 08602fdd600a6f..c47a2984aa5b17 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -46,84 +46,82 @@ struct btrfs_tlv_header { /* commands */ enum btrfs_send_cmd { - BTRFS_SEND_C_UNSPEC, + BTRFS_SEND_C_UNSPEC = 0, /* Version 1 */ - BTRFS_SEND_C_SUBVOL, - BTRFS_SEND_C_SNAPSHOT, + BTRFS_SEND_C_SUBVOL = 1, + BTRFS_SEND_C_SNAPSHOT = 2, - BTRFS_SEND_C_MKFILE, - BTRFS_SEND_C_MKDIR, - BTRFS_SEND_C_MKNOD, - BTRFS_SEND_C_MKFIFO, - BTRFS_SEND_C_MKSOCK, - BTRFS_SEND_C_SYMLINK, + BTRFS_SEND_C_MKFILE = 3, + BTRFS_SEND_C_MKDIR = 4, + BTRFS_SEND_C_MKNOD = 5, + BTRFS_SEND_C_MKFIFO = 6, + BTRFS_SEND_C_MKSOCK = 7, + BTRFS_SEND_C_SYMLINK = 8, - BTRFS_SEND_C_RENAME, - BTRFS_SEND_C_LINK, - BTRFS_SEND_C_UNLINK, - BTRFS_SEND_C_RMDIR, + BTRFS_SEND_C_RENAME = 9, + BTRFS_SEND_C_LINK = 10, + BTRFS_SEND_C_UNLINK = 11, + BTRFS_SEND_C_RMDIR = 12, - BTRFS_SEND_C_SET_XATTR, - BTRFS_SEND_C_REMOVE_XATTR, + BTRFS_SEND_C_SET_XATTR = 13, + BTRFS_SEND_C_REMOVE_XATTR = 14, - BTRFS_SEND_C_WRITE, - BTRFS_SEND_C_CLONE, + BTRFS_SEND_C_WRITE = 15, + BTRFS_SEND_C_CLONE = 16, - BTRFS_SEND_C_TRUNCATE, - BTRFS_SEND_C_CHMOD, - BTRFS_SEND_C_CHOWN, - BTRFS_SEND_C_UTIMES, + BTRFS_SEND_C_TRUNCATE = 17, + BTRFS_SEND_C_CHMOD = 18, + BTRFS_SEND_C_CHOWN = 19, + BTRFS_SEND_C_UTIMES = 20, - BTRFS_SEND_C_END, - BTRFS_SEND_C_UPDATE_EXTENT, - __BTRFS_SEND_C_MAX_V1, + BTRFS_SEND_C_END = 21, + BTRFS_SEND_C_UPDATE_EXTENT = 22, + BTRFS_SEND_C_MAX_V1 = 22, /* Version 2 */ - __BTRFS_SEND_C_MAX_V2, + BTRFS_SEND_C_MAX_V2 = 22, /* End */ - __BTRFS_SEND_C_MAX, + BTRFS_SEND_C_MAX = 22, }; -#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) /* attributes in send stream */ enum { - BTRFS_SEND_A_UNSPEC, - - BTRFS_SEND_A_UUID, - BTRFS_SEND_A_CTRANSID, - - BTRFS_SEND_A_INO, - BTRFS_SEND_A_SIZE, - BTRFS_SEND_A_MODE, - BTRFS_SEND_A_UID, - BTRFS_SEND_A_GID, - BTRFS_SEND_A_RDEV, - BTRFS_SEND_A_CTIME, - BTRFS_SEND_A_MTIME, - BTRFS_SEND_A_ATIME, - BTRFS_SEND_A_OTIME, - - BTRFS_SEND_A_XATTR_NAME, - BTRFS_SEND_A_XATTR_DATA, - - BTRFS_SEND_A_PATH, - BTRFS_SEND_A_PATH_TO, - BTRFS_SEND_A_PATH_LINK, - - BTRFS_SEND_A_FILE_OFFSET, - BTRFS_SEND_A_DATA, - - BTRFS_SEND_A_CLONE_UUID, - BTRFS_SEND_A_CLONE_CTRANSID, - BTRFS_SEND_A_CLONE_PATH, - BTRFS_SEND_A_CLONE_OFFSET, - BTRFS_SEND_A_CLONE_LEN, - - __BTRFS_SEND_A_MAX, + BTRFS_SEND_A_UNSPEC = 0, + + BTRFS_SEND_A_UUID = 1, + BTRFS_SEND_A_CTRANSID = 2, + + BTRFS_SEND_A_INO = 3, + BTRFS_SEND_A_SIZE = 4, + BTRFS_SEND_A_MODE = 5, + BTRFS_SEND_A_UID = 6, + BTRFS_SEND_A_GID = 7, + BTRFS_SEND_A_RDEV = 8, + BTRFS_SEND_A_CTIME = 9, + BTRFS_SEND_A_MTIME = 10, + BTRFS_SEND_A_ATIME = 11, + BTRFS_SEND_A_OTIME = 12, + + BTRFS_SEND_A_XATTR_NAME = 13, + BTRFS_SEND_A_XATTR_DATA = 14, + + BTRFS_SEND_A_PATH = 15, + BTRFS_SEND_A_PATH_TO = 16, + BTRFS_SEND_A_PATH_LINK = 17, + + BTRFS_SEND_A_FILE_OFFSET = 18, + BTRFS_SEND_A_DATA = 19, + + BTRFS_SEND_A_CLONE_UUID = 20, + BTRFS_SEND_A_CLONE_CTRANSID = 21, + BTRFS_SEND_A_CLONE_PATH = 22, + BTRFS_SEND_A_CLONE_OFFSET = 23, + BTRFS_SEND_A_CLONE_LEN = 24, + + BTRFS_SEND_A_MAX = 24, }; -#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) #ifdef __KERNEL__ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); From c2a0824ed857b1c571684a6d6722d167acb9651f Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Mar 2022 10:25:39 -0700 Subject: [PATCH 0473/1250] btrfs: send: add stream v2 definitions This adds the definitions of the new commands for send stream version 2 and their respective attributes: fallocate, FS_IOC_SETFLAGS (a.k.a. chattr), and encoded writes. It also documents two changes to the send stream format in v2: the receiver shouldn't assume a maximum command size, and the DATA attribute is encoded differently to allow for writes larger than 64k. These will be implemented in subsequent changes, and then the ioctl will accept the new version and flag. Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- fs/btrfs/send.h | 40 ++++++++++++++++++++++++++++++++++---- include/uapi/linux/btrfs.h | 7 +++++++ 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 237753860758cb..6ec31736c5228f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7552,7 +7552,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->clone_roots_cnt = arg->clone_sources_count; - sctx->send_max_size = BTRFS_SEND_BUF_SIZE; + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); if (!sctx->send_buf) { ret = -ENOMEM; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index c47a2984aa5b17..858ce8132614bf 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -12,7 +12,11 @@ #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" #define BTRFS_SEND_STREAM_VERSION 1 -#define BTRFS_SEND_BUF_SIZE SZ_64K +/* + * In send stream v1, no command is larger than 64K. In send stream v2, no limit + * should be assumed. + */ +#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K enum btrfs_tlv_type { BTRFS_TLV_U8, @@ -80,16 +84,20 @@ enum btrfs_send_cmd { BTRFS_SEND_C_MAX_V1 = 22, /* Version 2 */ - BTRFS_SEND_C_MAX_V2 = 22, + BTRFS_SEND_C_FALLOCATE = 23, + BTRFS_SEND_C_SETFLAGS = 24, + BTRFS_SEND_C_ENCODED_WRITE = 25, + BTRFS_SEND_C_MAX_V2 = 25, /* End */ - BTRFS_SEND_C_MAX = 22, + BTRFS_SEND_C_MAX = 25, }; /* attributes in send stream */ enum { BTRFS_SEND_A_UNSPEC = 0, + /* Version 1 */ BTRFS_SEND_A_UUID = 1, BTRFS_SEND_A_CTRANSID = 2, @@ -112,6 +120,11 @@ enum { BTRFS_SEND_A_PATH_LINK = 17, BTRFS_SEND_A_FILE_OFFSET = 18, + /* + * As of send stream v2, this attribute is special: it must be the last + * attribute in a command, its header contains only the type, and its + * length is implicitly the remaining length of the command. + */ BTRFS_SEND_A_DATA = 19, BTRFS_SEND_A_CLONE_UUID = 20, @@ -120,7 +133,26 @@ enum { BTRFS_SEND_A_CLONE_OFFSET = 23, BTRFS_SEND_A_CLONE_LEN = 24, - BTRFS_SEND_A_MAX = 24, + BTRFS_SEND_A_MAX_V1 = 24, + + /* Version 2 */ + BTRFS_SEND_A_FALLOCATE_MODE = 25, + + BTRFS_SEND_A_SETFLAGS_FLAGS = 26, + + BTRFS_SEND_A_UNENCODED_FILE_LEN = 27, + BTRFS_SEND_A_UNENCODED_LEN = 28, + BTRFS_SEND_A_UNENCODED_OFFSET = 29, + /* + * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from + * BTRFS_SEND_C_ENCODED_WRITE. + */ + BTRFS_SEND_A_COMPRESSION = 30, + BTRFS_SEND_A_ENCRYPTION = 31, + BTRFS_SEND_A_MAX_V2 = 31, + + /* End */ + BTRFS_SEND_A_MAX = 31, }; #ifdef __KERNEL__ diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index d956b2993970f4..b6f26a434b1097 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -777,6 +777,13 @@ struct btrfs_ioctl_received_subvol_args { */ #define BTRFS_SEND_FLAG_VERSION 0x8 +/* + * Send compressed data using the ENCODED_WRITE command instead of decompressing + * the data and sending it with the WRITE command. This requires protocol + * version >= 2. + */ +#define BTRFS_SEND_FLAG_COMPRESSED 0x10 + #define BTRFS_SEND_FLAG_MASK \ (BTRFS_SEND_FLAG_NO_FILE_DATA | \ BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \ From 5dab683e6f567b75348bc81828acee327a88c80c Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Mar 2022 10:25:40 -0700 Subject: [PATCH 0474/1250] btrfs: send: write larger chunks when using stream v2 The length field of the send stream TLV header is 16 bits. This means that the maximum amount of data that can be sent for one write is 64K minus one. However, encoded writes must be able to send the maximum compressed extent (128K) in one command, or more. To support this, send stream version 2 encodes the DATA attribute differently: it has no length field, and the length is implicitly up to the end of containing command (which has a 32bit length field). Although this is necessary for encoded writes, normal writes can benefit from it, too. Also add a check to enforce that the DATA attribute is last. It is only strictly necessary for v2, but we might as well make v1 consistent with it. For v2, let's bump up the send buffer to the maximum compressed extent size plus 16K for the other metadata (144K total). Since this will most likely be vmalloc'd (and always will be after the next commit), we round it up to the next page since we might as well use the rest of the page on systems with >16K pages. Reviewed-by: Nikolay Borisov Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/send.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6ec31736c5228f..02df94815ae997 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -82,6 +82,11 @@ struct send_ctx { char *send_buf; u32 send_size; u32 send_max_size; + /* + * Whether BTRFS_SEND_A_DATA attribute was already added to current + * command (since protocol v2, data must be the last attribute). + */ + bool put_data; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ /* Protocol version compatibility requested */ u32 proto; @@ -596,6 +601,9 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) int total_len = sizeof(*hdr) + len; int left = sctx->send_max_size - sctx->send_size; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + if (unlikely(left < total_len)) return -EOVERFLOW; @@ -733,6 +741,7 @@ static int send_cmd(struct send_ctx *sctx) &sctx->send_off); sctx->send_size = 0; + sctx->put_data = false; return ret; } @@ -4860,14 +4869,28 @@ static inline u64 max_send_read_size(const struct send_ctx *sctx) static int put_data_header(struct send_ctx *sctx, u32 len) { - struct btrfs_tlv_header *hdr; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + sctx->put_data = true; + if (sctx->proto >= 2) { + /* + * Since v2, the data attribute header doesn't include a length, + * it is implicitly to the end of the command. + */ + if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) + return -EOVERFLOW; + put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); + sctx->send_size += sizeof(__le16); + } else { + struct btrfs_tlv_header *hdr; - if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) - return -EOVERFLOW; - hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); - put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); - put_unaligned_le16(len, &hdr->tlv_len); - sctx->send_size += sizeof(*hdr); + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + } return 0; } @@ -7552,7 +7575,11 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->clone_roots_cnt = arg->clone_sources_count; - sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; + if (sctx->proto >= 2) + sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); + else + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); if (!sctx->send_buf) { ret = -ENOMEM; From dcba4e315adce2a41fef66bc50a234fa4f8137df Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Mon, 4 Apr 2022 10:29:07 -0700 Subject: [PATCH 0475/1250] btrfs: send: get send buffer pages for protocol v2 For encoded writes in send v2, we will get the encoded data with btrfs_encoded_read_regular_fill_pages(), which expects a list of raw pages. To avoid extra buffers and copies, we should read directly into the send buffer. Therefore, we need the raw pages for the send buffer. We currently allocate the send buffer with kvmalloc(), which may return a kmalloc'd buffer or a vmalloc'd buffer. For vmalloc, we can get the pages with vmalloc_to_page(). For kmalloc, we could use virt_to_page(). However, the buffer size we use (144K) is not a power of two, which in theory is not guaranteed to return a page-aligned buffer, and in practice would waste a lot of memory due to rounding up to the next power of two. 144K is large enough that it usually gets allocated with vmalloc(), anyways. So, for send v2, replace kvmalloc() with vmalloc() and save the pages in an array. Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/send.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 02df94815ae997..57052fe4033c4f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -87,6 +87,7 @@ struct send_ctx { * command (since protocol v2, data must be the last attribute). */ bool put_data; + struct page **send_buf_pages; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ /* Protocol version compatibility requested */ u32 proto; @@ -7575,12 +7576,31 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->clone_roots_cnt = arg->clone_sources_count; - if (sctx->proto >= 2) + if (sctx->proto >= 2) { + u32 send_buf_num_pages; + sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); - else + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT; + sctx->send_buf_pages = kcalloc(send_buf_num_pages, + sizeof(*sctx->send_buf_pages), + GFP_KERNEL); + if (!sctx->send_buf_pages) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < send_buf_num_pages; i++) { + sctx->send_buf_pages[i] = + vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT)); + } + } else { sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; - - sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + } if (!sctx->send_buf) { ret = -ENOMEM; goto out; @@ -7773,6 +7793,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) fput(sctx->send_filp); kvfree(sctx->clone_roots); + kfree(sctx->send_buf_pages); kvfree(sctx->send_buf); name_cache_free(sctx); From 40d2c6f318fab8b55cc2755f587d27a20061d4b2 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Mar 2022 10:25:42 -0700 Subject: [PATCH 0476/1250] btrfs: send: send compressed extents with encoded writes Now that all of the pieces are in place, we can use the ENCODED_WRITE command to send compressed extents when appropriate. Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ++ fs/btrfs/inode.c | 13 +-- fs/btrfs/send.c | 232 +++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 226 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 613f46bab3e22f..9a50da857f6e1a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3363,6 +3363,12 @@ int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7329a03292ebe8..1ac43ae3869dd7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10196,9 +10196,8 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) } } -static int btrfs_encoded_io_compression_from_extent( - struct btrfs_fs_info *fs_info, - int compress_type) +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type) { switch (compress_type) { case BTRFS_COMPRESS_NONE: @@ -10403,11 +10402,9 @@ static void btrfs_encoded_read_endio(struct bio *bio) bio_put(bio); } -static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, - u64 disk_bytenr, - u64 disk_io_size, - struct page **pages) +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 57052fe4033c4f..bc00393c12339d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -625,6 +625,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, @@ -5161,17 +5162,214 @@ static int send_hole(struct send_ctx *sctx, u64 end) return ret; } -static int send_extent_data(struct send_ctx *sctx, - const u64 offset, - const u64 len) +static int send_encoded_inline_extent(struct send_ctx *sctx, + struct btrfs_path *path, u64 offset, + u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 ram_bytes; + size_t inline_size; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + ram_bytes - offset, len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + + ret = put_data_header(sctx, inline_size); + if (ret < 0) + goto out; + read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, + btrfs_file_extent_inline_start(ei), inline_size); + sctx->send_size += inline_size; + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, + u64 offset, u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 disk_bytenr, disk_num_bytes; + u32 data_offset; + struct btrfs_cmd_header *hdr; + u32 crc; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset, + len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, + btrfs_file_extent_ram_bytes(leaf, ei)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, + offset - key.offset + btrfs_file_extent_offset(leaf, ei)); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0); + + ret = put_data_header(sctx, disk_num_bytes); + if (ret < 0) + goto out; + + /* + * We want to do I/O directly into the send buffer, so get the next page + * boundary in the send buffer. This means that there may be a gap + * between the beginning of the command and the file data. + */ + data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + if (data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes) { + ret = -EOVERFLOW; + goto out; + } + + /* + * Note that send_buf is a mapping of send_buf_pages, so this is really + * reading into send_buf. + */ + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + disk_bytenr, disk_num_bytes, + sctx->send_buf_pages + + (data_offset >> PAGE_SHIFT)); + if (ret) + goto out; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); + hdr->crc = 0; + crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + hdr->crc = cpu_to_le32(crc); + + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); + if (!ret) { + ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset, + disk_num_bytes, &sctx->send_off); + } + sctx->send_size = 0; + sctx->put_data = false; + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, + const u64 offset, const u64 len) { const u64 end = offset + len; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { + bool is_inline = (btrfs_file_extent_type(leaf, ei) == + BTRFS_FILE_EXTENT_INLINE); + + /* + * Send the compressed extent unless the compressed data is + * larger than the decompressed data. This can happen if we're + * not sending the entire extent, either because it has been + * partially overwritten/truncated or because this is a part of + * the extent that we couldn't clone in clone_range(). + */ + if (is_inline && + btrfs_file_extent_inline_item_len(leaf, + path->slots[0]) <= len) { + return send_encoded_inline_extent(sctx, path, offset, + len); + } else if (!is_inline && + btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) { + return send_encoded_extent(sctx, path, offset, len); + } + } + if (sctx->cur_inode == NULL) { struct btrfs_root *root = sctx->send_root; @@ -5309,12 +5507,9 @@ static int send_capabilities(struct send_ctx *sctx) return ret; } -static int clone_range(struct send_ctx *sctx, - struct clone_root *clone_root, - const u64 disk_byte, - u64 data_offset, - u64 offset, - u64 len) +static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, + struct clone_root *clone_root, const u64 disk_byte, + u64 data_offset, u64 offset, u64 len) { struct btrfs_path *path; struct btrfs_key key; @@ -5338,7 +5533,7 @@ static int clone_range(struct send_ctx *sctx, */ if (clone_root->offset == 0 && len == sctx->send_root->fs_info->sectorsize) - return send_extent_data(sctx, offset, len); + return send_extent_data(sctx, dst_path, offset, len); path = alloc_path_for_send(); if (!path) @@ -5435,7 +5630,8 @@ static int clone_range(struct send_ctx *sctx, if (hole_len > len) hole_len = len; - ret = send_extent_data(sctx, offset, hole_len); + ret = send_extent_data(sctx, dst_path, offset, + hole_len); if (ret < 0) goto out; @@ -5508,14 +5704,16 @@ static int clone_range(struct send_ctx *sctx, if (ret < 0) goto out; } - ret = send_extent_data(sctx, offset + slen, + ret = send_extent_data(sctx, dst_path, + offset + slen, clone_len - slen); } else { ret = send_clone(sctx, offset, clone_len, clone_root); } } else { - ret = send_extent_data(sctx, offset, clone_len); + ret = send_extent_data(sctx, dst_path, offset, + clone_len); } if (ret < 0) @@ -5547,7 +5745,7 @@ static int clone_range(struct send_ctx *sctx, } if (len > 0) - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; out: @@ -5578,10 +5776,10 @@ static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); - ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, end - offset); + ret = clone_range(sctx, path, clone_root, disk_byte, + data_offset, offset, end - offset); } else { - ret = send_extent_data(sctx, offset, end - offset); + ret = send_extent_data(sctx, path, offset, end - offset); } sctx->cur_inode_next_write_offset = end; return ret; From a11b9ce136e2aabe47a5216c2b1269f0db43d0f8 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 17 Mar 2022 10:25:43 -0700 Subject: [PATCH 0477/1250] btrfs: send: enable support for stream v2 and compressed writes Now that the new support is implemented, allow the ioctl to accept v2 and the compressed flag, and update the version in sysfs. Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/send.c | 7 +++++-- fs/btrfs/send.h | 2 +- include/uapi/linux/btrfs.h | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index bc00393c12339d..6d01dc26d4080b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -701,8 +701,7 @@ static int send_header(struct send_ctx *sctx) struct btrfs_stream_header hdr; strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); - hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); - + hdr.version = cpu_to_le32(sctx->proto); return write_buf(sctx->send_filp, &hdr, sizeof(hdr), &sctx->send_off); } @@ -7755,6 +7754,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } else { sctx->proto = 1; } + if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) { + ret = -EINVAL; + goto out; + } sctx->send_filp = fget(arg->send_fd); if (!sctx->send_filp) { diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 858ce8132614bf..b0dc07567d09ac 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -10,7 +10,7 @@ #include "ctree.h" #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" -#define BTRFS_SEND_STREAM_VERSION 1 +#define BTRFS_SEND_STREAM_VERSION 2 /* * In send stream v1, no command is larger than 64K. In send stream v2, no limit diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index b6f26a434b1097..f54dc91e402576 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -788,7 +788,8 @@ struct btrfs_ioctl_received_subvol_args { (BTRFS_SEND_FLAG_NO_FILE_DATA | \ BTRFS_SEND_FLAG_OMIT_STREAM_HEADER | \ BTRFS_SEND_FLAG_OMIT_END_CMD | \ - BTRFS_SEND_FLAG_VERSION) + BTRFS_SEND_FLAG_VERSION | \ + BTRFS_SEND_FLAG_COMPRESSED) struct btrfs_ioctl_send_args { __s64 send_fd; /* in */ From 513329d17458e07b350e26bc89a32fcb1a954a5d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:33 +0200 Subject: [PATCH 0478/1250] btrfs: move more work into btrfs_end_bioc Assign ->mirror_num and ->bi_status in btrfs_end_bioc instead of duplicating the logic in the callers. Also remove the bio argument as it always must be bioc->orig_bio and the now pointless bioc_error that did nothing but assign bi_sector to the same value just sampled in the caller. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 72 ++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 50 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b8e40d55012f0b..9935b5d955bee0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6616,19 +6616,29 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); } -static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) +static inline void btrfs_end_bioc(struct btrfs_io_context *bioc) { - bio->bi_private = bioc->private; - bio->bi_end_io = bioc->end_io; - bio_endio(bio); + struct bio *orig_bio = bioc->orig_bio; + btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num; + orig_bio->bi_private = bioc->private; + orig_bio->bi_end_io = bioc->end_io; + + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + orig_bio->bi_status = BLK_STS_IOERR; + else + orig_bio->bi_status = BLK_STS_OK; + bio_endio(orig_bio); btrfs_put_bioc(bioc); } static void btrfs_end_bio(struct bio *bio) { struct btrfs_io_context *bioc = bio->bi_private; - int is_orig_bio = 0; if (bio->bi_status) { atomic_inc(&bioc->error); @@ -6649,35 +6659,12 @@ static void btrfs_end_bio(struct bio *bio) } } - if (bio == bioc->orig_bio) - is_orig_bio = 1; + if (bio != bioc->orig_bio) + bio_put(bio); btrfs_bio_counter_dec(bioc->fs_info); - - if (atomic_dec_and_test(&bioc->stripes_pending)) { - if (!is_orig_bio) { - bio_put(bio); - bio = bioc->orig_bio; - } - - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - /* only send an error to the higher layers if it is - * beyond the tolerance of the btrfs bio - */ - if (atomic_read(&bioc->error) > bioc->max_errors) { - bio->bi_status = BLK_STS_IOERR; - } else { - /* - * this bio is actually up to date, we didn't - * go over the max number of errors - */ - bio->bi_status = BLK_STS_OK; - } - - btrfs_end_bioc(bioc, bio); - } else if (!is_orig_bio) { - bio_put(bio); - } + if (atomic_dec_and_test(&bioc->stripes_pending)) + btrfs_end_bioc(bioc); } static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, @@ -6715,23 +6702,6 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, submit_bio(bio); } -static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) -{ - atomic_inc(&bioc->error); - if (atomic_dec_and_test(&bioc->stripes_pending)) { - /* Should be the original bio. */ - WARN_ON(bio != bioc->orig_bio); - - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - bio->bi_iter.bi_sector = logical >> 9; - if (atomic_read(&bioc->error) > bioc->max_errors) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_status = BLK_STS_OK; - btrfs_end_bioc(bioc, bio); - } -} - blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) { @@ -6790,7 +6760,9 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, &dev->dev_state) || (btrfs_op(first_bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bioc_error(bioc, first_bio, logical); + atomic_inc(&bioc->error); + if (atomic_dec_and_test(&bioc->stripes_pending)) + btrfs_end_bioc(bioc); continue; } From e3809f7498b0e9dd9abc6250b67b46dd53b9cd84 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:34 +0200 Subject: [PATCH 0479/1250] btrfs: simplify code flow in btrfs_submit_dio_bio There is no exit block and cleanup and the function is reasonably short so we can use inline return and not the goto. This makes the function more straight forward. Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1ac43ae3869dd7..3a2a2d4906db8a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7983,39 +7983,33 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; - /* Check btrfs_submit_bio_hook() for rules about async submit. */ - if (async_submit) - async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); - if (!write) { ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); if (ret) - goto err; + return ret; } if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; - if (write && async_submit) { - ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset, - btrfs_submit_bio_start_direct_io); - goto err; - } else if (write) { + if (write) { + /* Check btrfs_submit_data_bio() for async submit rules */ + if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers)) + return btrfs_wq_submit_bio(inode, bio, 0, file_offset, + btrfs_submit_bio_start_direct_io); /* * If we aren't doing async submit, calculate the csum of the * bio now. */ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); if (ret) - goto err; + return ret; } else { btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, file_offset - dip->file_offset); } map: - ret = btrfs_map_bio(fs_info, bio, 0); -err: - return ret; + return btrfs_map_bio(fs_info, bio, 0); } static void btrfs_submit_direct(const struct iomap_iter *iter, From 4c8da8b2ee8440d4ba0c150f1bf622264a5dd4f9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:35 +0200 Subject: [PATCH 0480/1250] btrfs: split btrfs_submit_data_bio to read and write parts Split btrfs_submit_data_bio into one helper for reads and one for writes. Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 +- fs/btrfs/extent_io.c | 13 +++-- fs/btrfs/inode.c | 130 ++++++++++++++++++++----------------------- 3 files changed, 71 insertions(+), 77 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9a50da857f6e1a..6d4e71f52910ec 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3259,8 +3259,9 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ -void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 69b6b4ba009e47..47407e6bb91e50 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -182,17 +182,20 @@ static void submit_one_bio(struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type) { struct extent_io_tree *tree = bio->bi_private; + struct inode *inode = tree->private_data; bio->bi_private = NULL; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - if (is_data_inode(tree->private_data)) - btrfs_submit_data_bio(tree->private_data, bio, mirror_num, - compress_type); + if (!is_data_inode(inode)) + btrfs_submit_metadata_bio(inode, bio, mirror_num); + else if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_submit_data_write_bio(inode, bio, mirror_num); else - btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num); + btrfs_submit_data_read_bio(inode, bio, mirror_num, compress_type); + /* * Above submission hooks will handle the error by ending the bio, * which will do the cleanup properly. So here we should not return @@ -2786,7 +2789,7 @@ static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio, ret = btrfs_repair_one_sector(inode, failed_bio, bio_offset + offset, page, pgoff + offset, start + offset, - failed_mirror, btrfs_submit_data_bio); + failed_mirror, btrfs_submit_data_read_bio); if (!ret) { /* * We have submitted the read repair, the page release diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3a2a2d4906db8a..316c73d7d8cabb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2580,90 +2580,80 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, return errno_to_blk_status(ret); } -/* - * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read. - * - * Rules about async/sync submit, - * a) read: sync submit - * - * b) write without checksum: sync submit - * - * c) write with checksum: - * c-1) if bio is issued by fsync: sync submit - * (sync_writers != 0) - * - * c-2) if root is reloc root: sync submit - * (only in case of buffered IO) - * - * c-3) otherwise: async submit - */ -void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type) +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - blk_status_t ret = 0; - int skip_sum; - int async = !atomic_read(&BTRFS_I(inode)->sync_writers); - - skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - - if (btrfs_is_free_space_inode(BTRFS_I(inode))) - metadata = BTRFS_WQ_ENDIO_FREE_SPACE; + struct btrfs_inode *bi = BTRFS_I(inode); + blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct page *page = bio_first_bvec_all(bio)->bv_page; - loff_t file_offset = page_offset(page); - - ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); + ret = extract_ordered_extent(bi, bio, + page_offset(bio_first_bvec_all(bio)->bv_page)); if (ret) goto out; } - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); - if (ret) - goto out; - - if (compress_type != BTRFS_COMPRESS_NONE) { - /* - * btrfs_submit_compressed_read will handle completing - * the bio if there were any errors, so just return - * here. - */ - btrfs_submit_compressed_read(inode, bio, mirror_num); - return; - } else { - /* - * Lookup bio sums does extra checks around whether we - * need to csum or not, which is why we ignore skip_sum - * here. - */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); + /* + * Rules for async/sync submit: + * a) write without checksum: sync submit + * b) write with checksum: + * b-1) if bio is issued by fsync: sync submit + * (sync_writers != 0) + * b-2) if root is reloc root: sync submit + * (only in case of buffered IO) + * b-3) otherwise: async submit + */ + if (!(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) { + if (atomic_read(&bi->sync_writers)) { + ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); if (ret) goto out; + } else if (btrfs_is_data_reloc_root(bi->root)) { + ; /* Csum items have already been cloned */ + } else { + ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + btrfs_submit_bio_start); + goto out; } - goto mapit; - } else if (async && !skip_sum) { - /* csum items have already been cloned */ - if (btrfs_is_data_reloc_root(root)) - goto mapit; - /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, - 0, btrfs_submit_bio_start); + } + ret = btrfs_map_bio(fs_info, bio, mirror_num); +out: + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + } +} + +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + blk_status_t ret; + + ret = btrfs_bio_wq_end_io(fs_info, bio, + btrfs_is_free_space_inode(BTRFS_I(inode)) ? + BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA); + if (ret) goto out; - } else if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); - if (ret) - goto out; + + if (compress_type != BTRFS_COMPRESS_NONE) { + /* + * btrfs_submit_compressed_read will handle completing the bio + * if there were any errors, so just return here. + */ + btrfs_submit_compressed_read(inode, bio, mirror_num); + return; } -mapit: + /* + * Lookup bio sums does extra checks around whether we need to csum or + * not, which is why we ignore skip_sum here. + */ + ret = btrfs_lookup_bio_sums(inode, bio, NULL); + if (ret) + goto out; ret = btrfs_map_bio(fs_info, bio, mirror_num); - out: if (ret) { bio->bi_status = ret; @@ -7993,7 +7983,7 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, goto map; if (write) { - /* Check btrfs_submit_data_bio() for async submit rules */ + /* Check btrfs_submit_data_write_bio() for async submit rules */ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers)) return btrfs_wq_submit_bio(inode, bio, 0, file_offset, btrfs_submit_bio_start_direct_io); From 56f5436b589ed0d4aa95fe44eea3812b770ed41d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:36 +0200 Subject: [PATCH 0481/1250] btrfs: defer I/O completion based on the btrfs_raid_bio Instead of attaching an extra allocation an indirect call to each low-level bio issued by the RAID code, add a work_struct to struct btrfs_raid_bio and only defer the per-rbio completion action. The per-bio action for all the I/Os are trivial and can be safely done from interrupt context. As a nice side effect this also allows sharing the boilerplate code for the per-bio completions Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 12 ++---- fs/btrfs/disk-io.h | 1 - fs/btrfs/raid56.c | 102 ++++++++++++++++++--------------------------- fs/btrfs/raid56.h | 2 + 5 files changed, 47 insertions(+), 72 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6d4e71f52910ec..1d5b38f3aa5fda 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -852,7 +852,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *flush_workers; struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; - struct btrfs_workqueue *endio_raid56_workers; + struct workqueue_struct *endio_raid56_workers; struct workqueue_struct *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 018510188a0d13..1c9c6c2980dd25 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -754,14 +754,10 @@ static void end_workqueue_bio(struct bio *bio) wq = fs_info->endio_meta_write_workers; else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) wq = fs_info->endio_freespace_worker; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; else wq = fs_info->endio_write_workers; } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else if (end_io_wq->metadata) + if (end_io_wq->metadata) wq = fs_info->endio_meta_workers; else wq = fs_info->endio_workers; @@ -2281,7 +2277,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); - btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + if (fs_info->endio_raid56_workers) + destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); @@ -2490,8 +2487,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, max_active, 2); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, - max_active, 4); + alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 4ee8c42c9f7835..809ef065f1666e 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -21,7 +21,6 @@ enum btrfs_wq_endio_type { BTRFS_WQ_ENDIO_DATA, BTRFS_WQ_ENDIO_METADATA, BTRFS_WQ_ENDIO_FREE_SPACE, - BTRFS_WQ_ENDIO_RAID56, }; static inline u64 btrfs_sb_offset(int mirror) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index baba435692d2c4..00cd9e8db7ae0b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1488,15 +1488,7 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid_rmw_end_io(struct bio *bio) +static void raid56_bio_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -1507,23 +1499,34 @@ static void raid_rmw_end_io(struct bio *bio) bio_put(bio); - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + if (atomic_dec_and_test(&rbio->stripes_pending)) + queue_work(rbio->bioc->fs_info->endio_raid56_workers, + &rbio->end_io_work); +} - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; +/* + * End io handler for the read phase of the RMW cycle. All the bios here are + * physical stripe bios we've read from the disk so we can recalculate the + * parity of the stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_rmw_end_io_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); + + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + rbio_orig_end_io(rbio, BLK_STS_IOERR); + return; + } /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write but if there + * are any failed stripes we'll reconstruct from parity first. */ validate_rbio_for_rmw(rbio); - return; - -cleanup: - - rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -1598,10 +1601,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_rmw_end_io; - - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + bio->bi_end_io = raid56_bio_end_io; if (trace_raid56_read_partial_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; @@ -2076,25 +2078,13 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* - * This is called only for stripes we've read from disk to - * reconstruct the parity. + * This is called only for stripes we've read from disk to reconstruct the + * parity. */ -static void raid_recover_end_io(struct bio *bio) +static void raid_recover_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - /* - * we only read stripe pages off the disk, set them - * up to date if there were no errors - */ - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); if (atomic_read(&rbio->error) > rbio->bioc->max_errors) rbio_orig_end_io(rbio, BLK_STS_IOERR); @@ -2177,10 +2167,9 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_recover_end_io; - - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + bio->bi_end_io = raid56_bio_end_io; if (trace_raid56_scrub_read_recover_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; @@ -2650,24 +2639,14 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid56_parity_scrub_end_io(struct bio *bio) +static void raid56_parity_scrub_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write, but if there + * are any failed stripes we'll reconstruct from parity first */ validate_rbio_for_parity_scrub(rbio); } @@ -2737,10 +2716,9 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_parity_scrub_end_io; - - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + bio->bi_end_io = raid56_bio_end_io; if (trace_raid56_scrub_read_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 3badde24dcbf06..3b22657ca857e3 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -100,6 +100,8 @@ struct btrfs_raid_bio { atomic_t error; + struct work_struct end_io_work; + /* Bitmap to record which horizontal stripe has data */ unsigned long dbitmap; From 1608f8046a81ac22ad8581f33e17ad95a5d25a5d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:37 +0200 Subject: [PATCH 0482/1250] btrfs: don't double-defer bio completions for compressed reads The bio completion handler of the bio used for the compressed data is already run in a workqueue using btrfs_bio_wq_end_io, so don't schedule the completion of the original bio to the same workqueue again but just execute it directly. Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 316c73d7d8cabb..3067c966d8b634 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2631,12 +2631,6 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; - ret = btrfs_bio_wq_end_io(fs_info, bio, - btrfs_is_free_space_inode(BTRFS_I(inode)) ? - BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA); - if (ret) - goto out; - if (compress_type != BTRFS_COMPRESS_NONE) { /* * btrfs_submit_compressed_read will handle completing the bio @@ -2646,6 +2640,12 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, return; } + ret = btrfs_bio_wq_end_io(fs_info, bio, + btrfs_is_free_space_inode(BTRFS_I(inode)) ? + BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA); + if (ret) + goto out; + /* * Lookup bio sums does extra checks around whether we need to csum or * not, which is why we ignore skip_sum here. From eb0c74e35371d4a66010c55e331104b090b7f733 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:38 +0200 Subject: [PATCH 0483/1250] btrfs: don't use btrfs_bio_wq_end_io for compressed writes Compressed write bio completion is the only user of btrfs_bio_wq_end_io for writes, and the use of btrfs_bio_wq_end_io is a little suboptimal here as we only real need user context for the final completion of a compressed_bio structure, and not every single bio completion. Add a work_struct to struct compressed_bio instead and use that to call finish_compressed_bio_write. This allows to remove all handling of write bios in the btrfs_bio_wq_end_io infrastructure. Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/compression.c | 44 +++++++++++++++++++++--------------------- fs/btrfs/compression.h | 7 +++++-- fs/btrfs/ctree.h | 2 +- fs/btrfs/disk-io.c | 30 ++++++++++++---------------- fs/btrfs/super.c | 2 -- 5 files changed, 40 insertions(+), 45 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2536754656b665..2ea5cf5ae21001 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -398,6 +398,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) kfree(cb); } +static void btrfs_finish_compressed_write_work(struct work_struct *work) +{ + struct compressed_bio *cb = + container_of(work, struct compressed_bio, write_end_work); + + finish_compressed_bio_write(cb); +} + /* * Do the cleanup once all the compressed pages hit the disk. This will clear * writeback on the file pages and free the compressed pages. @@ -409,29 +417,15 @@ static void end_compressed_bio_write(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; - if (!dec_and_test_compressed_bio(cb, bio)) - goto out; - - btrfs_record_physical_zoned(cb->inode, cb->start, bio); + if (dec_and_test_compressed_bio(cb, bio)) { + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - finish_compressed_bio_write(cb); -out: + btrfs_record_physical_zoned(cb->inode, cb->start, bio); + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); + } bio_put(bio); } -static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, - struct bio *bio, int mirror_num) -{ - blk_status_t ret; - - ASSERT(bio->bi_iter.bi_size); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; - ret = btrfs_map_bio(fs_info, bio, mirror_num); - return ret; -} - /* * Allocate a compressed_bio, which will be used to read/write on-disk * (aka, compressed) * data. @@ -528,7 +522,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; cb->writeback = writeback; - cb->orig_bio = NULL; + INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; if (blkcg_css) @@ -598,7 +592,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, goto finish_cb; } - ret = submit_compressed_bio(fs_info, bio, 0); + ASSERT(bio->bi_iter.bi_size); + ret = btrfs_map_bio(fs_info, bio, 0); if (ret) goto finish_cb; bio = NULL; @@ -935,7 +930,12 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, fs_info->sectorsize); sums += fs_info->csum_size * nr_sectors; - ret = submit_compressed_bio(fs_info, comp_bio, mirror_num); + ASSERT(comp_bio->bi_iter.bi_size); + ret = btrfs_bio_wq_end_io(fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); + if (ret) + goto finish_cb; + ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) goto finish_cb; comp_bio = NULL; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 2707404389a5d0..5fca7603e928a5 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -61,8 +61,11 @@ struct compressed_bio { blk_status_t status; int mirror_num; - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; + union { + /* For reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + struct work_struct write_end_work; + }; /* * the start of a variable length array of checksums only diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1d5b38f3aa5fda..e689dba076b067 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -854,7 +854,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_meta_workers; struct workqueue_struct *endio_raid56_workers; struct workqueue_struct *rmw_workers; - struct btrfs_workqueue *endio_meta_write_workers; + struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1c9c6c2980dd25..ea32627139b04f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -749,19 +749,10 @@ static void end_workqueue_bio(struct bio *bio) fs_info = end_io_wq->info; end_io_wq->status = bio->bi_status; - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - wq = fs_info->endio_meta_write_workers; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - wq = fs_info->endio_freespace_worker; - else - wq = fs_info->endio_write_workers; - } else { - if (end_io_wq->metadata) - wq = fs_info->endio_meta_workers; - else - wq = fs_info->endio_workers; - } + if (end_io_wq->metadata) + wq = fs_info->endio_meta_workers; + else + wq = fs_info->endio_workers; btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); btrfs_queue_work(wq, &end_io_wq->work); @@ -772,6 +763,9 @@ blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, { struct btrfs_end_io_wq *end_io_wq; + if (WARN_ON_ONCE(btrfs_op(bio) != BTRFS_MAP_WRITE)) + return BLK_STS_IOERR; + end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) return BLK_STS_RESOURCE; @@ -2281,6 +2275,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); + if (fs_info->compressed_write_workers) + destroy_workqueue(fs_info->compressed_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -2295,7 +2291,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) * queues can do metadata I/O operations. */ btrfs_destroy_workqueue(fs_info->endio_meta_workers); - btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2483,15 +2478,14 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->endio_meta_workers = btrfs_alloc_workqueue(fs_info, "endio-meta", flags, max_active, 4); - fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, - max_active, 2); fs_info->endio_raid56_workers = alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); + fs_info->compressed_write_workers = + alloc_workqueue("btrfs-compressed-write", flags, max_active); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); @@ -2506,7 +2500,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) if (!(fs_info->workers && fs_info->hipri_workers && fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->endio_meta_write_workers && + fs_info->compressed_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8539ee2dc79f4a..e3800f0f993fee 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1934,8 +1934,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, - new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); From 253140c88ec15c9c2336cb25988ac1464d13f2df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:39 +0200 Subject: [PATCH 0484/1250] btrfs: centralize setting REQ_META Set REQ_META in btrfs_submit_metadata_bio instead of the various callers. We'll start relying on this flag inside of btrfs in a bit, and this ensures it is always set correctly. Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 ++ fs/btrfs/extent_io.c | 8 ++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ea32627139b04f..a085cd95ef12ef 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -914,6 +914,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; + bio->bi_opf |= REQ_META; + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { /* * called for a read, do the setup so that checksum validation diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 47407e6bb91e50..d7059a48cefcab 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4567,7 +4567,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + unsigned int write_flags = wbc_to_write_flags(wbc); bool no_dirty_ebs = false; int ret; @@ -4612,7 +4612,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, { u64 disk_bytenr = eb->start; int i, num_pages; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + unsigned int write_flags = wbc_to_write_flags(wbc); int ret = 0; prepare_eb_write(eb); @@ -6630,7 +6630,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, + ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, eb->start, eb->len, eb->start - page_offset(page), end_bio_extent_readpage, mirror_num, 0, @@ -6737,7 +6737,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + err = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, page_offset(page), PAGE_SIZE, 0, end_bio_extent_readpage, mirror_num, 0, false); From aaa89ac5ab3928cb1a339c4a3cb68a524bc88208 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:40 +0200 Subject: [PATCH 0485/1250] btrfs: remove btrfs_end_io_wq All reads bio that go through btrfs_map_bio need to be completed in user context. And read I/Os are the most common and timing critical in almost any file system workloads. Embed a work_struct into struct btrfs_bio and use it to complete all read bios submitted through btrfs_map, using the REQ_META flag to decide which workqueue they are placed on. This removes the need for a separate 128 byte allocation (typically rounded up to 192 bytes by slab) for all reads with a size increase of 24 bytes for struct btrfs_bio. Future patches will reorganize struct btrfs_bio to make use of this extra space for writes as well. (All sizes are based a on typical 64-bit non-debug build) Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 -- fs/btrfs/ctree.h | 4 +- fs/btrfs/disk-io.c | 120 +++-------------------------------------- fs/btrfs/disk-io.h | 10 ---- fs/btrfs/inode.c | 24 +-------- fs/btrfs/super.c | 11 +--- fs/btrfs/volumes.c | 33 ++++++++++-- fs/btrfs/volumes.h | 3 ++ 8 files changed, 42 insertions(+), 167 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 2ea5cf5ae21001..63d542961b78a1 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -931,10 +931,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, sums += fs_info->csum_size * nr_sectors; ASSERT(comp_bio->bi_iter.bi_size); - ret = btrfs_bio_wq_end_io(fs_info, comp_bio, - BTRFS_WQ_ENDIO_DATA); - if (ret) - goto finish_cb; ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); if (ret) goto finish_cb; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e689dba076b067..22a287cbc3e7c4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -850,8 +850,8 @@ struct btrfs_fs_info { struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; - struct btrfs_workqueue *endio_workers; - struct btrfs_workqueue *endio_meta_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; struct workqueue_struct *endio_raid56_workers; struct workqueue_struct *rmw_workers; struct workqueue_struct *compressed_write_workers; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a085cd95ef12ef..ed1d92b370db09 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -51,7 +51,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -64,40 +63,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); -/* - * btrfs_end_io_wq structs are used to do processing in task context when an IO - * is complete. This is used during reads to verify checksums, and it is used - * by writes to insert metadata for new file extents after IO is complete. - */ -struct btrfs_end_io_wq { - struct bio *bio; - bio_end_io_t *end_io; - void *private; - struct btrfs_fs_info *info; - blk_status_t status; - enum btrfs_wq_endio_type metadata; - struct btrfs_work work; -}; - -static struct kmem_cache *btrfs_end_io_wq_cache; - -int __init btrfs_end_io_wq_init(void) -{ - btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", - sizeof(struct btrfs_end_io_wq), - 0, - SLAB_MEM_SPREAD, - NULL); - if (!btrfs_end_io_wq_cache) - return -ENOMEM; - return 0; -} - -void __cold btrfs_end_io_wq_exit(void) -{ - kmem_cache_destroy(btrfs_end_io_wq_cache); -} - static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) { if (fs_info->csum_shash) @@ -740,48 +705,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, return ret; } -static void end_workqueue_bio(struct bio *bio) -{ - struct btrfs_end_io_wq *end_io_wq = bio->bi_private; - struct btrfs_fs_info *fs_info; - struct btrfs_workqueue *wq; - - fs_info = end_io_wq->info; - end_io_wq->status = bio->bi_status; - - if (end_io_wq->metadata) - wq = fs_info->endio_meta_workers; - else - wq = fs_info->endio_workers; - - btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); - btrfs_queue_work(wq, &end_io_wq->work); -} - -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata) -{ - struct btrfs_end_io_wq *end_io_wq; - - if (WARN_ON_ONCE(btrfs_op(bio) != BTRFS_MAP_WRITE)) - return BLK_STS_IOERR; - - end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); - if (!end_io_wq) - return BLK_STS_RESOURCE; - - end_io_wq->private = bio->bi_private; - end_io_wq->end_io = bio->bi_end_io; - end_io_wq->info = info; - end_io_wq->status = 0; - end_io_wq->bio = bio; - end_io_wq->metadata = metadata; - - bio->bi_private = end_io_wq; - bio->bi_end_io = end_workqueue_bio; - return 0; -} - static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; @@ -917,14 +840,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ bio->bi_opf |= REQ_META; if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - /* - * called for a read, do the setup so that checksum validation - * can happen in the async kernel threads - */ - ret = btrfs_bio_wq_end_io(fs_info, bio, - BTRFS_WQ_ENDIO_METADATA); - if (!ret) - ret = btrfs_map_bio(fs_info, bio, mirror_num); + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else if (!should_async_write(fs_info, BTRFS_I(inode))) { ret = btree_csum_one_bio(bio); if (!ret) @@ -1947,25 +1863,6 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, return root; } -/* - * called by the kthread helper functions to finally call the bio end_io - * functions. This is where read checksum verification actually happens - */ -static void end_workqueue_fn(struct btrfs_work *work) -{ - struct bio *bio; - struct btrfs_end_io_wq *end_io_wq; - - end_io_wq = container_of(work, struct btrfs_end_io_wq, work); - bio = end_io_wq->bio; - - bio->bi_status = end_io_wq->status; - bio->bi_private = end_io_wq->private; - bio->bi_end_io = end_io_wq->end_io; - bio_endio(bio); - kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); -} - static int cleaner_kthread(void *arg) { struct btrfs_fs_info *fs_info = arg; @@ -2272,7 +2169,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->delalloc_workers); btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); - btrfs_destroy_workqueue(fs_info->endio_workers); + if (fs_info->endio_workers) + destroy_workqueue(fs_info->endio_workers); if (fs_info->endio_raid56_workers) destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) @@ -2292,7 +2190,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) * the queues used for metadata I/O, since tasks from those other work * queues can do metadata I/O operations. */ - btrfs_destroy_workqueue(fs_info->endio_meta_workers); + if (fs_info->endio_meta_workers) + destroy_workqueue(fs_info->endio_meta_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2471,15 +2370,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->fixup_workers = btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); - /* - * endios are largely parallel and should have a very - * low idle thresh - */ fs_info->endio_workers = - btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); + alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta", flags, - max_active, 4); + alloc_workqueue("btrfs-endio-meta", flags, max_active); fs_info->endio_raid56_workers = alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 809ef065f1666e..05e779a41a9979 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -17,12 +17,6 @@ */ #define BTRFS_BDEV_BLOCKSIZE (4096) -enum btrfs_wq_endio_type { - BTRFS_WQ_ENDIO_DATA, - BTRFS_WQ_ENDIO_METADATA, - BTRFS_WQ_ENDIO_FREE_SPACE, -}; - static inline u64 btrfs_sb_offset(int mirror) { u64 start = SZ_16K; @@ -120,8 +114,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata); blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start); @@ -144,8 +136,6 @@ int btree_lock_page_hook(struct page *page, void *data, int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); -int __init btrfs_end_io_wq_init(void); -void __cold btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_set_buffer_lockdep_class(u64 objectid, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3067c966d8b634..9cce0a3228f831 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2640,12 +2640,6 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, return; } - ret = btrfs_bio_wq_end_io(fs_info, bio, - btrfs_is_free_space_inode(BTRFS_I(inode)) ? - BTRFS_WQ_ENDIO_FREE_SPACE : BTRFS_WQ_ENDIO_DATA); - if (ret) - goto out; - /* * Lookup bio sums does extra checks around whether we need to csum or * not, which is why we ignore skip_sum here. @@ -7879,9 +7873,6 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, BUG_ON(bio_op(bio) == REQ_OP_WRITE); - if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA)) - return; - refcount_inc(&dip->refs); if (btrfs_map_bio(fs_info, bio, mirror_num)) refcount_dec(&dip->refs); @@ -7970,19 +7961,12 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; - bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; - if (!write) { - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; - } - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; - if (write) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { /* Check btrfs_submit_data_write_bio() for async submit rules */ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers)) return btrfs_wq_submit_bio(inode, bio, 0, file_offset, @@ -10314,12 +10298,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, return ret; } - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) { - btrfs_bio_free_csum(bbio); - return ret; - } - atomic_inc(&priv->pending); ret = btrfs_map_bio(fs_info, bio, mirror_num); if (ret) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index e3800f0f993fee..719dda57dc7a0a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1932,8 +1932,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); @@ -2702,13 +2700,9 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_ref; - err = btrfs_end_io_wq_init(); - if (err) - goto free_prelim_ref; - err = btrfs_interface_init(); if (err) - goto free_end_io_wq; + goto free_prelim_ref; btrfs_print_mod_info(); @@ -2724,8 +2718,6 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); -free_end_io_wq: - btrfs_end_io_wq_exit(); free_prelim_ref: btrfs_prelim_ref_exit(); free_delayed_ref: @@ -2763,7 +2755,6 @@ static void __exit exit_btrfs_fs(void) extent_state_cache_exit(); extent_io_exit(); btrfs_interface_exit(); - btrfs_end_io_wq_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9935b5d955bee0..04e7e79cab47c0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6616,11 +6616,27 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); } -static inline void btrfs_end_bioc(struct btrfs_io_context *bioc) +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc) +{ + if (bioc->orig_bio->bi_opf & REQ_META) + return bioc->fs_info->endio_meta_workers; + return bioc->fs_info->endio_workers; +} + +static void btrfs_end_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = + container_of(work, struct btrfs_bio, end_io_work); + + bio_endio(&bbio->bio); +} + +static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async) { struct bio *orig_bio = bioc->orig_bio; + struct btrfs_bio *bbio = btrfs_bio(orig_bio); - btrfs_bio(orig_bio)->mirror_num = bioc->mirror_num; + bbio->mirror_num = bioc->mirror_num; orig_bio->bi_private = bioc->private; orig_bio->bi_end_io = bioc->end_io; @@ -6632,7 +6648,14 @@ static inline void btrfs_end_bioc(struct btrfs_io_context *bioc) orig_bio->bi_status = BLK_STS_IOERR; else orig_bio->bi_status = BLK_STS_OK; - bio_endio(orig_bio); + + if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work); + } else { + bio_endio(orig_bio); + } + btrfs_put_bioc(bioc); } @@ -6664,7 +6687,7 @@ static void btrfs_end_bio(struct bio *bio) btrfs_bio_counter_dec(bioc->fs_info); if (atomic_dec_and_test(&bioc->stripes_pending)) - btrfs_end_bioc(bioc); + btrfs_end_bioc(bioc, true); } static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, @@ -6762,7 +6785,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { atomic_inc(&bioc->error); if (atomic_dec_and_test(&bioc->stripes_pending)) - btrfs_end_bioc(bioc); + btrfs_end_bioc(bioc, false); continue; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 1e86c48268edde..7973d11e5f5d89 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -371,6 +371,9 @@ struct btrfs_bio { u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; struct bvec_iter iter; + /* For read end I/O handling */ + struct work_struct end_io_work; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. From 121bda5166c81afc4652a6ba6b9596b6a96f05c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:41 +0200 Subject: [PATCH 0486/1250] btrfs: factor stripe submission logic out of btrfs_map_bio Move all per-stripe handling into submit_stripe_bio and use a label to cleanup instead of duplicating the logic. Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 74 ++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 04e7e79cab47c0..1e06b7ee6a8151 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6690,10 +6690,30 @@ static void btrfs_end_bio(struct bio *bio) btrfs_end_bioc(bioc, true); } -static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, - u64 physical, struct btrfs_device *dev) +static void submit_stripe_bio(struct btrfs_io_context *bioc, + struct bio *orig_bio, int dev_nr, bool clone) { struct btrfs_fs_info *fs_info = bioc->fs_info; + struct btrfs_device *dev = bioc->stripes[dev_nr].dev; + u64 physical = bioc->stripes[dev_nr].physical; + struct bio *bio; + + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(orig_bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + atomic_inc(&bioc->error); + if (atomic_dec_and_test(&bioc->stripes_pending)) + btrfs_end_bioc(bioc, false); + return; + } + + if (clone) { + bio = btrfs_bio_clone(dev->bdev, orig_bio); + } else { + bio = orig_bio; + bio_set_dev(bio, dev->bdev); + } bio->bi_private = bioc; btrfs_bio(bio)->device = dev; @@ -6728,32 +6748,25 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) { - struct btrfs_device *dev; - struct bio *first_bio = bio; u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = 0; - u64 map_length; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; int ret; int dev_nr; int total_devs; struct btrfs_io_context *bioc = NULL; - length = bio->bi_iter.bi_size; - map_length = length; - btrfs_bio_counter_inc_blocked(fs_info); ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bioc, mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); - } + if (ret) + goto out_dec; total_devs = bioc->num_stripes; - bioc->orig_bio = first_bio; - bioc->private = first_bio->bi_private; - bioc->end_io = first_bio->bi_end_io; - atomic_set(&bioc->stripes_pending, bioc->num_stripes); + bioc->orig_bio = bio; + bioc->private = bio->bi_private; + bioc->end_io = bio->bi_end_io; + atomic_set(&bioc->stripes_pending, total_devs); if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { @@ -6765,9 +6778,7 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, ret = raid56_parity_recover(bio, bioc, map_length, mirror_num, 1); } - - btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); + goto out_dec; } if (map_length < length) { @@ -6778,28 +6789,13 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - dev = bioc->stripes[dev_nr].dev; - if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, - &dev->dev_state) || - (btrfs_op(first_bio) == BTRFS_MAP_WRITE && - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - atomic_inc(&bioc->error); - if (atomic_dec_and_test(&bioc->stripes_pending)) - btrfs_end_bioc(bioc, false); - continue; - } - - if (dev_nr < total_devs - 1) { - bio = btrfs_bio_clone(dev->bdev, first_bio); - } else { - bio = first_bio; - bio_set_dev(bio, dev->bdev); - } + const bool should_clone = (dev_nr < total_devs - 1); - submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); + submit_stripe_bio(bioc, bio, dev_nr, should_clone); } +out_dec: btrfs_bio_counter_dec(fs_info); - return BLK_STS_OK; + return errno_to_blk_status(ret); } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, From 29658afe58b02f0a36634b4e020533473c00910a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 May 2022 09:36:42 +0200 Subject: [PATCH 0487/1250] btrfs: do not allocate a btrfs_bio for low-level bios The bios submitted from btrfs_map_bio don't really interact with the rest of btrfs and the only btrfs_bio member actually used in the low-level bios is the pointer to the btrfs_io_context used for endio handler. Use a union in struct btrfs_io_stripe that allows the endio handler to find the btrfs_io_context and remove the spurious ->device assignment so that a plain fs_bio_set bio can be used for the low-level bios allocated inside btrfs_map_bio. Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 13 ------------- fs/btrfs/extent_io.h | 1 - fs/btrfs/volumes.c | 19 +++++++++---------- fs/btrfs/volumes.h | 7 ++++++- 4 files changed, 15 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d7059a48cefcab..5efe25bbee5e0d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3203,19 +3203,6 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) return bio; } -struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio) -{ - struct btrfs_bio *bbio; - struct bio *new; - - /* Bio allocation backed by a bioset does not fail */ - new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(new); - btrfs_bio_init(bbio); - bbio->iter = bio->bi_iter; - return new; -} - struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 23d4103c883160..72966cf21961ef 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -247,7 +247,6 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1e06b7ee6a8151..7513e45c0c4273 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6661,23 +6661,21 @@ static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async) static void btrfs_end_bio(struct bio *bio) { - struct btrfs_io_context *bioc = bio->bi_private; + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; if (bio->bi_status) { atomic_inc(&bioc->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - struct btrfs_device *dev = btrfs_bio(bio)->device; - - ASSERT(dev->bdev); if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_FLUSH_ERRS); } } @@ -6709,14 +6707,15 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, } if (clone) { - bio = btrfs_bio_clone(dev->bdev, orig_bio); + bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set); } else { bio = orig_bio; bio_set_dev(bio, dev->bdev); + btrfs_bio(bio)->device = dev; } - bio->bi_private = bioc; - btrfs_bio(bio)->device = dev; + bioc->stripes[dev_nr].bioc = bioc; + bio->bi_private = &bioc->stripes[dev_nr]; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; /* diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 7973d11e5f5d89..a3c3a0d716bdca 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -412,7 +412,12 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) struct btrfs_io_stripe { struct btrfs_device *dev; - u64 physical; + union { + /* Block mapping */ + u64 physical; + /* For the endio handler */ + struct btrfs_io_context *bioc; + }; u64 length; /* only used for discard mappings */ }; From 240ad774e0a5410a6191524d67bdae55c2bb1677 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 31 May 2022 16:53:33 +0200 Subject: [PATCH 0488/1250] btrfs: replace kmap() with kmap_local_page() in inode.c The use of kmap() is being deprecated in favor of kmap_local_page() where it is feasible. With kmap_local_page(), the mapping is per thread, CPU local and not globally visible. Therefore, use kmap_local_page() / kunmap_local() in inode.c wherever the mappings are per thread and not globally visible. Tested on QEMU + KVM 32 bits VM with 4GB of RAM and HIGHMEM64G enabled. Suggested-by: Ira Weiny Reviewed-by: Christoph Hellwig Signed-off-by: Fabio M. De Francesco Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9cce0a3228f831..92dca0e0d20731 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10760,15 +10760,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = -ENOMEM; goto out_pages; } - kaddr = kmap(pages[i]); + kaddr = kmap_local_page(pages[i]); if (copy_from_iter(kaddr, bytes, from) != bytes) { - kunmap(pages[i]); + kunmap_local(kaddr); ret = -EFAULT; goto out_pages; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap(pages[i]); + kunmap_local(kaddr); } for (;;) { From 5ca381b8581e9017c4befa9644b02ffb9af18b03 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 31 May 2022 16:53:34 +0200 Subject: [PATCH 0489/1250] btrfs: replace kmap() with kmap_local_page() in lzo.c The use of kmap() is being deprecated in favor of kmap_local_page() where it is feasible. With kmap_local_page(), the mapping is per thread, CPU local and not globally visible. Therefore, use kmap_local_page() / kunmap_local() in lzo.c wherever the mappings are per thread and not globally visible. Tested on QEMU + KVM 32 bits VM with 4GB of RAM and HIGHMEM64G enabled. Suggested-by: Ira Weiny Reviewed-by: Christoph Hellwig Signed-off-by: Fabio M. De Francesco Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/lzo.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 430ad36b8b0802..89bc5f825e0a6a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -155,7 +155,7 @@ static int copy_compressed_data_to_page(char *compressed_data, out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); write_compress_length(kaddr + offset_in_page(*cur_out), compressed_size); *cur_out += LZO_LEN; @@ -167,7 +167,7 @@ static int copy_compressed_data_to_page(char *compressed_data, u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, orig_out + compressed_size - *cur_out); - kunmap(cur_page); + kunmap_local(kaddr); if ((*cur_out / PAGE_SIZE) >= max_nr_page) return -E2BIG; @@ -180,7 +180,7 @@ static int copy_compressed_data_to_page(char *compressed_data, return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); memcpy(kaddr + offset_in_page(*cur_out), compressed_data + *cur_out - orig_out, copy_len); @@ -202,7 +202,7 @@ static int copy_compressed_data_to_page(char *compressed_data, *cur_out += sector_bytes_left; out: - kunmap(cur_page); + kunmap_local(kaddr); return 0; } @@ -248,12 +248,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap(page_in); + data_in = kmap_local_page(page_in); ret = lzo1x_1_compress(data_in + offset_in_page(cur_in), in_len, workspace->cbuf, &out_len, workspace->mem); - kunmap(page_in); + kunmap_local(data_in); if (ret < 0) { pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; @@ -310,7 +310,6 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - char *kaddr; struct page *cur_page; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); @@ -318,11 +317,8 @@ static void copy_compressed_segment(struct compressed_bio *cb, ASSERT(copy_len); cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; - kaddr = kmap(cur_page); - memcpy(dest + *cur_in - orig_in, - kaddr + offset_in_page(*cur_in), - copy_len); - kunmap(cur_page); + memcpy_from_page(dest + *cur_in - orig_in, cur_page, + offset_in_page(*cur_in), copy_len); *cur_in += copy_len; } @@ -342,9 +338,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - kaddr = kmap(cb->compressed_pages[0]); + kaddr = kmap_local_page(cb->compressed_pages[0]); len_in = read_compress_length(kaddr); - kunmap(cb->compressed_pages[0]); + kunmap_local(kaddr); cur_in += LZO_LEN; /* @@ -378,9 +374,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) (cur_in + LZO_LEN - 1) / sectorsize); cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; ASSERT(cur_page); - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); - kunmap(cur_page); + kunmap_local(kaddr); cur_in += LZO_LEN; if (seg_len > WORKSPACE_CBUF_LENGTH) { From a469d3fcf29f573a996adadaa4e2dbed4bcb2b70 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 27 Jul 2020 20:59:20 +0200 Subject: [PATCH 0490/1250] btrfs: remove redundant check in up check_setget_bounds There are two separate checks in the bounds checker, the first one being a special case of the second. As this function is performance critical due to checking access to any eb member, reducing the size can slightly improve performance. On a release build on x86_64 the helper is completely inlined so the function call overhead is also gone. There was a report of 5% performance drop on metadata heavy workload, that disappeared after disabling asserts. The most significant part of that is the bounds checker. https://lore.kernel.org/linux-btrfs/20200724164147.39925-1-josef@toxicpanda.com/ After the analysis, the optimized code removes the worst overhead which is the function call and the performance was restored. https://lore.kernel.org/linux-btrfs/20200730110943.GE3703@twin.jikos.cz/ 1. baseline, asserts on, setget check on run time: 46s run time with perf: 48s 2. asserts on, comment out setget check run time: 44s run time with perf: 47s So this is confirms the 5% difference 3. asserts on, optimized seget check run time: 44s run time with perf: 47s The optimizations are reducing the number of ifs to 1 and inlining the hot path. Low-level stuff, gets the performance back. Patch below. 4. asserts off, no setget check run time: 44s run time with perf: 45s This verifies that asserts other than the setget check have negligible impact on performance and it's not harmful to keep them on. Analysis where the performance is lost: * check_setget_bounds is short function, but it's still a function call, changing the flow of instructions and given how many times it's called the overhead adds up * there are two conditions, one to check if the range is completely outside (member_offset > eb->len) or partially inside (member_offset + size > eb->len) Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/struct-funcs.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index f429256f56dba1..12455b2b41de89 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -12,15 +12,10 @@ static bool check_setget_bounds(const struct extent_buffer *eb, { const unsigned long member_offset = (unsigned long)ptr + off; - if (member_offset > eb->len) { + if (unlikely(member_offset + size > eb->len)) { btrfs_warn(eb->fs_info, - "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d", - (unsigned long)ptr, eb->start, member_offset, size); - return false; - } - if (member_offset + size > eb->len) { - btrfs_warn(eb->fs_info, - "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d", + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), (unsigned long)ptr, eb->start, member_offset, size); return false; } From d890dea60e14ca4b61827ed66e4d02538c115ee0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jun 2022 09:11:01 +0200 Subject: [PATCH 0491/1250] btrfs: don't use bio->bi_private to pass the inode to submit_one_bio submit_one_bio is only used for page cache I/O, so the inode can be trivially derived from the first page in the bio. Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5efe25bbee5e0d..361b18d136c0c3 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -181,10 +181,7 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, static void submit_one_bio(struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type) { - struct extent_io_tree *tree = bio->bi_private; - struct inode *inode = tree->private_data; - - bio->bi_private = NULL; + struct inode *inode = bio_first_page_all(bio)->mapping->host; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); @@ -3362,7 +3359,6 @@ static int alloc_new_bio(struct btrfs_inode *inode, bio_ctrl->bio = bio; bio_ctrl->compress_type = compress_type; bio->bi_end_io = end_io_func; - bio->bi_private = &inode->io_tree; bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) From f4e379009c87d78c10e0d24401fb35024cf2a8e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jun 2022 09:11:02 +0200 Subject: [PATCH 0492/1250] btrfs: merge end_write_bio and flush_write_bio Merge end_write_bio and flush_write_bio into a single submit_write_bio helper, that either submits the bio or ends it if a negative errno was passed in. This consolidates a lot of duplicated checks in the callers. Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 94 ++++++++++++++------------------------------ 1 file changed, 29 insertions(+), 65 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 361b18d136c0c3..4fb66a30dc14a7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -201,39 +201,26 @@ static void submit_one_bio(struct bio *bio, int mirror_num, */ } -/* Cleanup unsubmitted bios */ -static void end_write_bio(struct extent_page_data *epd, int ret) -{ - struct bio *bio = epd->bio_ctrl.bio; - - if (bio) { - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - epd->bio_ctrl.bio = NULL; - } -} - /* - * Submit bio from extent page data via submit_one_bio - * - * Return 0 if everything is OK. - * Return <0 for error. + * Submit or fail the current bio in an extent_page_data structure. */ -static void flush_write_bio(struct extent_page_data *epd) +static void submit_write_bio(struct extent_page_data *epd, int ret) { struct bio *bio = epd->bio_ctrl.bio; - if (bio) { + if (!bio) + return; + + if (ret) { + ASSERT(ret < 0); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + } else { submit_one_bio(bio, 0, 0); - /* - * Clean up of epd->bio is handled by its endio function. - * And endio is either triggered by successful bio execution - * or the error handler of submit bio hook. - * So at this point, no matter what happened, we don't need - * to clean up epd->bio. - */ - epd->bio_ctrl.bio = NULL; } + + /* The bio is owned by the bi_end_io handler now */ + epd->bio_ctrl.bio = NULL; } int __init extent_state_cache_init(void) @@ -4251,7 +4238,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - flush_write_bio(epd); + submit_write_bio(epd, 0); flush = 1; btrfs_tree_lock(eb); } @@ -4261,7 +4248,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!epd->sync_io) return 0; if (!flush) { - flush_write_bio(epd); + submit_write_bio(epd, 0); flush = 1; } while (1) { @@ -4308,7 +4295,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - flush_write_bio(epd); + submit_write_bio(epd, 0); flush = 1; } lock_page(p); @@ -4724,7 +4711,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - end_write_bio(epd, ret); + submit_write_bio(epd, ret); return ret; } @@ -4903,10 +4890,6 @@ int btree_write_cache_pages(struct address_space *mapping, index = 0; goto retry; } - if (ret < 0) { - end_write_bio(&epd, ret); - goto out; - } /* * If something went wrong, don't allow any metadata write bio to be * submitted. @@ -4933,21 +4916,17 @@ int btree_write_cache_pages(struct address_space *mapping, * Now such dirty tree block will not be cleaned by any dirty * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. - */ - if (!BTRFS_FS_ERROR(fs_info)) { - flush_write_bio(&epd); - } else { - ret = -EROFS; - end_write_bio(&epd, ret); - } -out: - btrfs_zoned_meta_io_unlock(fs_info); - /* + * * We can get ret > 0 from submit_extent_page() indicating how many ebs * were submitted. Reset it to 0 to avoid false alerts for the caller. */ if (ret > 0) ret = 0; + if (!ret && BTRFS_FS_ERROR(fs_info)) + ret = -EROFS; + submit_write_bio(&epd, ret); + + btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -5049,7 +5028,7 @@ static int extent_write_cache_pages(struct address_space *mapping, * tmpfs file mapping */ if (!trylock_page(page)) { - flush_write_bio(epd); + submit_write_bio(epd, 0); lock_page(page); } @@ -5060,7 +5039,7 @@ static int extent_write_cache_pages(struct address_space *mapping, if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) - flush_write_bio(epd); + submit_write_bio(epd, 0); wait_on_page_writeback(page); } @@ -5100,7 +5079,7 @@ static int extent_write_cache_pages(struct address_space *mapping, * page in our current bio, and thus deadlock, so flush the * write bio here. */ - flush_write_bio(epd); + submit_write_bio(epd, 0); goto retry; } @@ -5121,13 +5100,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) }; ret = __extent_writepage(page, wbc, &epd); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - - flush_write_bio(&epd); + submit_write_bio(&epd, ret); return ret; } @@ -5188,10 +5161,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - if (!found_error) - flush_write_bio(&epd); - else - end_write_bio(&epd, ret); + submit_write_bio(&epd, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -5216,13 +5186,7 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); - ASSERT(ret <= 0); - if (ret < 0) { - btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); - end_write_bio(&epd, ret); - return ret; - } - flush_write_bio(&epd); + submit_write_bio(&epd, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } From 40d3c82624c95a1fbd4405435bf3cd2e95f804fb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jun 2022 09:11:03 +0200 Subject: [PATCH 0493/1250] btrfs: pass the btrfs_bio_ctrl to submit_one_bio submit_one_bio always works on the bio and compression flags from a btrfs_bio_ctrl structure. Pass the explicitly and clean up the calling conventions by handling a NULL bio in submit_one_bio, and using the btrfs_bio_ctrl to pass the mirror number as well. Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 85 ++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4fb66a30dc14a7..3395474d51a3f7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -144,6 +144,7 @@ struct tree_entry { */ struct btrfs_bio_ctrl { struct bio *bio; + int mirror_num; enum btrfs_compression_type compress_type; u32 len_to_stripe_boundary; u32 len_to_oe_boundary; @@ -178,10 +179,18 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, return ret; } -static void submit_one_bio(struct bio *bio, int mirror_num, - enum btrfs_compression_type compress_type) +static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { - struct inode *inode = bio_first_page_all(bio)->mapping->host; + struct bio *bio; + struct inode *inode; + int mirror_num; + + if (!bio_ctrl->bio) + return; + + bio = bio_ctrl->bio; + inode = bio_first_page_all(bio)->mapping->host; + mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); @@ -191,14 +200,11 @@ static void submit_one_bio(struct bio *bio, int mirror_num, else if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_submit_data_write_bio(inode, bio, mirror_num); else - btrfs_submit_data_read_bio(inode, bio, mirror_num, compress_type); + btrfs_submit_data_read_bio(inode, bio, mirror_num, + bio_ctrl->compress_type); - /* - * Above submission hooks will handle the error by ending the bio, - * which will do the cleanup properly. So here we should not return - * any error, or the caller of submit_extent_page() will do cleanup - * again, causing problems. - */ + /* The bio is owned by the bi_end_io handler now */ + bio_ctrl->bio = NULL; } /* @@ -215,12 +221,11 @@ static void submit_write_bio(struct extent_page_data *epd, int ret) ASSERT(ret < 0); bio->bi_status = errno_to_blk_status(ret); bio_endio(bio); + /* The bio is owned by the bi_end_io handler now */ + epd->bio_ctrl.bio = NULL; } else { - submit_one_bio(bio, 0, 0); + submit_one_bio(&epd->bio_ctrl); } - - /* The bio is owned by the bi_end_io handler now */ - epd->bio_ctrl.bio = NULL; } int __init extent_state_cache_init(void) @@ -3410,7 +3415,6 @@ static int submit_extent_page(unsigned int opf, struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, bio_end_io_t end_io_func, - int mirror_num, enum btrfs_compression_type compress_type, bool force_bio_submit) { @@ -3422,10 +3426,8 @@ static int submit_extent_page(unsigned int opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); - if (force_bio_submit && bio_ctrl->bio) { - submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); - bio_ctrl->bio = NULL; - } + if (force_bio_submit) + submit_one_bio(bio_ctrl); while (cur < pg_offset + size) { u32 offset = cur - pg_offset; @@ -3465,8 +3467,7 @@ static int submit_extent_page(unsigned int opf, if (added < size - offset) { /* The bio should contain some page(s) */ ASSERT(bio_ctrl->bio->bi_iter.bi_size); - submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); - bio_ctrl->bio = NULL; + submit_one_bio(bio_ctrl); } cur += added; } @@ -3743,10 +3744,8 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, bio_ctrl, page, disk_bytenr, iosize, - pg_offset, - end_bio_extent_readpage, 0, - this_bio_flag, - force_bio_submit); + pg_offset, end_bio_extent_readpage, + this_bio_flag, force_bio_submit); if (ret) { /* * We have to unlock the remaining range, or the page @@ -3779,8 +3778,7 @@ int btrfs_read_folio(struct file *file, struct folio *folio) * If btrfs_do_readpage() failed we will want to submit the assembled * bio to do the cleanup. */ - if (bio_ctrl.bio) - submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); + submit_one_bio(&bio_ctrl); return ret; } @@ -4063,7 +4061,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, disk_bytenr, iosize, cur - page_offset(page), end_bio_extent_writepage, - 0, 0, false); + 0, false); if (ret) { has_error = true; if (!saved_ret) @@ -4556,7 +4554,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, &epd->bio_ctrl, page, eb->start, eb->len, eb->start - page_offset(page), - end_bio_subpage_eb_writepage, 0, 0, false); + end_bio_subpage_eb_writepage, 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4597,7 +4595,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, &epd->bio_ctrl, p, disk_bytenr, PAGE_SIZE, 0, end_bio_extent_buffer_writepage, - 0, 0, false); + 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -5209,9 +5207,7 @@ void extent_readahead(struct readahead_control *rac) if (em_cached) free_extent_map(em_cached); - - if (bio_ctrl.bio) - submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); + submit_one_bio(&bio_ctrl); } /* @@ -6545,7 +6541,9 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; int ret = 0; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -6580,8 +6578,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, eb->start, eb->len, eb->start - page_offset(page), - end_bio_extent_readpage, mirror_num, 0, - true); + end_bio_extent_readpage, 0, true); if (ret) { /* * In the endio function, if we hit something wrong we will @@ -6590,10 +6587,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - if (bio_ctrl.bio) { - submit_one_bio(bio_ctrl.bio, mirror_num, 0); - bio_ctrl.bio = NULL; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -6613,7 +6607,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -6687,7 +6683,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) err = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, page_offset(page), PAGE_SIZE, 0, end_bio_extent_readpage, - mirror_num, 0, false); + 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6704,10 +6700,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } - if (bio_ctrl.bio) { - submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type); - bio_ctrl.bio = NULL; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; From c4e618475aad149139d4857be7f15fe7690f527c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 7 Jun 2022 19:50:59 +0800 Subject: [PATCH 0494/1250] btrfs: make btrfs_super_block::log_root_transid deprecated When using "btrfs inspect-internal dump-super" to inspect an fs with dirty log, it always shows the log_root_transid as 0: log_root 30474240 log_root_transid 0 <<< log_root_level 0 It turns out that, btrfs_super_block::log_root_transid is never really utilized (even no read for it). This can date back to the introduction of btrfs into upstream kernel. In fact, when reading log tree root, we always use btrfs_super_block::generation + 1 as the expected generation. So here we're completely safe to mark this member deprecated. In theory we can easily reuse this member for other purposes, but to be extra safe, here we follow the leafsize way, by adding "__unused_" for log_root_transid. And we can safely remove the accessors, since there is no such callers from the very beginning. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 22a287cbc3e7c4..6ae9a95794644d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -248,8 +248,12 @@ struct btrfs_super_block { __le64 chunk_root; __le64 log_root; - /* this will help find the new super based on the log root */ - __le64 log_root_transid; + /* + * This member has never been utilized since the very beginning, thus + * it's always 0 regardless of kernel version. We always use + * generation + 1 to read log tree root. So here we mark it deprecated. + */ + __le64 __unused_log_root_transid; __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; @@ -2475,8 +2479,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, - log_root_transid, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, From 8cfb0a99fff6ce1ccda384d7c3c74ead5692d96b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 7 Jun 2022 19:48:24 +0800 Subject: [PATCH 0495/1250] btrfs: reject log replay if there is unsupported RO compat flag [BUG] If we have a btrfs image with dirty log, along with an unsupported RO compatible flag: log_root 30474240 ... compat_flags 0x0 compat_ro_flags 0x40000003 ( FREE_SPACE_TREE | FREE_SPACE_TREE_VALID | unknown flag: 0x40000000 ) Then even if we can only mount it RO, we will still cause metadata update for log replay: BTRFS info (device dm-1): flagging fs with big metadata feature BTRFS info (device dm-1): using free space tree BTRFS info (device dm-1): has skinny extents BTRFS info (device dm-1): start tree-log replay This is definitely against RO compact flag requirement. [CAUSE] RO compact flag only forces us to do RO mount, but we will still do log replay for plain RO mount. Thus this will result us to do log replay and update metadata. This can be very problematic for new RO compat flag, for example older kernel can not understand v2 cache, and if we allow metadata update on RO mount and invalidate/corrupt v2 cache. [FIX] Just reject the mount unless rescue=nologreplay is provided: BTRFS error (device dm-1): cannot replay dirty log with unsupport optional features (0x40000000), try rescue=nologreplay instead We don't want to set rescue=nologreply directly, as this would make the end user to read the old data, and cause confusion. Since the such case is really rare, we're mostly fine to just reject the mount with an error message, which also includes the proper workaround. CC: stable@vger.kernel.org #4.9+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ed1d92b370db09..32b88a2277340c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3556,6 +3556,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device err = -EINVAL; goto fail_alloc; } + /* + * We have unsupported RO compat features, although RO mounted, we + * should not cause any metadata write, including log replay. + * Or we could screw up whatever the new feature requires. + */ + if (unlikely(features && btrfs_super_log_root(disk_super) && + !btrfs_test_opt(fs_info, NOLOGREPLAY))) { + btrfs_err(fs_info, +"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", + features); + err = -EINVAL; + goto fail_alloc; + } + if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; From 3fd0e380f6b6bb247c078ffae146d3f96b23ee5b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jun 2022 08:57:42 +0200 Subject: [PATCH 0496/1250] btrfs: stop looking at btrfs_bio->iter in index_one_bio All the bios that index_one_bio operates on are the bios submitted by the upper layer. These are never resubmitted to an actual device by the raid56 code, and thus the iter never changes from the initial state. Thus we can always just use bi_iter directly as it will be the same as the saved copy. Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 00cd9e8db7ae0b..3c58869779375b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1106,9 +1106,6 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->raid_map[0]; - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_bio(bio)->iter; - bio_for_each_segment(bvec, bio, iter) { u32 bvec_offset; From c5de48d916ed2c36629dcf409bf18320cacdc94c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 3 Jun 2022 08:57:25 +0200 Subject: [PATCH 0497/1250] btrfs: split discard handling out of btrfs_map_block Mapping block for discard doesn't really share any code with the regular block mapping case. Split it out into an entirely separate helper that just returns an array of btrfs_discard_stripe structures and the number of stripes. This removes the need for the length field in the btrfs_io_context structure, so remove tht. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 72 ++++++++++++++++-------------------------- fs/btrfs/volumes.c | 67 ++++++++++++++++----------------------- fs/btrfs/volumes.h | 10 +++++- 3 files changed, 64 insertions(+), 85 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a3afc15430cead..91d2c20c7c8a6f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1269,7 +1269,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } -static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) +static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; @@ -1316,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; - struct btrfs_io_context *bioc = NULL; /* - * Avoid races with device replace and make sure our bioc has devices - * associated to its stripes that don't go away while we are discarding. + * Avoid races with device replace and make sure the devices in the + * stripes don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { - struct btrfs_io_stripe *stripe; + struct btrfs_discard_stripe *stripes; + unsigned int num_stripes; int i; num_bytes = end - cur; - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, - &num_bytes, &bioc, 0); - /* - * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or - * -EOPNOTSUPP. For any such error, @num_bytes is not updated, - * thus we can't continue anyway. - */ - if (ret < 0) - goto out; + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); + if (IS_ERR(stripes)) { + ret = PTR_ERR(stripes); + if (ret == -EOPNOTSUPP) + ret = 0; + break; + } - stripe = bioc->stripes; - for (i = 0; i < bioc->num_stripes; i++, stripe++) { + for (i = 0; i < num_stripes; i++) { + struct btrfs_discard_stripe *stripe = stripes + i; u64 bytes; - struct btrfs_device *device = stripe->dev; - if (!device->bdev) { + if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &stripe->dev->dev_state)) continue; ret = do_discard_extent(stripe, &bytes); - if (!ret) { - discarded_bytes += bytes; - } else if (ret != -EOPNOTSUPP) { + if (ret) { /* - * Logic errors or -ENOMEM, or -EIO, but - * unlikely to happen. - * - * And since there are two loops, explicitly - * go to out to avoid confusion. + * Keep going if discard is not supported by the + * device. */ - btrfs_put_bioc(bioc); - goto out; + if (ret != -EOPNOTSUPP) + break; + ret = 0; + } else { + discarded_bytes += bytes; } - - /* - * Just in case we get back EOPNOTSUPP for some reason, - * just ignore the return value so we don't screw up - * people calling discard_extent. - */ - ret = 0; } - btrfs_put_bioc(bioc); + kfree(stripes); + if (ret) + break; cur += num_bytes; } -out: btrfs_bio_counter_dec(fs_info); - if (actual_bytes) *actual_bytes = discarded_bytes; - - - if (ret == -EOPNOTSUPP) - ret = 0; return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7513e45c0c4273..12a6150ee19d29 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5913,18 +5913,17 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc) kfree(bioc); } -/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ /* * Please note that, discard won't be sent to target device of device * replace. */ -static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 *length_ret, - struct btrfs_io_context **bioc_ret) +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes) { struct extent_map *em; struct map_lookup *map; - struct btrfs_io_context *bioc; + struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; u64 stripe_nr; @@ -5933,29 +5932,26 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, u64 stripe_cnt; u64 stripe_len; u64 stripe_offset; - u64 num_stripes; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; - int ret = 0; + int ret; int i; - /* Discard always returns a bioc. */ - ASSERT(bioc_ret); - em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) - return PTR_ERR(em); + return ERR_CAST(em); map = em->map_lookup; + /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; - goto out; - } + goto out_free_map; +} offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); @@ -5981,7 +5977,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - num_stripes = 1; + *num_stripes = 1; stripe_index = 0; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { @@ -5991,7 +5987,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, sub_stripes = map->sub_stripes; factor = map->num_stripes / sub_stripes; - num_stripes = min_t(u64, map->num_stripes, + *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= sub_stripes; @@ -6001,31 +5997,30 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, last_stripe *= sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = map->num_stripes; + *num_stripes = map->num_stripes; } else { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); } - bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); - if (!bioc) { + stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); + if (!stripes) { ret = -ENOMEM; - goto out; + goto out_free_map; } - for (i = 0; i < num_stripes; i++) { - bioc->stripes[i].physical = + for (i = 0; i < *num_stripes; i++) { + stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bioc->stripes[i].dev = map->stripes[stripe_index].dev; + stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - bioc->stripes[i].length = stripes_per_dev * - map->stripe_len; + stripes[i].length = stripes_per_dev * map->stripe_len; if (i / sub_stripes < remaining_stripes) - bioc->stripes[i].length += map->stripe_len; + stripes[i].length += map->stripe_len; /* * Special for the first stripe and @@ -6036,17 +6031,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * off end_off */ if (i < sub_stripes) - bioc->stripes[i].length -= stripe_offset; + stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) - bioc->stripes[i].length -= stripe_end_offset; + stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { - bioc->stripes[i].length = length; + stripes[i].length = length; } stripe_index++; @@ -6056,12 +6051,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } } - *bioc_ret = bioc; - bioc->map_type = map->type; - bioc->num_stripes = num_stripes; -out: free_extent_map(em); - return ret; + return stripes; +out_free_map: + free_extent_map(em); + return ERR_PTR(ret); } /* @@ -6204,7 +6198,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + i; new->physical = old->physical; - new->length = old->length; new->dev = dev_replace->tgtdev; bioc->tgtdev_map[i] = index_where_to_add; index_where_to_add++; @@ -6245,8 +6238,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + num_stripes; tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bioc->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; bioc->tgtdev_map[index_srcdev] = num_stripes; @@ -6600,10 +6591,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, int mirror_num) { - if (op == BTRFS_MAP_DISCARD) - return __btrfs_map_block_for_discard(fs_info, logical, - length, bioc_ret); - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, mirror_num, 0); } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a3c3a0d716bdca..588367c76c4630 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -418,7 +418,12 @@ struct btrfs_io_stripe { /* For the endio handler */ struct btrfs_io_context *bioc; }; - u64 length; /* only used for discard mappings */ +}; + +struct btrfs_discard_stripe { + struct btrfs_device *dev; + u64 physical; + u64 length; }; /* @@ -557,6 +562,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); From 4448529b3427ea0189e486f46402e399dba79bcd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 6 Jun 2022 18:36:35 +0200 Subject: [PATCH 0498/1250] btrfs: sysfs: advertise zoned support among features We've hidden the zoned support in sysfs under debug config for the first releases but now the stability is reasonable, though not all features have been implemented. Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a536091c3f7608..db3736de14a5f3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -289,9 +289,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); -#ifdef CONFIG_BTRFS_DEBUG -/* Remove once support for zoned allocation is feature complete */ +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +#endif +#ifdef CONFIG_BTRFS_DEBUG /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); #endif @@ -320,8 +321,10 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), +#endif +#ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(extent_tree_v2), #endif #ifdef CONFIG_FS_VERITY From 1a2b6428ae6c64c6176b80d99db5d5b1b626af25 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 9 Jun 2022 09:28:04 -0700 Subject: [PATCH 0499/1250] btrfs: add tracepoints for ordered extents When debugging a reference counting issue with ordered extents, I've found we're lacking a lot of tracepoint coverage in the ordered extent code. Close these gaps by adding tracepoints after every refcount_inc() in the ordered extent code. Reviewed-by: Boris Burkov Reviewed-by: Qu Wenruo Reviewed-by: Anand Jain Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 19 +++++++++-- include/trace/events/btrfs.h | 64 ++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index dc88d2b3721fdf..41b3bc44c92b23 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -401,6 +401,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); cond_wake_up(&entry->wait); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_mark_finished(inode, entry); spin_unlock_irqrestore(&tree->lock, flags); btrfs_init_work(&entry->work, finish_func, NULL, NULL); btrfs_queue_work(wq, &entry->work); @@ -473,6 +474,7 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, if (finished && cached && entry) { *cached = entry; refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } spin_unlock_irqrestore(&tree->lock, flags); return finished; @@ -807,8 +809,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) entry = NULL; - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup(inode, entry); + } out: spin_unlock_irqrestore(&tree->lock, flags); return entry; @@ -848,8 +852,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( break; } out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_range(inode, entry); + } spin_unlock_irq(&tree->lock); return entry; } @@ -878,6 +884,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, ASSERT(list_empty(&ordered->log_list)); list_add_tail(&ordered->log_list, list); refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } spin_unlock_irq(&tree->lock); } @@ -901,6 +908,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first(inode, entry); out: spin_unlock_irq(&tree->lock); return entry; @@ -975,8 +983,11 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( /* No ordered extent in the range */ entry = NULL; out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first_range(inode, entry); + } + spin_unlock_irq(&tree->lock); return entry; } @@ -1055,6 +1066,8 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret = 0; + trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + spin_lock_irq(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 29fa8ea2cc0f6c..73df80d462dc83 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -598,6 +598,70 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put, TP_ARGS(inode, ordered) ); +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup, + + TP_PROTO(const struct btrfs_inode *inode, + const struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_range, + + TP_PROTO(const struct btrfs_inode *inode, + const struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first_range, + + TP_PROTO(const struct btrfs_inode *inode, + const struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_for_logging, + + TP_PROTO(const struct btrfs_inode *inode, + const struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_lookup_first, + + TP_PROTO(const struct btrfs_inode *inode, + const struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_split, + + TP_PROTO(const struct btrfs_inode *inode, + const struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_dec_test_pending, + + TP_PROTO(const struct btrfs_inode *inode, + const struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + +DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_mark_finished, + + TP_PROTO(const struct btrfs_inode *inode, + const struct btrfs_ordered_extent *ordered), + + TP_ARGS(inode, ordered) +); + DECLARE_EVENT_CLASS(btrfs__writepage, TP_PROTO(const struct page *page, const struct inode *inode, From 39e49fbc4ae420ec7242ef090067b0ceb435b3d5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 13 Jun 2022 15:09:48 -0400 Subject: [PATCH 0500/1250] btrfs: tree-log: make the return value for log syncing consistent Currently we will return 1 or -EAGAIN if we decide we need to commit the transaction rather than sync the log. In practice this doesn't really matter, we interpret any !0 and !BTRFS_NO_LOG_SYNC as needing to commit the transaction. However this makes it hard to figure out what the correct thing to do is. Fix this up by defining BTRFS_LOG_FORCE_COMMIT and using this in all the places where we want to force the transaction to be committed. CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 2 +- fs/btrfs/tree-log.c | 18 +++++++++--------- fs/btrfs/tree-log.h | 3 +++ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9dfde1af8a64a1..89c6d7ff19874c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2308,7 +2308,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } /* we've logged all the items and now have a consistent diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1201f083d4dbc5..d898ba13285fb9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -171,7 +171,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, int index = (root->log_transid + 1) % 2; if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -194,7 +194,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, * writing. */ if (zoned && !created) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -3121,7 +3121,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; mutex_unlock(&root->log_mutex); goto out; } @@ -3222,7 +3222,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -3261,7 +3261,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_wake_log_root; } @@ -5848,7 +5848,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, inode_only == LOG_INODE_ALL && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } @@ -6562,12 +6562,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, bool log_dentries = false; if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } if (btrfs_root_refs(&root->root_item) == 0) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } @@ -6665,7 +6665,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } if (ret) diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 1620f8170629e4..57ab5f3b8dc77e 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -12,6 +12,9 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +/* We can't use the tree log for whatever reason, force a transaction commit */ +#define BTRFS_LOG_FORCE_COMMIT (1) + struct btrfs_log_ctx { int log_ret; int log_transid; From c62ae419019288e33e1605ef14c6ba448f1dd369 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 2 Jun 2022 15:51:18 +0800 Subject: [PATCH 0501/1250] btrfs: raid56: avoid double for loop inside finish_rmw() We can easily calculate the stripe number and sector number inside the stripe. Thus there is not much need for a double for loop. For the only case we want to skip the whole stripe, we can manually increase @total_sector_nr. This is not a recommended behavior, thus every time the iterator gets modified there will be a comment along with an ASSERT() for it. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 97 +++++++++++++++++++++++++++-------------------- 1 file changed, 55 insertions(+), 42 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3c58869779375b..c63845c036dff1 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1182,7 +1182,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; + /* The total sector number inside the full stripe. */ + int total_sector_nr; int stripe; + /* Sector number inside a stripe. */ int sectornr; bool has_qstripe; struct bio_list bio_list; @@ -1267,63 +1270,73 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } /* - * time to start writing. Make bios for everything from the - * higher layers (the bio_list in our rbio) and our p/q. Ignore - * everything else. + * Start writing. Make bios for everything from the higher layers (the + * bio_list in our rbio) and our P/Q. Ignore everything else. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; - /* This vertical stripe has no data, skip it. */ - if (!test_bit(sectornr, &rbio->dbitmap)) - continue; + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, - REQ_OP_WRITE); - if (ret) - goto cleanup; + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, rbio->stripe_len, + REQ_OP_WRITE); + if (ret) + goto cleanup; } if (likely(!bioc->num_tgtdevs)) goto write_data; - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bioc->tgtdev_map[stripe]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - /* This vertical stripe has no data, skip it. */ - if (!test_bit(sectornr, &rbio->dbitmap)) - continue; + if (!bioc->tgtdev_map[stripe]) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. + */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; + continue; + } - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - rbio->bioc->tgtdev_map[stripe], - sectornr, rbio->stripe_len, - REQ_OP_WRITE); - if (ret) - goto cleanup; + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, + rbio->bioc->tgtdev_map[stripe], + sectornr, rbio->stripe_len, + REQ_OP_WRITE); + if (ret) + goto cleanup; } write_data: From 095d914af209434318075db3a8f5b47b4b62f204 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 2 Jun 2022 15:51:19 +0800 Subject: [PATCH 0502/1250] btrfs: raid56: avoid double for loop inside __raid56_parity_recover() The double for loop can be easily converted to single for loop as we're really iterating the sectors in their bytenr order. The only exception is the full stripe skip, however that can also easily be done inside the loop. Add an ASSERT() along with a comment for that specific case. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c63845c036dff1..ae4556b9806048 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2115,8 +2115,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2132,29 +2131,29 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * stripe cache, it is possible that some or all of these * pages are going to be uptodate. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + struct sector_ptr *sector; + if (rbio->faila == stripe || rbio->failb == stripe) { atomic_inc(&rbio->error); + /* Skip the current stripe. */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; continue; } + /* The RMW code may have already read this page in. */ + sector = rbio_stripe_sector(rbio, stripe, sectornr); + if (sector->uptodate) + continue; - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* - * the rmw code may have already read this - * page in - */ - sector = rbio_stripe_sector(rbio, stripe, sectornr); - if (sector->uptodate) - continue; - - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret < 0) - goto cleanup; - } + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, rbio->stripe_len, + REQ_OP_READ); + if (ret < 0) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); From 238c88456c7834f219751399402e54fb0ec3f786 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Jun 2022 08:34:34 +0800 Subject: [PATCH 0503/1250] btrfs: raid56: avoid double for loop inside alloc_rbio_essential_pages() The double loop is just checking if the page for the vertical stripe is allocated. We can easily convert it to single loop and get rid of @stripe variable. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index ae4556b9806048..41cdeff63a6b78 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2380,23 +2380,22 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int stripe; - int sectornr; - - for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - struct page *page; - int index = (stripe * rbio->stripe_nsectors + sectornr) * - sectorsize >> PAGE_SHIFT; + int total_sector_nr; - if (rbio->stripe_pages[index]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct page *page; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; - } + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (rbio->stripe_pages[index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; } index_stripe_sectors(rbio); return 0; From d4a3bc9b94a1a67ddf71c62176a63c72641d3908 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Jun 2022 08:34:35 +0800 Subject: [PATCH 0504/1250] btrfs: raid56: avoid double for loop inside raid56_rmw_stripe() This function doesn't even utilize full stripe skip, just iterate all the data sectors is definitely enough. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 59 ++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 41cdeff63a6b78..7ddcac96e844df 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1547,9 +1547,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; struct bio_list bio_list; + const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -1561,38 +1561,35 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + /* Build a list of bios to read all the missing data sectors. */ + for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - /* - * We want to find all the sectors missing from the - * rbio and read them from the disk. If * sector_in_rbio() - * finds a page in the bio list we don't need to read - * it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a page + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate page. - * If so, be happy and use it. - */ - if (sector->uptodate) - continue; + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate page. If so, + * use it. + */ + if (sector->uptodate) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret) - goto cleanup; - } + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, rbio->stripe_len, + REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); From 76b8d914e8e77ebd319afab480c6c8685831abd2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 8 Jun 2022 08:34:36 +0800 Subject: [PATCH 0505/1250] btrfs: raid56: avoid double for loop inside raid56_parity_scrub_stripe() Originally it's iterating all the sectors which has dbitmap sector for the vertical stripe. It can be easily converted to sector bytenr iteration with an test_bit() call. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 62 +++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 7ddcac96e844df..f002334d244a77 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2661,8 +2661,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int sectornr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2672,37 +2671,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) goto cleanup; atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { - struct sector_ptr *sector; - /* - * We want to find all the sectors missing from the - * rbio and read them from the disk. If * sector_in_rbio() - * finds a sector in the bio list we don't need to read - * it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + /* Build a list of bios to read all the missing parts. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int stripe = total_sector_nr / rbio->stripe_nsectors; + struct sector_ptr *sector; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate sector. - * If so, be happy and use it. - */ - if (sector->uptodate) - continue; + /* No data in the vertical stripe, no need to read. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); - if (ret) - goto cleanup; - } + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a sector + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; + + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate sector. If so, + * use it. + */ + if (sector->uptodate) + continue; + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, rbio->stripe_len, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); From e783db6c25cd8e8459986681bec1d7db2cb11f19 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 16:49:48 +0200 Subject: [PATCH 0506/1250] btrfs: open code rbtree search in split_state Preparatory work to remove tree_insert from extent_io.c, the rbtree search loop is a known and simple so it can be open coded. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3395474d51a3f7..4b3004874a964d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -607,7 +607,8 @@ static int insert_state(struct extent_io_tree *tree, static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct extent_state *prealloc, u64 split) { - struct rb_node *node; + struct rb_node *parent = NULL; + struct rb_node **node; if (tree->private_data && is_data_inode(tree->private_data)) btrfs_split_delalloc_extent(tree->private_data, orig, split); @@ -617,12 +618,27 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, - &prealloc->rb_node, NULL, NULL); - if (node) { - free_extent_state(prealloc); - return -EEXIST; + parent = &orig->rb_node; + node = &parent; + while (*node) { + struct tree_entry *entry; + + parent = *node; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (prealloc->end < entry->start) { + node = &(*node)->rb_left; + } else if (prealloc->end > entry->end) { + node = &(*node)->rb_right; + } else { + free_extent_state(prealloc); + return -EEXIST; + } } + + rb_link_node(&prealloc->rb_node, parent, node); + rb_insert_color(&prealloc->rb_node, &tree->state); + return 0; } From 02a40f1caa64530c9a4aebc6f38912a24c8d53fb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 17:14:17 +0200 Subject: [PATCH 0507/1250] btrfs: open code rbtree search in insert_state The rbtree search is a known pattern and can be open coded, allowing to remove the tree_insert and further cleanups. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 80 ++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4b3004874a964d..5e0d5a6ae6e6f9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -368,42 +368,6 @@ void free_extent_state(struct extent_state *state) } } -static struct rb_node *tree_insert(struct rb_root *root, - struct rb_node *search_start, - u64 offset, - struct rb_node *node, - struct rb_node ***p_in, - struct rb_node **parent_in) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - if (p_in && parent_in) { - p = *p_in; - parent = *parent_in; - goto do_insert; - } - - p = search_start ? &search_start : &root->rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (offset < entry->start) - p = &(*p)->rb_left; - else if (offset > entry->end) - p = &(*p)->rb_right; - else - return parent; - } - -do_insert: - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - /** * Search @tree for an entry that contains @offset. Such entry would have * entry->start <= offset && entry->end >= offset. @@ -561,11 +525,12 @@ static void set_state_bits(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, u64 start, u64 end, - struct rb_node ***p, - struct rb_node **parent, + struct rb_node ***node_in, + struct rb_node **parent_in, u32 *bits, struct extent_changeset *changeset) { - struct rb_node *node; + struct rb_node **node; + struct rb_node *parent; if (end < start) { btrfs_err(tree->fs_info, @@ -577,15 +542,36 @@ static int insert_state(struct extent_io_tree *tree, set_state_bits(tree, state, bits, changeset); - node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - found->start, found->end, start, end); - return -EEXIST; + /* Caller provides the exact tree location */ + if (node_in && parent_in) { + node = *node_in; + parent = *parent_in; + goto insert_new; } + + node = &tree->state.rb_node; + while (*node) { + struct tree_entry *entry; + + parent = *node; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (end < entry->start) { + node = &(*node)->rb_left; + } else if (end > entry->end) { + node = &(*node)->rb_right; + } else { + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", + entry->start, entry->end, start, end); + return -EEXIST; + } + } + +insert_new: + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); return 0; } From f7c415e47111a53ac180c281911941001adf5832 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 17:18:24 +0200 Subject: [PATCH 0508/1250] btrfs: lift start and end parameters to callers of insert_state Let callers of insert_state to set up the extent state to allow further simplifications of the parameters. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5e0d5a6ae6e6f9..5b67e899f05a9d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -524,21 +524,14 @@ static void set_state_bits(struct extent_io_tree *tree, * probably isn't what you want to call (see set/clear_extent_bit). */ static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, u64 start, u64 end, + struct extent_state *state, struct rb_node ***node_in, struct rb_node **parent_in, u32 *bits, struct extent_changeset *changeset) { struct rb_node **node; struct rb_node *parent; - - if (end < start) { - btrfs_err(tree->fs_info, - "insert state: end < start %llu %llu", end, start); - WARN_ON(1); - } - state->start = start; - state->end = end; + const u64 end = state->end; set_state_bits(tree, state, bits, changeset); @@ -563,7 +556,7 @@ static int insert_state(struct extent_io_tree *tree, } else { btrfs_err(tree->fs_info, "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, start, end); + entry->start, entry->end, state->start, end); return -EEXIST; } } @@ -1027,8 +1020,9 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, if (!node) { prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, changeset); + prealloc->start = start; + prealloc->end = end; + err = insert_state(tree, prealloc, &p, &parent, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1144,8 +1138,9 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, changeset); + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, NULL, NULL, &bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1268,8 +1263,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, err = -ENOMEM; goto out; } - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, NULL); + prealloc->start = start; + prealloc->end = end; + err = insert_state(tree, prealloc, &p, &parent, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1366,8 +1362,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, NULL); + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, NULL, NULL, &bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); From 8d22613ea91d386770b9f5b90be908d7c8b0b4dc Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 17:54:54 +0200 Subject: [PATCH 0509/1250] btrfs: pass bits by value not by pointer for extent_state helpers The bits are passed to all extent state helpers for no apparent reason, the value only read and never updated so remove the indirection and pass it directly. Also unify the type to u32 where needed. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/extent_io.c | 46 +++++++++++++++++++++----------------------- fs/btrfs/inode.c | 24 +++++++++++------------ 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6ae9a95794644d..e5f19b49efa804 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3318,9 +3318,9 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, struct inode *dir); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits); + u32 bits); void btrfs_clear_delalloc_extent(struct inode *inode, - struct extent_state *state, unsigned *bits); + struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5b67e899f05a9d..20ba7c272e2db8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -510,7 +510,7 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, u32 *bits, + struct extent_state *state, u32 bits, struct extent_changeset *changeset); /* @@ -527,7 +527,7 @@ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, struct rb_node ***node_in, struct rb_node **parent_in, - u32 *bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { struct rb_node **node; struct rb_node *parent; @@ -639,11 +639,11 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - u32 *bits, int wake, + u32 bits, int wake, struct extent_changeset *changeset) { struct extent_state *next; - u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; + u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -805,8 +805,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (err) goto out; if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake, - changeset); + state = clear_state_bit(tree, state, bits, wake, changeset); goto next; } goto search_again; @@ -827,13 +826,13 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, &bits, wake, changeset); + clear_state_bit(tree, prealloc, bits, wake, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, &bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, changeset); next: if (last_end == (u64)-1) goto out; @@ -924,9 +923,9 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - u32 *bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { - u32 bits_to_set = *bits & ~EXTENT_CTLBITS; + u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; if (tree->private_data && is_data_inode(tree->private_data)) @@ -1022,7 +1021,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, BUG_ON(!prealloc); prealloc->start = start; prealloc->end = end; - err = insert_state(tree, prealloc, &p, &parent, &bits, changeset); + err = insert_state(tree, prealloc, &p, &parent, bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1048,7 +1047,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, goto out; } - set_state_bits(tree, state, &bits, changeset); + set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -1104,7 +1103,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits, changeset); + set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -1140,7 +1139,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, NULL, NULL, &bits, changeset); + err = insert_state(tree, prealloc, NULL, NULL, bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1168,7 +1167,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits, changeset); + set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; @@ -1265,7 +1264,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } prealloc->start = start; prealloc->end = end; - err = insert_state(tree, prealloc, &p, &parent, &bits, NULL); + err = insert_state(tree, prealloc, &p, &parent, bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1284,9 +1283,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits, NULL); + set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, NULL); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1325,10 +1324,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits, NULL); + set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, - NULL); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1364,7 +1362,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, NULL, NULL, &bits, NULL); + err = insert_state(tree, prealloc, NULL, NULL, bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1389,9 +1387,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits, NULL); + set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); + clear_state_bit(tree, prealloc, clear_bits, 0, NULL); prealloc = NULL; goto out; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 92dca0e0d20731..74d93f15847829 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2274,18 +2274,18 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits) + u32 bits) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); @@ -2303,7 +2303,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; - if (*bits & EXTENT_DEFRAG) + if (bits & EXTENT_DEFRAG) BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) @@ -2312,7 +2312,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, } if (!(state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - state->start; @@ -2325,14 +2325,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * accounting happens. */ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, - struct extent_state *state, unsigned *bits) + struct extent_state *state, u32 bits) { struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); - if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); @@ -2343,7 +2343,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); @@ -2356,7 +2356,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * don't need to call delalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_CLEAR_META_RESV && + if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, false); @@ -2366,7 +2366,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && - (*bits & EXTENT_CLEAR_DATA_RESV)) + (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, @@ -2381,11 +2381,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, } if ((state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; - if (*bits & EXTENT_ADD_INODE_BYTES) + if (bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } From d154effb1408d874a5b5c68824266bb334ce88e1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 18:11:31 +0200 Subject: [PATCH 0510/1250] btrfs: add fast path for extent_state insertion In two cases the exact location where to insert the extent state is known at the call time so we don't need to pass it to insert_state that takes the fast path. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 20ba7c272e2db8..937c8eafe86183 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -569,6 +569,20 @@ static int insert_state(struct extent_io_tree *tree, return 0; } +/* + * Insert state to @tree to the location given by @node and @parent. + */ +static void insert_state_fast(struct extent_io_tree *tree, + struct extent_state *state, struct rb_node **node, + struct rb_node *parent, unsigned bits, + struct extent_changeset *changeset) +{ + set_state_bits(tree, state, bits, changeset); + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); +} + /* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an @@ -1021,10 +1035,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, BUG_ON(!prealloc); prealloc->start = start; prealloc->end = end; - err = insert_state(tree, prealloc, &p, &parent, bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - + insert_state_fast(tree, prealloc, p, parent, bits, changeset); cache_state(prealloc, cached_state); prealloc = NULL; goto out; @@ -1264,9 +1275,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, } prealloc->start = start; prealloc->end = end; - err = insert_state(tree, prealloc, &p, &parent, bits, NULL); - if (err) - extent_io_tree_panic(tree, err); + insert_state_fast(tree, prealloc, p, parent, bits, NULL); cache_state(prealloc, cached_state); prealloc = NULL; goto out; From 1e8c4082c574bcd08a4b3138c7eb11e3b35cd79c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 18:15:31 +0200 Subject: [PATCH 0511/1250] btrfs: remove node and parent parameters from insert_state There's no caller left that would pass valid pointers to insert_state so we can drop them. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 937c8eafe86183..29e6ec7dfc2c13 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -525,8 +525,6 @@ static void set_state_bits(struct extent_io_tree *tree, */ static int insert_state(struct extent_io_tree *tree, struct extent_state *state, - struct rb_node ***node_in, - struct rb_node **parent_in, u32 bits, struct extent_changeset *changeset) { struct rb_node **node; @@ -535,13 +533,6 @@ static int insert_state(struct extent_io_tree *tree, set_state_bits(tree, state, bits, changeset); - /* Caller provides the exact tree location */ - if (node_in && parent_in) { - node = *node_in; - parent = *parent_in; - goto insert_new; - } - node = &tree->state.rb_node; while (*node) { struct tree_entry *entry; @@ -561,7 +552,6 @@ static int insert_state(struct extent_io_tree *tree, } } -insert_new: rb_link_node(&state->rb_node, parent, node); rb_insert_color(&state->rb_node, &tree->state); @@ -1150,7 +1140,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, NULL, NULL, bits, changeset); + err = insert_state(tree, prealloc, bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1371,7 +1361,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, NULL, NULL, bits, NULL); + err = insert_state(tree, prealloc, bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); From 671b42d1d4e6425093d9ba3f1b098151c99f8e4e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 18:35:24 +0200 Subject: [PATCH 0512/1250] btrfs: open code inexact rbtree search in tree_search The call chain from tree_search tree_search_for_insert __etree_search can be open coded and allow further simplifications, here we need a tree search with fallback to the next node in case it's not found. This is represented as __etree_search parameters next_ret=valid, prev_ret=NULL. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 29e6ec7dfc2c13..ee84474fcf7ee0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -453,10 +453,35 @@ tree_search_for_insert(struct extent_io_tree *tree, return ret; } -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) { - return tree_search_for_insert(tree, offset, NULL, NULL); + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct rb_node *prev = NULL; + struct tree_entry *entry; + + while (*node) { + prev = *node; + entry = rb_entry(prev, struct tree_entry, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return *node; + } + + /* Search neighbors until we find the first one past the end */ + while (prev && offset > entry->end) { + prev = rb_next(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); + } + + return prev; } /* From e0599108108f50baa7d0dbee4b9a0f43121ff101 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 18:49:39 +0200 Subject: [PATCH 0513/1250] btrfs: make tree search for insert more generic and use it for tree_search With a slight extension of tree_search_for_insert (fill the return node and parent return parameters) we can avoid calling __etree_search from tree_search, that could be removed eventually in followup patches. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ee84474fcf7ee0..a80b7e7e23f418 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -443,20 +443,6 @@ tree_search_for_insert(struct extent_io_tree *tree, u64 offset, struct rb_node ***p_ret, struct rb_node **parent_ret) -{ - struct rb_node *next= NULL; - struct rb_node *ret; - - ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); - if (!ret) - return next; - return ret; -} - -/* - * Inexact rb-tree search, return the next entry if @offset is not found - */ -static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) { struct rb_root *root = &tree->state; struct rb_node **node = &root->rb_node; @@ -475,6 +461,11 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offse return *node; } + if (p_ret) + *p_ret = node; + if (parent_ret) + *parent_ret = prev; + /* Search neighbors until we find the first one past the end */ while (prev && offset > entry->end) { prev = rb_next(prev); @@ -484,6 +475,14 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offse return prev; } +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + /* * utility function to look for merge candidates inside a given range. * Any extents with matching state are merged together into a single From 18bbef96c2d3cf6e664619638200ae6ee780203f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 25 Jun 2020 19:03:41 +0200 Subject: [PATCH 0514/1250] btrfs: unify tree search helper returning prev and next nodes Simplify helper to return only next and prev pointers, we don't need all the node/parent/prev/next pointers of __etree_search as there are now other specialized helpers. Rename parameters so they follow the naming. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 122 ++++++++++++++++++++++--------------------- 1 file changed, 62 insertions(+), 60 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a80b7e7e23f418..1935cb7a305dac 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -374,81 +374,87 @@ void free_extent_state(struct extent_state *state) * * @tree: the tree to search * @offset: offset that should fall within an entry in @tree - * @next_ret: pointer to the first entry whose range ends after @offset - * @prev_ret: pointer to the first entry whose range begins before @offset - * @p_ret: pointer where new node should be anchored (used when inserting an + * @node_ret: pointer where new node should be anchored (used when inserting an * entry in the tree) * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * - * This function returns a pointer to the entry that contains @offset byte - * address. If no such entry exists, then NULL is returned and the other - * pointer arguments to the function are filled, otherwise the found entry is - * returned and other pointers are left untouched. + * Return a pointer to the entry that contains @offset byte address and don't change + * @node_ret and @parent_ret. + * + * If no such entry exists, return pointer to entry that ends before @offset + * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. */ -static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **next_ret, - struct rb_node **prev_ret, - struct rb_node ***p_ret, - struct rb_node **parent_ret) +static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***node_ret, + struct rb_node **parent_ret) { struct rb_root *root = &tree->state; - struct rb_node **n = &root->rb_node; + struct rb_node **node = &root->rb_node; struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; struct tree_entry *entry; - struct tree_entry *prev_entry = NULL; - while (*n) { - prev = *n; + while (*node) { + prev = *node; entry = rb_entry(prev, struct tree_entry, rb_node); - prev_entry = entry; if (offset < entry->start) - n = &(*n)->rb_left; + node = &(*node)->rb_left; else if (offset > entry->end) - n = &(*n)->rb_right; + node = &(*node)->rb_right; else - return *n; + return *node; } - if (p_ret) - *p_ret = n; + if (node_ret) + *node_ret = node; if (parent_ret) *parent_ret = prev; - if (next_ret) { - orig_prev = prev; - while (prev && offset > prev_entry->end) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - prev = orig_prev; + /* Search neighbors until we find the first one past the end */ + while (prev && offset > entry->end) { + prev = rb_next(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); } - if (prev_ret) { - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - } - return NULL; + return prev; } -static inline struct rb_node * -tree_search_for_insert(struct extent_io_tree *tree, - u64 offset, - struct rb_node ***p_ret, - struct rb_node **parent_ret) +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) +{ + return tree_search_for_insert(tree, offset, NULL, NULL); +} + +/** + * Search offset in the tree or fill neighbor rbtree node pointers. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * + * Return a pointer to the entry that contains @offset byte address. If no + * such entry exists, then return NULL and fill @prev_ret and @next_ret. + * Otherwise return the found entry and other pointers are left untouched. + */ +static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree, + u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) { struct rb_root *root = &tree->state; struct rb_node **node = &root->rb_node; struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; struct tree_entry *entry; + ASSERT(prev_ret); + ASSERT(next_ret); + while (*node) { prev = *node; entry = rb_entry(prev, struct tree_entry, rb_node); @@ -461,26 +467,22 @@ tree_search_for_insert(struct extent_io_tree *tree, return *node; } - if (p_ret) - *p_ret = node; - if (parent_ret) - *parent_ret = prev; - - /* Search neighbors until we find the first one past the end */ + orig_prev = prev; while (prev && offset > entry->end) { prev = rb_next(prev); entry = rb_entry(prev, struct tree_entry, rb_node); } + *next_ret = prev; + prev = orig_prev; - return prev; -} + entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < entry->start) { + prev = rb_prev(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; -/* - * Inexact rb-tree search, return the next entry if @offset is not found - */ -static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) -{ - return tree_search_for_insert(tree, offset, NULL, NULL); + return NULL; } /* @@ -1686,7 +1688,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, /* Find first extent with bits cleared */ while (1) { - node = __etree_search(tree, start, &next, &prev, NULL, NULL); + node = tree_search_prev_next(tree, start, &prev, &next); if (!node && !next && !prev) { /* * Tree is completely empty, send full range and let From 9ab4425f431500b6d5ec67a5e578ea6cdcfe9efe Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 May 2022 16:34:28 +0800 Subject: [PATCH 0515/1250] btrfs: remove parameter dev_extent_len from scrub_stripe() For scrub_stripe() we can easily calculate the dev extent length as we have the full info of the chunk. Thus there is no need to pass @dev_extent_len from the caller, and we introduce a helper, btrfs_calc_stripe_length(), to do the calculation from extent_map structure. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 15 ++++++++------- fs/btrfs/volumes.c | 12 ++++++------ fs/btrfs/volumes.h | 1 + 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index db700e6ec5a932..a0c45e92bd6cb7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3418,20 +3418,22 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct extent_map *em, struct btrfs_device *scrub_dev, - int stripe_index, u64 dev_extent_len) + int stripe_index) { struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root; struct btrfs_root *csum_root; struct blk_plug plug; + struct map_lookup *map = em->map_lookup; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; u64 physical = map->stripes[stripe_index].physical; - const u64 physical_end = physical + dev_extent_len; + const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; /* The logical increment after finishing one stripe */ @@ -3558,8 +3560,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical += map->stripe_len; spin_lock(&sctx->stat_lock); if (stop_loop) - sctx->stat.last_physical = map->stripes[stripe_index].physical + - dev_extent_len; + sctx->stat.last_physical = + map->stripes[stripe_index].physical + dev_stripe_len; else sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); @@ -3628,8 +3630,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, bg, map, scrub_dev, i, - dev_extent_len); + ret = scrub_stripe(sctx, bg, em, scrub_dev, i); if (ret) goto out; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 12a6150ee19d29..e12b139586e09b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6939,11 +6939,12 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) +u64 btrfs_calc_stripe_length(const struct extent_map *em) { - const int data_stripes = calc_data_stripes(type, num_stripes); + const struct map_lookup *map = em->map_lookup; + const int data_stripes = calc_data_stripes(map->type, map->num_stripes); - return div_u64(chunk_len, data_stripes); + return div_u64(em->len, data_stripes); } #if BITS_PER_LONG == 32 @@ -7082,8 +7083,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->type = type; map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; - em->orig_block_len = calc_stripe_length(type, em->len, - map->num_stripes); + em->orig_block_len = btrfs_calc_stripe_length(em); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -7984,7 +7984,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } map = em->map_lookup; - stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); + stripe_len = btrfs_calc_stripe_length(em); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 588367c76c4630..f19916a69beafa 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -633,6 +633,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); +u64 btrfs_calc_stripe_length(const struct extent_map *em); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); From d4cf52815fdf6650d988cd9f584e4b9c97d997b2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 May 2022 16:34:29 +0800 Subject: [PATCH 0516/1250] btrfs: use btrfs_chunk_max_errors() to replace tolerance calculation In __btrfs_map_block() we have an assignment to @max_errors using nr_parity_stripes(). Although it works for RAID56 it's confusing. Replace it with btrfs_chunk_max_errors(). Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e12b139586e09b..75a59423a1bfef 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6466,7 +6466,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; - max_errors = nr_parity_stripes(map); + max_errors = btrfs_chunk_max_errors(map); *length = map->stripe_len; stripe_index = 0; From 23ff075a86a4923bc8648687a1a8c84eab4bddc0 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 May 2022 16:34:30 +0800 Subject: [PATCH 0517/1250] btrfs: use btrfs_raid_array to calculate number of parity stripes Use the raid table instead of hard coded values and rename the helper as it is exported. This could make later extension on RAID56 based profiles easier. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 10 ++-------- fs/btrfs/raid56.h | 12 +----------- fs/btrfs/volumes.c | 7 +++++++ fs/btrfs/volumes.h | 1 + 4 files changed, 11 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f002334d244a77..0f0368e63e5af1 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -922,7 +922,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - int nr_data = 0; void *p; ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); @@ -976,14 +975,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, CONSUME_ALLOC(rbio->finish_pointers, real_stripes); #undef CONSUME_ALLOC - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - nr_data = real_stripes - 1; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - nr_data = real_stripes - 2; - else - BUG(); + ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); + rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); - rbio->nr_data = nr_data; return rbio; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 3b22657ca857e3..c73bceb2b46162 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -155,19 +155,9 @@ struct raid56_bio_trace_info { u8 stripe_nr; }; -static inline int nr_parity_stripes(const struct map_lookup *map) -{ - if (map->type & BTRFS_BLOCK_GROUP_RAID5) - return 1; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - return 2; - else - return 0; -} - static inline int nr_data_stripes(const struct map_lookup *map) { - return map->num_stripes - nr_parity_stripes(map); + return map->num_stripes - btrfs_nr_parity_stripes(map->type); } #define RAID5_P_STRIPE ((u64)-2) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 75a59423a1bfef..e40c0d59c4a07e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -182,6 +182,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags) return btrfs_raid_array[index].raid_name; } +int btrfs_nr_parity_stripes(u64 type) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); + + return btrfs_raid_array[index].nparity; +} + /* * Fill @buf with textual description of @bg_flags, no more than @size_buf * bytes including terminating null byte. diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f19916a69beafa..b61508723d5d22 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -634,6 +634,7 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); u64 btrfs_calc_stripe_length(const struct extent_map *em); +int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); From 09922392880c279c93312c8088f74d333fda4f0e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 13 May 2022 16:34:31 +0800 Subject: [PATCH 0518/1250] btrfs: use ncopies from btrfs_raid_array in btrfs_num_copies() For all non-RAID56 profiles, we can use btrfs_raid_array[].ncopies directly, only for RAID5 and RAID6 we need some extra handling as there's no table value for that. For RAID10 there's a change from sub_stripes to ncopies. The values are the same but semantically we want to use number of copies, as this is what btrfs_num_copies does. Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e40c0d59c4a07e..80636fbf28b7a3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5717,7 +5717,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct extent_map *em; struct map_lookup *map; - int ret; + enum btrfs_raid_types index; + int ret = 1; em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) @@ -5730,10 +5731,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) - ret = map->num_stripes; - else if (map->type & BTRFS_BLOCK_GROUP_RAID10) - ret = map->sub_stripes; + index = btrfs_bg_flags_to_raid_index(map->type); + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) + ret = btrfs_raid_array[index].ncopies; else if (map->type & BTRFS_BLOCK_GROUP_RAID5) ret = 2; else if (map->type & BTRFS_BLOCK_GROUP_RAID6) @@ -5745,8 +5747,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * stripe under reconstruction. */ ret = map->num_stripes; - else - ret = 1; free_extent_map(em); down_read(&fs_info->dev_replace.rwsem); From 81067af66b1200a185fe0d4d95fcebc0224cc35f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 6 Jun 2022 18:52:24 +0200 Subject: [PATCH 0519/1250] btrfs: call inode_to_path directly and drop indirection The functions for iterating inode reference take a function parameter but there's only one value, inode_to_path(). Remove the indirection and call the function. As paths_from_inode would become just an alias for iterate_irefs(), merge the two into one function. Signed-off-by: David Sterba --- fs/btrfs/backref.c | 50 +++++++++++++++++++--------------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ebc392ea1d74c9..df3352f8be24b2 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2054,12 +2054,11 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, return ret; } -typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx); +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, void *ctx); static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) + struct btrfs_path *path, void *ctx) { int ret = 0; int slot; @@ -2103,7 +2102,7 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, fs_root->root_key.objectid); - ret = iterate(parent, name_len, + ret = inode_to_path(parent, name_len, (unsigned long)(iref + 1), eb, ctx); if (ret) break; @@ -2119,8 +2118,7 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, } static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) + struct btrfs_path *path, void *ctx) { int ret; int slot; @@ -2162,7 +2160,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, extref = (struct btrfs_inode_extref *)(ptr + cur_offset); parent = btrfs_inode_extref_parent(eb, extref); name_len = btrfs_inode_extref_name_len(eb, extref); - ret = iterate(parent, name_len, + ret = inode_to_path(parent, name_len, (unsigned long)&extref->name, eb, ctx); if (ret) break; @@ -2180,26 +2178,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, iterate_irefs_t *iterate, - void *ctx) -{ - int ret; - int found_refs = 0; - - ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); - if (!ret) - ++found_refs; - else if (ret != -ENOENT) - return ret; - - ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); - if (ret == -ENOENT && found_refs) - return 0; - - return ret; -} - /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error @@ -2248,8 +2226,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, */ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { - return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, ipath->fs_root, ipath->btrfs_path, ipath); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, ipath->fs_root, ipath->btrfs_path, ipath); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; } struct btrfs_data_container *init_data_container(u32 total_bytes) From 37d4fef07af1fd20c4fb79a7a7aa9e0eb5fc6957 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 6 Jun 2022 19:06:17 +0200 Subject: [PATCH 0520/1250] btrfs: simplify parameters of backref iterators The inode reference iterator interface takes parameters that are derived from the context parameter, but as it's a void* type the values are passed individually. Change the ctx type to inode_fs_path as it's the only thing we pass and drop any parameters that are derived from that. Signed-off-by: David Sterba --- fs/btrfs/backref.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index df3352f8be24b2..e62f142fd3e570 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2055,10 +2055,9 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, } static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx); + struct extent_buffer *eb, struct inode_fs_paths *ipath); -static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, void *ctx) +static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) { int ret = 0; int slot; @@ -2067,6 +2066,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, u32 name_len; u64 parent = 0; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_ref *iref; struct btrfs_key found_key; @@ -2103,7 +2104,7 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, cur, found_key.objectid, fs_root->root_key.objectid); ret = inode_to_path(parent, name_len, - (unsigned long)(iref + 1), eb, ctx); + (unsigned long)(iref + 1), eb, ipath); if (ret) break; len = sizeof(*iref) + name_len; @@ -2117,14 +2118,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, void *ctx) +static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath) { int ret; int slot; u64 offset = 0; u64 parent; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_extref *extref; u32 item_size; @@ -2161,7 +2163,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, parent = btrfs_inode_extref_parent(eb, extref); name_len = btrfs_inode_extref_name_len(eb, extref); ret = inode_to_path(parent, name_len, - (unsigned long)&extref->name, eb, ctx); + (unsigned long)&extref->name, eb, ipath); if (ret) break; @@ -2183,9 +2185,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, * returns <0 in case of an error */ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx) + struct extent_buffer *eb, struct inode_fs_paths *ipath) { - struct inode_fs_paths *ipath = ctx; char *fspath; char *fspath_min; int i = ipath->fspath->elem_cnt; @@ -2229,13 +2230,13 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) int ret; int found_refs = 0; - ret = iterate_inode_refs(inum, ipath->fs_root, ipath->btrfs_path, ipath); + ret = iterate_inode_refs(inum, ipath); if (!ret) ++found_refs; else if (ret != -ENOENT) return ret; - ret = iterate_inode_extrefs(inum, ipath->fs_root, ipath->btrfs_path, ipath); + ret = iterate_inode_extrefs(inum, ipath); if (ret == -ENOENT && found_refs) return 0; From c70b00b31d7074204df716dfed01def5fd00ce09 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 6 Jun 2022 19:32:59 +0200 Subject: [PATCH 0521/1250] btrfs: sink iterator parameter to btrfs_ioctl_logical_to_ino There's only one function we pass to iterate_inodes_from_logical as iterator, so we can drop the indirection and call it directly, after moving the function to backref.c Signed-off-by: David Sterba --- fs/btrfs/backref.c | 25 ++++++++++++++++++++++--- fs/btrfs/backref.h | 3 +-- fs/btrfs/ioctl.c | 22 +--------------------- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e62f142fd3e570..d385357e19b613 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2028,10 +2028,29 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, return ret; } +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) + void *ctx, bool ignore_offset) { int ret; u64 extent_item_pos; @@ -2049,7 +2068,7 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - iterate, ctx, ignore_offset); + build_ino_list, ctx, ignore_offset); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ba454032dbe227..2759de7d324c87 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -35,8 +35,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, bool ignore_offset); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, + struct btrfs_path *path, void *ctx, bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 679ce4c5c341ab..7e1b4b0fbd6c6e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4243,26 +4243,6 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) -{ - struct btrfs_data_container *inodes = ctx; - const size_t c = 3 * sizeof(u64); - - if (inodes->bytes_left >= c) { - inodes->bytes_left -= c; - inodes->val[inodes->elem_cnt] = inum; - inodes->val[inodes->elem_cnt + 1] = offset; - inodes->val[inodes->elem_cnt + 2] = root; - inodes->elem_cnt += 3; - } else { - inodes->bytes_missing += c - inodes->bytes_left; - inodes->bytes_left = 0; - inodes->elem_missed += 3; - } - - return 0; -} - static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, void __user *arg, int version) { @@ -4312,7 +4292,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes, ignore_offset); + inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) From 51adc557d2272899e85d30fc85e5d83e3bde8a43 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 6 Jun 2022 19:07:02 +0200 Subject: [PATCH 0522/1250] btrfs: remove unused typedefs get_extent_t and btrfs_work_func_t Signed-off-by: David Sterba --- fs/btrfs/async-thread.h | 1 - fs/btrfs/extent_io.h | 4 ---- 2 files changed, 5 deletions(-) diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 07960529b36010..6e2596ddae1002 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -13,7 +13,6 @@ struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); -typedef void (*btrfs_work_func_t)(struct work_struct *arg); struct btrfs_work { btrfs_func_t func; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 72966cf21961ef..c0f1fb63eeae79 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -142,10 +142,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) struct extent_map_tree; -typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); - int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); From 5f7fb790c28d76254aaf8f2face2a55a9682b814 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jun 2022 15:06:34 +0800 Subject: [PATCH 0523/1250] btrfs: use named constant for reserved device space There's a reserved space on each device of size 1MiB that can be used by bootloaders or to avoid accidental overwrite. Use a symbolic constant with the explaining comment instead of hard coding the value and multiple comments. Note: since btrfs-progs v4.1, mkfs.btrfs will reserve the first 1MiB for the primary super block (at offset 64KiB), until then the range could have been used by mistake. Kernel has been always respecting the 1MiB range for writes. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 +++++++ fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/super.c | 12 ++++-------- fs/btrfs/volumes.c | 7 +------ 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e5f19b49efa804..6d9f81d4e99c09 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -229,6 +229,13 @@ struct btrfs_root_backup { #define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 +/* + * The reserved space at the beginning of each device. + * It covers the primary super block and leaves space for potential use by other + * tools like bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) + /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 91d2c20c7c8a6f..a1696e3ffb1e38 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5976,7 +5976,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { - u64 start = SZ_1M, len = 0, end = 0; + u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; *trimmed = 0; @@ -6020,8 +6020,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) break; } - /* Ensure we skip the reserved area in the first 1M */ - start = max_t(u64, start, SZ_1M); + /* Ensure we skip the reserved space on each device. */ + start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); /* * If find_first_clear_extent_bit find a range that spans the diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 719dda57dc7a0a..41652dcd16f436 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2272,17 +2272,13 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* - * In order to avoid overwriting the superblock on the drive, - * btrfs starts at an offset of at least 1MB when doing chunk - * allocation. - * - * This ensures we have at least min_stripe_size free space - * after excluding 1MB. + * Ensure we have at least min_stripe_size on top of the + * reserved space on the device. */ - if (avail_space <= SZ_1M + min_stripe_size) + if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size) continue; - avail_space -= SZ_1M; + avail_space -= BTRFS_DEVICE_RANGE_RESERVED; devices_info[i].dev = device; devices_info[i].max_avail = avail_space; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 80636fbf28b7a3..e6af960a69614f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1403,12 +1403,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: - /* - * We don't want to overwrite the superblock on the drive nor - * any area used by the boot loader (grub for example), so we - * make sure to start at an offset of at least 1MB. - */ - return max_t(u64, start, SZ_1M); + return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular From 1206eda77dcca9c3f131e438609b3a51c8d252a4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 13 Jun 2022 15:06:35 +0800 Subject: [PATCH 0524/1250] btrfs: warn about dev extents that are inside the reserved range Btrfs on-disk format has reserved the first 1MiB for the primary super block (at 64KiB offset) and bootloaders may also use this space. This behavior is only introduced since v4.1 btrfs-progs release, although kernel can ensure we never touch the reserved range of super blocks, it's better to inform the end users, and a balance will resolve the problem. Signed-off-by: Qu Wenruo [ update changelog and message ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e6af960a69614f..076040310f6fbb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7996,6 +7996,16 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } + /* + * Very old mkfs.btrfs (before v4.1) will not respect the reserved + * space. Although kernel can handle it without problem, better to warn + * the users. + */ + if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) + btrfs_warn(fs_info, + "devid %llu physical %llu len %llu inside the reserved space", + devid, physical_offset, physical_len); + for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->devid == devid && map->stripes[i].physical == physical_offset) { From db5ce77937c96ff106c39af1ba81f99817ed89ea Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 17 Jun 2022 15:53:34 +0300 Subject: [PATCH 0525/1250] btrfs: batch up release of reserved metadata for delayed items used for deletion With Filipe's recent rework of the delayed inode code one aspect which isn't batched is the release of the reserved metadata of delayed inode's delete items. With this patch on top of Filipe's rework and running the same test as provided in the description of a patch titled "btrfs: improve batch deletion of delayed dir index items" I observe the following change of the number of calls to btrfs_block_rsv_release: Before this change: - block_rsv_release: 1004 - btrfs_delete_delayed_items_total_time: 14602 - delete_batches: 505 After: - block_rsv_release: 510 - btrfs_delete_delayed_items_total_time: 13643 - delete_batches: 507 Reviewed-by: Filipe Manana Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 3f85182e4b8734..823aa05b3e38d1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -833,11 +833,13 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; struct extent_buffer *leaf = path->nodes[0]; LIST_HEAD(batch_list); int nitems, slot, last_slot; int ret; + u64 total_reserved_size = item->bytes_reserved; ASSERT(leaf != NULL); @@ -874,14 +876,27 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, nitems++; curr = next; list_add_tail(&curr->tree_list, &batch_list); + total_reserved_size += curr->bytes_reserved; } ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); if (ret) return ret; + /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */ + if (total_reserved_size > 0) { + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we + * don't need to release/reserve qgroup space. + */ + trace_btrfs_space_reservation(fs_info, "delayed_item", + item->key.objectid, total_reserved_size, + 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, + total_reserved_size, NULL); + } + list_for_each_entry_safe(curr, next, &batch_list, tree_list) { - btrfs_delayed_item_release_metadata(root, curr); list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } From 4c54be8d6053190e650df301860b6a36cddd7bf4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 19 Jun 2022 08:07:05 +0200 Subject: [PATCH 0526/1250] btrfs: remove the finish_func argument to btrfs_mark_ordered_io_finished finish_func is always set to finish_ordered_fn, so remove it and also the now pointless and somewhat confusingly named __endio_write_update_ordered wrapper. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 40 +++++++++------------------------------- fs/btrfs/ordered-data.c | 17 +++++++++++------ fs/btrfs/ordered-data.h | 5 +++-- 3 files changed, 23 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 74d93f15847829..d535ed39c39181 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -114,7 +114,6 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -125,10 +124,6 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type); -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate); - /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * @@ -223,7 +218,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, /* * Here we just clear all Ordered bits for every page in the - * range, then __endio_write_update_ordered() will handle + * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, @@ -244,7 +239,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, offset = page_offset(locked_page) + PAGE_SIZE; } - return __endio_write_update_ordered(inode, offset, bytes, false); + return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); @@ -3086,7 +3081,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; @@ -3295,21 +3290,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) return ret; } -static void finish_ordered_fn(struct btrfs_work *work) -{ - struct btrfs_ordered_extent *ordered_extent; - ordered_extent = container_of(work, struct btrfs_ordered_extent, work); - btrfs_finish_ordered_io(ordered_extent); -} - void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) { trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, - finish_ordered_fn, uptodate); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); } /* @@ -7827,8 +7814,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos += submitted; length -= submitted; if (write) - __endio_write_update_ordered(BTRFS_I(inode), pos, - length, false); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); @@ -7850,10 +7837,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - __endio_write_update_ordered(BTRFS_I(dip->inode), - dip->file_offset, - dip->bytes, - !dip->bio.bi_status); + btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + dip->file_offset, dip->bytes, + !dip->bio.bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->file_offset, @@ -7916,14 +7902,6 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, return err; } -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate) -{ - btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, - finish_ordered_fn, uptodate); -} - static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 41b3bc44c92b23..1952ac85222c0b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -272,6 +272,14 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, spin_unlock_irq(&tree->lock); } +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); +} + /* * Mark all ordered extents io inside the specified range finished. * @@ -281,16 +289,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the * endio function twice. - * @finish_func: The function to be executed when all the IO of an ordered - * extent are finished. * * This function is called for endio, thus the range must have ordered * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate) + struct page *page, u64 file_offset, + u64 num_bytes, bool uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -403,7 +408,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, refcount_inc(&entry->refs); trace_btrfs_ordered_extent_mark_finished(inode, entry); spin_unlock_irqrestore(&tree->lock, flags); - btrfs_init_work(&entry->work, finish_func, NULL, NULL); + btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &entry->work); spin_lock_irqsave(&tree->lock, flags); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ecad67a2c74575..87792f85e2c4ae 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -180,13 +180,14 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); + void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate); + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); From 01de9aab3c24e02f0713dd2183775df3e171c5ea Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 9 Jun 2022 13:18:44 +0800 Subject: [PATCH 0527/1250] btrfs: raid56: don't trust any cached sector in __raid56_parity_recover() [BUG] There is a small workload which will always fail with recent kernel: (A simplified version from btrfs/125 test case) mkfs.btrfs -f -m raid5 -d raid5 -b 1G $dev1 $dev2 $dev3 mount $dev1 $mnt xfs_io -f -c "pwrite -S 0xee 0 1M" $mnt/file1 sync umount $mnt btrfs dev scan -u $dev3 mount -o degraded $dev1 $mnt xfs_io -f -c "pwrite -S 0xff 0 128M" $mnt/file2 umount $mnt btrfs dev scan mount $dev1 $mnt btrfs balance start --full-balance $mnt umount $mnt The failure is always failed to read some tree blocks: BTRFS info (device dm-4): relocating block group 217710592 flags data|raid5 BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7 BTRFS error (device dm-4): parent transid verify failed on 38993920 wanted 9 found 7 ... [CAUSE] With the recently added debug output, we can see all RAID56 operations related to full stripe 38928384: 56.1183: raid56_read_partial: full_stripe=38928384 devid=2 type=DATA1 offset=0 opf=0x0 physical=9502720 len=65536 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=16384 opf=0x0 physical=9519104 len=16384 56.1185: raid56_read_partial: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x0 physical=9551872 len=16384 56.1187: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=0 opf=0x1 physical=9502720 len=16384 56.1188: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=32768 opf=0x1 physical=9535488 len=16384 56.1188: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=0 opf=0x1 physical=30474240 len=16384 56.1189: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=32768 opf=0x1 physical=30507008 len=16384 56.1218: raid56_write_stripe: full_stripe=38928384 devid=3 type=DATA2 offset=49152 opf=0x1 physical=9551872 len=16384 56.1219: raid56_write_stripe: full_stripe=38928384 devid=1 type=PQ1 offset=49152 opf=0x1 physical=30523392 len=16384 56.2721: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 56.2723: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 56.2724: raid56_parity_recover: full stripe=38928384 eb=39010304 mirror=2 Before we enter raid56_parity_recover(), we have triggered some metadata write for the full stripe 38928384, this leads to us to read all the sectors from disk. Furthermore, btrfs raid56 write will cache its calculated P/Q sectors to avoid unnecessary read. This means, for that full stripe, after any partial write, we will have stale data, along with P/Q calculated using that stale data. Thankfully due to patch "btrfs: only write the sectors in the vertical stripe which has data stripes" we haven't submitted all the corrupted P/Q to disk. When we really need to recover certain range, aka in raid56_parity_recover(), we will use the cached rbio, along with its cached sectors (the full stripe is all cached). This explains why we have no event raid56_scrub_read_recover() triggered. Since we have the cached P/Q which is calculated using the stale data, the recovered one will just be stale. In our particular test case, it will always return the same incorrect metadata, thus causing the same error message "parent transid verify failed on 39010304 wanted 9 found 7" again and again. [BTRFS DESTRUCTIVE RMW PROBLEM] Test case btrfs/125 (and above workload) always has its trouble with the destructive read-modify-write (RMW) cycle: 0 32K 64K Data1: | Good | Good | Data2: | Bad | Bad | Parity: | Good | Good | In above case, if we trigger any write into Data1, we will use the bad data in Data2 to re-generate parity, killing the only chance to recovery Data2, thus Data2 is lost forever. This destructive RMW cycle is not specific to btrfs RAID56, but there are some btrfs specific behaviors making the case even worse: - Btrfs will cache sectors for unrelated vertical stripes. In above example, if we're only writing into 0~32K range, btrfs will still read data range (32K ~ 64K) of Data1, and (64K~128K) of Data2. This behavior is to cache sectors for later update. Incidentally commit d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") has a bug which makes RAID56 to never trust the cached sectors, thus slightly improve the situation for recovery. Unfortunately, follow up fix "btrfs: update stripe_sectors::uptodate in steal_rbio" will revert the behavior back to the old one. - Btrfs raid56 partial write will update all P/Q sectors and cache them This means, even if data at (64K ~ 96K) of Data2 is free space, and only (96K ~ 128K) of Data2 is really stale data. And we write into that (96K ~ 128K), we will update all the parity sectors for the full stripe. This unnecessary behavior will completely kill the chance of recovery. Thankfully, an unrelated optimization "btrfs: only write the sectors in the vertical stripe which has data stripes" will prevent submitting the write bio for untouched vertical sectors. That optimization will keep the on-disk P/Q untouched for a chance for later recovery. [FIX] Although we have no good way to completely fix the destructive RMW (unless we go full scrub for each partial write), we can still limit the damage. With patch "btrfs: only write the sectors in the vertical stripe which has data stripes" now we won't really submit the P/Q of unrelated vertical stripes, so the on-disk P/Q should still be fine. Now we really need to do is just drop all the cached sectors when doing recovery. By this, we have a chance to read the original P/Q from disk, and have a chance to recover the stale data, while still keep the cache to speed up regular write path. In fact, just dropping all the cache for recovery path is good enough to allow the test case btrfs/125 along with the small script to pass reliably. The lack of metadata write after the degraded mount, and forced metadata COW is saving us this time. So this patch will fix the behavior by not trust any cache in __raid56_parity_recover(), to solve the problem while still keep the cache useful. But please note that this test pass DOES NOT mean we have solved the destructive RMW problem, we just do better damage control a little better. Related patches: - btrfs: only write the sectors in the vertical stripe - d4e28d9b5f04 ("btrfs: raid56: make steal_rbio() subpage compatible") - btrfs: update stripe_sectors::uptodate in steal_rbio Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0f0368e63e5af1..c6411c849fea04 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2118,9 +2118,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) atomic_set(&rbio->error, 0); /* - * read everything that hasn't failed. Thanks to the - * stripe cache, it is possible that some or all of these - * pages are going to be uptodate. + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2135,11 +2138,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) total_sector_nr += rbio->stripe_nsectors - 1; continue; } - /* The RMW code may have already read this page in. */ sector = rbio_stripe_sector(rbio, stripe, sectornr); - if (sector->uptodate) - continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, rbio->stripe_len, REQ_OP_READ); From 5cb1c4452d66decdba59c57dc496073329e09838 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2022 08:26:27 +0200 Subject: [PATCH 0528/1250] btrfs: increase direct io read size limit to 256 sectors Btrfs currently limits direct I/O reads to a single sector, which goes back to commit c329861da406 ("Btrfs: don't allocate a separate csums array for direct reads") from Josef. That commit changes the direct I/O code to ".. use the private part of the io_tree for our csums.", but ten years later that isn't how checksums for direct reads work, instead they use a csums allocation on a per-btrfs_dio_private basis (which have their own performance problem for small I/O, but that will be addressed later). There is no fundamental limit in btrfs itself to limit the I/O size except for the size of the checksum array that scales linearly with the number of sectors in an I/O. Pick a somewhat arbitrary limit of 256 limits, which matches what the buffered reads typically see as the upper limit as the limit for direct I/O as well. This significantly improves direct read performance. For example a fio run doing 1 MiB aio reads with a queue depth of 1 roughly triples the throughput: Baseline: READ: bw=65.3MiB/s (68.5MB/s), 65.3MiB/s-65.3MiB/s (68.5MB/s-68.5MB/s), io=19.1GiB (20.6GB), run=300013-300013msec With this patch: READ: bw=196MiB/s (206MB/s), 196MiB/s-196MiB/s (206MB/s-206MB/s), io=57.5GiB (61.7GB), run=300006-300006msc Reviewed-by: Qu Wenruo Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++++- fs/btrfs/volumes.h | 7 +++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d535ed39c39181..03771f09a9a215 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7578,8 +7578,12 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, const u64 data_alloc_len = length; bool unlock_extents = false; + /* + * Cap the size of reads to that usually seen in buffered I/O as we need + * to allocate a contiguous array for the checksums. + */ if (!write) - len = min_t(u64, len, fs_info->sectorsize); + len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); lockstart = start; lockend = start + len - 1; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b61508723d5d22..9537d82bb7a201 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -354,6 +354,13 @@ struct btrfs_fs_devices { - 2 * sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) +/* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + /* * Additional info to pass along bio. * From cd99534eb7d73ab58d9ee2ffe809c9f816f3bad9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Jun 2022 15:25:32 +0200 Subject: [PATCH 0529/1250] btrfs: send: drop __KERNEL__ ifdef from send.h We don't need this ifdef as the header file is not shared, the protocol definition used by userspace should be from libbtrfs or libbtrfsutil. Signed-off-by: David Sterba --- fs/btrfs/send.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index b0dc07567d09ac..45562190b473e2 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -155,8 +155,6 @@ enum { BTRFS_SEND_A_MAX = 31, }; -#ifdef __KERNEL__ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); -#endif #endif From 71aee61a92337ed062c86be0f5d70f122abb2e16 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Jun 2022 15:28:41 +0200 Subject: [PATCH 0530/1250] btrfs: send: simplify includes We don't need the whole ctree.h in send.h, none of the data types defined there are used. Signed-off-by: David Sterba --- fs/btrfs/send.c | 1 + fs/btrfs/send.h | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6d01dc26d4080b..8f88df368c31d8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -17,6 +17,7 @@ #include #include "send.h" +#include "ctree.h" #include "backref.h" #include "locking.h" #include "disk-io.h" diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 45562190b473e2..7f615ddc8d9c5a 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -7,7 +7,7 @@ #ifndef BTRFS_SEND_H #define BTRFS_SEND_H -#include "ctree.h" +#include #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" #define BTRFS_SEND_STREAM_VERSION 2 @@ -18,6 +18,9 @@ */ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K +struct inode; +struct btrfs_ioctl_send_args; + enum btrfs_tlv_type { BTRFS_TLV_U8, BTRFS_TLV_U16, From b9088e092dad872ef7691eadffe73728eb3b9f36 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Jun 2022 15:40:46 +0200 Subject: [PATCH 0531/1250] btrfs: send: remove old TODO regarding ERESTARTSYS The whole send operation is restartable and handling properly a buffer write may not be easy. We can't know what caused that and if a short delay and retry will fix it or how many retries should be performed in case it's a temporary condition. The error value is returned to the ioctl caller so in case it's transient problem, the user would be notified about the reason. Remove the TODO note as there's no plan to handle ERESTARTSYS. Signed-off-by: David Sterba --- fs/btrfs/send.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8f88df368c31d8..b2a895563f6d85 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -582,15 +582,10 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) while (pos < len) { ret = kernel_write(filp, buf + pos, len - pos, off); - /* TODO handle that correctly */ - /*if (ret == -ERESTARTSYS) { - continue; - }*/ if (ret < 0) return ret; - if (ret == 0) { + if (ret == 0) return -EIO; - } pos += ret; } From c8a2d5c8b1808d0918eb63cc1ca120dd8ebba3e9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 2 Jun 2022 18:03:08 +0200 Subject: [PATCH 0532/1250] btrfs: send: use boolean types for current inode status The new, new_gen and deleted indicate a status, use boolean type instead of int. Signed-off-by: David Sterba --- fs/btrfs/send.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b2a895563f6d85..d31cd39edff4e4 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -118,14 +118,14 @@ struct send_ctx { */ u64 cur_ino; u64 cur_inode_gen; - int cur_inode_new; - int cur_inode_new_gen; - int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + bool cur_inode_new; + bool cur_inode_new_gen; + bool cur_inode_deleted; bool ignore_cur_inode; u64 send_progress; @@ -6482,7 +6482,7 @@ static int changed_inode(struct send_ctx *sctx, close_current_inode(sctx); sctx->cur_ino = key->objectid; - sctx->cur_inode_new_gen = 0; + sctx->cur_inode_new_gen = false; sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; @@ -6523,7 +6523,7 @@ static int changed_inode(struct send_ctx *sctx, */ if (left_gen != right_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) - sctx->cur_inode_new_gen = 1; + sctx->cur_inode_new_gen = true; } /* @@ -6555,8 +6555,8 @@ static int changed_inode(struct send_ctx *sctx, if (result == BTRFS_COMPARE_TREE_NEW) { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6567,8 +6567,8 @@ static int changed_inode(struct send_ctx *sctx, ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6586,8 +6586,8 @@ static int changed_inode(struct send_ctx *sctx, * First, process the inode as if it was deleted. */ sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6601,8 +6601,8 @@ static int changed_inode(struct send_ctx *sctx, * Now process the inode as if it was new. */ sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6634,9 +6634,9 @@ static int changed_inode(struct send_ctx *sctx, goto out; } else { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_new_gen = 0; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = false; + sctx->cur_inode_new_gen = false; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( From f1cd209f952fad86019a8c99ea4dc3ae0578c39e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Jun 2022 09:49:44 +0200 Subject: [PATCH 0533/1250] btrfs: remove extent writepage address space operation Same as in commit 21b4ee7029c9 ("xfs: drop ->writepage completely"): we can remove the callback as it's only used in one place - single page writeback from memory reclaim and is not called for cgroup writeback at all. We only allow such writeback from kswapd, not from direct memory reclaim, and so it is rarely used. When it comes from kswapd, it is effectively random dirty page shoot-down, which is horrible for IO patterns. We can rely on background writeback to clean all dirty pages in an efficient way and not let it be interrupted by kswapd. Suggested-by: Johannes Weiner Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 14 -------------- fs/btrfs/extent_io.h | 1 - fs/btrfs/inode.c | 28 +--------------------------- fs/btrfs/subpage.c | 2 +- 4 files changed, 2 insertions(+), 43 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1935cb7a305dac..70fc7a65092422 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5110,20 +5110,6 @@ static int extent_write_cache_pages(struct address_space *mapping, return ret; } -int extent_write_full_page(struct page *page, struct writeback_control *wbc) -{ - int ret; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - - ret = __extent_writepage(page, wbc, &epd); - submit_write_bio(&epd, ret); - return ret; -} - /* * Submit the pages in the range to bio for call sites which delalloc range has * already been ran (aka, ordered extent inserted) and all pages are still diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c0f1fb63eeae79..a76c6ef74cd3c5 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -146,7 +146,6 @@ int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 03771f09a9a215..a00052bb194de2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8133,31 +8133,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } -static int btrfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - int ret; - - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - /* - * If we are under memory pressure we will call this directly from the - * VM, we need to make sure we have the inode referenced for the ordered - * extent. If not just return like we didn't do anything. - */ - if (!igrab(inode)) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; - } - ret = extent_write_full_page(page, wbc); - btrfs_add_delayed_iput(inode); - return ret; -} - static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -8461,7 +8436,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepage() function could + * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ @@ -11379,7 +11354,6 @@ static const struct file_operations btrfs_dir_file_operations = { */ static const struct address_space_operations btrfs_aops = { .read_folio = btrfs_read_folio, - .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 0146fee730a093..6fc2b77ae5c345 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -731,7 +731,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, * It should not have any subpage::writers count. * Can be unlocked by unlock_page(). * This is the most common locked page for __extent_writepage() called - * inside extent_write_cache_pages() or extent_write_full_page(). + * inside extent_write_cache_pages(). * Rarer cases include the @locked_page from extent_write_locked_range(). * * - Page locked by lock_delalloc_pages() From f878c0845b8d8dbc3d3b412567a889d0019d770d Mon Sep 17 00:00:00 2001 From: Ioannis Angelakopoulos Date: Tue, 14 Jun 2022 15:22:32 -0700 Subject: [PATCH 0534/1250] btrfs: collect commit stats, count, duration Track several stats about transaction commit, to be later exported via sysfs: - number of commits so far - duration of the last commit in ns - maximum commit duration seen so far in ns - total duration for all commits so far in ns The update of the commit stats occurs after the commit thread has gone through all the logic that checks if there is another thread committing at the same time. This means that we only account for actual commit work in the commit stats we report and not the time the thread spends waiting until it is ready to do the commit work. Reviewed-by: Nikolay Borisov Signed-off-by: Ioannis Angelakopoulos Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 15 +++++++++++++++ fs/btrfs/transaction.c | 22 ++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6d9f81d4e99c09..64c65d0f7d0676 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -667,6 +667,18 @@ enum btrfs_exclusive_operation { BTRFS_EXCLOP_SWAP_ACTIVATE, }; +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -1075,6 +1087,9 @@ struct btrfs_fs_info { spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 875b801ab3d7c0..c07bead4f0e49f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -2098,12 +2099,23 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } +static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) +{ + fs_info->commit_stats.commit_count++; + fs_info->commit_stats.last_commit_dur = interval; + fs_info->commit_stats.max_commit_dur = + max_t(u64, fs_info->commit_stats.max_commit_dur, interval); + fs_info->commit_stats.total_commit_dur += interval; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; + ktime_t start_time; + ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); @@ -2228,6 +2240,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } } + /* + * Get the time spent on the work done by the commit thread and not + * the time spent waiting on a previous commit + */ + start_time = ktime_get_ns(); + extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); @@ -2469,6 +2487,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trace_btrfs_transaction_commit(fs_info); + interval = ktime_get_ns() - start_time; + btrfs_scrub_continue(fs_info); if (current->journal_info == trans) @@ -2476,6 +2496,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); + update_commit_stats(fs_info, interval); + return ret; unlock_reloc: From be4a2f7f1fe424e770d766f60aa3a87ccea9a7a0 Mon Sep 17 00:00:00 2001 From: Ioannis Angelakopoulos Date: Tue, 14 Jun 2022 15:22:34 -0700 Subject: [PATCH 0535/1250] btrfs: sysfs: export commit stats Export commit stats in file /sys/fs/btrfs/UUID/commit_stats with example output like: commits 123 last_commit_ms 11 max_commit_ms 150 total_commit_ms 2000 The values are in one file so reading them at a single time will give a more consistent view. The stats are internally tracked in nanoseconds so the cumulative values should not suffer from rounding errors. Writing 0 to the file 'commit_stats' will reset max_commit_ms. Initial values are set at first mount of the filesystem. Reviewed-by: Nikolay Borisov Signed-off-by: Ioannis Angelakopoulos [ update changelog ] Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index db3736de14a5f3..c6307b111c2c75 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -991,6 +991,48 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); +static ssize_t btrfs_commit_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, + "commits %llu\n" + "last_commit_ms %llu\n" + "max_commit_ms %llu\n" + "total_commit_ms %llu\n", + fs_info->commit_stats.commit_count, + div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); +} + +static ssize_t btrfs_commit_stats_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long val; + int ret; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + ret = kstrtoul(buf, 10, &val); + if (ret) + return ret; + if (val) + return -EINVAL; + + WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0); + + return len; +} +BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); + static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1230,6 +1272,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), + BTRFS_ATTR_PTR(, commit_stats), NULL, }; @@ -2236,4 +2279,3 @@ void __cold btrfs_exit_sysfs(void) #endif kset_unregister(btrfs_kset); } - From f80cbcb8fa18928d69f431cb9d4b6a2feab7706f Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 21 Jun 2022 15:40:59 +0900 Subject: [PATCH 0536/1250] btrfs: ensure pages are unlocked on cow_file_range() failure There is a hung_task report on zoned btrfs like below. https://github.com/naota/linux/issues/59 [726.328648] INFO: task rocksdb:high0:11085 blocked for more than 241 seconds. [726.329839] Not tainted 5.16.0-rc1+ #1 [726.330484] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [726.331603] task:rocksdb:high0 state:D stack: 0 pid:11085 ppid: 11082 flags:0x00000000 [726.331608] Call Trace: [726.331611] [726.331614] __schedule+0x2e5/0x9d0 [726.331622] schedule+0x58/0xd0 [726.331626] io_schedule+0x3f/0x70 [726.331629] __folio_lock+0x125/0x200 [726.331634] ? find_get_entries+0x1bc/0x240 [726.331638] ? filemap_invalidate_unlock_two+0x40/0x40 [726.331642] truncate_inode_pages_range+0x5b2/0x770 [726.331649] truncate_inode_pages_final+0x44/0x50 [726.331653] btrfs_evict_inode+0x67/0x480 [726.331658] evict+0xd0/0x180 [726.331661] iput+0x13f/0x200 [726.331664] do_unlinkat+0x1c0/0x2b0 [726.331668] __x64_sys_unlink+0x23/0x30 [726.331670] do_syscall_64+0x3b/0xc0 [726.331674] entry_SYSCALL_64_after_hwframe+0x44/0xae [726.331677] RIP: 0033:0x7fb9490a171b [726.331681] RSP: 002b:00007fb943ffac68 EFLAGS: 00000246 ORIG_RAX: 0000000000000057 [726.331684] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fb9490a171b [726.331686] RDX: 00007fb943ffb040 RSI: 000055a6bbe6ec20 RDI: 00007fb94400d300 [726.331687] RBP: 00007fb943ffad00 R08: 0000000000000000 R09: 0000000000000000 [726.331688] R10: 0000000000000031 R11: 0000000000000246 R12: 00007fb943ffb000 [726.331690] R13: 00007fb943ffb040 R14: 0000000000000000 R15: 00007fb943ffd260 [726.331693] While we debug the issue, we found running fstests generic/551 on 5GB non-zoned null_blk device in the emulated zoned mode also had a similar hung issue. Also, we can reproduce the same symptom with an error injected cow_file_range() setup. The hang occurs when cow_file_range() fails in the middle of allocation. cow_file_range() called from do_allocation_zoned() can split the give region ([start, end]) for allocation depending on current block group usages. When btrfs can allocate bytes for one part of the split regions but fails for the other region (e.g. because of -ENOSPC), we return the error leaving the pages in the succeeded regions locked. Technically, this occurs only when @unlock == 0. Otherwise, we unlock the pages in an allocated region after creating an ordered extent. Considering the callers of cow_file_range(unlock=0) won't write out the pages, we can unlock the pages on error exit from cow_file_range(). So, we can ensure all the pages except @locked_page are unlocked on error case. In summary, cow_file_range now behaves like this: - page_started == 1 (return value) - All the pages are unlocked. IO is started. - unlock == 1 - All the pages except @locked_page are unlocked in any case - unlock == 0 - On success, all the pages are locked for writing out them - On failure, all the pages except @locked_page are unlocked Fixes: 42c011000963 ("btrfs: zoned: introduce dedicated data write path for zoned filesystems") CC: stable@vger.kernel.org # 5.12+ Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/inode.c | 72 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a00052bb194de2..5a58042a404b9a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1128,6 +1128,28 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * *page_started is set to one if we unlock locked_page and do everything * required to start IO on it. It may be clean and already done with * IO when we return. + * + * When unlock == 1, we unlock the pages in successfully allocated regions. + * When unlock == 0, we leave them locked for writing them out. + * + * However, we unlock all the pages except @locked_page in case of failure. + * + * In summary, page locking state will be as follow: + * + * - page_started == 1 (return value) + * - All the pages are unlocked. IO is started. + * - Note that this can happen only on success + * - unlock == 1 + * - All the pages except @locked_page are unlocked in any case + * - unlock == 0 + * - On success, all the pages are locked for writing out them + * - On failure, all the pages except @locked_page are unlocked + * + * When a failure happens in the second or later iteration of the + * while-loop, the ordered extents created in previous iterations are kept + * intact. So, the caller must clean them up by calling + * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for + * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, @@ -1137,6 +1159,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; + u64 orig_start = start; u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; @@ -1324,18 +1347,44 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + /* + * Now, we have three regions to clean up: + * + * |-------(1)----|---(2)---|-------------(3)----------| + * `- orig_start `- start `- start + cur_alloc_size `- end + * + * We process each region below. + */ + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + /* - * If we reserved an extent for our delalloc range (or a subrange) and - * failed to create the respective ordered extent, then it means that - * when we reserved the extent we decremented the extent's size from - * the data space_info's bytes_may_use counter and incremented the - * space_info's bytes_reserved counter by the same amount. We must make - * sure extent_clear_unlock_delalloc() does not try to decrement again - * the data space_info's bytes_may_use counter, therefore we do not pass - * it the flag EXTENT_CLEAR_DATA_RESV. + * For the range (1). We have already instantiated the ordered extents + * for this region. They are cleaned up by + * btrfs_cleanup_ordered_extents() in e.g, + * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are + * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | + * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup + * function. + * + * However, in case of unlock == 0, we still need to unlock the pages + * (except @locked_page) to ensure all the pages are unlocked. + */ + if (!unlock && orig_start < start) + extent_clear_unlock_delalloc(inode, orig_start, start - 1, + locked_page, 0, page_ops); + + /* + * For the range (2). If we reserved an extent for our delalloc range + * (or a subrange) and failed to create the respective ordered extent, + * then it means that when we reserved the extent we decremented the + * extent's size from the data space_info's bytes_may_use counter and + * incremented the space_info's bytes_reserved counter by the same + * amount. We must make sure extent_clear_unlock_delalloc() does not try + * to decrement again the data space_info's bytes_may_use counter, + * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, @@ -1347,6 +1396,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (start >= end) goto out; } + + /* + * For the range (3). We never touched the region. In addition to the + * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data + * space_info's bytes_may_use counter, reserved in + * btrfs_check_data_free_space(). + */ extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); From 49f8be645573afe88bfee0ba69e8d7bce4ec4f37 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 21 Jun 2022 15:41:00 +0900 Subject: [PATCH 0537/1250] btrfs: extend btrfs_cleanup_ordered_extents for NULL locked_page btrfs_cleanup_ordered_extents() assumes locked_page to be non-NULL, so it is not usable for submit_uncompressed_range() which can have NULL locked_page. Add support supports locked_page == NULL case. Also, it rewrites redundant "page_offset(locked_page)". Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/inode.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5a58042a404b9a..11ff5bb4015352 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -190,11 +190,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = page_offset(locked_page); - u64 page_end = page_start + PAGE_SIZE - 1; - + u64 page_start, page_end; struct page *page; + if (locked_page) { + page_start = page_offset(locked_page); + page_end = page_start + PAGE_SIZE - 1; + } + while (index <= end_index) { /* * For locked page, we will call end_extent_writepage() on it @@ -207,7 +210,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + if (locked_page && index == (page_start >> PAGE_SHIFT)) { index++; continue; } @@ -226,17 +229,20 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, put_page(page); } - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) - return; - /* - * In case this page belongs to the delalloc range being instantiated - * then skip it, since the first page of a range is going to be - * properly cleaned up by the caller of run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + if (locked_page) { + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_start + PAGE_SIZE) + return; + /* + * In case this page belongs to the delalloc range being + * instantiated then skip it, since the first page of a range is + * going to be properly cleaned up by the caller of + * run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; + } } return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); From 44df0136127e420c2d13e36e40d98499b73be583 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 21 Jun 2022 15:41:01 +0900 Subject: [PATCH 0538/1250] btrfs: fix error handling of fallback uncompress write When cow_file_range() fails in the middle of the allocation loop, it unlocks the pages but leaves the ordered extents intact. Thus, we need to call btrfs_cleanup_ordered_extents() to finish the created ordered extents. Also, we need to call end_extent_writepage() if locked_page is available because btrfs_cleanup_ordered_extents() never processes the region on the locked_page. Furthermore, we need to set the mapping as error if locked_page is unavailable before unlocking the pages, so that the errno is properly propagated to the user space. CC: stable@vger.kernel.org # 5.18+ Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/inode.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 11ff5bb4015352..90b1c1df09432e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -928,8 +928,18 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, goto out; } if (ret < 0) { - if (locked_page) + btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + if (locked_page) { + const u64 page_start = page_offset(locked_page); + const u64 page_end = page_start + PAGE_SIZE - 1; + + btrfs_page_set_error(inode->root->fs_info, locked_page, + page_start, PAGE_SIZE); + set_page_writeback(locked_page); + end_page_writeback(locked_page); + end_extent_writepage(locked_page, ret, page_start, page_end); unlock_page(locked_page); + } goto out; } @@ -1378,9 +1388,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * However, in case of unlock == 0, we still need to unlock the pages * (except @locked_page) to ensure all the pages are unlocked. */ - if (!unlock && orig_start < start) + if (!unlock && orig_start < start) { + if (!locked_page) + mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_page, 0, page_ops); + } /* * For the range (2). If we reserved an extent for our delalloc range From 86a20197ad882cf6b4c4a4f7be66c1cd9e3d572c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 21 Jun 2022 15:41:02 +0900 Subject: [PATCH 0539/1250] btrfs: replace unnecessary goto with direct return at cow_file_range() The 'goto out' in cow_file_range() in the exit block are not necessary and jump back. Replace them with return, while still keeping 'goto out' in the main code. Reviewed-by: Filipe Manana Signed-off-by: Naohiro Aota Reviewed-by: David Sterba [ keep goto in the main code, update changelog ] Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 90b1c1df09432e..fae0e8457edd40 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1413,7 +1413,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, page_ops); start += cur_alloc_size; if (start >= end) - goto out; + return ret; } /* @@ -1425,7 +1425,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); - goto out; + return ret; } /* From 060e1c6f383e454add726398a42bdac18dd1ebbd Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 19 Jun 2022 21:47:56 +0800 Subject: [PATCH 0540/1250] btrfs: output mirror number for bad metadata When handling a real world transid mismatch image, it's hard to know which copy is corrupted, as the error messages just look like this: BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0 BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0 BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0 BTRFS warning (device dm-3): checksum verify failed on 30408704 wanted 0xcdcdcdcd found 0x3c0adc8e level 0 We don't even know if the retry is caused by btrfs or the VFS retry. To make things a little easier to read, add mirror number for all related tree block read errors. So the above messages would look like this: BTRFS warning (device dm-3): checksum verify failed on logical 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0 BTRFS warning (device dm-3): checksum verify failed on logical 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0 BTRFS warning (device dm-3): checksum verify failed on logical 30408704 mirror 1 wanted 0xcdcdcdcd found 0x3c0adc8e level 0 BTRFS warning (device dm-3): checksum verify failed on logical 30408704 mirror 2 wanted 0xcdcdcdcd found 0x3c0adc8e level 0 Signed-off-by: Qu Wenruo [ update messages, add "logical" ] Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 32b88a2277340c..4f514919b03b74 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -221,8 +221,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, goto out; } btrfs_err_rl(eb->fs_info, - "parent transid verify failed on %llu wanted %llu found %llu", - eb->start, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); ret = 1; clear_extent_buffer_uptodate(eb); @@ -552,21 +552,23 @@ static int validate_extent_buffer(struct extent_buffer *eb) found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", - eb->start, found_start); + btrfs_err_rl(fs_info, + "bad tree block start, mirror %u want %llu have %llu", + eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } if (check_tree_block_fsid(eb)) { - btrfs_err_rl(fs_info, "bad fsid on block %llu", - eb->start); + btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", + eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "bad tree block level %d on %llu", - (int)btrfs_header_level(eb), eb->start); + btrfs_err(fs_info, + "bad tree block level, mirror %u level %d on logical %llu", + eb->read_mirror, btrfs_header_level(eb), eb->start); ret = -EIO; goto out; } @@ -577,8 +579,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, - "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", - eb->start, +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", + eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); @@ -603,8 +605,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) set_extent_buffer_uptodate(eb); else btrfs_err(fs_info, - "block=%llu read time tree block corruption detected", - eb->start); + "read time tree block corruption detected on logical %llu mirror %u", + eb->start, eb->read_mirror); out: return ret; } From bd733292548e014295dd18ad3d6e0dac73df7e4c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 17 May 2022 16:50:30 +0200 Subject: [PATCH 0541/1250] btrfs: send: add OTIME as utimes attribute for proto 2+ by default When send v1 was introduced the otime (inode creation time) was not available, however the attribute in btrfs send protocol exists. Though it would be possible to add it for v1 too as the attribute would be ignored by v1 receive, let's not change the layout of v1 and only add that to v2+. The otime cannot be changed and is only informative. Signed-off-by: David Sterba --- fs/btrfs/send.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d31cd39edff4e4..c2c429005e733c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2580,7 +2580,8 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); - /* TODO Add otime support when the otime patches get into upstream */ + if (sctx->proto >= 2) + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime); ret = send_cmd(sctx); From f4ea8c9d48259fa63d5e70d748040f543dfcaee1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 18 May 2022 18:02:55 +0200 Subject: [PATCH 0542/1250] btrfs: send: add new command FILEATTR for file attributes There are file attributes inherited from previous ext2 SETFLAGS/GETFLAGS and later from XFLAGS interfaces, now commonly found under the 'fileattr' API. This corresponds to the individual inode bits and that's part of the on-disk format, so this is suitable for the protocol. The other interfaces contain a lot of cruft or bits that btrfs does not support yet. Currently the value is u64 and matches btrfs_inode_item. Not all the bits can be set by ioctls (like NODATASUM or READONLY), but we can send them over the protocol and leave it up to the receiving side what and how to apply. As some of the flags, eg. IMMUTABLE, can prevent any further changes, the receiving side needs to understand that and apply the changes in the right order, or possibly with some intermediate steps. This should be easier, future proof and simpler on the protocol layer than implementing in kernel. Signed-off-by: David Sterba --- fs/btrfs/send.c | 104 +++++++++++++++++++++++++++++++++++------------- fs/btrfs/send.h | 10 ++++- 2 files changed, 85 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c2c429005e733c..3ed80da71dad29 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -844,7 +844,7 @@ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) */ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, - u64 *gid, u64 *rdev) + u64 *gid, u64 *rdev, u64 *fileattr) { int ret; struct btrfs_inode_item *ii; @@ -874,6 +874,12 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, *gid = btrfs_inode_gid(path->nodes[0], ii); if (rdev) *rdev = btrfs_inode_rdev(path->nodes[0], ii); + /* + * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's + * otherwise logically split to 32/32 parts. + */ + if (fileattr) + *fileattr = btrfs_inode_flags(path->nodes[0], ii); return ret; } @@ -881,7 +887,7 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, static int get_inode_info(struct btrfs_root *root, u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) + u64 *rdev, u64 *fileattr) { struct btrfs_path *path; int ret; @@ -890,7 +896,7 @@ static int get_inode_info(struct btrfs_root *root, if (!path) return -ENOMEM; ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, - rdev); + rdev, fileattr); btrfs_free_path(path); return ret; } @@ -1636,7 +1642,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) u64 right_gen; ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; left_ret = ret; @@ -1645,7 +1651,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) right_ret = -ENOENT; } else { ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; right_ret = ret; @@ -1808,7 +1814,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, if (dir_gen) { ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0) goto out; } @@ -1880,7 +1886,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, */ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1908,7 +1914,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (other_inode > sctx->send_progress || is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, who_mode, NULL, NULL, NULL); + who_gen, who_mode, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -1947,7 +1953,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, if (dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1970,7 +1976,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, } ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret < 0) goto out; @@ -2501,6 +2507,39 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) return ret; } +static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + if (sctx->proto < 2) + return 0; + + btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; @@ -2615,7 +2654,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) if (ino != sctx->cur_ino) { ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, - NULL, NULL, &rdev); + NULL, NULL, &rdev, NULL); if (ret < 0) goto out; } else { @@ -3318,7 +3357,7 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) * The parent inode might have been deleted in the send snapshot */ ret = get_inode_info(sctx->send_root, cur->dir, NULL, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); if (ret == -ENOENT) { ret = 0; continue; @@ -3493,11 +3532,11 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, } ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, - &left_gen, NULL, NULL, NULL, NULL); + &left_gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, - &right_gen, NULL, NULL, NULL, NULL); + &right_gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -3628,7 +3667,7 @@ static int is_ancestor(struct btrfs_root *root, } ret = get_inode_info(root, parent, NULL, &parent_gen, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = check_ino_in_path(root, ino1, ino1_gen, @@ -3720,7 +3759,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, ret = get_inode_info(sctx->parent_root, ino, NULL, &parent_ino_gen, NULL, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; if (ino_gen == parent_ino_gen) { @@ -4326,8 +4365,7 @@ static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, if (!p) return -ENOMEM; - ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, - NULL, NULL); + ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -4415,7 +4453,7 @@ static int __find_iref(int num, u64 dir, int index, * else matches. */ ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret) return ret; if (dir_gen != ctx->dir_gen) @@ -4459,7 +4497,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index, struct send_ctx *sctx = ctx; ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret) return ret; @@ -4482,7 +4520,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index, struct send_ctx *sctx = ctx; ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret) return ret; @@ -5031,7 +5069,7 @@ static int send_clone(struct send_ctx *sctx, if (clone_root->root == sctx->send_root) { ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, - &gen, NULL, NULL, NULL, NULL); + &gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); @@ -5540,7 +5578,8 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, * accept clones from these extents. */ ret = __get_inode_info(clone_root->root, path, clone_root->ino, - &clone_src_i_size, NULL, NULL, NULL, NULL, NULL); + &clone_src_i_size, NULL, NULL, NULL, NULL, NULL, + NULL); btrfs_release_path(path); if (ret < 0) goto out; @@ -6235,11 +6274,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) u64 left_mode; u64 left_uid; u64 left_gid; + u64 left_fileattr; u64 right_mode; u64 right_uid; u64 right_gid; + u64 right_fileattr; int need_chmod = 0; int need_chown = 0; + bool need_fileattr = false; int need_truncate = 1; int pending_move = 0; int refs_processed = 0; @@ -6273,7 +6315,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, - &left_mode, &left_uid, &left_gid, NULL); + &left_mode, &left_uid, &left_gid, NULL, &left_fileattr); if (ret < 0) goto out; @@ -6288,7 +6330,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &old_size, NULL, &right_mode, &right_uid, - &right_gid, NULL); + &right_gid, NULL, &right_fileattr); if (ret < 0) goto out; @@ -6296,6 +6338,8 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) need_chown = 1; if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) need_chmod = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr) + need_fileattr = true; if ((old_size == sctx->cur_inode_size) || (sctx->cur_inode_size > old_size && sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) @@ -6339,6 +6383,12 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; } + if (need_fileattr) { + ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_fileattr); + if (ret < 0) + goto out; + } ret = send_capabilities(sctx); if (ret < 0) @@ -6750,12 +6800,12 @@ static int dir_changed(struct send_ctx *sctx, u64 dir) int ret; ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret) return ret; ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret) return ret; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 7f615ddc8d9c5a..4bb4e6a638cb4a 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -88,7 +88,7 @@ enum btrfs_send_cmd { /* Version 2 */ BTRFS_SEND_C_FALLOCATE = 23, - BTRFS_SEND_C_SETFLAGS = 24, + BTRFS_SEND_C_FILEATTR = 24, BTRFS_SEND_C_ENCODED_WRITE = 25, BTRFS_SEND_C_MAX_V2 = 25, @@ -141,7 +141,13 @@ enum { /* Version 2 */ BTRFS_SEND_A_FALLOCATE_MODE = 25, - BTRFS_SEND_A_SETFLAGS_FLAGS = 26, + /* + * File attributes from the FS_*_FL namespace (i_flags, xflags), + * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored + * in btrfs_inode_item::flags (represented by btrfs_inode::flags and + * btrfs_inode::ro_flags). + */ + BTRFS_SEND_A_FILEATTR = 26, BTRFS_SEND_A_UNENCODED_FILE_LEN = 27, BTRFS_SEND_A_UNENCODED_LEN = 28, From 90411b6a836bdb0f29957b6bdd70da7604aaa1d4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 13 Jun 2022 18:31:17 -0400 Subject: [PATCH 0543/1250] btrfs: reset block group chunk force if we have to wait If you try to force a chunk allocation, but you race with another chunk allocation, you will end up waiting on the chunk allocation that just occurred and then allocate another chunk. If you have many threads all doing this at once you can way over-allocate chunks. Fix this by resetting force to NO_FORCE, that way if we think we need to allocate we can, otherwise we don't force another chunk allocation if one is already happening. Reviewed-by: Filipe Manana CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ede389f2602d5c..13358fbc162977 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -3761,6 +3761,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, * attempt. */ wait_for_alloc = true; + force = CHUNK_ALLOC_NO_FORCE; spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); From 73b85cb2da613b693dd29621814381288829e4a0 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 22 Jun 2022 20:45:18 +0200 Subject: [PATCH 0544/1250] btrfs: print checksum type and implementation at mount time Per user request, print the checksum type and implementation at mount time among the messages. The checksum is user configurable and the actual crypto implementation is useful to see for performance reasons. The same information is also available after mount in /sys/fs/FSID/checksum file. Example: [25.323662] BTRFS info (device vdb): using sha256 (sha256-generic) checksum algorithm Link: https://github.com/kdave/btrfs-progs/issues/483 Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4f514919b03b74..243bd7bd79cd75 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2425,6 +2425,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; + btrfs_info(fs_info, "using %s (%s) checksum algorithm", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(csum_shash)); return 0; } From 54de4818013b45d3fb585d108c8de1618e77acc9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 23 Jun 2022 10:55:47 +0300 Subject: [PATCH 0545/1250] btrfs: properly flag filesystem with BTRFS_FEATURE_INCOMPAT_BIG_METADATA Commit 6f93e834fa7c seemingly inadvertently moved the code responsible for flagging the filesystem as having BIG_METADATA to a place where setting the flag was essentially lost. This means that filesystems created with kernels containing this bug (starting with 5.15) can potentially be mounted by older (pre-3.4) kernels. In reality chances for this happening are low because there are other incompat flags introduced in the mean time. Still the correct behavior is to set INCOMPAT_BIG_METADATA flag and persist this in the superblock. Fixes: 6f93e834fa7c ("btrfs: fix upper limit for max_inline for page size 64K") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Qu Wenruo Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 243bd7bd79cd75..e12fd3abd689b3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3484,16 +3484,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - /* - * Flag our filesystem as having big metadata blocks if they are bigger - * than the page size. - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - btrfs_info(fs_info, - "flagging fs with big metadata feature"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); @@ -3534,6 +3524,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); + /* + * Flag our filesystem as having big metadata blocks if they are bigger + * than the page size. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + btrfs_info(fs_info, + "flagging fs with big metadata feature"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + /* * mixed block groups end up with duplicate but slightly offset * extent buffers for the same range. It leads to corruptions From da8d237a5ada1f481a1efbe68dad9e2008779bec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 23 Jun 2022 16:57:00 +0200 Subject: [PATCH 0546/1250] btrfs: use mask for all RAID1* profiles in btrfs_calc_avail_data_space There's a sequence of hard coded values for RAID1 profiles that are already stored in the raid_attr table that should be used instead. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/super.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 41652dcd16f436..4c7089b1681b32 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2243,12 +2243,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID1C3) - num_stripes = 3; - else if (type & BTRFS_BLOCK_GROUP_RAID1C4) - num_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK) + num_stripes = rattr->ncopies; else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; From 798959741990935f45b0a785e160c159ad3e045f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 23 Jun 2022 16:57:02 +0200 Subject: [PATCH 0547/1250] btrfs: merge calculations for simple striped profiles in btrfs_rmap_block Use the same expression for stripe_nr for RAID0 (map->sub_stripes is 1) and RAID10 (map->sub_stripes is 2), with equivalent results. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 13358fbc162977..e930749770ac58 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1816,11 +1816,10 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { stripe_nr = stripe_nr * map->num_stripes + i; stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = stripe_nr * map->num_stripes + i; } /* * The remaining case would be for RAID56, multiply by From 9660943126730d535e8ce2dd1f30aa0000126f5d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 21 Jun 2022 18:40:48 +0200 Subject: [PATCH 0548/1250] btrfs: clean up chained assignments The chained assignments may be convenient to write, but make readability a bit worse as it's too easy to overlook that there are several values set on the same line while this is rather an exception. Making it consistent everywhere avoids surprises. The pattern where inode times are initialized reuses the first value and the order is mtime, ctime. In other blocks the assignments are expanded so the order of variables is similar to the neighboring code. Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 3 +- fs/btrfs/file.c | 9 ++++-- fs/btrfs/free-space-cache.c | 3 +- fs/btrfs/inode.c | 41 ++++++++++++++++++---------- fs/btrfs/reflink.c | 6 ++-- fs/btrfs/tests/extent-buffer-tests.c | 3 +- fs/btrfs/transaction.c | 4 +-- fs/btrfs/volumes.c | 3 +- 8 files changed, 46 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a7dd6ba25e990f..f43196a893ca3c 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -587,7 +587,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, ASSERT(!IS_ERR(em)); map = em->map_lookup; - num_extents = cur_extent = 0; + num_extents = 0; + cur_extent = 0; for (i = 0; i < map->num_stripes; i++) { /* We have more device extent to copy */ if (srcdev != map->stripes[i].dev) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 89c6d7ff19874c..734baa729cd394 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2058,9 +2058,11 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, num_written = btrfs_encoded_write(iocb, from, encoded); num_sync = encoded->len; } else if (iocb->ki_flags & IOCB_DIRECT) { - num_written = num_sync = btrfs_direct_write(iocb, from); + num_written = btrfs_direct_write(iocb, from); + num_sync = num_written; } else { - num_written = num_sync = btrfs_buffered_write(iocb, from); + num_written = btrfs_buffered_write(iocb, from); + num_sync = num_written; } btrfs_set_inode_last_sub_trans(inode); @@ -3100,7 +3102,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index b1ae3ba2ca2c37..996da650ecdc33 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3536,7 +3536,8 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, * data, keep it dense. */ if (btrfs_test_opt(fs_info, SSD_SPREAD)) { - cont1_bytes = min_bytes = bytes + empty_size; + cont1_bytes = bytes + empty_size; + min_bytes = cont1_bytes; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; min_bytes = fs_info->sectorsize; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fae0e8457edd40..e4632c83453e29 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3129,8 +3129,10 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) - num_bytes = ram_bytes = oe->truncated_len; + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + num_bytes = oe->truncated_len; + ram_bytes = num_bytes; + } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -4317,8 +4319,9 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = - dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; + dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4480,7 +4483,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); if (ret) btrfs_abort_transaction(trans, ret); @@ -5121,9 +5125,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) */ if (newsize != oldsize) { inode_inc_iversion(inode); - if (!(mask & (ATTR_CTIME | ATTR_MTIME))) - inode->i_ctime = inode->i_mtime = - current_time(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } } if (newsize > oldsize) { @@ -7571,7 +7576,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - *map = em = em2; + *map = em2; + em = em2; } if (IS_ERR(em2)) { @@ -9208,8 +9214,10 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; + old_dir->i_mtime = ctime; + old_dir->i_ctime = ctime; + new_dir->i_mtime = ctime; + new_dir->i_ctime = ctime; old_inode->i_ctime = ctime; new_inode->i_ctime = ctime; @@ -9472,9 +9480,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - old_inode->i_ctime = current_time(old_dir); + old_dir->i_mtime = current_time(old_dir); + old_dir->i_ctime = old_dir->i_mtime; + new_dir->i_mtime = old_dir->i_mtime; + new_dir->i_ctime = old_dir->i_mtime; + old_inode->i_ctime = old_dir->i_mtime; if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -10629,7 +10639,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -ENOBUFS; goto out_em; } - disk_io_size = count = em->block_len; + disk_io_size = em->block_len; + count = em->block_len; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 8a6cabdb8f9328..9acf47b11fe63e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -23,8 +23,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, int ret; inode_inc_iversion(inode); - if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_time(inode); + if (!no_time_update) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 51a8b075c25983..b7d181a08eabd8 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -47,7 +47,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, nodesize); + path->nodes[0] = eb; if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c07bead4f0e49f..0bec10740ad392 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1832,8 +1832,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + dentry->d_name.len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = - current_time(parent_inode); + parent_inode->i_mtime = current_time(parent_inode); + parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 076040310f6fbb..2d788a351c1f27 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7211,7 +7211,8 @@ static int read_one_dev(struct extent_buffer *leaf, u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - devid = args.devid = btrfs_device_id(leaf, dev_item); + devid = btrfs_device_id(leaf, dev_item); + args.devid = devid; read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), From 1e2e783f4e3c563ebc3ccacf965458ae4feaa29e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 23 Jun 2022 10:57:52 +0300 Subject: [PATCH 0549/1250] btrfs: don't print 'flagging with big metadata' anymore on mount Added in commit 727011e07cbd ("Btrfs: allow metadata blocks larger than the page size") in 2010 and it's been default for mkfs since 3.12 (2013). The message doesn't really convey any useful information to users. Remove it. Reviewed-by: Qu Wenruo Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e12fd3abd689b3..b3bfd905c6548f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3528,12 +3528,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * Flag our filesystem as having big metadata blocks if they are bigger * than the page size. */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - btrfs_info(fs_info, - "flagging fs with big metadata feature"); + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } /* * mixed block groups end up with duplicate but slightly offset From 2d75b5ec04942d6a171e98ef1931b7d3e568bdc4 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 23 Jun 2022 11:08:58 +0300 Subject: [PATCH 0550/1250] btrfs: don't print 'has skinny extents' anymore on mount Skinny extents have been a default mkfs feature since version 3.18 i (introduced in btrfs-progs commit 6715de04d9a7 ("btrfs-progs: mkfs: make skinny-metadata default") ). It really doesn't bring any value to users to simply remove it. Reviewed-by: Qu Wenruo Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b3bfd905c6548f..ae7fc4ed25244b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3521,9 +3521,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; - if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - btrfs_info(fs_info, "has skinny extents"); - /* * Flag our filesystem as having big metadata blocks if they are bigger * than the page size. From 457f03dc49acc2d8f10a5f5947221e4dc06787dc Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 24 Jun 2022 11:01:22 +0300 Subject: [PATCH 0551/1250] btrfs: sysfs: remove MIXED_BACKREF feature file This feature has been the default for about 13 year. At this point it's safe to consider it an indispensable feature of BTRFS as such there's no need to advertise it in sysfs. Remove the global sysfs feature file, the per-filesystem feature file has never been there. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c6307b111c2c75..73f99a9647d6e7 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -276,7 +276,6 @@ static umode_t btrfs_feature_visible(struct kobject *kobj, return mode; } -BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); @@ -308,7 +307,6 @@ BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); * can be changed on a mounted filesystem. */ static struct attribute *btrfs_supported_feature_attrs[] = { - BTRFS_FEAT_ATTR_PTR(mixed_backref), BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), From bd749ca0b994d0c2204f7762f93b80cf8739ca24 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 24 Jun 2022 11:01:23 +0300 Subject: [PATCH 0552/1250] btrfs: sysfs: remove BIG_METADATA feature files This flag has been merged in 3.10 and is effectively always-on. Its status depends on the host page size so there's another way to guarantee compatibility with old kernels. Due to a bug introduced in 6f93e834fa7c ("btrfs: fix upper limit for max_inline for page size 64K") the flag is not persisted among features in the superblock so it's not reliable. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 73f99a9647d6e7..d5d0717fd09a35 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -280,7 +280,6 @@ BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); -BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); @@ -311,7 +310,6 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), BTRFS_FEAT_ATTR_PTR(compress_zstd), - BTRFS_FEAT_ATTR_PTR(big_metadata), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), From 685c6269600eb0972a73882b5596cc5659580cb0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 6 Jul 2022 11:14:23 +0100 Subject: [PATCH 0553/1250] btrfs: remove the inode cache check at btrfs_is_free_space_inode() The inode cache feature was removed in kernel 5.11, and we no longer have any code that reads from or writes to inode caches. We may still mount a filesystem that has inode caches, but they are ignored. Remove the check for an inode cache from btrfs_is_free_space_inode(), since we no longer have code to trigger reads from an inode cache or writes to an inode cache. The check at send.c is still needed, because in case we find a filesystem with an inode cache, we must ignore it. Also leave the checks at tree-checker.c, as they are sanity checks. This eliminates a dead branch and reduces the amount of code since it's in an inline function. Before: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1620662 189240 29032 1838934 1c0f56 fs/btrfs/btrfs.ko After: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1620502 189240 29032 1838774 1c0eb6 fs/btrfs/btrfs.ko Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 33811e896623f0..b467264bd1bbd4 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -305,8 +305,7 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) if (root == root->fs_info->tree_root && btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) return true; - if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; + return false; } From 33f45a36b3b10ae58c453e1734160de896b59adb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:05 +0200 Subject: [PATCH 0554/1250] btrfs: raid56: use fixed stripe length everywhere The raid56 code assumes a fixed stripe length BTRFS_STRIPE_LEN but there are functions passing it as arguments, this is not necessary. The fixed value has been used for a long time and though the stripe length should be configurable by super block member stripesize, this hasn't been implemented and would require more changes so we don't need to keep this code around until then. Partially based on a patch from Qu Wenruo. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig [ update changelog ] Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 61 ++++++++++++++++++++-------------------------- fs/btrfs/raid56.h | 12 +++------ fs/btrfs/scrub.c | 9 +++---- fs/btrfs/volumes.c | 13 ++++------ 4 files changed, 39 insertions(+), 56 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c6411c849fea04..f4d3200a14dc57 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -474,9 +474,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio) int ret = 1; spin_lock_irqsave(&rbio->bio_list_lock, flags); - if (size != rbio->nr_data * rbio->stripe_len) + if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; - BUG_ON(size > rbio->nr_data * rbio->stripe_len); + BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); spin_unlock_irqrestore(&rbio->bio_list_lock, flags); return ret; @@ -913,18 +913,17 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_io_context *bioc, - u32 stripe_len) + struct btrfs_io_context *bioc) { const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT; + const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; const unsigned int num_pages = stripe_npages * real_stripes; - const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits; + const unsigned int stripe_nsectors = + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; void *p; - ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE)); /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); /* @@ -948,7 +947,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bioc = bioc; - rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; @@ -1020,7 +1018,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector, unsigned int stripe_nr, unsigned int sector_nr, - unsigned long bio_max_len, unsigned int opf) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; @@ -1065,7 +1062,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL), + bio = bio_alloc(stripe->dev->bdev, + max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), opf, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> 9; bio->bi_private = rbio; @@ -1287,8 +1285,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, - REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -1327,8 +1324,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->bioc->tgtdev_map[stripe], - sectornr, rbio->stripe_len, - REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -1373,7 +1369,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bioc->num_stripes; i++) { stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, rbio->stripe_len) && + if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; } @@ -1395,7 +1391,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->nr_data; i++) { u64 stripe_start = rbio->bioc->raid_map[i]; - if (in_range(logical, stripe_start, rbio->stripe_len)) + if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) return i; } return -1; @@ -1580,8 +1576,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) continue; ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, rbio->stripe_len, - REQ_OP_READ); + stripe, sectornr, REQ_OP_READ); if (ret) goto cleanup; } @@ -1790,7 +1785,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) ASSERT(orig_logical >= full_stripe_start && orig_logical + orig_len <= full_stripe_start + - rbio->nr_data * rbio->stripe_len); + rbio->nr_data * BTRFS_STRIPE_LEN); bio_list_add(&rbio->bio_list, orig_bio); rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; @@ -1808,7 +1803,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len) +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; @@ -1816,7 +1811,7 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stri struct blk_plug_cb *cb; int ret; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { btrfs_put_bioc(bioc); return PTR_ERR(rbio); @@ -2140,8 +2135,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) } sector = rbio_stripe_sector(rbio, stripe, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, - REQ_OP_READ); + sectornr, REQ_OP_READ); if (ret < 0) goto cleanup; } @@ -2199,7 +2193,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * of the drive. */ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, int mirror_num, int generic_io) + int mirror_num, int generic_io) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; @@ -2210,7 +2204,7 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, btrfs_bio(bio)->mirror_num = mirror_num; } - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { if (generic_io) btrfs_put_bioc(bioc); @@ -2304,14 +2298,14 @@ static void read_rebuild_work(struct work_struct *work) struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2356,7 +2350,7 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, ASSERT(logical >= rbio->bioc->raid_map[0]); ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + - rbio->stripe_len * rbio->nr_data); + BTRFS_STRIPE_LEN * rbio->nr_data); stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); index = stripe_offset / sectorsize; rbio->bio_sectors[index].page = page; @@ -2512,7 +2506,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, - sectornr, rbio->stripe_len, REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2526,7 +2520,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, bioc->tgtdev_map[rbio->scrubp], - sectornr, rbio->stripe_len, REQ_OP_WRITE); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2693,7 +2687,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) continue; ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, rbio->stripe_len, REQ_OP_READ); + sectornr, REQ_OP_READ); if (ret) goto cleanup; } @@ -2758,13 +2752,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length) +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(fs_info, bioc, length); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index c73bceb2b46162..1dce205b79bf96 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -56,9 +56,6 @@ struct btrfs_raid_bio { */ enum btrfs_rbio_ops operation; - /* Size of each individual stripe on disk */ - u32 stripe_len; - /* How many pages there are for the full stripe including P/Q */ u16 nr_pages; @@ -169,21 +166,20 @@ static inline int nr_data_stripes(const struct map_lookup *map) struct btrfs_device; int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u32 stripe_len, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len); + int mirror_num, int generic_io); +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, unsigned int pgoff, u64 logical); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, - struct btrfs_io_context *bioc, u32 stripe_len, + struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length); +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a0c45e92bd6cb7..ad7958d18158f6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1216,7 +1216,6 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, u64 *raid_map, - u64 mapped_length, int nstripes, int mirror, int *stripe_index, u64 *stripe_offset) @@ -1231,7 +1230,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, continue; if (logical >= raid_map[i] && - logical < raid_map[i] + mapped_length) + logical < raid_map[i] + BTRFS_STRIPE_LEN) break; } @@ -1335,7 +1334,6 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, scrub_stripe_index_and_offset(logical, bioc->map_type, bioc->raid_map, - mapped_length, bioc->num_stripes - bioc->num_tgtdevs, mirror_index, @@ -1387,7 +1385,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, mirror_num = sector->sblock->sectors[0]->mirror_num; ret = raid56_parity_recover(bio, sector->recover->bioc, - sector->recover->map_length, mirror_num, 0); if (ret) return ret; @@ -2195,7 +2192,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(bio, bioc, length); + rbio = raid56_alloc_missing_rbio(bio, bioc); if (!rbio) goto rbio_out; @@ -2829,7 +2826,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, sparity->scrub_dev, &sparity->dbitmap, sparity->nsectors); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2d788a351c1f27..36a5466266c416 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6461,6 +6461,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, @@ -6758,14 +6759,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { - /* In this case, map_length has been set to the length of - a single stripe; not the whole write */ - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - ret = raid56_parity_write(bio, bioc, map_length); - } else { - ret = raid56_parity_recover(bio, bioc, map_length, - mirror_num, 1); - } + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + ret = raid56_parity_write(bio, bioc); + else + ret = raid56_parity_recover(bio, bioc, mirror_num, 1); goto out_dec; } From eaf70712ec7f68c6ef758ed3a497eece99b525be Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 17 Jun 2022 12:04:06 +0200 Subject: [PATCH 0555/1250] btrfs: return proper mapped length for RAID56 profiles in __btrfs_map_block() For profiles other than RAID56, __btrfs_map_block() returns @map_length as min(stripe_end, logical + *length), which is also the same result from btrfs_get_io_geometry(). But for RAID56, __btrfs_map_block() returns @map_length as stripe_len. This strange behavior is going to hurt incoming bio split at btrfs_map_bio() time, as we will use @map_length as bio split size. Fix this behavior by returning @map_length by the same calculation as for other profiles. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Qu Wenruo Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 36a5466266c416..2eb72dda764c22 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6471,7 +6471,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, num_stripes = map->num_stripes; max_errors = btrfs_chunk_max_errors(map); - *length = map->stripe_len; + /* Return the length to the full stripe end */ + *length = min(logical + *length, + raid56_full_stripe_start + em->start + + data_stripes * stripe_len) - logical; stripe_index = 0; stripe_offset = 0; } else { From 911aa8598805c8528f9ea4176286b882babfa2a5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:07 +0200 Subject: [PATCH 0556/1250] btrfs: do not return errors from btrfs_map_bio Always consume the bio and call the end_io handler on error instead of returning an error and letting the caller handle it. This matches what the block layer submission does and avoids any confusion on who needs to handle errors. As this requires touching all the callers, rename the function to btrfs_submit_bio, which describes the functionality much better. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/compression.c | 8 ++------ fs/btrfs/disk-io.c | 21 ++++++++++----------- fs/btrfs/inode.c | 25 ++++++++++--------------- fs/btrfs/volumes.c | 12 +++++++----- fs/btrfs/volumes.h | 3 +-- 5 files changed, 30 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 63d542961b78a1..907fc8a4c092cb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -593,9 +593,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, } ASSERT(bio->bi_iter.bi_size); - ret = btrfs_map_bio(fs_info, bio, 0); - if (ret) - goto finish_cb; + btrfs_submit_bio(fs_info, bio, 0); bio = NULL; } cond_resched(); @@ -931,9 +929,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, sums += fs_info->csum_size * nr_sectors; ASSERT(comp_bio->bi_iter.bi_size); - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); - if (ret) - goto finish_cb; + btrfs_submit_bio(fs_info, comp_bio, mirror_num); comp_bio = NULL; } } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ae7fc4ed25244b..5719712f2d4c4f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -731,7 +731,6 @@ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async; struct inode *inode; - blk_status_t ret; async = container_of(work, struct async_submit_bio, work); inode = async->inode; @@ -749,11 +748,7 @@ static void run_one_async_done(struct btrfs_work *work) * This changes nothing when cgroups aren't in use. */ async->bio->bi_opf |= REQ_CGROUP_PUNT; - ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); - if (ret) { - async->bio->bi_status = ret; - bio_endio(async->bio); - } + btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -817,7 +812,7 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, { /* * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_map_bio + * submission context. Just jump into btrfs_submit_bio. */ return btree_csum_one_bio(bio); } @@ -842,11 +837,15 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ bio->bi_opf |= REQ_META; if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else if (!should_async_write(fs_info, BTRFS_I(inode))) { + btrfs_submit_bio(fs_info, bio, mirror_num); + return; + } + if (!should_async_write(fs_info, BTRFS_I(inode))) { ret = btree_csum_one_bio(bio); - if (!ret) - ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (!ret) { + btrfs_submit_bio(fs_info, bio, mirror_num); + return; + } } else { /* * kthread helpers are used to submit writes so that diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e4632c83453e29..fe7e8af21c2d44 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2687,7 +2687,8 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro goto out; } } - ret = btrfs_map_bio(fs_info, bio, mirror_num); + btrfs_submit_bio(fs_info, bio, mirror_num); + return; out: if (ret) { bio->bi_status = ret; @@ -2715,14 +2716,13 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, * not, which is why we ignore skip_sum here. */ ret = btrfs_lookup_bio_sums(inode, bio, NULL); - if (ret) - goto out; - ret = btrfs_map_bio(fs_info, bio, mirror_num); -out: if (ret) { bio->bi_status = ret; bio_endio(bio); + return; } + + btrfs_submit_bio(fs_info, bio, mirror_num); } /* @@ -7945,8 +7945,7 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, BUG_ON(bio_op(bio) == REQ_OP_WRITE); refcount_inc(&dip->refs); - if (btrfs_map_bio(fs_info, bio, mirror_num)) - refcount_dec(&dip->refs); + btrfs_submit_bio(fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, @@ -8046,7 +8045,8 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, file_offset - dip->file_offset); } map: - return btrfs_map_bio(fs_info, bio, 0); + btrfs_submit_bio(fs_info, bio, 0); + return BLK_STS_OK; } static void btrfs_submit_direct(const struct iomap_iter *iter, @@ -10330,7 +10330,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { struct btrfs_encoded_read_private *priv = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; @@ -10341,12 +10340,8 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, } atomic_inc(&priv->pending); - ret = btrfs_map_bio(fs_info, bio, mirror_num); - if (ret) { - atomic_dec(&priv->pending); - btrfs_bio_free_csum(bbio); - } - return ret; + btrfs_submit_bio(fs_info, bio, mirror_num); + return BLK_STS_OK; } static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2eb72dda764c22..6b2ad30e022161 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6726,8 +6726,8 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, } } btrfs_debug_in_rcu(fs_info, - "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); @@ -6737,8 +6737,7 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, submit_bio(bio); } -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num) +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) { u64 logical = bio->bi_iter.bi_sector << 9; u64 length = bio->bi_iter.bi_size; @@ -6783,7 +6782,10 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } out_dec: btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); + if (ret) { + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + } } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9537d82bb7a201..5639961b3626f7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -580,8 +580,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, From 93a79ca87ef031b10cb82f8c2cb63478e29a653f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:08 +0200 Subject: [PATCH 0557/1250] btrfs: do not return errors from raid56_parity_write Always consume the bio and call the end_io handler on error instead of returning an error and letting the caller handle it. This matches what the block layer submission does and avoids any confusion on who needs to handle errors. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 23 +++++++++++++++-------- fs/btrfs/raid56.h | 2 +- fs/btrfs/volumes.c | 2 +- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f4d3200a14dc57..0408ef29bd02cd 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1803,18 +1803,19 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret; + int ret = 0; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + ret = PTR_ERR(rbio); + goto out; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); @@ -1829,8 +1830,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); - return ret; + goto out_dec_counter; + return; } cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); @@ -1841,13 +1842,19 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - ret = 0; } else { ret = __raid56_parity_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); + goto out_dec_counter; } - return ret; + + return; + +out_dec_counter: + btrfs_bio_counter_dec(fs_info); +out: + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); } /* diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 1dce205b79bf96..3f223ae39462a6 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -167,7 +167,7 @@ struct btrfs_device; int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, unsigned int pgoff, u64 logical); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6b2ad30e022161..ed440b5a300ca0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6762,7 +6762,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { if (btrfs_op(bio) == BTRFS_MAP_WRITE) - ret = raid56_parity_write(bio, bioc); + raid56_parity_write(bio, bioc); else ret = raid56_parity_recover(bio, bioc, mirror_num, 1); goto out_dec; From 0650865f6d5349df21245d057ff360f332432af3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:09 +0200 Subject: [PATCH 0558/1250] btrfs: do not return errors from raid56_parity_recover Always consume the bio and call the end_io handler on error instead of returning an error and letting the caller handle it. This matches what the block layer submission does and avoids any confusion on who needs to handle errors. Also use the proper bool type for the generic_io argument. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 39 ++++++++++++++++----------------------- fs/btrfs/raid56.h | 4 ++-- fs/btrfs/scrub.c | 10 ++-------- fs/btrfs/volumes.c | 2 +- 4 files changed, 21 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0408ef29bd02cd..84d0e073b409ee 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2199,12 +2199,11 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - int mirror_num, int generic_io) +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - int ret; if (generic_io) { ASSERT(bioc->mirror_num == mirror_num); @@ -2213,9 +2212,8 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - if (generic_io) - btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + goto out_end_bio; } rbio->operation = BTRFS_RBIO_READ_REBUILD; @@ -2227,10 +2225,9 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bioc->map_type); - if (generic_io) - btrfs_put_bioc(bioc); kfree(rbio); - return -EIO; + bio->bi_status = BLK_STS_IOERR; + goto out_end_bio; } if (generic_io) { @@ -2257,24 +2254,20 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->failb--; } - ret = lock_stripe_add(rbio); + if (lock_stripe_add(rbio)) + return; /* - * __raid56_parity_recover will end the bio with - * any errors it hits. We don't want to return - * its error value up the stack because our caller - * will end up calling bio_endio with any nonzero - * return + * This adds our rbio to the list of rbios that will be handled after + * the current lock owner is done. */ - if (ret == 0) - __raid56_parity_recover(rbio); - /* - * our rbio has been added to the list of - * rbios that will be handled after the - * currently lock owner is done - */ - return 0; + __raid56_parity_recover(rbio); + return; +out_end_bio: + if (generic_io) + btrfs_put_bioc(bioc); + bio_endio(bio); } static void rmw_work(struct work_struct *work) diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 3f223ae39462a6..6f48f9e4c86941 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -165,8 +165,8 @@ static inline int nr_data_stripes(const struct map_lookup *map) struct btrfs_device; -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - int mirror_num, int generic_io); +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io); void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ad7958d18158f6..3afe5fa50a631f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1376,18 +1376,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct scrub_sector *sector) { DECLARE_COMPLETION_ONSTACK(done); - int ret; - int mirror_num; bio->bi_iter.bi_sector = sector->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - - mirror_num = sector->sblock->sectors[0]->mirror_num; - ret = raid56_parity_recover(bio, sector->recover->bioc, - mirror_num, 0); - if (ret) - return ret; + raid56_parity_recover(bio, sector->recover->bioc, + sector->sblock->sectors[0]->mirror_num, false); wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ed440b5a300ca0..c9328cbd7fe91c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6764,7 +6764,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror if (btrfs_op(bio) == BTRFS_MAP_WRITE) raid56_parity_write(bio, bioc); else - ret = raid56_parity_recover(bio, bioc, mirror_num, 1); + raid56_parity_recover(bio, bioc, mirror_num, true); goto out_dec; } From 82e9d7aab29382bc315cfc3e51324ecc5ad0a23a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:10 +0200 Subject: [PATCH 0559/1250] btrfs: raid56: transfer the bio counter reference to the raid submission helpers Transfer the bio counter reference acquired by btrfs_submit_bio to raid56_parity_write and raid56_parity_recovery together with the bio that the reference was acquired for instead of acquiring another reference in those helpers and dropping the original one in btrfs_submit_bio. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 16 ++++++---------- fs/btrfs/volumes.c | 15 +++++++-------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 84d0e073b409ee..1afe32d5ab017a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1815,12 +1815,11 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) if (IS_ERR(rbio)) { btrfs_put_bioc(bioc); ret = PTR_ERR(rbio); - goto out; + goto out_dec_counter; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); - btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; /* @@ -1852,7 +1851,6 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) out_dec_counter: btrfs_bio_counter_dec(fs_info); -out: bio->bi_status = errno_to_blk_status(ret); bio_endio(bio); } @@ -2208,6 +2206,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, if (generic_io) { ASSERT(bioc->mirror_num == mirror_num); btrfs_bio(bio)->mirror_num = mirror_num; + } else { + btrfs_get_bioc(bioc); } rbio = alloc_rbio(fs_info, bioc); @@ -2230,12 +2230,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, goto out_end_bio; } - if (generic_io) { - btrfs_bio_counter_inc_noblocked(fs_info); + if (generic_io) rbio->generic_bio_cnt = 1; - } else { - btrfs_get_bioc(bioc); - } /* * Loop retry: @@ -2265,8 +2261,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, return; out_end_bio: - if (generic_io) - btrfs_put_bioc(bioc); + btrfs_bio_counter_dec(fs_info); + btrfs_put_bioc(bioc); bio_endio(bio); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c9328cbd7fe91c..bf4e140f6bfc7e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6750,8 +6750,12 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror btrfs_bio_counter_inc_blocked(fs_info); ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bioc, mirror_num, 1); - if (ret) - goto out_dec; + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + return; + } total_devs = bioc->num_stripes; bioc->orig_bio = bio; @@ -6765,7 +6769,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror raid56_parity_write(bio, bioc); else raid56_parity_recover(bio, bioc, mirror_num, true); - goto out_dec; + return; } if (map_length < length) { @@ -6780,12 +6784,7 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror submit_stripe_bio(bioc, bio, dev_nr, should_clone); } -out_dec: btrfs_bio_counter_dec(fs_info); - if (ret) { - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - } } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, From 87a22382491a6d9f0d16213bcd21a14945494162 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:11 +0200 Subject: [PATCH 0560/1250] btrfs: simplify sync/async submission in btrfs_submit_data_write_bio btrfs_submit_data_write_bio special cases the reloc root because the checksums are preloaded, but only does so for the !sync case. The sync case can't happen for data relocation, but just handling it more generally significantly simplifies the logic. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fe7e8af21c2d44..dade66ee220020 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2664,28 +2664,25 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro } /* - * Rules for async/sync submit: - * a) write without checksum: sync submit - * b) write with checksum: - * b-1) if bio is issued by fsync: sync submit - * (sync_writers != 0) - * b-2) if root is reloc root: sync submit - * (only in case of buffered IO) - * b-3) otherwise: async submit + * If we need to checksum, and the I/O is not issued by fsync and + * friends, that is ->sync_writers != 0, defer the submission to a + * workqueue to parallelize it. + * + * Csum items for reloc roots have already been cloned at this point, + * so they are handled as part of the no-checksum case. */ if (!(bi->flags & BTRFS_INODE_NODATASUM) && - !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) { - if (atomic_read(&bi->sync_writers)) { - ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); - if (ret) - goto out; - } else if (btrfs_is_data_reloc_root(bi->root)) { - ; /* Csum items have already been cloned */ - } else { + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(bi->root)) { + if (!atomic_read(&bi->sync_writers)) { ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btrfs_submit_bio_start); goto out; } + + ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); + if (ret) + goto out; } btrfs_submit_bio(fs_info, bio, mirror_num); return; From ffcef43da3dee4f759d8b5f7aec1ab43fe7bd365 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:12 +0200 Subject: [PATCH 0561/1250] btrfs: handle allocation failure in btrfs_wq_submit_bio gracefully btrfs_wq_submit_bio is used for writeback under memory pressure. Instead of failing the I/O when we can't allocate the async_submit_bio, just punt back to the synchronous submission path. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 43 ++++++++++++++++++++++++------------------- fs/btrfs/disk-io.h | 6 +++--- fs/btrfs/inode.c | 17 +++++++++-------- 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5719712f2d4c4f..bcb6807ce19e86 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -759,16 +759,23 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start) +/* + * Submit bio to an async queue. + * + * Retrun: + * - true if the work has been succesfuly submitted + * - false in case of error + */ +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return BLK_STS_RESOURCE; + return false; async->inode = inode; async->bio = bio; @@ -786,7 +793,7 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, btrfs_queue_work(fs_info->hipri_workers, &async->work); else btrfs_queue_work(fs_info->workers, &async->work); - return 0; + return true; } static blk_status_t btree_csum_one_bio(struct bio *bio) @@ -840,25 +847,23 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ btrfs_submit_bio(fs_info, bio, mirror_num); return; } - if (!should_async_write(fs_info, BTRFS_I(inode))) { - ret = btree_csum_one_bio(bio); - if (!ret) { - btrfs_submit_bio(fs_info, bio, mirror_num); - return; - } - } else { - /* - * kthread helpers are used to submit writes so that - * checksumming can happen in parallel across all CPUs - */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - btree_submit_bio_start); - } + /* + * Kthread helpers are used to submit writes so that checksumming can + * happen in parallel across all CPUs. + */ + if (should_async_write(fs_info, BTRFS_I(inode)) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start)) + return; + + ret = btree_csum_one_bio(bio); if (ret) { bio->bi_status = ret; bio_endio(bio); + return; } + + btrfs_submit_bio(fs_info, bio, mirror_num); } #ifdef CONFIG_MIGRATION diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 05e779a41a9979..8993b428e09ceb 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -114,9 +114,9 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start); +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dade66ee220020..42616f51c62ed4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2674,11 +2674,10 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro if (!(bi->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(bi->root)) { - if (!atomic_read(&bi->sync_writers)) { - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - btrfs_submit_bio_start); - goto out; - } + if (!atomic_read(&bi->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + btrfs_submit_bio_start)) + return; ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); if (ret) @@ -8027,9 +8026,11 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, if (btrfs_op(bio) == BTRFS_MAP_WRITE) { /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers)) - return btrfs_wq_submit_bio(inode, bio, 0, file_offset, - btrfs_submit_bio_start_direct_io); + if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && + btrfs_wq_submit_bio(inode, bio, 0, file_offset, + btrfs_submit_bio_start_direct_io)) + return BLK_STS_OK; + /* * If we aren't doing async submit, calculate the csum of the * bio now. From ac3e008e90e9de17caca3739d27ce295da5a4995 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 17 Jun 2022 12:04:13 +0200 Subject: [PATCH 0562/1250] btrfs: do not return errors from btrfs_submit_dio_bio Always consume the bio and call the end_io handler on error instead of returning an error and letting the caller handle it. This matches what the block layer submission and the other btrfs bio submission handlers do and avoids any confusion on who needs to handle errors. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 42616f51c62ed4..21ef0d870a692d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8014,8 +8014,8 @@ static void btrfs_end_dio_bio(struct bio *bio) btrfs_dio_private_put(dip); } -static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, - struct inode *inode, u64 file_offset, int async_submit) +static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + u64 file_offset, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; @@ -8029,22 +8029,24 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && btrfs_wq_submit_bio(inode, bio, 0, file_offset, btrfs_submit_bio_start_direct_io)) - return BLK_STS_OK; + return; /* * If we aren't doing async submit, calculate the csum of the * bio now. */ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); - if (ret) - return ret; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + return; + } } else { btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, file_offset - dip->file_offset); } map: btrfs_submit_bio(fs_info, bio, 0); - return BLK_STS_OK; } static void btrfs_submit_direct(const struct iomap_iter *iter, @@ -8157,14 +8159,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - status = btrfs_submit_dio_bio(bio, inode, file_offset, - async_submit); - if (status) { - bio_put(bio); - if (submit_len > 0) - refcount_dec(&dip->refs); - goto out_err_em; - } + btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; From 888d030f5a9ad5304b151aa717ab9cb905985baa Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 23 Jun 2022 17:08:14 +0200 Subject: [PATCH 0563/1250] btrfs: switch btrfs_block_rsv::full to bool Use simple bool type for the block reserve full status, there's short to save space as there used to be int but there's no reason for that. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 15 ++++++--------- fs/btrfs/block-rsv.h | 2 +- fs/btrfs/delayed-ref.c | 4 ++-- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index b3ee49b0b1e830..26c43a6ef5d26a 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -118,7 +118,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; + block_rsv->full = true; } else { num_bytes = 0; } @@ -142,7 +142,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, bytes_to_add = min(num_bytes, bytes_to_add); dest->reserved += bytes_to_add; if (dest->reserved >= dest->size) - dest->full = 1; + dest->full = true; num_bytes -= bytes_to_add; } spin_unlock(&dest->lock); @@ -304,7 +304,7 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) if (block_rsv->reserved >= num_bytes) { block_rsv->reserved -= num_bytes; if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; + block_rsv->full = false; ret = 0; } spin_unlock(&block_rsv->lock); @@ -319,7 +319,7 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, if (update_size) block_rsv->size += num_bytes; else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; + block_rsv->full = true; spin_unlock(&block_rsv->lock); } @@ -341,7 +341,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, } global_rsv->reserved -= num_bytes; if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; + global_rsv->full = false; spin_unlock(&global_rsv->lock); btrfs_block_rsv_add_bytes(dest, num_bytes, true); @@ -408,10 +408,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) btrfs_try_granting_tickets(fs_info, sinfo); } - if (block_rsv->reserved == block_rsv->size) - block_rsv->full = 1; - else - block_rsv->full = 0; + block_rsv->full = (block_rsv->reserved == block_rsv->size); if (block_rsv->size >= sinfo->total_bytes) sinfo->force_alloc = CHUNK_ALLOC_FORCE; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 3b67ff08d4348a..99c491ef128ee7 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -25,7 +25,7 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned short full; + bool full; unsigned short type; unsigned short failfast; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 99f37fca2e9605..36a3debe94930e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -132,7 +132,7 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; - delayed_rsv->full = 0; + delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; } @@ -175,7 +175,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, if (num_bytes) delayed_refs_rsv->reserved += num_bytes; if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = 1; + delayed_refs_rsv->full = true; spin_unlock(&delayed_refs_rsv->lock); if (num_bytes) From 74bbfa6eccbd00afd62e787d6d4390b3d33cc781 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 23 Jun 2022 17:08:14 +0200 Subject: [PATCH 0564/1250] btrfs: switch btrfs_block_rsv::failfast to bool Use simple bool type for the block reserve failfast status, there's short to save space as there used to be int but there's no reason for that. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/block-rsv.h | 2 +- fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 99c491ef128ee7..0702d4087ff60d 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -26,8 +26,8 @@ struct btrfs_block_rsv { struct btrfs_space_info *space_info; spinlock_t lock; bool full; + bool failfast; unsigned short type; - unsigned short failfast; /* * Qgroup equivalent for @size @reserved diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 734baa729cd394..f406a662e94291 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2736,7 +2736,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, goto out; } rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; /* * 1 - update the inode diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 21ef0d870a692d..b04280a682316e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5432,7 +5432,7 @@ void btrfs_evict_inode(struct inode *inode) if (!rsv) goto no_delete; rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -8686,7 +8686,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) if (!rsv) return -ENOMEM; rsv->size = min_size; - rsv->failfast = 1; + rsv->failfast = true; /* * 1 for the truncate slack space From 515944fa56b13e77ba8a464331b713c72b9c204b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 23 Jun 2022 17:15:37 +0200 Subject: [PATCH 0565/1250] btrfs: use enum for btrfs_block_rsv::type The number of block group reserve types BTRFS_BLOCK_RSV_* is small and fits to u8 and there's enough left in case we want to add more. For type safety use the enum but make it 8 bits in the structure to save space. The structure size is now 48 on release build, making a slight improvement in structures where it's embedded, like btrfs_fs_info or btrfs_inode. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 6 +++--- fs/btrfs/block-rsv.h | 11 ++++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 26c43a6ef5d26a..06be0644dd3765 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -171,7 +171,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, return 0; } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); @@ -180,7 +180,7 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type) + enum btrfs_rsv_type type) { btrfs_init_block_rsv(rsv, type); rsv->space_info = btrfs_find_space_info(fs_info, @@ -188,7 +188,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type) + enum btrfs_rsv_type type) { struct btrfs_block_rsv *block_rsv; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 0702d4087ff60d..0c183709be0084 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -9,7 +9,7 @@ enum btrfs_reserve_flush_enum; /* * Types of block reserves */ -enum { +enum btrfs_rsv_type { BTRFS_BLOCK_RSV_GLOBAL, BTRFS_BLOCK_RSV_DELALLOC, BTRFS_BLOCK_RSV_TRANS, @@ -27,7 +27,8 @@ struct btrfs_block_rsv { spinlock_t lock; bool full; bool failfast; - unsigned short type; + /* Block reserve type, one of BTRFS_BLOCK_RSV_* */ + enum btrfs_rsv_type type:8; /* * Qgroup equivalent for @size @reserved @@ -49,13 +50,13 @@ struct btrfs_block_rsv { u64 qgroup_rsv_reserved; }; -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type); void btrfs_init_root_block_rsv(struct btrfs_root *root); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, From dc2cd920821803cb7b5792d598cc7ea935e90c09 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 4 Jul 2022 12:42:04 +0100 Subject: [PATCH 0566/1250] btrfs: don't fallback to buffered IO for NOWAIT direct IO writes Currently, for a direct IO write, if we need to fallback to buffered IO, either to satisfy the whole write operation or just a part of it, we do it in the current context even if it's a NOWAIT context. This is not ideal because we currently don't have support for NOWAIT semantics in the buffered IO path (we can block for several reasons), so we should instead return -EAGAIN to the caller, so that it knows it should retry (the whole operation or what's left of it) in a context where blocking is acceptable. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f406a662e94291..687fb372093fa9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1971,11 +1971,25 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (is_sync_write) iocb->ki_flags |= IOCB_DSYNC; - /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ + /* + * If 'err' is -ENOTBLK or we have not written all data, then it means + * we must fallback to buffered IO. + */ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) goto out; buffered: + /* + * If we are in a NOWAIT context, then return -EAGAIN to signal the caller + * it must retry the operation in a context where blocking is acceptable, + * since we currently don't have NOWAIT semantics support for buffered IO + * and may block there for many reasons (reserving space for example). + */ + if (iocb->ki_flags & IOCB_NOWAIT) { + err = -EAGAIN; + goto out; + } + pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { From e308c85b1b928736e438ff478f5c15325221f377 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Mon, 18 Jul 2022 13:14:40 +0100 Subject: [PATCH 0567/1250] thermal/drivers/rzg2l: Fix comments This patch replaces 'Capture times'->'Total number of ADC data samples' as the former does not really explain much. It also fixes the typo * caliberation->calibration Lastly, as per the coding style /* should be on a separate line. This patch fixes this issue. Reported-by: Pavel Machek Signed-off-by: Biju Das Link: https://lore.kernel.org/r/20220718121440.556408-1-biju.das.jz@bp.renesas.com Signed-off-by: Daniel Lezcano --- drivers/thermal/rzg2l_thermal.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/thermal/rzg2l_thermal.c b/drivers/thermal/rzg2l_thermal.c index be07e04c692610..51ae80eda6af46 100644 --- a/drivers/thermal/rzg2l_thermal.c +++ b/drivers/thermal/rzg2l_thermal.c @@ -47,7 +47,7 @@ #define TS_CODE_AVE_SCALE(x) ((x) * 1000000) #define MCELSIUS(temp) ((temp) * MILLIDEGREE_PER_DEGREE) -#define TS_CODE_CAP_TIMES 8 /* Capture times */ +#define TS_CODE_CAP_TIMES 8 /* Total number of ADC data samples */ #define RZG2L_THERMAL_GRAN 500 /* milli Celsius */ #define RZG2L_TSU_SS_TIMEOUT_US 1000 @@ -80,7 +80,8 @@ static int rzg2l_thermal_get_temp(void *devdata, int *temp) int val, i; for (i = 0; i < TS_CODE_CAP_TIMES ; i++) { - /* TSU repeats measurement at 20 microseconds intervals and + /* + * TSU repeats measurement at 20 microseconds intervals and * automatically updates the results of measurement. As per * the HW manual for measuring temperature we need to read 8 * values consecutively and then take the average. @@ -92,16 +93,18 @@ static int rzg2l_thermal_get_temp(void *devdata, int *temp) ts_code_ave = result / TS_CODE_CAP_TIMES; - /* Calculate actual sensor value by applying curvature correction formula + /* + * Calculate actual sensor value by applying curvature correction formula * dsensor = ts_code_ave / (1 + ts_code_ave * 0.000013). Here we are doing * integer calculation by scaling all the values by 1000000. */ dsensor = TS_CODE_AVE_SCALE(ts_code_ave) / (TS_CODE_AVE_SCALE(1) + (ts_code_ave * CURVATURE_CORRECTION_CONST)); - /* The temperature Tj is calculated by the formula + /* + * The temperature Tj is calculated by the formula * Tj = (dsensor − calib1) * 165/ (calib0 − calib1) − 40 - * where calib0 and calib1 are the caliberation values. + * where calib0 and calib1 are the calibration values. */ val = ((dsensor - priv->calib1) * (MCELSIUS(165) / (priv->calib0 - priv->calib1))) - MCELSIUS(40); @@ -122,7 +125,8 @@ static int rzg2l_thermal_init(struct rzg2l_thermal_priv *priv) rzg2l_thermal_write(priv, TSU_SM, TSU_SM_NORMAL_MODE); rzg2l_thermal_write(priv, TSU_ST, 0); - /* Before setting the START bit, TSU should be in normal operating + /* + * Before setting the START bit, TSU should be in normal operating * mode. As per the HW manual, it will take 60 µs to place the TSU * into normal operating mode. */ @@ -217,7 +221,7 @@ static int rzg2l_thermal_probe(struct platform_device *pdev) if (ret) goto err; - dev_dbg(dev, "TSU probed with %s caliberation values", + dev_dbg(dev, "TSU probed with %s calibration values", rzg2l_thermal_read(priv, OTPTSUTRIM_REG(0)) ? "hw" : "sw"); return 0; From d3608b23acab2f1a87f7e1c861c694dd5c479069 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 13 Jun 2022 09:40:06 -0400 Subject: [PATCH 0568/1250] NLM: Defend against file_lock changes after vfs_test_lock() Instead of trusting that struct file_lock returns completely unchanged after vfs_test_lock() when there's no conflicting lock, stash away our nlm_lockowner reference so we can properly release it for all cases. This defends against another file_lock implementation overwriting fl_owner when the return type is F_UNLCK. Reported-by: Roberto Bergantinos Corpas Tested-by: Roberto Bergantinos Corpas Signed-off-by: Benjamin Coddington Signed-off-by: Chuck Lever --- fs/lockd/svc4proc.c | 4 +++- fs/lockd/svclock.c | 10 +--------- fs/lockd/svcproc.c | 5 ++++- include/linux/lockd/lockd.h | 1 + 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 176b468a61c757..4f247ab8be6115 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -87,6 +87,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) struct nlm_args *argp = rqstp->rq_argp; struct nlm_host *host; struct nlm_file *file; + struct nlm_lockowner *test_owner; __be32 rc = rpc_success; dprintk("lockd: TEST4 called\n"); @@ -96,6 +97,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + test_owner = argp->lock.fl.fl_owner; /* Now check for conflicting locks */ resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie); if (resp->status == nlm_drop_reply) @@ -103,7 +105,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) else dprintk("lockd: TEST4 status %d\n", ntohl(resp->status)); - nlmsvc_release_lockowner(&argp->lock); + nlmsvc_put_lockowner(test_owner); nlmsvc_release_host(host); nlm_release_file(file); return rc; diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index cb3658ab9b7aed..9c1aa75441e1cc 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -340,7 +340,7 @@ nlmsvc_get_lockowner(struct nlm_lockowner *lockowner) return lockowner; } -static void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner) +void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner) { if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock)) return; @@ -590,7 +590,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, int error; int mode; __be32 ret; - struct nlm_lockowner *test_owner; dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", nlmsvc_file_inode(file)->i_sb->s_id, @@ -604,9 +603,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } - /* If there's a conflicting lock, remember to clean up the test lock */ - test_owner = (struct nlm_lockowner *)lock->fl.fl_owner; - mode = lock_to_openmode(&lock->fl); error = vfs_test_lock(file->f_file[mode], &lock->fl); if (error) { @@ -635,10 +631,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, conflock->fl.fl_end = lock->fl.fl_end; locks_release_private(&lock->fl); - /* Clean up the test lock */ - lock->fl.fl_owner = NULL; - nlmsvc_put_lockowner(test_owner); - ret = nlm_lck_denied; out: return ret; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 4dc1b40a489a2a..b09ca35b527cc0 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -116,6 +116,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) struct nlm_args *argp = rqstp->rq_argp; struct nlm_host *host; struct nlm_file *file; + struct nlm_lockowner *test_owner; __be32 rc = rpc_success; dprintk("lockd: TEST called\n"); @@ -125,6 +126,8 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; + test_owner = argp->lock.fl.fl_owner; + /* Now check for conflicting locks */ resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie)); if (resp->status == nlm_drop_reply) @@ -133,7 +136,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp) dprintk("lockd: TEST status %d vers %d\n", ntohl(resp->status), rqstp->rq_vers); - nlmsvc_release_lockowner(&argp->lock); + nlmsvc_put_lockowner(test_owner); nlmsvc_release_host(host); nlm_release_file(file); return rc; diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index fcef192e5e45ed..70ce419e270935 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -292,6 +292,7 @@ void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t); __be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **, struct nlm_lock *); void nlm_release_file(struct nlm_file *); +void nlmsvc_put_lockowner(struct nlm_lockowner *); void nlmsvc_release_lockowner(struct nlm_lock *); void nlmsvc_mark_resources(struct net *); void nlmsvc_free_host_resources(struct nlm_host *); From a6ee59336e4a6c0813622601645e6e9efaa29bfb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Jun 2022 10:06:16 -0400 Subject: [PATCH 0569/1250] SUNRPC: Expand the svc_alloc_arg_err tracepoint Record not only the number of pages requested, but the number of pages that were actually allocated, to get a measure of progress (or lack thereof). Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 14 +++++++++----- net/sunrpc/svc_xprt.c | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index b61d9c90fa2638..5c48be033cc765 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1989,20 +1989,24 @@ TRACE_EVENT(svc_wake_up, TRACE_EVENT(svc_alloc_arg_err, TP_PROTO( - unsigned int pages + unsigned int requested, + unsigned int allocated ), - TP_ARGS(pages), + TP_ARGS(requested, allocated), TP_STRUCT__entry( - __field(unsigned int, pages) + __field(unsigned int, requested) + __field(unsigned int, allocated) ), TP_fast_assign( - __entry->pages = pages; + __entry->requested = requested; + __entry->allocated = allocated; ), - TP_printk("pages=%u", __entry->pages) + TP_printk("requested=%u allocated=%u", + __entry->requested, __entry->allocated) ); DECLARE_EVENT_CLASS(svc_deferred_event, diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 2c4dd7ca95b0cd..2106003645a78d 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -691,7 +691,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) set_current_state(TASK_RUNNING); return -EINTR; } - trace_svc_alloc_arg_err(pages); + trace_svc_alloc_arg_err(pages, ret); memalloc_retry_wait(GFP_KERNEL); } rqstp->rq_page_end = &rqstp->rq_pages[pages]; From 53e794122f2a61bb91c12d35b66a11a0f8e3e4b9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 21 Jun 2022 10:06:23 -0400 Subject: [PATCH 0570/1250] NFSD: Instrument fh_verify() Capture file handles and how they map to local inodes. In particular, NFSv4 PUTFH uses fh_verify() so we can now observe which file handles are the target of OPEN, LOOKUP, RENAME, and so on. Signed-off-by: Chuck Lever --- fs/nfsd/nfsfh.c | 5 +++-- fs/nfsd/trace.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index c29baa03dfafd6..5e2ed4b2a925c2 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -331,8 +331,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) struct dentry *dentry; __be32 error; - dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp)); - if (!fhp->fh_dentry) { error = nfsd_set_fh_dentry(rqstp, fhp); if (error) @@ -340,6 +338,9 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) } dentry = fhp->fh_dentry; exp = fhp->fh_export; + + trace_nfsd_fh_verify(rqstp, fhp, type, access); + /* * We still have to do all these permission checks, even when * fh_dentry is already set: diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index a60ead3b227a53..8467fd8f94c2c4 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -171,6 +171,52 @@ TRACE_EVENT(nfsd_compound_encode_err, __entry->opnum, __entry->status) ); +#define show_fs_file_type(x) \ + __print_symbolic(x, \ + { S_IFLNK, "LNK" }, \ + { S_IFREG, "REG" }, \ + { S_IFDIR, "DIR" }, \ + { S_IFCHR, "CHR" }, \ + { S_IFBLK, "BLK" }, \ + { S_IFIFO, "FIFO" }, \ + { S_IFSOCK, "SOCK" }) + +TRACE_EVENT(nfsd_fh_verify, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + umode_t type, + int access + ), + TP_ARGS(rqstp, fhp, type, access), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __sockaddr(server, rqstp->rq_xprt->xpt_remotelen) + __sockaddr(client, rqstp->rq_xprt->xpt_remotelen) + __field(u32, xid) + __field(u32, fh_hash) + __field(void *, inode) + __field(unsigned long, type) + __field(unsigned long, access) + ), + TP_fast_assign( + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; + __assign_sockaddr(server, &rqstp->rq_xprt->xpt_local, + rqstp->rq_xprt->xpt_locallen); + __assign_sockaddr(client, &rqstp->rq_xprt->xpt_remote, + rqstp->rq_xprt->xpt_remotelen); + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->inode = d_inode(fhp->fh_dentry); + __entry->type = type; + __entry->access = access; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x inode=%p type=%s access=%s", + __entry->xid, __entry->fh_hash, __entry->inode, + show_fs_file_type(__entry->type), + show_nfsd_may_flags(__entry->access) + ) +); DECLARE_EVENT_CLASS(nfsd_fh_err_class, TP_PROTO(struct svc_rqst *rqstp, From f3b863afe1a555d47071e4d9e0bd6f78138e88e5 Mon Sep 17 00:00:00 2001 From: Zhang Jiaming Date: Thu, 23 Jun 2022 16:20:05 +0800 Subject: [PATCH 0571/1250] NFSD: Fix space and spelling mistake Add a blank space after ','. Change 'succesful' to 'successful'. Signed-off-by: Zhang Jiaming Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 3895eb52d2b104..d267b9bcf1fc75 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -828,7 +828,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out_umask; status = nfsd_create(rqstp, &cstate->current_fh, create->cr_name, create->cr_namelen, - &create->cr_iattr,S_IFCHR, rdev, &resfh); + &create->cr_iattr, S_IFCHR, rdev, &resfh); break; case NF4SOCK: @@ -2711,7 +2711,7 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) { /* * Don't execute this op if we couldn't encode a - * succesful reply: + * successful reply: */ u32 plen = op->opdesc->op_rsize_bop(rqstp, op); /* From 8599040cbad088922e7114cec72478154e4c1b85 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 28 Jun 2022 22:25:25 +0100 Subject: [PATCH 0572/1250] nfsd: remove redundant assignment to variable len Variable len is being assigned a value zero and this is never read, it is being re-assigned later. The assignment is redundant and can be removed. Cleans up clang scan-build warning: fs/nfsd/nfsctl.c:636:2: warning: Value stored to 'len' is never read Signed-off-by: Colin Ian King Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 0621c2faf24244..66c352bf61b1d4 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -633,7 +633,6 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) } /* Now write current state into reply buffer */ - len = 0; sep = ""; remaining = SIMPLE_TRANSACTION_LIMIT; for (num=2 ; num <= 4 ; num++) { From a9aeb79f92034a743ae800a24c5c16d2cdcc1c4f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 1 Jul 2022 10:37:15 -0400 Subject: [PATCH 0573/1250] SUNRPC: Fix server-side fault injection documentation Fixes: 37324e6bb120 ("SUNRPC: Cache deferral injection") Signed-off-by: Chuck Lever --- Documentation/fault-injection/fault-injection.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index eb9c2d9a4f5f39..17779a2772e51e 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -169,6 +169,13 @@ configuration of fault-injection capabilities. default is 'N', setting it to 'Y' will disable disconnect injection on the RPC server. +- /sys/kernel/debug/fail_sunrpc/ignore-cache-wait: + + Format: { 'Y' | 'N' } + + default is 'N', setting it to 'Y' will disable cache wait + injection on the RPC server. + - /sys/kernel/debug/fail_function/inject: Format: { 'function-name' | '!function-name' | '' } From 1a6121ca63c5b1af5a3ef05baf9579f644e7619b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:23:45 -0400 Subject: [PATCH 0574/1250] NFSD: Demote a WARN to a pr_warn() The call trace doesn't add much value, but it sure is noisy. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d267b9bcf1fc75..5af9f8d1feb6e7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -630,9 +630,9 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } status = nfsd4_process_open2(rqstp, resfh, open); - WARN(status && open->op_created, - "nfsd4_process_open2 failed to open newly-created file! status=%u\n", - be32_to_cpu(status)); + if (status && open->op_created) + pr_warn("nfsd4_process_open2 failed to open newly-created file: status=%u\n", + be32_to_cpu(status)); if (reclaim && !status) nn->somebody_reclaimed = true; out: From a6406d4234da87a4d565989e47ac280b6ee3c4ff Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:23:52 -0400 Subject: [PATCH 0575/1250] NFSD: Report filecache LRU size Surface the NFSD filecache's LRU list length to help field troubleshooters monitor filecache issues. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9cb2d590c0361b..a0234d194ec1cd 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1068,7 +1068,7 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { unsigned int i, count = 0, longest = 0; - unsigned long hits = 0; + unsigned long lru = 0, hits = 0; /* * No need for spinlocks here since we're not terribly interested in @@ -1081,6 +1081,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) count += nfsd_file_hashtbl[i].nfb_count; longest = max(longest, nfsd_file_hashtbl[i].nfb_count); } + lru = list_lru_count(&nfsd_file_lru); } mutex_unlock(&nfsd_mutex); @@ -1089,6 +1090,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "total entries: %u\n", count); seq_printf(m, "longest chain: %u\n", longest); + seq_printf(m, "lru entries: %lu\n", lru); seq_printf(m, "cache hits: %lu\n", hits); return 0; } From d330cdde32805d181ad0f1aaaa6c8d74a4ad45b4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:23:59 -0400 Subject: [PATCH 0576/1250] NFSD: Report count of calls to nfsd_file_acquire() Count the number of successful acquisitions that did not create a file (ie, acquisitions that do not result in a compulsory cache miss). This count can be compared directly with the reported hit count to compute a hit ratio. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index a0234d194ec1cd..3359df6c7ac0d3 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -43,6 +43,7 @@ struct nfsd_fcache_bucket { }; static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); +static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); struct nfsd_fcache_disposal { struct work_struct work; @@ -975,6 +976,8 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, } out: if (status == nfs_ok) { + if (open) + this_cpu_inc(nfsd_file_acquisitions); *pnf = nf; } else { nfsd_file_put(nf); @@ -1067,8 +1070,9 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, */ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { + unsigned long hits = 0, acquisitions = 0; unsigned int i, count = 0, longest = 0; - unsigned long lru = 0, hits = 0; + unsigned long lru = 0; /* * No need for spinlocks here since we're not terribly interested in @@ -1085,13 +1089,16 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) } mutex_unlock(&nfsd_mutex); - for_each_possible_cpu(i) + for_each_possible_cpu(i) { hits += per_cpu(nfsd_file_cache_hits, i); + acquisitions += per_cpu(nfsd_file_acquisitions, i); + } seq_printf(m, "total entries: %u\n", count); seq_printf(m, "longest chain: %u\n", longest); seq_printf(m, "lru entries: %lu\n", lru); seq_printf(m, "cache hits: %lu\n", hits); + seq_printf(m, "acquisitions: %lu\n", acquisitions); return 0; } From 0187c33071abaa6ee8af59fc7ea1e6b5fc2be62a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:24:05 -0400 Subject: [PATCH 0577/1250] NFSD: Report count of freed filecache items Surface the count of freed nfsd_file items. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 3359df6c7ac0d3..c28e9577837d58 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -44,6 +44,7 @@ struct nfsd_fcache_bucket { static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); +static DEFINE_PER_CPU(unsigned long, nfsd_file_releases); struct nfsd_fcache_disposal { struct work_struct work; @@ -202,6 +203,8 @@ nfsd_file_free(struct nfsd_file *nf) { bool flush = false; + this_cpu_inc(nfsd_file_releases); + trace_nfsd_file_put_final(nf); if (nf->nf_mark) nfsd_file_mark_put(nf->nf_mark); @@ -1070,7 +1073,7 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, */ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { - unsigned long hits = 0, acquisitions = 0; + unsigned long hits = 0, acquisitions = 0, releases = 0; unsigned int i, count = 0, longest = 0; unsigned long lru = 0; @@ -1092,6 +1095,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) for_each_possible_cpu(i) { hits += per_cpu(nfsd_file_cache_hits, i); acquisitions += per_cpu(nfsd_file_acquisitions, i); + releases += per_cpu(nfsd_file_releases, i); } seq_printf(m, "total entries: %u\n", count); @@ -1099,6 +1103,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "lru entries: %lu\n", lru); seq_printf(m, "cache hits: %lu\n", hits); seq_printf(m, "acquisitions: %lu\n", acquisitions); + seq_printf(m, "releases: %lu\n", releases); return 0; } From e6cb8fc097af8264ff8869c6526999e90a72b6d3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:24:12 -0400 Subject: [PATCH 0578/1250] NFSD: Report average age of filecache items This is a measure of how long items stay in the filecache, to help assess how efficient the cache is. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 11 ++++++++++- fs/nfsd/filecache.h | 1 + 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index c28e9577837d58..da48c51a2bf0a9 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -45,6 +45,7 @@ struct nfsd_fcache_bucket { static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); static DEFINE_PER_CPU(unsigned long, nfsd_file_releases); +static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); struct nfsd_fcache_disposal { struct work_struct work; @@ -178,6 +179,7 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, if (nf) { INIT_HLIST_NODE(&nf->nf_node); INIT_LIST_HEAD(&nf->nf_lru); + nf->nf_birthtime = ktime_get(); nf->nf_file = NULL; nf->nf_cred = get_current_cred(); nf->nf_net = net; @@ -201,9 +203,11 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, static bool nfsd_file_free(struct nfsd_file *nf) { + s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime)); bool flush = false; this_cpu_inc(nfsd_file_releases); + this_cpu_add(nfsd_file_total_age, age); trace_nfsd_file_put_final(nf); if (nf->nf_mark) @@ -1075,7 +1079,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { unsigned long hits = 0, acquisitions = 0, releases = 0; unsigned int i, count = 0, longest = 0; - unsigned long lru = 0; + unsigned long lru = 0, total_age = 0; /* * No need for spinlocks here since we're not terribly interested in @@ -1096,6 +1100,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) hits += per_cpu(nfsd_file_cache_hits, i); acquisitions += per_cpu(nfsd_file_acquisitions, i); releases += per_cpu(nfsd_file_releases, i); + total_age += per_cpu(nfsd_file_total_age, i); } seq_printf(m, "total entries: %u\n", count); @@ -1104,6 +1109,10 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "cache hits: %lu\n", hits); seq_printf(m, "acquisitions: %lu\n", acquisitions); seq_printf(m, "releases: %lu\n", releases); + if (releases) + seq_printf(m, "mean age (ms): %ld\n", total_age / releases); + else + seq_printf(m, "mean age (ms): -\n"); return 0; } diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 1da0c79a558048..d0c42619dc10fb 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -46,6 +46,7 @@ struct nfsd_file { refcount_t nf_ref; unsigned char nf_may; struct nfsd_file_mark *nf_mark; + ktime_t nf_birthtime; }; int nfsd_file_cache_init(void); From 99fbaf6afc9db9c2c2cecd73cece147a1a7035ec Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:24:18 -0400 Subject: [PATCH 0579/1250] NFSD: Add nfsd_file_lru_dispose_list() helper Refactor the invariant part of nfsd_file_lru_walk_list() into a separate helper function. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index da48c51a2bf0a9..b278030e0a12e0 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -457,11 +457,31 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, return LRU_SKIP; } +/* + * Unhash items on @dispose immediately, then queue them on the + * disposal workqueue to finish releasing them in the background. + * + * cel: Note that between the time list_lru_shrink_walk runs and + * now, these items are in the hash table but marked unhashed. + * Why release these outside of lru_cb ? There's no lock ordering + * problem since lru_cb currently takes no lock. + */ +static void nfsd_file_gc_dispose_list(struct list_head *dispose) +{ + struct nfsd_file *nf; + + list_for_each_entry(nf, dispose, nf_lru) { + spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_do_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + } + nfsd_file_dispose_list_delayed(dispose); +} + static unsigned long nfsd_file_lru_walk_list(struct shrink_control *sc) { LIST_HEAD(head); - struct nfsd_file *nf; unsigned long ret; if (sc) @@ -471,12 +491,7 @@ nfsd_file_lru_walk_list(struct shrink_control *sc) ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX); - list_for_each_entry(nf, &head, nf_lru) { - spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); - nfsd_file_do_unhash(nf); - spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); - } - nfsd_file_dispose_list_delayed(&head); + nfsd_file_gc_dispose_list(&head); return ret; } From ffe885404315c9d7bbb989d1de10a5ed4d62f9e2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:24:25 -0400 Subject: [PATCH 0580/1250] NFSD: Refactor nfsd_file_gc() Refactor nfsd_file_gc() to use the new list_lru helper. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index b278030e0a12e0..4e1162f51a70eb 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -498,7 +498,11 @@ nfsd_file_lru_walk_list(struct shrink_control *sc) static void nfsd_file_gc(void) { - nfsd_file_lru_walk_list(NULL); + LIST_HEAD(dispose); + + list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, + &dispose, LONG_MAX); + nfsd_file_gc_dispose_list(&dispose); } static void From f89c538b3b323a00472af87a44c8c39a1716cf02 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:24:31 -0400 Subject: [PATCH 0581/1250] NFSD: Refactor nfsd_file_lru_scan() Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 4e1162f51a70eb..79cbbbdf835546 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -478,23 +478,6 @@ static void nfsd_file_gc_dispose_list(struct list_head *dispose) nfsd_file_dispose_list_delayed(dispose); } -static unsigned long -nfsd_file_lru_walk_list(struct shrink_control *sc) -{ - LIST_HEAD(head); - unsigned long ret; - - if (sc) - ret = list_lru_shrink_walk(&nfsd_file_lru, sc, - nfsd_file_lru_cb, &head); - else - ret = list_lru_walk(&nfsd_file_lru, - nfsd_file_lru_cb, - &head, LONG_MAX); - nfsd_file_gc_dispose_list(&head); - return ret; -} - static void nfsd_file_gc(void) { @@ -521,7 +504,13 @@ nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc) static unsigned long nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) { - return nfsd_file_lru_walk_list(sc); + LIST_HEAD(dispose); + unsigned long ret; + + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, + nfsd_file_lru_cb, &dispose); + nfsd_file_gc_dispose_list(&dispose); + return ret; } static struct shrinker nfsd_file_shrinker = { From 3d87c9f3ac0aeff1002425e60f11e75edd936d52 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:24:38 -0400 Subject: [PATCH 0582/1250] NFSD: Report the number of items evicted by the LRU walk Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 13 ++++++++++--- fs/nfsd/trace.h | 29 +++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 79cbbbdf835546..12f5874739134b 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -46,6 +46,7 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); static DEFINE_PER_CPU(unsigned long, nfsd_file_releases); static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); +static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); struct nfsd_fcache_disposal { struct work_struct work; @@ -452,6 +453,7 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, goto out_skip; list_lru_isolate_move(lru, &nf->nf_lru, head); + this_cpu_inc(nfsd_file_evictions); return LRU_REMOVED; out_skip: return LRU_SKIP; @@ -482,9 +484,11 @@ static void nfsd_file_gc(void) { LIST_HEAD(dispose); + unsigned long ret; - list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, - &dispose, LONG_MAX); + ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, + &dispose, LONG_MAX); + trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru)); nfsd_file_gc_dispose_list(&dispose); } @@ -509,6 +513,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &dispose); + trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru)); nfsd_file_gc_dispose_list(&dispose); return ret; } @@ -1085,7 +1090,7 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, */ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { - unsigned long hits = 0, acquisitions = 0, releases = 0; + unsigned long hits = 0, acquisitions = 0, releases = 0, evictions = 0; unsigned int i, count = 0, longest = 0; unsigned long lru = 0, total_age = 0; @@ -1109,6 +1114,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) acquisitions += per_cpu(nfsd_file_acquisitions, i); releases += per_cpu(nfsd_file_releases, i); total_age += per_cpu(nfsd_file_total_age, i); + evictions += per_cpu(nfsd_file_evictions, i); } seq_printf(m, "total entries: %u\n", count); @@ -1117,6 +1123,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "cache hits: %lu\n", hits); seq_printf(m, "acquisitions: %lu\n", acquisitions); seq_printf(m, "releases: %lu\n", releases); + seq_printf(m, "evictions: %lu\n", evictions); if (releases) seq_printf(m, "mean age (ms): %ld\n", total_age / releases); else diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 8467fd8f94c2c4..59dc5b2f4a50c9 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -897,6 +897,35 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event, __entry->nlink, __entry->mode, __entry->mask) ); +DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class, + TP_PROTO( + unsigned long removed, + unsigned long remaining + ), + TP_ARGS(removed, remaining), + TP_STRUCT__entry( + __field(unsigned long, removed) + __field(unsigned long, remaining) + ), + TP_fast_assign( + __entry->removed = removed; + __entry->remaining = remaining; + ), + TP_printk("%lu entries removed, %lu remaining", + __entry->removed, __entry->remaining) +); + +#define DEFINE_NFSD_FILE_LRUWALK_EVENT(name) \ +DEFINE_EVENT(nfsd_file_lruwalk_class, name, \ + TP_PROTO( \ + unsigned long removed, \ + unsigned long remaining \ + ), \ + TP_ARGS(removed, remaining)) + +DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed); +DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed); + #include "cache.h" TRACE_DEFINE_ENUM(RC_DROPIT); From 83a19c3b452fb830f79166499db73cf005471072 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:24:45 -0400 Subject: [PATCH 0583/1250] NFSD: Record number of flush calls Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 12f5874739134b..7b532449b93fcf 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -46,6 +46,7 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); static DEFINE_PER_CPU(unsigned long, nfsd_file_releases); static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); +static DEFINE_PER_CPU(unsigned long, nfsd_file_pages_flushed); static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); struct nfsd_fcache_disposal { @@ -249,7 +250,12 @@ nfsd_file_check_write_error(struct nfsd_file *nf) static void nfsd_file_flush(struct nfsd_file *nf) { - if (nf->nf_file && vfs_fsync(nf->nf_file, 1) != 0) + struct file *file = nf->nf_file; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return; + this_cpu_add(nfsd_file_pages_flushed, file->f_mapping->nrpages); + if (vfs_fsync(file, 1) != 0) nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); } @@ -1090,7 +1096,8 @@ nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, */ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { - unsigned long hits = 0, acquisitions = 0, releases = 0, evictions = 0; + unsigned long releases = 0, pages_flushed = 0, evictions = 0; + unsigned long hits = 0, acquisitions = 0; unsigned int i, count = 0, longest = 0; unsigned long lru = 0, total_age = 0; @@ -1115,6 +1122,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) releases += per_cpu(nfsd_file_releases, i); total_age += per_cpu(nfsd_file_total_age, i); evictions += per_cpu(nfsd_file_evictions, i); + pages_flushed += per_cpu(nfsd_file_pages_flushed, i); } seq_printf(m, "total entries: %u\n", count); @@ -1128,6 +1136,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "mean age (ms): %ld\n", total_age / releases); else seq_printf(m, "mean age (ms): -\n"); + seq_printf(m, "pages flushed: %lu\n", pages_flushed); return 0; } From 37b172b6b4527e3205134e7c4bbad61d48fe48b4 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:24:51 -0400 Subject: [PATCH 0584/1250] NFSD: Zero counters when the filecache is re-initialized If nfsd_file_cache_init() is called after a shutdown, be sure the stat counters are reset. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 7b532449b93fcf..3055a04eeabeae 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -830,6 +830,8 @@ nfsd_file_cache_shutdown_net(struct net *net) void nfsd_file_cache_shutdown(void) { + int i; + set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); lease_unregister_notifier(&nfsd_file_lease_notifier); @@ -853,6 +855,15 @@ nfsd_file_cache_shutdown(void) nfsd_file_hashtbl = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; + + for_each_possible_cpu(i) { + per_cpu(nfsd_file_cache_hits, i) = 0; + per_cpu(nfsd_file_acquisitions, i) = 0; + per_cpu(nfsd_file_releases, i) = 0; + per_cpu(nfsd_file_total_age, i) = 0; + per_cpu(nfsd_file_pages_flushed, i) = 0; + per_cpu(nfsd_file_evictions, i) = 0; + } } static bool From 0357aff15676296664dde0dc47a0d8b56c151363 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:24:58 -0400 Subject: [PATCH 0585/1250] NFSD: Hook up the filecache stat file There has always been the capability of exporting filecache metrics via /proc, but it was never hooked up. Let's surface these metrics to enable better observability of the filecache. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfsctl.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 66c352bf61b1d4..7002edbf26870e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -25,6 +25,7 @@ #include "state.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" /* * We have a single directory with several nodes in it. @@ -45,6 +46,7 @@ enum { NFSD_Ports, NFSD_MaxBlkSize, NFSD_MaxConnections, + NFSD_Filecache, NFSD_SupportedEnctypes, /* * The below MUST come last. Otherwise we leave a hole in nfsd_files[] @@ -229,6 +231,13 @@ static const struct file_operations reply_cache_stats_operations = { .release = single_release, }; +static const struct file_operations filecache_ops = { + .open = nfsd_file_cache_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + /*----------------------------------------------------------------------------*/ /* * payload - write methods @@ -1370,6 +1379,7 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc) [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO}, + [NFSD_Filecache] = {"filecache", &filecache_ops, S_IRUGO}, #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE) [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO}, #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */ From 7a57743254fa2a338d0ea41fe0d7c7767d0e009c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:25:04 -0400 Subject: [PATCH 0586/1250] NFSD: WARN when freeing an item still linked via nf_lru Add a guardrail to prevent freeing memory that is still on a list. This includes either a dispose list or the LRU list. This is the sign of a bug, but this class of bugs can be detected so that they don't endanger system stability, especially while debugging. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 3055a04eeabeae..8ade3699664c23 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -220,6 +220,14 @@ nfsd_file_free(struct nfsd_file *nf) fput(nf->nf_file); flush = true; } + + /* + * If this item is still linked via nf_lru, that's a bug. + * WARN and leak it to preserve system stability. + */ + if (WARN_ON_ONCE(!list_empty(&nf->nf_lru))) + return flush; + call_rcu(&nf->nf_rcu, nfsd_file_slab_free); return flush; } @@ -349,7 +357,7 @@ nfsd_file_dispose_list(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); - list_del(&nf->nf_lru); + list_del_init(&nf->nf_lru); nfsd_file_flush(nf); nfsd_file_put_noref(nf); } @@ -363,7 +371,7 @@ nfsd_file_dispose_list_sync(struct list_head *dispose) while(!list_empty(dispose)) { nf = list_first_entry(dispose, struct nfsd_file, nf_lru); - list_del(&nf->nf_lru); + list_del_init(&nf->nf_lru); nfsd_file_flush(nf); if (!refcount_dec_and_test(&nf->nf_ref)) continue; From bee249ca95d066b401d2712b337a6ed4d417896e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:25:11 -0400 Subject: [PATCH 0587/1250] NFSD: Trace filecache LRU activity Observe the operation of garbage collection and the lifetime of filecache items. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 44 +++++++++++++++++++++++++++++++------------- fs/nfsd/trace.h | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 13 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 8ade3699664c23..37373b012276b4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -267,6 +267,18 @@ nfsd_file_flush(struct nfsd_file *nf) nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); } +static void nfsd_file_lru_add(struct nfsd_file *nf) +{ + if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) + trace_nfsd_file_lru_add(nf); +} + +static void nfsd_file_lru_remove(struct nfsd_file *nf) +{ + if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) + trace_nfsd_file_lru_del(nf); +} + static void nfsd_file_do_unhash(struct nfsd_file *nf) { @@ -286,8 +298,7 @@ nfsd_file_unhash(struct nfsd_file *nf) { if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { nfsd_file_do_unhash(nf); - if (!list_empty(&nf->nf_lru)) - list_lru_del(&nfsd_file_lru, &nf->nf_lru); + nfsd_file_lru_remove(nf); return true; } return false; @@ -450,27 +461,34 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, * counter. Here we check the counter and then test and clear the flag. * That order is deliberate to ensure that we can do this locklessly. */ - if (refcount_read(&nf->nf_ref) > 1) - goto out_skip; + if (refcount_read(&nf->nf_ref) > 1) { + trace_nfsd_file_gc_in_use(nf); + return LRU_SKIP; + } /* * Don't throw out files that are still undergoing I/O or * that have uncleared errors pending. */ - if (nfsd_file_check_writeback(nf)) - goto out_skip; + if (nfsd_file_check_writeback(nf)) { + trace_nfsd_file_gc_writeback(nf); + return LRU_SKIP; + } - if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) - goto out_skip; + if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) { + trace_nfsd_file_gc_referenced(nf); + return LRU_SKIP; + } - if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) - goto out_skip; + if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + trace_nfsd_file_gc_hashed(nf); + return LRU_SKIP; + } list_lru_isolate_move(lru, &nf->nf_lru, head); this_cpu_inc(nfsd_file_evictions); + trace_nfsd_file_gc_disposed(nf); return LRU_REMOVED; -out_skip: - return LRU_SKIP; } /* @@ -1037,7 +1055,7 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, refcount_inc(&nf->nf_ref); __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); - list_lru_add(&nfsd_file_lru, &nf->nf_lru); + nfsd_file_lru_add(nf); hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head); ++nfsd_file_hashtbl[hashval].nfb_count; nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 59dc5b2f4a50c9..1cc1133371eb2d 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -897,6 +897,45 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event, __entry->nlink, __entry->mode, __entry->mask) ); +DECLARE_EVENT_CLASS(nfsd_file_gc_class, + TP_PROTO( + const struct nfsd_file *nf + ), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(void *, nf_inode) + __field(void *, nf_file) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + ), + TP_fast_assign( + __entry->nf_inode = nf->nf_inode; + __entry->nf_file = nf->nf_file; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + ), + TP_printk("inode=%p ref=%d nf_flags=%s nf_file=%p", + __entry->nf_inode, __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + __entry->nf_file + ) +); + +#define DEFINE_NFSD_FILE_GC_EVENT(name) \ +DEFINE_EVENT(nfsd_file_gc_class, name, \ + TP_PROTO( \ + const struct nfsd_file *nf \ + ), \ + TP_ARGS(nf)) + +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_hashed); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed); + DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class, TP_PROTO( unsigned long removed, From 8faf1c66c2192e36b05a93ff279703646adfeb38 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:25:17 -0400 Subject: [PATCH 0588/1250] NFSD: Leave open files out of the filecache LRU There have been reports of problems when running fstests generic/531 against Linux NFS servers with NFSv4. The NFS server that hosts the test's SCRATCH_DEV suffers from CPU soft lock-ups during the test. Analysis shows that: fs/nfsd/filecache.c 482 ret = list_lru_walk(&nfsd_file_lru, 483 nfsd_file_lru_cb, 484 &head, LONG_MAX); causes nfsd_file_gc() to walk the entire length of the filecache LRU list every time it is called (which is quite frequently). The walk holds a spinlock the entire time that prevents other nfsd threads from accessing the filecache. What's more, for NFSv4 workloads, none of the items that are visited during this walk may be evicted, since they are all files that are held OPEN by NFS clients. Address this by ensuring that open files are not kept on the LRU list. Reported-by: Frank van der Linden Reported-by: Wang Yugui Link: https://bugzilla.linux-nfs.org/show_bug.cgi?id=386 Suggested-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 24 +++++++++++++++++++----- fs/nfsd/trace.h | 2 ++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 37373b012276b4..6e9e186334abe7 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -269,6 +269,7 @@ nfsd_file_flush(struct nfsd_file *nf) static void nfsd_file_lru_add(struct nfsd_file *nf) { + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) trace_nfsd_file_lru_add(nf); } @@ -298,7 +299,6 @@ nfsd_file_unhash(struct nfsd_file *nf) { if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { nfsd_file_do_unhash(nf); - nfsd_file_lru_remove(nf); return true; } return false; @@ -319,6 +319,7 @@ nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *disp if (refcount_dec_not_one(&nf->nf_ref)) return true; + nfsd_file_lru_remove(nf); list_add(&nf->nf_lru, dispose); return true; } @@ -330,6 +331,7 @@ nfsd_file_put_noref(struct nfsd_file *nf) if (refcount_dec_and_test(&nf->nf_ref)) { WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags)); + nfsd_file_lru_remove(nf); nfsd_file_free(nf); } } @@ -339,7 +341,7 @@ nfsd_file_put(struct nfsd_file *nf) { might_sleep(); - set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + nfsd_file_lru_add(nf); if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) { nfsd_file_flush(nf); nfsd_file_put_noref(nf); @@ -439,8 +441,18 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) } } -/* +/** + * nfsd_file_lru_cb - Examine an entry on the LRU list + * @item: LRU entry to examine + * @lru: controlling LRU + * @lock: LRU list lock (unused) + * @arg: dispose list + * * Note this can deadlock with nfsd_file_cache_purge. + * + * Return values: + * %LRU_REMOVED: @item was removed from the LRU + * %LRU_SKIP: @item cannot be evicted */ static enum lru_status nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, @@ -462,8 +474,9 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, * That order is deliberate to ensure that we can do this locklessly. */ if (refcount_read(&nf->nf_ref) > 1) { + list_lru_isolate(lru, &nf->nf_lru); trace_nfsd_file_gc_in_use(nf); - return LRU_SKIP; + return LRU_REMOVED; } /* @@ -1020,6 +1033,7 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, goto retry; } + nfsd_file_lru_remove(nf); this_cpu_inc(nfsd_file_cache_hits); if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) { @@ -1055,7 +1069,6 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, refcount_inc(&nf->nf_ref); __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); - nfsd_file_lru_add(nf); hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head); ++nfsd_file_hashtbl[hashval].nfb_count; nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, @@ -1080,6 +1093,7 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, */ if (status != nfs_ok || inode->i_nlink == 0) { bool do_free; + nfsd_file_lru_remove(nf); spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); do_free = nfsd_file_unhash(nf); spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 1cc1133371eb2d..54082b868b720e 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -929,7 +929,9 @@ DEFINE_EVENT(nfsd_file_gc_class, name, \ TP_ARGS(nf)) DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add_disposed); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced); From 998d114355c77948a997de0ef0f91397a545744b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:25:24 -0400 Subject: [PATCH 0589/1250] NFSD: Fix the filecache LRU shrinker Without LRU item rotation, the shrinker visits only a few items on the end of the LRU list, and those would always be long-term OPEN files for NFSv4 workloads. That makes the filecache shrinker completely ineffective. Adopt the same strategy as the inode LRU by using LRU_ROTATE. Suggested-by: Dave Chinner Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 6e9e186334abe7..87adefd4d1154e 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -452,6 +452,7 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) * * Return values: * %LRU_REMOVED: @item was removed from the LRU + * %LRU_ROTATE: @item is to be moved to the LRU tail * %LRU_SKIP: @item cannot be evicted */ static enum lru_status @@ -490,7 +491,7 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) { trace_nfsd_file_gc_referenced(nf); - return LRU_SKIP; + return LRU_ROTATE; } if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { @@ -532,7 +533,7 @@ nfsd_file_gc(void) unsigned long ret; ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, - &dispose, LONG_MAX); + &dispose, list_lru_count(&nfsd_file_lru)); trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru)); nfsd_file_gc_dispose_list(&dispose); } From 02c03044fc9eaaec2547b94928df2704826d642a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:25:30 -0400 Subject: [PATCH 0590/1250] NFSD: Never call nfsd_file_gc() in foreground paths The checks in nfsd_file_acquire() and nfsd_file_put() that directly invoke filecache garbage collection are intended to keep cache occupancy between a low- and high-watermark. The reason to limit the capacity of the filecache is to keep filecache lookups reasonably fast. However, invoking garbage collection at those points has some undesirable negative impacts. Files that are held open by NFSv4 clients often push the occupancy of the filecache over these watermarks. At that point: - Every call to nfsd_file_acquire() and nfsd_file_put() results in an LRU walk. This has the same effect on lookup latency as long chains in the hash table. - Garbage collection will then run on every nfsd thread, causing a lot of unnecessary lock contention. - Limiting cache capacity pushes out files used only by NFSv3 clients, which are the type of files the filecache is supposed to help. To address those negative impacts, remove the direct calls to the garbage collector. Subsequent patches will address maintaining lookup efficiency as cache capacity increases. Suggested-by: Wang Yugui Suggested-by: Dave Chinner Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 87adefd4d1154e..647d177160d6c7 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -29,8 +29,6 @@ #define NFSD_LAUNDRETTE_DELAY (2 * HZ) #define NFSD_FILE_SHUTDOWN (1) -#define NFSD_FILE_LRU_THRESHOLD (4096UL) -#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2) /* We only care about NFSD_MAY_READ/WRITE for this cache */ #define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) @@ -66,8 +64,6 @@ static struct fsnotify_group *nfsd_file_fsnotify_group; static atomic_long_t nfsd_filecache_count; static struct delayed_work nfsd_filecache_laundrette; -static void nfsd_file_gc(void); - static void nfsd_file_schedule_laundrette(void) { @@ -350,9 +346,6 @@ nfsd_file_put(struct nfsd_file *nf) nfsd_file_schedule_laundrette(); } else nfsd_file_put_noref(nf); - - if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT) - nfsd_file_gc(); } struct nfsd_file * @@ -1075,8 +1068,7 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, nfsd_file_hashtbl[hashval].nfb_count); spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); - if (atomic_long_inc_return(&nfsd_filecache_count) >= NFSD_FILE_LRU_THRESHOLD) - nfsd_file_gc(); + atomic_long_inc(&nfsd_filecache_count); nf->nf_mark = nfsd_file_mark_find_or_create(nf); if (nf->nf_mark) { From 9167eb94d98121d7f6c0fb17fcdeb862d258a75c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:25:37 -0400 Subject: [PATCH 0591/1250] NFSD: No longer record nf_hashval in the trace log I'm about to replace nfsd_file_hashtbl with an rhashtable. The individual hash values will no longer be visible or relevant, so remove them from the tracepoints. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 15 ++++++++------- fs/nfsd/trace.h | 45 +++++++++++++++++++++------------------------ 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 647d177160d6c7..c5c4130b16a206 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -595,7 +595,7 @@ nfsd_file_close_inode_sync(struct inode *inode) LIST_HEAD(dispose); __nfsd_file_close_inode(inode, hashval, &dispose); - trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose)); + trace_nfsd_file_close_inode_sync(inode, !list_empty(&dispose)); nfsd_file_dispose_list_sync(&dispose); } @@ -615,7 +615,7 @@ nfsd_file_close_inode(struct inode *inode) LIST_HEAD(dispose); __nfsd_file_close_inode(inode, hashval, &dispose); - trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose)); + trace_nfsd_file_close_inode(inode, !list_empty(&dispose)); nfsd_file_dispose_list_delayed(&dispose); } @@ -969,7 +969,7 @@ nfsd_file_is_cached(struct inode *inode) } } rcu_read_unlock(); - trace_nfsd_file_is_cached(inode, hashval, (int)ret); + trace_nfsd_file_is_cached(inode, (int)ret); return ret; } @@ -1001,9 +1001,8 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, new = nfsd_file_alloc(inode, may_flags, hashval, net); if (!new) { - trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, - NULL, nfserr_jukebox); - return nfserr_jukebox; + status = nfserr_jukebox; + goto out_status; } spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); @@ -1055,8 +1054,10 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, nf = NULL; } - trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status); +out_status: + trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); return status; + open_file: nf = new; /* Take reference for the hashtable */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 54082b868b720e..bb5a17ccf2cd75 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -750,7 +750,6 @@ DECLARE_EVENT_CLASS(nfsd_file_class, TP_PROTO(struct nfsd_file *nf), TP_ARGS(nf), TP_STRUCT__entry( - __field(unsigned int, nf_hashval) __field(void *, nf_inode) __field(int, nf_ref) __field(unsigned long, nf_flags) @@ -758,15 +757,13 @@ DECLARE_EVENT_CLASS(nfsd_file_class, __field(struct file *, nf_file) ), TP_fast_assign( - __entry->nf_hashval = nf->nf_hashval; __entry->nf_inode = nf->nf_inode; __entry->nf_ref = refcount_read(&nf->nf_ref); __entry->nf_flags = nf->nf_flags; __entry->nf_may = nf->nf_may; __entry->nf_file = nf->nf_file; ), - TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p", - __entry->nf_hashval, + TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p", __entry->nf_inode, __entry->nf_ref, show_nf_flags(__entry->nf_flags), @@ -786,15 +783,18 @@ DEFINE_NFSD_FILE_EVENT(nfsd_file_put); DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked); TRACE_EVENT(nfsd_file_acquire, - TP_PROTO(struct svc_rqst *rqstp, unsigned int hash, - struct inode *inode, unsigned int may_flags, - struct nfsd_file *nf, __be32 status), + TP_PROTO( + struct svc_rqst *rqstp, + struct inode *inode, + unsigned int may_flags, + struct nfsd_file *nf, + __be32 status + ), - TP_ARGS(rqstp, hash, inode, may_flags, nf, status), + TP_ARGS(rqstp, inode, may_flags, nf, status), TP_STRUCT__entry( __field(u32, xid) - __field(unsigned int, hash) __field(void *, inode) __field(unsigned long, may_flags) __field(int, nf_ref) @@ -806,7 +806,6 @@ TRACE_EVENT(nfsd_file_acquire, TP_fast_assign( __entry->xid = be32_to_cpu(rqstp->rq_xid); - __entry->hash = hash; __entry->inode = inode; __entry->may_flags = may_flags; __entry->nf_ref = nf ? refcount_read(&nf->nf_ref) : 0; @@ -816,8 +815,8 @@ TRACE_EVENT(nfsd_file_acquire, __entry->status = be32_to_cpu(status); ), - TP_printk("xid=0x%x hash=0x%x inode=%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=%p status=%u", - __entry->xid, __entry->hash, __entry->inode, + TP_printk("xid=0x%x inode=%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=%p status=%u", + __entry->xid, __entry->inode, show_nfsd_may_flags(__entry->may_flags), __entry->nf_ref, show_nf_flags(__entry->nf_flags), show_nfsd_may_flags(__entry->nf_may), @@ -828,7 +827,6 @@ TRACE_EVENT(nfsd_file_open, TP_PROTO(struct nfsd_file *nf, __be32 status), TP_ARGS(nf, status), TP_STRUCT__entry( - __field(unsigned int, nf_hashval) __field(void *, nf_inode) /* cannot be dereferenced */ __field(int, nf_ref) __field(unsigned long, nf_flags) @@ -836,15 +834,13 @@ TRACE_EVENT(nfsd_file_open, __field(void *, nf_file) /* cannot be dereferenced */ ), TP_fast_assign( - __entry->nf_hashval = nf->nf_hashval; __entry->nf_inode = nf->nf_inode; __entry->nf_ref = refcount_read(&nf->nf_ref); __entry->nf_flags = nf->nf_flags; __entry->nf_may = nf->nf_may; __entry->nf_file = nf->nf_file; ), - TP_printk("hash=0x%x inode=%p ref=%d flags=%s may=%s file=%p", - __entry->nf_hashval, + TP_printk("inode=%p ref=%d flags=%s may=%s file=%p", __entry->nf_inode, __entry->nf_ref, show_nf_flags(__entry->nf_flags), @@ -853,26 +849,27 @@ TRACE_EVENT(nfsd_file_open, ) DECLARE_EVENT_CLASS(nfsd_file_search_class, - TP_PROTO(struct inode *inode, unsigned int hash, int found), - TP_ARGS(inode, hash, found), + TP_PROTO( + struct inode *inode, + int found + ), + TP_ARGS(inode, found), TP_STRUCT__entry( __field(struct inode *, inode) - __field(unsigned int, hash) __field(int, found) ), TP_fast_assign( __entry->inode = inode; - __entry->hash = hash; __entry->found = found; ), - TP_printk("hash=0x%x inode=%p found=%d", __entry->hash, - __entry->inode, __entry->found) + TP_printk("inode=%p found=%d", + __entry->inode, __entry->found) ); #define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \ DEFINE_EVENT(nfsd_file_search_class, name, \ - TP_PROTO(struct inode *inode, unsigned int hash, int found), \ - TP_ARGS(inode, hash, found)) + TP_PROTO(struct inode *inode, int found), \ + TP_ARGS(inode, found)) DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync); DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode); From 8263f6aa088c4dd0e47f26fa78629ac988660a42 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:25:44 -0400 Subject: [PATCH 0592/1250] NFSD: Remove lockdep assertion from unhash_and_release_locked() IIUC, holding the hash bucket lock is needed only in nfsd_file_unhash, and there is already a lockdep assertion there. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index c5c4130b16a206..3402c05e2934ff 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -306,8 +306,6 @@ nfsd_file_unhash(struct nfsd_file *nf) static bool nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose) { - lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); - trace_nfsd_file_unhash_and_release_locked(nf); if (!nfsd_file_unhash(nf)) return false; From 2442e0be271abf877f81d39aa968320c5e2fdaef Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:25:50 -0400 Subject: [PATCH 0593/1250] NFSD: nfsd_file_unhash can compute hashval from nf->nf_inode Remove an unnecessary usage of nf_hashval. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 3402c05e2934ff..65591ead6339c3 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -279,13 +279,17 @@ static void nfsd_file_lru_remove(struct nfsd_file *nf) static void nfsd_file_do_unhash(struct nfsd_file *nf) { - lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + struct inode *inode = nf->nf_inode; + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + + lockdep_assert_held(&nfsd_file_hashtbl[hashval].nfb_lock); trace_nfsd_file_unhash(nf); if (nfsd_file_check_write_error(nf)) nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); - --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; + --nfsd_file_hashtbl[hashval].nfb_count; hlist_del_rcu(&nf->nf_node); atomic_long_dec(&nfsd_filecache_count); } From 2ac4da0ec1a3bb12fadd1b3f5afba9adf9900cf5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:25:57 -0400 Subject: [PATCH 0594/1250] NFSD: Refactor __nfsd_file_close_inode() The code that computes the hashval is the same in both callers. To prevent them from going stale, reframe the documenting comments to remove descriptions of the underlying hash table structure, which is about to be replaced. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 40 +++++++++++++++++++++------------------- fs/nfsd/trace.h | 44 +++++++++++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 30 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 65591ead6339c3..d56b434dc3776a 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -565,39 +565,44 @@ static struct shrinker nfsd_file_shrinker = { .seeks = 1, }; -static void -__nfsd_file_close_inode(struct inode *inode, unsigned int hashval, - struct list_head *dispose) +/* + * Find all cache items across all net namespaces that match @inode and + * move them to @dispose. The lookup is atomic wrt nfsd_file_acquire(). + */ +static unsigned int +__nfsd_file_close_inode(struct inode *inode, struct list_head *dispose) { + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + unsigned int count = 0; struct nfsd_file *nf; struct hlist_node *tmp; spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) { - if (inode == nf->nf_inode) + if (inode == nf->nf_inode) { nfsd_file_unhash_and_release_locked(nf, dispose); + count++; + } } spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + return count; } /** * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file * @inode: inode of the file to attempt to remove * - * Walk the whole hash bucket, looking for any files that correspond to "inode". - * If any do, then unhash them and put the hashtable reference to them and - * destroy any that had their last reference put. Also ensure that any of the - * fputs also have their final __fput done as well. + * Unhash and put, then flush and fput all cache items associated with @inode. */ void nfsd_file_close_inode_sync(struct inode *inode) { - unsigned int hashval = (unsigned int)hash_long(inode->i_ino, - NFSD_FILE_HASH_BITS); LIST_HEAD(dispose); + unsigned int count; - __nfsd_file_close_inode(inode, hashval, &dispose); - trace_nfsd_file_close_inode_sync(inode, !list_empty(&dispose)); + count = __nfsd_file_close_inode(inode, &dispose); + trace_nfsd_file_close_inode_sync(inode, count); nfsd_file_dispose_list_sync(&dispose); } @@ -605,19 +610,16 @@ nfsd_file_close_inode_sync(struct inode *inode) * nfsd_file_close_inode - attempt a delayed close of a nfsd_file * @inode: inode of the file to attempt to remove * - * Walk the whole hash bucket, looking for any files that correspond to "inode". - * If any do, then unhash them and put the hashtable reference to them and - * destroy any that had their last reference put. + * Unhash and put all cache item associated with @inode. */ static void nfsd_file_close_inode(struct inode *inode) { - unsigned int hashval = (unsigned int)hash_long(inode->i_ino, - NFSD_FILE_HASH_BITS); LIST_HEAD(dispose); + unsigned int count; - __nfsd_file_close_inode(inode, hashval, &dispose); - trace_nfsd_file_close_inode(inode, !list_empty(&dispose)); + count = __nfsd_file_close_inode(inode, &dispose); + trace_nfsd_file_close_inode(inode, count); nfsd_file_dispose_list_delayed(&dispose); } diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index bb5a17ccf2cd75..af609590ac86ac 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -850,30 +850,52 @@ TRACE_EVENT(nfsd_file_open, DECLARE_EVENT_CLASS(nfsd_file_search_class, TP_PROTO( - struct inode *inode, - int found + const struct inode *inode, + unsigned int count ), - TP_ARGS(inode, found), + TP_ARGS(inode, count), TP_STRUCT__entry( - __field(struct inode *, inode) - __field(int, found) + __field(const struct inode *, inode) + __field(unsigned int, count) ), TP_fast_assign( __entry->inode = inode; - __entry->found = found; + __entry->count = count; ), - TP_printk("inode=%p found=%d", - __entry->inode, __entry->found) + TP_printk("inode=%p count=%u", + __entry->inode, __entry->count) ); #define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \ DEFINE_EVENT(nfsd_file_search_class, name, \ - TP_PROTO(struct inode *inode, int found), \ - TP_ARGS(inode, found)) + TP_PROTO( \ + const struct inode *inode, \ + unsigned int count \ + ), \ + TP_ARGS(inode, count)) DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync); DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode); -DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached); + +TRACE_EVENT(nfsd_file_is_cached, + TP_PROTO( + const struct inode *inode, + int found + ), + TP_ARGS(inode, found), + TP_STRUCT__entry( + __field(const struct inode *, inode) + __field(int, found) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->found = found; + ), + TP_printk("inode=%p is %scached", + __entry->inode, + __entry->found ? "" : "not " + ) +); TRACE_EVENT(nfsd_file_fsnotify_handle_event, TP_PROTO(struct inode *inode, u32 mask), From da2c9dc368357ce66c62ba7774d4b233951a9502 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:26:03 -0400 Subject: [PATCH 0595/1250] NFSD: nfsd_file_hash_remove can compute hashval Remove an unnecessary use of nf_hashval. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index d56b434dc3776a..5302c42c2ef56e 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -294,6 +294,18 @@ nfsd_file_do_unhash(struct nfsd_file *nf) atomic_long_dec(&nfsd_filecache_count); } +static void +nfsd_file_hash_remove(struct nfsd_file *nf) +{ + struct inode *inode = nf->nf_inode; + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + nfsd_file_do_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); +} + static bool nfsd_file_unhash(struct nfsd_file *nf) { @@ -513,11 +525,8 @@ static void nfsd_file_gc_dispose_list(struct list_head *dispose) { struct nfsd_file *nf; - list_for_each_entry(nf, dispose, nf_lru) { - spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); - nfsd_file_do_unhash(nf); - spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); - } + list_for_each_entry(nf, dispose, nf_lru) + nfsd_file_hash_remove(nf); nfsd_file_dispose_list_delayed(dispose); } From d7c9e4cb22b3f8b462c762174056f9240bd7adb2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:26:10 -0400 Subject: [PATCH 0596/1250] NFSD: Remove nfsd_file::nf_hashval The value in this field can always be computed from nf_inode, thus it is no longer used. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 6 ++---- fs/nfsd/filecache.h | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 5302c42c2ef56e..5e4a8cc79f4076 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -168,8 +168,7 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf) } static struct nfsd_file * -nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, - struct net *net) +nfsd_file_alloc(struct inode *inode, unsigned int may, struct net *net) { struct nfsd_file *nf; @@ -183,7 +182,6 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, nf->nf_net = net; nf->nf_flags = 0; nf->nf_inode = inode; - nf->nf_hashval = hashval; refcount_set(&nf->nf_ref, 1); nf->nf_may = may & NFSD_FILE_MAY_MASK; if (may & NFSD_MAY_NOT_BREAK_LEASE) { @@ -1012,7 +1010,7 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, if (nf) goto wait_for_construction; - new = nfsd_file_alloc(inode, may_flags, hashval, net); + new = nfsd_file_alloc(inode, may_flags, net); if (!new) { status = nfserr_jukebox; goto out_status; diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index d0c42619dc10fb..31dc65f82c753a 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -42,7 +42,6 @@ struct nfsd_file { #define NFSD_FILE_REFERENCED (4) unsigned long nf_flags; struct inode *nf_inode; - unsigned int nf_hashval; refcount_t nf_ref; unsigned char nf_may; struct nfsd_file_mark *nf_mark; From 5640a38fc6105d8c51ce7ef3a89132a313087294 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:26:16 -0400 Subject: [PATCH 0597/1250] NFSD: Replace the "init once" mechanism In a moment, the nfsd_file_hashtbl global will be replaced with an rhashtable. Replace the one or two spots that need to check if the hash table is available. We can easily reuse the SHUTDOWN flag for this purpose. Document that this mechanism relies on callers to hold the nfsd_mutex to prevent init, shutdown, and purging to run concurrently. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 5e4a8cc79f4076..61e43cf17523bb 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -28,7 +28,7 @@ #define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS) #define NFSD_LAUNDRETTE_DELAY (2 * HZ) -#define NFSD_FILE_SHUTDOWN (1) +#define NFSD_FILE_CACHE_UP (0) /* We only care about NFSD_MAY_READ/WRITE for this cache */ #define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) @@ -59,7 +59,7 @@ static struct kmem_cache *nfsd_file_slab; static struct kmem_cache *nfsd_file_mark_slab; static struct nfsd_fcache_bucket *nfsd_file_hashtbl; static struct list_lru nfsd_file_lru; -static long nfsd_file_lru_flags; +static unsigned long nfsd_file_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; static atomic_long_t nfsd_filecache_count; static struct delayed_work nfsd_filecache_laundrette; @@ -67,9 +67,8 @@ static struct delayed_work nfsd_filecache_laundrette; static void nfsd_file_schedule_laundrette(void) { - long count = atomic_long_read(&nfsd_filecache_count); - - if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags)) + if ((atomic_long_read(&nfsd_filecache_count) == 0) || + test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0) return; queue_delayed_work(system_wq, &nfsd_filecache_laundrette, @@ -704,9 +703,8 @@ nfsd_file_cache_init(void) int ret = -ENOMEM; unsigned int i; - clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); - - if (nfsd_file_hashtbl) + lockdep_assert_held(&nfsd_mutex); + if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) return 0; nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0); @@ -792,8 +790,8 @@ nfsd_file_cache_init(void) /* * Note this can deadlock with nfsd_file_lru_cb. */ -void -nfsd_file_cache_purge(struct net *net) +static void +__nfsd_file_cache_purge(struct net *net) { unsigned int i; struct nfsd_file *nf; @@ -801,9 +799,6 @@ nfsd_file_cache_purge(struct net *net) LIST_HEAD(dispose); bool del; - if (!nfsd_file_hashtbl) - return; - for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i]; @@ -864,6 +859,19 @@ nfsd_file_cache_start_net(struct net *net) return nn->fcache_disposal ? 0 : -ENOMEM; } +/** + * nfsd_file_cache_purge - Remove all cache items associated with @net + * @net: target net namespace + * + */ +void +nfsd_file_cache_purge(struct net *net) +{ + lockdep_assert_held(&nfsd_mutex); + if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) + __nfsd_file_cache_purge(net); +} + void nfsd_file_cache_shutdown_net(struct net *net) { @@ -876,7 +884,9 @@ nfsd_file_cache_shutdown(void) { int i; - set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + lockdep_assert_held(&nfsd_mutex); + if (test_and_clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0) + return; lease_unregister_notifier(&nfsd_file_lease_notifier); unregister_shrinker(&nfsd_file_shrinker); @@ -885,7 +895,7 @@ nfsd_file_cache_shutdown(void) * calling nfsd_file_cache_purge */ cancel_delayed_work_sync(&nfsd_filecache_laundrette); - nfsd_file_cache_purge(NULL); + __nfsd_file_cache_purge(NULL); list_lru_destroy(&nfsd_file_lru); rcu_barrier(); fsnotify_put_group(nfsd_file_fsnotify_group); @@ -1163,7 +1173,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) * don't end up racing with server shutdown */ mutex_lock(&nfsd_mutex); - if (nfsd_file_hashtbl) { + if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) { for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { count += nfsd_file_hashtbl[i].nfb_count; longest = max(longest, nfsd_file_hashtbl[i].nfb_count); From b8a6144ead6b78cb7c1556f6e8aa683bb1cd1433 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:26:23 -0400 Subject: [PATCH 0598/1250] NFSD: Set up an rhashtable for the filecache Add code to initialize and tear down an rhashtable. The rhashtable is not used yet. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 160 ++++++++++++++++++++++++++++++++++++++------ fs/nfsd/filecache.h | 1 + 2 files changed, 140 insertions(+), 21 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 61e43cf17523bb..e5d6c2759bc205 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "vfs.h" #include "nfsd.h" @@ -63,6 +64,136 @@ static unsigned long nfsd_file_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; static atomic_long_t nfsd_filecache_count; static struct delayed_work nfsd_filecache_laundrette; +static struct rhashtable nfsd_file_rhash_tbl + ____cacheline_aligned_in_smp; + +enum nfsd_file_lookup_type { + NFSD_FILE_KEY_INODE, + NFSD_FILE_KEY_FULL, +}; + +struct nfsd_file_lookup_key { + struct inode *inode; + struct net *net; + const struct cred *cred; + unsigned char need; + enum nfsd_file_lookup_type type; +}; + +/* + * The returned hash value is based solely on the address of an in-code + * inode, a pointer to a slab-allocated object. The entropy in such a + * pointer is concentrated in its middle bits. + */ +static u32 nfsd_file_inode_hash(const struct inode *inode, u32 seed) +{ + unsigned long ptr = (unsigned long)inode; + u32 k; + + k = ptr >> L1_CACHE_SHIFT; + k &= 0x00ffffff; + return jhash2(&k, 1, seed); +} + +/** + * nfsd_file_key_hashfn - Compute the hash value of a lookup key + * @data: key on which to compute the hash value + * @len: rhash table's key_len parameter (unused) + * @seed: rhash table's random seed of the day + * + * Return value: + * Computed 32-bit hash value + */ +static u32 nfsd_file_key_hashfn(const void *data, u32 len, u32 seed) +{ + const struct nfsd_file_lookup_key *key = data; + + return nfsd_file_inode_hash(key->inode, seed); +} + +/** + * nfsd_file_obj_hashfn - Compute the hash value of an nfsd_file + * @data: object on which to compute the hash value + * @len: rhash table's key_len parameter (unused) + * @seed: rhash table's random seed of the day + * + * Return value: + * Computed 32-bit hash value + */ +static u32 nfsd_file_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct nfsd_file *nf = data; + + return nfsd_file_inode_hash(nf->nf_inode, seed); +} + +static bool +nfsd_match_cred(const struct cred *c1, const struct cred *c2) +{ + int i; + + if (!uid_eq(c1->fsuid, c2->fsuid)) + return false; + if (!gid_eq(c1->fsgid, c2->fsgid)) + return false; + if (c1->group_info == NULL || c2->group_info == NULL) + return c1->group_info == c2->group_info; + if (c1->group_info->ngroups != c2->group_info->ngroups) + return false; + for (i = 0; i < c1->group_info->ngroups; i++) { + if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i])) + return false; + } + return true; +} + +/** + * nfsd_file_obj_cmpfn - Match a cache item against search criteria + * @arg: search criteria + * @ptr: cache item to check + * + * Return values: + * %0 - Item matches search criteria + * %1 - Item does not match search criteria + */ +static int nfsd_file_obj_cmpfn(struct rhashtable_compare_arg *arg, + const void *ptr) +{ + const struct nfsd_file_lookup_key *key = arg->key; + const struct nfsd_file *nf = ptr; + + switch (key->type) { + case NFSD_FILE_KEY_INODE: + if (nf->nf_inode != key->inode) + return 1; + break; + case NFSD_FILE_KEY_FULL: + if (nf->nf_inode != key->inode) + return 1; + if (nf->nf_may != key->need) + return 1; + if (nf->nf_net != key->net) + return 1; + if (!nfsd_match_cred(nf->nf_cred, key->cred)) + return 1; + if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0) + return 1; + break; + } + return 0; +} + +static const struct rhashtable_params nfsd_file_rhash_params = { + .key_len = sizeof_field(struct nfsd_file, nf_inode), + .key_offset = offsetof(struct nfsd_file, nf_inode), + .head_offset = offsetof(struct nfsd_file, nf_rhash), + .hashfn = nfsd_file_key_hashfn, + .obj_hashfn = nfsd_file_obj_hashfn, + .obj_cmpfn = nfsd_file_obj_cmpfn, + /* Reduce resizing churn on light workloads */ + .min_size = 512, /* buckets */ + .automatic_shrinking = true, +}; static void nfsd_file_schedule_laundrette(void) @@ -700,13 +831,18 @@ static const struct fsnotify_ops nfsd_file_fsnotify_ops = { int nfsd_file_cache_init(void) { - int ret = -ENOMEM; + int ret; unsigned int i; lockdep_assert_held(&nfsd_mutex); if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) return 0; + ret = rhashtable_init(&nfsd_file_rhash_tbl, &nfsd_file_rhash_params); + if (ret) + return ret; + + ret = -ENOMEM; nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0); if (!nfsd_filecache_wq) goto out; @@ -784,6 +920,7 @@ nfsd_file_cache_init(void) nfsd_file_hashtbl = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; + rhashtable_destroy(&nfsd_file_rhash_tbl); goto out; } @@ -909,6 +1046,7 @@ nfsd_file_cache_shutdown(void) nfsd_file_hashtbl = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; + rhashtable_destroy(&nfsd_file_rhash_tbl); for_each_possible_cpu(i) { per_cpu(nfsd_file_cache_hits, i) = 0; @@ -920,26 +1058,6 @@ nfsd_file_cache_shutdown(void) } } -static bool -nfsd_match_cred(const struct cred *c1, const struct cred *c2) -{ - int i; - - if (!uid_eq(c1->fsuid, c2->fsuid)) - return false; - if (!gid_eq(c1->fsgid, c2->fsgid)) - return false; - if (c1->group_info == NULL || c2->group_info == NULL) - return c1->group_info == c2->group_info; - if (c1->group_info->ngroups != c2->group_info->ngroups) - return false; - for (i = 0; i < c1->group_info->ngroups; i++) { - if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i])) - return false; - } - return true; -} - static struct nfsd_file * nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, unsigned int hashval, struct net *net) diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 31dc65f82c753a..7fc017e7b09efb 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -29,6 +29,7 @@ struct nfsd_file_mark { * never be dereferenced, only used for comparison. */ struct nfsd_file { + struct rhash_head nf_rhash; struct hlist_node nf_node; struct list_head nf_lru; struct rcu_head nf_rcu; From 7315a712aa37d83d84a6387aa45a66b989c2db2c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:26:30 -0400 Subject: [PATCH 0599/1250] NFSD: Convert the filecache to use rhashtable Enable the filecache hash table to start small, then grow with the workload. Smaller server deployments benefit because there should be lower memory utilization. Larger server deployments should see improved scaling with the number of open files. Suggested-by: Jeff Layton Suggested-by: Dave Chinner Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 265 +++++++++++++++++++------------------------- fs/nfsd/trace.h | 63 ++++++++++- 2 files changed, 179 insertions(+), 149 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index e5d6c2759bc205..b0f30d10e17f43 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -62,7 +62,6 @@ static struct nfsd_fcache_bucket *nfsd_file_hashtbl; static struct list_lru nfsd_file_lru; static unsigned long nfsd_file_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; -static atomic_long_t nfsd_filecache_count; static struct delayed_work nfsd_filecache_laundrette; static struct rhashtable nfsd_file_rhash_tbl ____cacheline_aligned_in_smp; @@ -198,7 +197,7 @@ static const struct rhashtable_params nfsd_file_rhash_params = { static void nfsd_file_schedule_laundrette(void) { - if ((atomic_long_read(&nfsd_filecache_count) == 0) || + if ((atomic_read(&nfsd_file_rhash_tbl.nelems) == 0) || test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0) return; @@ -298,7 +297,7 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf) } static struct nfsd_file * -nfsd_file_alloc(struct inode *inode, unsigned int may, struct net *net) +nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) { struct nfsd_file *nf; @@ -309,11 +308,14 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, struct net *net) nf->nf_birthtime = ktime_get(); nf->nf_file = NULL; nf->nf_cred = get_current_cred(); - nf->nf_net = net; + nf->nf_net = key->net; nf->nf_flags = 0; - nf->nf_inode = inode; - refcount_set(&nf->nf_ref, 1); - nf->nf_may = may & NFSD_FILE_MAY_MASK; + __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); + __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); + nf->nf_inode = key->inode; + /* nf_ref is pre-incremented for hash table */ + refcount_set(&nf->nf_ref, 2); + nf->nf_may = key->need; if (may & NFSD_MAY_NOT_BREAK_LEASE) { if (may & NFSD_MAY_WRITE) __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags); @@ -405,40 +407,21 @@ static void nfsd_file_lru_remove(struct nfsd_file *nf) } static void -nfsd_file_do_unhash(struct nfsd_file *nf) +nfsd_file_hash_remove(struct nfsd_file *nf) { - struct inode *inode = nf->nf_inode; - unsigned int hashval = (unsigned int)hash_long(inode->i_ino, - NFSD_FILE_HASH_BITS); - - lockdep_assert_held(&nfsd_file_hashtbl[hashval].nfb_lock); - trace_nfsd_file_unhash(nf); if (nfsd_file_check_write_error(nf)) nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); - --nfsd_file_hashtbl[hashval].nfb_count; - hlist_del_rcu(&nf->nf_node); - atomic_long_dec(&nfsd_filecache_count); -} - -static void -nfsd_file_hash_remove(struct nfsd_file *nf) -{ - struct inode *inode = nf->nf_inode; - unsigned int hashval = (unsigned int)hash_long(inode->i_ino, - NFSD_FILE_HASH_BITS); - - spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); - nfsd_file_do_unhash(nf); - spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, + nfsd_file_rhash_params); } static bool nfsd_file_unhash(struct nfsd_file *nf) { if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - nfsd_file_do_unhash(nf); + nfsd_file_hash_remove(nf); return true; } return false; @@ -448,9 +431,9 @@ nfsd_file_unhash(struct nfsd_file *nf) * Return true if the file was unhashed. */ static bool -nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose) +nfsd_file_unhash_and_dispose(struct nfsd_file *nf, struct list_head *dispose) { - trace_nfsd_file_unhash_and_release_locked(nf); + trace_nfsd_file_unhash_and_dispose(nf); if (!nfsd_file_unhash(nf)) return false; /* keep final reference for nfsd_file_lru_dispose */ @@ -709,20 +692,23 @@ static struct shrinker nfsd_file_shrinker = { static unsigned int __nfsd_file_close_inode(struct inode *inode, struct list_head *dispose) { - unsigned int hashval = (unsigned int)hash_long(inode->i_ino, - NFSD_FILE_HASH_BITS); - unsigned int count = 0; - struct nfsd_file *nf; - struct hlist_node *tmp; - - spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); - hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) { - if (inode == nf->nf_inode) { - nfsd_file_unhash_and_release_locked(nf, dispose); - count++; - } - } - spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + struct nfsd_file_lookup_key key = { + .type = NFSD_FILE_KEY_INODE, + .inode = inode, + }; + unsigned int count = 0; + struct nfsd_file *nf; + + rcu_read_lock(); + do { + nf = rhashtable_lookup(&nfsd_file_rhash_tbl, &key, + nfsd_file_rhash_params); + if (!nf) + break; + nfsd_file_unhash_and_dispose(nf, dispose); + count++; + } while (1); + rcu_read_unlock(); return count; } @@ -930,30 +916,35 @@ nfsd_file_cache_init(void) static void __nfsd_file_cache_purge(struct net *net) { - unsigned int i; - struct nfsd_file *nf; - struct hlist_node *next; + struct rhashtable_iter iter; + struct nfsd_file *nf; LIST_HEAD(dispose); bool del; - for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { - struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i]; + rhashtable_walk_enter(&nfsd_file_rhash_tbl, &iter); + do { + rhashtable_walk_start(&iter); - spin_lock(&nfb->nfb_lock); - hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) { + nf = rhashtable_walk_next(&iter); + while (!IS_ERR_OR_NULL(nf)) { if (net && nf->nf_net != net) continue; - del = nfsd_file_unhash_and_release_locked(nf, &dispose); + del = nfsd_file_unhash_and_dispose(nf, &dispose); /* * Deadlock detected! Something marked this entry as * unhased, but hasn't removed it from the hash list. */ WARN_ON_ONCE(!del); + + nf = rhashtable_walk_next(&iter); } - spin_unlock(&nfb->nfb_lock); - nfsd_file_dispose_list(&dispose); - } + + rhashtable_walk_stop(&iter); + } while (nf == ERR_PTR(-EAGAIN)); + rhashtable_walk_exit(&iter); + + nfsd_file_dispose_list(&dispose); } static struct nfsd_fcache_disposal * @@ -1058,56 +1049,29 @@ nfsd_file_cache_shutdown(void) } } -static struct nfsd_file * -nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, - unsigned int hashval, struct net *net) -{ - struct nfsd_file *nf; - unsigned char need = may_flags & NFSD_FILE_MAY_MASK; - - hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, - nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) { - if (nf->nf_may != need) - continue; - if (nf->nf_inode != inode) - continue; - if (nf->nf_net != net) - continue; - if (!nfsd_match_cred(nf->nf_cred, current_cred())) - continue; - if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) - continue; - if (nfsd_file_get(nf) != NULL) - return nf; - } - return NULL; -} - /** - * nfsd_file_is_cached - are there any cached open files for this fh? - * @inode: inode of the file to check + * nfsd_file_is_cached - are there any cached open files for this inode? + * @inode: inode to check + * + * The lookup matches inodes in all net namespaces and is atomic wrt + * nfsd_file_acquire(). * - * Scan the hashtable for open files that match this fh. Returns true if there - * are any, and false if not. + * Return values: + * %true: filecache contains at least one file matching this inode + * %false: filecache contains no files matching this inode */ bool nfsd_file_is_cached(struct inode *inode) { - bool ret = false; - struct nfsd_file *nf; - unsigned int hashval; - - hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); - - rcu_read_lock(); - hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, - nf_node) { - if (inode == nf->nf_inode) { - ret = true; - break; - } - } - rcu_read_unlock(); + struct nfsd_file_lookup_key key = { + .type = NFSD_FILE_KEY_INODE, + .inode = inode, + }; + bool ret = false; + + if (rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, + nfsd_file_rhash_params) != NULL) + ret = true; trace_nfsd_file_is_cached(inode, (int)ret); return ret; } @@ -1116,39 +1080,51 @@ static __be32 nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf, bool open) { - __be32 status; - struct net *net = SVC_NET(rqstp); + struct nfsd_file_lookup_key key = { + .type = NFSD_FILE_KEY_FULL, + .need = may_flags & NFSD_FILE_MAY_MASK, + .net = SVC_NET(rqstp), + }; struct nfsd_file *nf, *new; - struct inode *inode; - unsigned int hashval; bool retry = true; + __be32 status; - /* FIXME: skip this if fh_dentry is already set? */ status = fh_verify(rqstp, fhp, S_IFREG, may_flags|NFSD_MAY_OWNER_OVERRIDE); if (status != nfs_ok) return status; + key.inode = d_inode(fhp->fh_dentry); + key.cred = get_current_cred(); - inode = d_inode(fhp->fh_dentry); - hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); retry: - rcu_read_lock(); - nf = nfsd_file_find_locked(inode, may_flags, hashval, net); - rcu_read_unlock(); + /* Avoid allocation if the item is already in cache */ + nf = rhashtable_lookup_fast(&nfsd_file_rhash_tbl, &key, + nfsd_file_rhash_params); + if (nf) + nf = nfsd_file_get(nf); if (nf) goto wait_for_construction; - new = nfsd_file_alloc(inode, may_flags, net); + new = nfsd_file_alloc(&key, may_flags); if (!new) { status = nfserr_jukebox; goto out_status; } - spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); - nf = nfsd_file_find_locked(inode, may_flags, hashval, net); - if (nf == NULL) + nf = rhashtable_lookup_get_insert_key(&nfsd_file_rhash_tbl, + &key, &new->nf_rhash, + nfsd_file_rhash_params); + if (!nf) { + nf = new; + goto open_file; + } + if (IS_ERR(nf)) + goto insert_err; + nf = nfsd_file_get(nf); + if (nf == NULL) { + nf = new; goto open_file; - spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + } nfsd_file_slab_free(&new->nf_rcu); wait_for_construction: @@ -1156,6 +1132,7 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, /* Did construction of this file fail? */ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + trace_nfsd_file_cons_err(rqstp, key.inode, may_flags, nf); if (!retry) { status = nfserr_jukebox; goto out; @@ -1194,22 +1171,11 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, } out_status: - trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); + put_cred(key.cred); + trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); return status; open_file: - nf = new; - /* Take reference for the hashtable */ - refcount_inc(&nf->nf_ref); - __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); - __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); - hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head); - ++nfsd_file_hashtbl[hashval].nfb_count; - nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, - nfsd_file_hashtbl[hashval].nfb_count); - spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); - atomic_long_inc(&nfsd_filecache_count); - nf->nf_mark = nfsd_file_mark_find_or_create(nf); if (nf->nf_mark) { if (open) { @@ -1224,19 +1190,20 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, * If construction failed, or we raced with a call to unlink() * then unhash. */ - if (status != nfs_ok || inode->i_nlink == 0) { - bool do_free; - nfsd_file_lru_remove(nf); - spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); - do_free = nfsd_file_unhash(nf); - spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); - if (do_free) + if (status != nfs_ok || key.inode->i_nlink == 0) + if (nfsd_file_unhash(nf)) nfsd_file_put_noref(nf); - } clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); smp_mb__after_atomic(); wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); goto out; + +insert_err: + nfsd_file_slab_free(&new->nf_rcu); + trace_nfsd_file_insert_err(rqstp, key.inode, may_flags, PTR_ERR(nf)); + nf = NULL; + status = nfserr_jukebox; + goto out_status; } /** @@ -1282,21 +1249,23 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) { unsigned long releases = 0, pages_flushed = 0, evictions = 0; unsigned long hits = 0, acquisitions = 0; - unsigned int i, count = 0, longest = 0; + unsigned int i, count = 0, buckets = 0; unsigned long lru = 0, total_age = 0; - /* - * No need for spinlocks here since we're not terribly interested in - * accuracy. We do take the nfsd_mutex simply to ensure that we - * don't end up racing with server shutdown - */ + /* Serialize with server shutdown */ mutex_lock(&nfsd_mutex); if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) { - for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { - count += nfsd_file_hashtbl[i].nfb_count; - longest = max(longest, nfsd_file_hashtbl[i].nfb_count); - } + struct bucket_table *tbl; + struct rhashtable *ht; + lru = list_lru_count(&nfsd_file_lru); + + rcu_read_lock(); + ht = &nfsd_file_rhash_tbl; + count = atomic_read(&ht->nelems); + tbl = rht_dereference_rcu(ht->tbl, ht); + buckets = tbl->size; + rcu_read_unlock(); } mutex_unlock(&nfsd_mutex); @@ -1310,7 +1279,7 @@ static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) } seq_printf(m, "total entries: %u\n", count); - seq_printf(m, "longest chain: %u\n", longest); + seq_printf(m, "hash buckets: %u\n", buckets); seq_printf(m, "lru entries: %lu\n", lru); seq_printf(m, "cache hits: %lu\n", hits); seq_printf(m, "acquisitions: %lu\n", acquisitions); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index af609590ac86ac..68b02497233d22 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -780,7 +780,7 @@ DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc); DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final); DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); DEFINE_NFSD_FILE_EVENT(nfsd_file_put); -DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose); TRACE_EVENT(nfsd_file_acquire, TP_PROTO( @@ -823,6 +823,67 @@ TRACE_EVENT(nfsd_file_acquire, __entry->nf_file, __entry->status) ); +TRACE_EVENT(nfsd_file_insert_err, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct inode *inode, + unsigned int may_flags, + long error + ), + TP_ARGS(rqstp, inode, may_flags, error), + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, inode) + __field(unsigned long, may_flags) + __field(long, error) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->error = error; + ), + TP_printk("xid=0x%x inode=%p may_flags=%s error=%ld", + __entry->xid, __entry->inode, + show_nfsd_may_flags(__entry->may_flags), + __entry->error + ) +); + +TRACE_EVENT(nfsd_file_cons_err, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct inode *inode, + unsigned int may_flags, + const struct nfsd_file *nf + ), + TP_ARGS(rqstp, inode, may_flags, nf), + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, inode) + __field(unsigned long, may_flags) + __field(unsigned int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(const void *, nf_file) + ), + TP_fast_assign( + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p", + __entry->xid, __entry->inode, + show_nfsd_may_flags(__entry->may_flags), __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), __entry->nf_file + ) +); + TRACE_EVENT(nfsd_file_open, TP_PROTO(struct nfsd_file *nf, __be32 status), TP_ARGS(nf, status), From 1b5a6bb08d96d26e48190c1ac945a08840b1125d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:26:36 -0400 Subject: [PATCH 0600/1250] NFSD: Clean up unused code after rhashtable conversion Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 33 +-------------------------------- fs/nfsd/filecache.h | 1 - 2 files changed, 1 insertion(+), 33 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index b0f30d10e17f43..05a1e757e535bb 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -22,11 +22,6 @@ #include "filecache.h" #include "trace.h" -#define NFSDDBG_FACILITY NFSDDBG_FH - -/* FIXME: dynamically size this for the machine somehow? */ -#define NFSD_FILE_HASH_BITS 12 -#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS) #define NFSD_LAUNDRETTE_DELAY (2 * HZ) #define NFSD_FILE_CACHE_UP (0) @@ -34,13 +29,6 @@ /* We only care about NFSD_MAY_READ/WRITE for this cache */ #define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) -struct nfsd_fcache_bucket { - struct hlist_head nfb_head; - spinlock_t nfb_lock; - unsigned int nfb_count; - unsigned int nfb_maxcount; -}; - static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions); static DEFINE_PER_CPU(unsigned long, nfsd_file_releases); @@ -58,7 +46,6 @@ static struct workqueue_struct *nfsd_filecache_wq __read_mostly; static struct kmem_cache *nfsd_file_slab; static struct kmem_cache *nfsd_file_mark_slab; -static struct nfsd_fcache_bucket *nfsd_file_hashtbl; static struct list_lru nfsd_file_lru; static unsigned long nfsd_file_flags; static struct fsnotify_group *nfsd_file_fsnotify_group; @@ -303,7 +290,6 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); if (nf) { - INIT_HLIST_NODE(&nf->nf_node); INIT_LIST_HEAD(&nf->nf_lru); nf->nf_birthtime = ktime_get(); nf->nf_file = NULL; @@ -817,8 +803,7 @@ static const struct fsnotify_ops nfsd_file_fsnotify_ops = { int nfsd_file_cache_init(void) { - int ret; - unsigned int i; + int ret; lockdep_assert_held(&nfsd_mutex); if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) @@ -833,13 +818,6 @@ nfsd_file_cache_init(void) if (!nfsd_filecache_wq) goto out; - nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE, - sizeof(*nfsd_file_hashtbl), GFP_KERNEL); - if (!nfsd_file_hashtbl) { - pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n"); - goto out_err; - } - nfsd_file_slab = kmem_cache_create("nfsd_file", sizeof(struct nfsd_file), 0, 0, NULL); if (!nfsd_file_slab) { @@ -883,11 +861,6 @@ nfsd_file_cache_init(void) goto out_notifier; } - for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { - INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head); - spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock); - } - INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker); out: return ret; @@ -902,8 +875,6 @@ nfsd_file_cache_init(void) nfsd_file_slab = NULL; kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - kvfree(nfsd_file_hashtbl); - nfsd_file_hashtbl = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; rhashtable_destroy(&nfsd_file_rhash_tbl); @@ -1033,8 +1004,6 @@ nfsd_file_cache_shutdown(void) fsnotify_wait_marks_destroyed(); kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - kvfree(nfsd_file_hashtbl); - nfsd_file_hashtbl = NULL; destroy_workqueue(nfsd_filecache_wq); nfsd_filecache_wq = NULL; rhashtable_destroy(&nfsd_file_rhash_tbl); diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 7fc017e7b09efb..7b40d5b446e12a 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -30,7 +30,6 @@ struct nfsd_file_mark { */ struct nfsd_file { struct rhash_head nf_rhash; - struct hlist_node nf_node; struct list_head nf_lru; struct rcu_head nf_rcu; struct file *nf_file; From 7d3d44756e19705d8bdc3a708115abf467113ce5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:26:43 -0400 Subject: [PATCH 0601/1250] NFSD: Separate tracepoints for acquire and create These tracepoints collect different information: the create case does not open a file, so there's no nf_file available. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 9 ++++---- fs/nfsd/nfs4state.c | 1 + fs/nfsd/trace.h | 54 ++++++++++++++++++++++++++++++++++++++------- 3 files changed, 52 insertions(+), 12 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 05a1e757e535bb..a5a35fb2a7edbe 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -1046,7 +1046,7 @@ nfsd_file_is_cached(struct inode *inode) } static __be32 -nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, +nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf, bool open) { struct nfsd_file_lookup_key key = { @@ -1141,7 +1141,8 @@ nfsd_do_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, out_status: put_cred(key.cred); - trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); + if (open) + trace_nfsd_file_acquire(rqstp, key.inode, may_flags, nf, status); return status; open_file: @@ -1189,7 +1190,7 @@ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { - return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, true); + return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, true); } /** @@ -1206,7 +1207,7 @@ __be32 nfsd_file_create(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **pnf) { - return nfsd_do_file_acquire(rqstp, fhp, may_flags, pnf, false); + return nfsd_file_do_acquire(rqstp, fhp, may_flags, pnf, false); } /* diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9409a0dc1b7674..3a05c095dfe536 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5104,6 +5104,7 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, goto out_put_access; nf->nf_file = open->op_filp; open->op_filp = NULL; + trace_nfsd_file_create(rqstp, access, nf); } spin_lock(&fp->fi_lock); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 68b02497233d22..1c4cf9d2dd8e79 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -784,10 +784,10 @@ DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose); TRACE_EVENT(nfsd_file_acquire, TP_PROTO( - struct svc_rqst *rqstp, - struct inode *inode, + const struct svc_rqst *rqstp, + const struct inode *inode, unsigned int may_flags, - struct nfsd_file *nf, + const struct nfsd_file *nf, __be32 status ), @@ -795,12 +795,12 @@ TRACE_EVENT(nfsd_file_acquire, TP_STRUCT__entry( __field(u32, xid) - __field(void *, inode) + __field(const void *, inode) __field(unsigned long, may_flags) - __field(int, nf_ref) + __field(unsigned int, nf_ref) __field(unsigned long, nf_flags) __field(unsigned long, nf_may) - __field(struct file *, nf_file) + __field(const void *, nf_file) __field(u32, status) ), @@ -815,12 +815,50 @@ TRACE_EVENT(nfsd_file_acquire, __entry->status = be32_to_cpu(status); ), - TP_printk("xid=0x%x inode=%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=%p status=%u", + TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p status=%u", __entry->xid, __entry->inode, show_nfsd_may_flags(__entry->may_flags), __entry->nf_ref, show_nf_flags(__entry->nf_flags), show_nfsd_may_flags(__entry->nf_may), - __entry->nf_file, __entry->status) + __entry->nf_file, __entry->status + ) +); + +TRACE_EVENT(nfsd_file_create, + TP_PROTO( + const struct svc_rqst *rqstp, + unsigned int may_flags, + const struct nfsd_file *nf + ), + + TP_ARGS(rqstp, may_flags, nf), + + TP_STRUCT__entry( + __field(const void *, nf_inode) + __field(const void *, nf_file) + __field(unsigned long, may_flags) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(unsigned int, nf_ref) + __field(u32, xid) + ), + + TP_fast_assign( + __entry->nf_inode = nf->nf_inode; + __entry->nf_file = nf->nf_file; + __entry->may_flags = may_flags; + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->xid = be32_to_cpu(rqstp->rq_xid); + ), + + TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p", + __entry->xid, __entry->nf_inode, + show_nfsd_may_flags(__entry->may_flags), + __entry->nf_ref, show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may), __entry->nf_file + ) ); TRACE_EVENT(nfsd_file_insert_err, From 2d044dc9398775d7fe5eb25e75a1695d65723a60 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:26:49 -0400 Subject: [PATCH 0602/1250] NFSD: Move nfsd_file_trace_alloc() tracepoint Avoid recording the allocation of an nfsd_file item that is immediately released because a matching item was already inserted in the hash. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 2 +- fs/nfsd/trace.h | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index a5a35fb2a7edbe..8dd15ddfb03b57 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -309,7 +309,6 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); } nf->nf_mark = NULL; - trace_nfsd_file_alloc(nf); } return nf; } @@ -1146,6 +1145,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, return status; open_file: + trace_nfsd_file_alloc(nf); nf->nf_mark = nfsd_file_mark_find_or_create(nf); if (nf->nf_mark) { if (open) { diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 1c4cf9d2dd8e79..96bb6629541e29 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -776,12 +776,35 @@ DEFINE_EVENT(nfsd_file_class, name, \ TP_PROTO(struct nfsd_file *nf), \ TP_ARGS(nf)) -DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc); DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final); DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); DEFINE_NFSD_FILE_EVENT(nfsd_file_put); DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_dispose); +TRACE_EVENT(nfsd_file_alloc, + TP_PROTO( + const struct nfsd_file *nf + ), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(const void *, nf_inode) + __field(unsigned long, nf_flags) + __field(unsigned long, nf_may) + __field(unsigned int, nf_ref) + ), + TP_fast_assign( + __entry->nf_inode = nf->nf_inode; + __entry->nf_flags = nf->nf_flags; + __entry->nf_ref = refcount_read(&nf->nf_ref); + __entry->nf_may = nf->nf_may; + ), + TP_printk("inode=%p ref=%u flags=%s may=%s", + __entry->nf_inode, __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nfsd_may_flags(__entry->nf_may) + ) +); + TRACE_EVENT(nfsd_file_acquire, TP_PROTO( const struct svc_rqst *rqstp, From 4ba22226f3beae8f9d4764baee6e5e3cf6878659 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:27:02 -0400 Subject: [PATCH 0603/1250] NFSD: NFSv4 CLOSE should release an nfsd_file immediately The last close of a file should enable other accessors to open and use that file immediately. Leaving the file open in the filecache prevents other users from accessing that file until the filecache garbage-collects the file -- sometimes that takes several seconds. Reported-by: Wang Yugui Link: https://bugzilla.linux-nfs.org/show_bug.cgi?387 Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 18 ++++++++++++++++++ fs/nfsd/filecache.h | 1 + fs/nfsd/nfs4state.c | 4 ++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 8dd15ddfb03b57..fef2f6570f0a62 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -458,6 +458,24 @@ nfsd_file_put(struct nfsd_file *nf) nfsd_file_put_noref(nf); } +/** + * nfsd_file_close - Close an nfsd_file + * @nf: nfsd_file to close + * + * If this is the final reference for @nf, free it immediately. + * This reflects an on-the-wire CLOSE or DELEGRETURN into the + * VFS and exported filesystem. + */ +void nfsd_file_close(struct nfsd_file *nf) +{ + nfsd_file_put(nf); + if (refcount_dec_if_one(&nf->nf_ref)) { + nfsd_file_unhash(nf); + nfsd_file_lru_remove(nf); + nfsd_file_free(nf); + } +} + struct nfsd_file * nfsd_file_get(struct nfsd_file *nf) { diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 7b40d5b446e12a..c5ddc877116b50 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -54,6 +54,7 @@ void nfsd_file_cache_shutdown(void); int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); +void nfsd_file_close(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); bool nfsd_file_is_cached(struct inode *inode); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3a05c095dfe536..9d1a3e131c49e7 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -820,9 +820,9 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) - nfsd_file_put(f1); + nfsd_file_close(f1); if (f2) - nfsd_file_put(f2); + nfsd_file_close(f2); } } From 49dd50b0bad5d8c8c6d57887b0be91567d6a78b7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 8 Jul 2022 14:27:09 -0400 Subject: [PATCH 0604/1250] NFSD: Ensure nf_inode is never dereferenced The documenting comment for struct nf_file states: /* * A representation of a file that has been opened by knfsd. These are hashed * in the hashtable by inode pointer value. Note that this object doesn't * hold a reference to the inode by itself, so the nf_inode pointer should * never be dereferenced, only used for comparison. */ Replace the two existing dereferences to make the comment always true. Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/filecache.c | 5 ++--- fs/nfsd/filecache.h | 2 +- fs/nfsd/nfs4state.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index fef2f6570f0a62..4758c2a3fcf8fd 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -228,12 +228,11 @@ nfsd_file_mark_put(struct nfsd_file_mark *nfm) } static struct nfsd_file_mark * -nfsd_file_mark_find_or_create(struct nfsd_file *nf) +nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode) { int err; struct fsnotify_mark *mark; struct nfsd_file_mark *nfm = NULL, *new; - struct inode *inode = nf->nf_inode; do { fsnotify_group_lock(nfsd_file_fsnotify_group); @@ -1164,7 +1163,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, open_file: trace_nfsd_file_alloc(nf); - nf->nf_mark = nfsd_file_mark_find_or_create(nf); + nf->nf_mark = nfsd_file_mark_find_or_create(nf, key.inode); if (nf->nf_mark) { if (open) { status = nfsd_open_verified(rqstp, fhp, may_flags, diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index c5ddc877116b50..d534b76cb65b8b 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -41,7 +41,7 @@ struct nfsd_file { #define NFSD_FILE_BREAK_WRITE (3) #define NFSD_FILE_REFERENCED (4) unsigned long nf_flags; - struct inode *nf_inode; + struct inode *nf_inode; /* don't deref */ refcount_t nf_ref; unsigned char nf_may; struct nfsd_file_mark *nf_mark; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9d1a3e131c49e7..994bd11bafe039 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2564,7 +2564,7 @@ static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f) static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { - struct inode *inode = f->nf_inode; + struct inode *inode = file_inode(f->nf_file); seq_printf(s, "superblock: \"%02x:%02x:%ld\"", MAJOR(inode->i_sb->s_dev), From a505db20ee1b690e4b301d426344c3f412111dbf Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 15 Jul 2022 16:54:51 -0700 Subject: [PATCH 0605/1250] NFSD: refactoring v4 specific code to a helper in nfs4state.c This patch moves the v4 specific code from nfsd_init_net() to nfsd4_init_leases_net() helper in nfs4state.c Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 12 ++++++++++++ fs/nfsd/nfsctl.c | 9 +-------- fs/nfsd/nfsd.h | 4 ++++ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 994bd11bafe039..8676f4c71fa408 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4330,6 +4330,18 @@ nfsd4_init_slabs(void) return -ENOMEM; } +void nfsd4_init_leases_net(struct nfsd_net *nn) +{ + nn->nfsd4_lease = 90; /* default lease time */ + nn->nfsd4_grace = 90; + nn->somebody_reclaimed = false; + nn->track_reclaim_completes = false; + nn->clverifier_counter = prandom_u32(); + nn->clientid_base = prandom_u32(); + nn->clientid_counter = nn->clientid_base + 1; + nn->s2s_cp_cl_id = nn->clientid_counter++; +} + static void init_nfs4_replay(struct nfs4_replay *rp) { rp->rp_status = nfserr_serverfault; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7002edbf26870e..164c822ae3ae9d 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1484,14 +1484,7 @@ static __net_init int nfsd_init_net(struct net *net) retval = nfsd_reply_cache_init(nn); if (retval) goto out_drc_error; - nn->nfsd4_lease = 90; /* default lease time */ - nn->nfsd4_grace = 90; - nn->somebody_reclaimed = false; - nn->track_reclaim_completes = false; - nn->clverifier_counter = prandom_u32(); - nn->clientid_base = prandom_u32(); - nn->clientid_counter = nn->clientid_base + 1; - nn->s2s_cp_cl_id = nn->clientid_counter++; + nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 9a8b09afc17333..ef8087691138a4 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -496,12 +496,16 @@ extern void unregister_cld_notifier(void); extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn); #endif +extern void nfsd4_init_leases_net(struct nfsd_net *nn); + #else /* CONFIG_NFSD_V4 */ static inline int nfsd4_is_junction(struct dentry *dentry) { return 0; } +static inline void nfsd4_init_leases_net(struct nfsd_net *nn) {}; + #define register_cld_notifier() 0 #define unregister_cld_notifier() do { } while(0) From ec56473c25fc81ae3cbd115c580515f5b8207d17 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 15 Jul 2022 16:54:52 -0700 Subject: [PATCH 0606/1250] NFSD: keep track of the number of v4 clients in the system Add counter nfs4_client_count to keep track of the total number of v4 clients, including courtesy clients, in the system. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 2 ++ fs/nfsd/nfs4state.c | 10 ++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 1b1a962a18041c..ce864f001a3eeb 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -189,6 +189,8 @@ struct nfsd_net { struct nfsd_fcache_disposal *fcache_disposal; siphash_key_t siphash_key; + + atomic_t nfs4_client_count; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8676f4c71fa408..4842cc827f082a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2053,7 +2053,8 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn) * This type of memory management is somewhat inefficient, but we use it * anyway since SETCLIENTID is not a common operation. */ -static struct nfs4_client *alloc_client(struct xdr_netobj name) +static struct nfs4_client *alloc_client(struct xdr_netobj name, + struct nfsd_net *nn) { struct nfs4_client *clp; int i; @@ -2076,6 +2077,7 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) atomic_set(&clp->cl_rpc_users, 0); clp->cl_cb_state = NFSD4_CB_UNKNOWN; clp->cl_state = NFSD4_ACTIVE; + atomic_inc(&nn->nfs4_client_count); atomic_set(&clp->cl_delegs_in_recall, 0); INIT_LIST_HEAD(&clp->cl_idhash); INIT_LIST_HEAD(&clp->cl_openowners); @@ -2183,6 +2185,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp) static void __destroy_client(struct nfs4_client *clp) { + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); int i; struct nfs4_openowner *oo; struct nfs4_delegation *dp; @@ -2226,6 +2229,7 @@ __destroy_client(struct nfs4_client *clp) nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) svc_xprt_put(clp->cl_cb_conn.cb_xprt); + atomic_add_unless(&nn->nfs4_client_count, -1, 0); free_client(clp); wake_up_all(&expiry_wq); } @@ -2848,7 +2852,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct dentry *dentries[ARRAY_SIZE(client_files)]; - clp = alloc_client(name); + clp = alloc_client(name, nn); if (clp == NULL) return NULL; @@ -4340,6 +4344,8 @@ void nfsd4_init_leases_net(struct nfsd_net *nn) nn->clientid_base = prandom_u32(); nn->clientid_counter = nn->clientid_base + 1; nn->s2s_cp_cl_id = nn->clientid_counter++; + + atomic_set(&nn->nfs4_client_count, 0); } static void init_nfs4_replay(struct nfs4_replay *rp) From 2a1bd2f6b336916824b6e537f75d48e4e4b4a56f Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Fri, 15 Jul 2022 16:54:53 -0700 Subject: [PATCH 0607/1250] NFSD: limit the number of v4 clients to 1024 per 1GB of system memory Currently there is no limit on how many v4 clients are supported by the system. This can be a problem in systems with small memory configuration to function properly when a very large number of clients exist that creates memory shortage conditions. This patch enforces a limit of 1024 NFSv4 clients, including courtesy clients, per 1GB of system memory. When the number of the clients reaches the limit, requests that create new clients are returned with NFS4ERR_DELAY and the laundromat is kicked start to trim old clients. Due to the overhead of the upcall to remove the client record, the maximun number of clients the laundromat removes on each run is limited to 128. This is done to ensure the laundromat can still process the other tasks in a timely manner. Since there is now a limit of the number of clients, the 24-hr idle time limit of courtesy client is no longer needed and was removed. Signed-off-by: Dai Ngo Signed-off-by: Chuck Lever --- fs/nfsd/netns.h | 1 + fs/nfsd/nfs4state.c | 27 +++++++++++++++++++++------ fs/nfsd/nfsd.h | 2 ++ 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ce864f001a3eeb..ffe17743cc74be 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -191,6 +191,7 @@ struct nfsd_net { siphash_key_t siphash_key; atomic_t nfs4_client_count; + int nfs4_max_clients; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4842cc827f082a..e46e3392d5577e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2059,6 +2059,10 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name, struct nfs4_client *clp; int i; + if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) { + mod_delayed_work(laundry_wq, &nn->laundromat_work, 0); + return NULL; + } clp = kmem_cache_zalloc(client_slab, GFP_KERNEL); if (clp == NULL) return NULL; @@ -4336,6 +4340,9 @@ nfsd4_init_slabs(void) void nfsd4_init_leases_net(struct nfsd_net *nn) { + struct sysinfo si; + u64 max_clients; + nn->nfsd4_lease = 90; /* default lease time */ nn->nfsd4_grace = 90; nn->somebody_reclaimed = false; @@ -4346,6 +4353,10 @@ void nfsd4_init_leases_net(struct nfsd_net *nn) nn->s2s_cp_cl_id = nn->clientid_counter++; atomic_set(&nn->nfs4_client_count, 0); + si_meminfo(&si); + max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024); + max_clients *= NFS4_CLIENTS_PER_GB; + nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB); } static void init_nfs4_replay(struct nfs4_replay *rp) @@ -5811,9 +5822,12 @@ static void nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, struct laundry_time *lt) { + unsigned int maxreap, reapcnt = 0; struct list_head *pos, *next; struct nfs4_client *clp; + maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ? + NFSD_CLIENT_MAX_TRIM_PER_RUN : 0; INIT_LIST_HEAD(reaplist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { @@ -5824,14 +5838,15 @@ nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist, break; if (!atomic_read(&clp->cl_rpc_users)) clp->cl_state = NFSD4_COURTESY; - if (!client_has_state(clp) || - ktime_get_boottime_seconds() >= - (clp->cl_time + NFSD_COURTESY_CLIENT_TIMEOUT)) + if (!client_has_state(clp)) goto exp_client; - if (nfs4_anylock_blockers(clp)) { + if (!nfs4_anylock_blockers(clp)) + if (reapcnt >= maxreap) + continue; exp_client: - if (!mark_client_expired_locked(clp)) - list_add(&clp->cl_lru, reaplist); + if (!mark_client_expired_locked(clp)) { + list_add(&clp->cl_lru, reaplist); + reapcnt++; } } spin_unlock(&nn->client_lock); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index ef8087691138a4..57a468ed85c358 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -341,6 +341,8 @@ void nfsd_lockd_shutdown(void); #define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */ #define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */ +#define NFSD_CLIENT_MAX_TRIM_PER_RUN 128 +#define NFS4_CLIENTS_PER_GB 1024 /* * The following attributes are currently not supported by the NFSv4 server: From 70ccfc6f852c76c8aca6e0fa8510b959207a48cd Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 18 Jul 2022 17:29:38 +0200 Subject: [PATCH 0608/1250] parisc: Clean up names in hardware database Stop guessing and just use the names for the hardware we know so far. Signed-off-by: Helge Deller --- arch/parisc/kernel/hardware.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/parisc/kernel/hardware.c b/arch/parisc/kernel/hardware.c index 17161e72ea2991..357d9cdab7ce4d 100644 --- a/arch/parisc/kernel/hardware.c +++ b/arch/parisc/kernel/hardware.c @@ -23,9 +23,6 @@ * HP PARISC Hardware Database * Access to this database is only possible during bootup * so don't reference this table after starting the init process - * - * NOTE: Product names which are listed here and ends with a '?' - * are guessed. If you know the correct name, please let us know. */ static struct hp_hardware hp_hardware_list[] __initdata = { @@ -212,7 +209,7 @@ static struct hp_hardware hp_hardware_list[] __initdata = { {HPHW_NPROC,0x5DD,0x4,0x81,"Duet W2"}, {HPHW_NPROC,0x5DE,0x4,0x81,"Piccolo W+"}, {HPHW_NPROC,0x5DF,0x4,0x81,"Cantata W2"}, - {HPHW_NPROC,0x5DF,0x0,0x00,"Marcato W+ (rp5470)?"}, + {HPHW_NPROC,0x5DF,0x0,0x00,"Marcato W+ (rp5470)"}, {HPHW_NPROC,0x5E0,0x4,0x91,"Cantata DC- W2"}, {HPHW_NPROC,0x5E1,0x4,0x91,"Crescendo DC- W2"}, {HPHW_NPROC,0x5E2,0x4,0x91,"Crescendo 650 W2"}, @@ -266,11 +263,11 @@ static struct hp_hardware hp_hardware_list[] __initdata = { {HPHW_NPROC,0x888,0x4,0x91,"Storm Peak Fast DC-"}, {HPHW_NPROC,0x889,0x4,0x91,"Storm Peak Fast"}, {HPHW_NPROC,0x88A,0x4,0x91,"Crestone Peak Slow"}, - {HPHW_NPROC,0x88B,0x4,0x91,"Crestone Peak Fast?"}, + {HPHW_NPROC,0x88B,0x4,0x91,"Crestone Peak Fast"}, {HPHW_NPROC,0x88C,0x4,0x91,"Orca Mako+"}, {HPHW_NPROC,0x88D,0x4,0x91,"Rainier/Medel Mako+ Slow"}, {HPHW_NPROC,0x88E,0x4,0x91,"Rainier/Medel Mako+ Fast"}, - {HPHW_NPROC,0x892,0x4,0x91,"Mt. Hamilton Slow Mako+?"}, + {HPHW_NPROC,0x892,0x4,0x91,"Mt. Hamilton Slow Mako+"}, {HPHW_NPROC,0x894,0x4,0x91,"Mt. Hamilton Fast Mako+"}, {HPHW_NPROC,0x895,0x4,0x91,"Storm Peak Slow Mako+"}, {HPHW_NPROC,0x896,0x4,0x91,"Storm Peak Fast Mako+"}, @@ -1198,7 +1195,7 @@ static struct hp_hardware hp_hardware_list[] __initdata = { {HPHW_FIO, 0x004, 0x00340, 0x0, "BARCO CX4500 VME Grphx Cnsl"}, {HPHW_FIO, 0x004, 0x00360, 0x0, "Hughes TOG VME FDDI"}, {HPHW_FIO, 0x076, 0x000AD, 0x0, "Crestone Peak Core RS-232"}, - {HPHW_FIO, 0x077, 0x000AD, 0x0, "Crestone Peak Fast? Core RS-232"}, + {HPHW_FIO, 0x077, 0x000AD, 0x0, "Crestone Peak Fast Core RS-232"}, {HPHW_IOA, 0x185, 0x0000B, 0x00, "Java BC Summit Port"}, {HPHW_IOA, 0x1FF, 0x0000B, 0x00, "Hitachi Ghostview Summit Port"}, {HPHW_IOA, 0x580, 0x0000B, 0x10, "U2-IOA BC Runway Port"}, From 8c21b490b959e8385d732c754056bd94b76e9176 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 18 Jul 2022 17:06:47 +0200 Subject: [PATCH 0609/1250] parisc: Show device names in /proc/iomem Improve the output of /proc/iomem to show the real hardware device name including the pa_pathname, e.g. "Merlin 160 Core Centronics [8:16:0]". Up to now only the pa_pathname ("[8:16.0]") was shown. Signed-off-by: Helge Deller --- arch/parisc/kernel/drivers.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index 776d624a7207b1..d126e78e101ae0 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -520,7 +520,6 @@ alloc_pa_dev(unsigned long hpa, struct hardware_path *mod_path) dev->id.hversion_rev = iodc_data[1] & 0x0f; dev->id.sversion = ((iodc_data[4] & 0x0f) << 16) | (iodc_data[5] << 8) | iodc_data[6]; - dev->hpa.name = parisc_pathname(dev); dev->hpa.start = hpa; /* This is awkward. The STI spec says that gfx devices may occupy * 32MB or 64MB. Unfortunately, we don't know how to tell whether @@ -534,10 +533,10 @@ alloc_pa_dev(unsigned long hpa, struct hardware_path *mod_path) dev->hpa.end = hpa + 0xfff; } dev->hpa.flags = IORESOURCE_MEM; - name = parisc_hardware_description(&dev->id); - if (name) { - strlcpy(dev->name, name, sizeof(dev->name)); - } + dev->hpa.name = dev->name; + name = parisc_hardware_description(&dev->id) ? : "unknown"; + snprintf(dev->name, sizeof(dev->name), "%s [%s]", + name, parisc_pathname(dev)); /* Silently fail things like mouse ports which are subsumed within * the keyboard controller From ecc5ca8e5107c7f84bcee4f1418455726304808b Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Fri, 15 Jul 2022 12:49:15 +0800 Subject: [PATCH 0610/1250] parisc: Fix comment typo in fault.c The double `the' is duplicated in line 41, remove one. Signed-off-by: Jason Wang Signed-off-by: Helge Deller --- arch/parisc/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 84bc437be5cd1f..c9d48bcdc933f6 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -38,7 +38,7 @@ int show_unhandled_signals = 1; /* * parisc_acctyp(unsigned int inst) -- * Given a PA-RISC memory access instruction, determine if the - * the instruction would perform a memory read or memory write + * instruction would perform a memory read or memory write * operation. * * This function assumes that the given instruction is a memory access From 210fe48b578ae43b048a217e1c0c0a353e74f2e2 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 22 Jun 2022 10:43:26 -0700 Subject: [PATCH 0611/1250] ntb_perf: Fix 64-bit division on 32-bit architectures When compiling for a 32-bit architecture, such as arm, an error occurs during modpost: ERROR: modpost: "__aeabi_uldivmod" [drivers/ntb/test/ntb_perf.ko] undefined! The tries member of struct perf_thread is u64 so a 64-bit division helper is needed. Use div_u64_rem() to get the remainder of the division so that it can be checked against zero. Fixes: dc150dfb081f ("ntb_perf: extend with burst latency measurement") Reported-by: kernel test robot Signed-off-by: Nathan Chancellor Signed-off-by: Jon Mason --- drivers/ntb/test/ntb_perf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index 23e154bd41b94b..4e05c7aa070d39 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -1118,6 +1118,7 @@ static int perf_run_latency(struct perf_thread *pthr) void __iomem *flt_dst, *bnd_dst; void *flt_src; u64 stop_at; + u32 rem; int ret; pthr->tries = 0; @@ -1146,7 +1147,8 @@ static int perf_run_latency(struct perf_thread *pthr) } /* Avoid processor soft lock-ups */ - if (!(pthr->tries % RESCHEDULE_RATIO)) + div_u64_rem(pthr->tries, RESCHEDULE_RATIO, &rem); + if (!rem) schedule(); } From f0f5670fdc4fa88ee3aa5a38a31b06aa99899b20 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 11 Jul 2022 16:01:48 -0700 Subject: [PATCH 0612/1250] ntb: idt: fix clang -Wformat warnings When building with Clang we encounter these warnings: | drivers/ntb/hw/idt/ntb_hw_idt.c:2409:28: error: format specifies type | 'unsigned char' but the argument has type 'int' [-Werror,-Wformat] | "\t%hhu-%hhu.\t", idx + cnt - 1); - | drivers/ntb/hw/idt/ntb_hw_idt.c:2438:29: error: format specifies type | 'unsigned char' but the argument has type 'int' [-Werror,-Wformat] | "\t%hhu-%hhu.\t", idx + cnt - 1); - | drivers/ntb/hw/idt/ntb_hw_idt.c:2484:15: error: format specifies type | 'unsigned char' but the argument has type 'int' [-Werror,-Wformat], src); For the first two warnings the format specifier used is `%hhu` which describes a u8. Both `idx` and `cnt` are u8 as well. However, the expression as a whole is promoted to an int as you cannot get smaller-than-int from addition. Therefore, to fix the warning, use the promoted-to-type's format specifier -- in this case `%d`. example: `` uint8_t a = 4, b = 7; int size = sizeof(a + b - 1); printf("%d\n", size); // output: 4 ``` For the last warning, src is of type `int` while the format specifier describes a u8. The fix here is just to use the proper specifier `%d`. See more: (https://wiki.sei.cmu.edu/confluence/display/c/INT02-C.+Understand+integer+conversion+rules) "Integer types smaller than int are promoted when an operation is performed on them. If all values of the original type can be represented as an int, the value of the smaller type is converted to an int; otherwise, it is converted to an unsigned int." Link: https://github.com/ClangBuiltLinux/linux/issues/378 Signed-off-by: Justin Stitt Acked-by: Serge Semin Signed-off-by: Jon Mason --- drivers/ntb/hw/idt/ntb_hw_idt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/ntb/hw/idt/ntb_hw_idt.c b/drivers/ntb/hw/idt/ntb_hw_idt.c index 733557231ed0bd..0ed6f809ff2eeb 100644 --- a/drivers/ntb/hw/idt/ntb_hw_idt.c +++ b/drivers/ntb/hw/idt/ntb_hw_idt.c @@ -2406,7 +2406,7 @@ static ssize_t idt_dbgfs_info_read(struct file *filp, char __user *ubuf, "\t%hhu.\t", idx); else off += scnprintf(strbuf + off, size - off, - "\t%hhu-%hhu.\t", idx, idx + cnt - 1); + "\t%hhu-%d.\t", idx, idx + cnt - 1); off += scnprintf(strbuf + off, size - off, "%s BAR%hhu, ", idt_get_mw_name(data), ndev->mws[idx].bar); @@ -2435,7 +2435,7 @@ static ssize_t idt_dbgfs_info_read(struct file *filp, char __user *ubuf, "\t%hhu.\t", idx); else off += scnprintf(strbuf + off, size - off, - "\t%hhu-%hhu.\t", idx, idx + cnt - 1); + "\t%hhu-%d.\t", idx, idx + cnt - 1); off += scnprintf(strbuf + off, size - off, "%s BAR%hhu, ", idt_get_mw_name(data), @@ -2480,7 +2480,7 @@ static ssize_t idt_dbgfs_info_read(struct file *filp, char __user *ubuf, int src; data = idt_ntb_msg_read(&ndev->ntb, &src, idx); off += scnprintf(strbuf + off, size - off, - "\t%hhu. 0x%08x from peer %hhu (Port %hhu)\n", + "\t%hhu. 0x%08x from peer %d (Port %hhu)\n", idx, data, src, ndev->peers[src].port); } off += scnprintf(strbuf + off, size - off, "\n"); From 875d329ae141f95fad37c6537096c6b625869425 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Tue, 12 Jul 2022 13:50:06 -0700 Subject: [PATCH 0613/1250] ntb: intel: add GNR support for Intel PCIe gen5 NTB Add Intel Granite Rapids NTB PCI device ID and related enabling. Expectation is same hardware interface as Saphire Rapids Xeon platforms. Signed-off-by: Dave Jiang Signed-off-by: Jon Mason --- drivers/ntb/hw/intel/ntb_hw_gen1.c | 4 +++- drivers/ntb/hw/intel/ntb_hw_gen4.c | 2 +- drivers/ntb/hw/intel/ntb_hw_intel.h | 9 +++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.c b/drivers/ntb/hw/intel/ntb_hw_gen1.c index e5f14e20a9ff71..72e2027a71c4dd 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen1.c +++ b/drivers/ntb/hw/intel/ntb_hw_gen1.c @@ -1874,7 +1874,7 @@ static int intel_ntb_pci_probe(struct pci_dev *pdev, rc = gen3_init_dev(ndev); if (rc) goto err_init_dev; - } else if (pdev_is_gen4(pdev)) { + } else if (pdev_is_gen4(pdev) || pdev_is_gen5(pdev)) { ndev->ntb.ops = &intel_ntb4_ops; rc = intel_ntb_init_pci(ndev, pdev); if (rc) @@ -2047,6 +2047,8 @@ static const struct pci_device_id intel_ntb_pci_tbl[] = { /* GEN4 */ {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_ICX)}, + /* SPR has same dev id has ICX but different revision id */ + {PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_NTB_B2B_GNR)}, {0} }; MODULE_DEVICE_TABLE(pci, intel_ntb_pci_tbl); diff --git a/drivers/ntb/hw/intel/ntb_hw_gen4.c b/drivers/ntb/hw/intel/ntb_hw_gen4.c index 4081fc538ff45a..22cac7975b3c66 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen4.c +++ b/drivers/ntb/hw/intel/ntb_hw_gen4.c @@ -197,7 +197,7 @@ int gen4_init_dev(struct intel_ntb_dev *ndev) ppd1 = ioread32(ndev->self_mmio + GEN4_PPD1_OFFSET); if (pdev_is_ICX(pdev)) ndev->ntb.topo = gen4_ppd_topo(ndev, ppd1); - else if (pdev_is_SPR(pdev)) + else if (pdev_is_SPR(pdev) || pdev_is_gen5(pdev)) ndev->ntb.topo = spr_ppd_topo(ndev, ppd1); dev_dbg(&pdev->dev, "ppd %#x topo %s\n", ppd1, ntb_topo_string(ndev->ntb.topo)); diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h index b233d1c6ba2ddb..380ec0d8e0d9db 100644 --- a/drivers/ntb/hw/intel/ntb_hw_intel.h +++ b/drivers/ntb/hw/intel/ntb_hw_intel.h @@ -70,6 +70,8 @@ #define PCI_DEVICE_ID_INTEL_NTB_SS_BDX 0x6F0F #define PCI_DEVICE_ID_INTEL_NTB_B2B_SKX 0x201C #define PCI_DEVICE_ID_INTEL_NTB_B2B_ICX 0x347e +#define PCI_DEVICE_ID_INTEL_NTB_B2B_SPR 0x347e +#define PCI_DEVICE_ID_INTEL_NTB_B2B_GNR 0x0db4 /* Ntb control and link status */ #define NTB_CTL_CFG_LOCK BIT(0) @@ -225,7 +227,14 @@ static inline int pdev_is_gen4(struct pci_dev *pdev) { if (pdev->device == PCI_DEVICE_ID_INTEL_NTB_B2B_ICX) return 1; + return 0; +} +static inline int pdev_is_gen5(struct pci_dev *pdev) +{ + if (pdev->device == PCI_DEVICE_ID_INTEL_NTB_B2B_GNR) + return 1; return 0; } + #endif From ef685262346b7a3c5092cb3234918f19c2c6f776 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 14 Feb 2022 23:38:41 -0600 Subject: [PATCH 0614/1250] PCI: designware-ep: Allow pcie_ep_set_bar change inbound map address ntb_transfer will set memory map windows after probe. So the inbound map address need be updated dynamtically. Signed-off-by: Frank Li Signed-off-by: Jon Mason --- drivers/pci/controller/dwc/pcie-designware-ep.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 0eda8236c125a0..b4cb65d851cd89 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -162,7 +162,11 @@ static int dw_pcie_ep_inbound_atu(struct dw_pcie_ep *ep, u8 func_no, u32 free_win; struct dw_pcie *pci = to_dw_pcie_from_ep(ep); - free_win = find_first_zero_bit(ep->ib_window_map, pci->num_ib_windows); + if (!ep->bar_to_atu[bar]) + free_win = find_first_zero_bit(ep->ib_window_map, pci->num_ib_windows); + else + free_win = ep->bar_to_atu[bar]; + if (free_win >= pci->num_ib_windows) { dev_err(pci->dev, "No free inbound window\n"); return -EINVAL; @@ -216,6 +220,7 @@ static void dw_pcie_ep_clear_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, dw_pcie_disable_atu(pci, atu_index, DW_PCIE_REGION_INBOUND); clear_bit(atu_index, ep->ib_window_map); ep->epf_bar[bar] = NULL; + ep->bar_to_atu[bar] = 0; } static int dw_pcie_ep_set_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, @@ -245,6 +250,9 @@ static int dw_pcie_ep_set_bar(struct pci_epc *epc, u8 func_no, u8 vfunc_no, if (ret) return ret; + if (ep->epf_bar[bar]) + return 0; + dw_pcie_dbi_ro_wr_en(pci); dw_pcie_writel_dbi2(pci, reg, lower_32_bits(size - 1)); From 8a78545603bda792045a3aeb23131a29817f5018 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 14 Feb 2022 23:38:42 -0600 Subject: [PATCH 0615/1250] NTB: epf: Added more flexible memory map method Supported below memory map method bar 0: config and spad data bar 2: door bell bar 4: memory map windows Signed-off-by: Frank Li Signed-off-by: Jon Mason --- drivers/ntb/hw/epf/ntb_hw_epf.c | 48 ++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 13 deletions(-) diff --git a/drivers/ntb/hw/epf/ntb_hw_epf.c b/drivers/ntb/hw/epf/ntb_hw_epf.c index b019755e4e21bd..3ece49cb18ffa4 100644 --- a/drivers/ntb/hw/epf/ntb_hw_epf.c +++ b/drivers/ntb/hw/epf/ntb_hw_epf.c @@ -45,7 +45,6 @@ #define NTB_EPF_MIN_DB_COUNT 3 #define NTB_EPF_MAX_DB_COUNT 31 -#define NTB_EPF_MW_OFFSET 2 #define NTB_EPF_COMMAND_TIMEOUT 1000 /* 1 Sec */ @@ -67,6 +66,7 @@ struct ntb_epf_dev { enum pci_barno ctrl_reg_bar; enum pci_barno peer_spad_reg_bar; enum pci_barno db_reg_bar; + enum pci_barno mw_bar; unsigned int mw_count; unsigned int spad_count; @@ -92,6 +92,8 @@ struct ntb_epf_data { enum pci_barno peer_spad_reg_bar; /* BAR that contains Doorbell region and Memory window '1' */ enum pci_barno db_reg_bar; + /* BAR that contains memory windows*/ + enum pci_barno mw_bar; }; static int ntb_epf_send_command(struct ntb_epf_dev *ndev, u32 command, @@ -411,7 +413,7 @@ static int ntb_epf_mw_set_trans(struct ntb_dev *ntb, int pidx, int idx, return -EINVAL; } - bar = idx + NTB_EPF_MW_OFFSET; + bar = idx + ndev->mw_bar; mw_size = pci_resource_len(ntb->pdev, bar); @@ -453,7 +455,7 @@ static int ntb_epf_peer_mw_get_addr(struct ntb_dev *ntb, int idx, if (idx == 0) offset = readl(ndev->ctrl_reg + NTB_EPF_MW1_OFFSET); - bar = idx + NTB_EPF_MW_OFFSET; + bar = idx + ndev->mw_bar; if (base) *base = pci_resource_start(ndev->ntb.pdev, bar) + offset; @@ -565,6 +567,7 @@ static int ntb_epf_init_pci(struct ntb_epf_dev *ndev, struct pci_dev *pdev) { struct device *dev = ndev->dev; + size_t spad_sz, spad_off; int ret; pci_set_drvdata(pdev, ndev); @@ -599,10 +602,16 @@ static int ntb_epf_init_pci(struct ntb_epf_dev *ndev, goto err_dma_mask; } - ndev->peer_spad_reg = pci_iomap(pdev, ndev->peer_spad_reg_bar, 0); - if (!ndev->peer_spad_reg) { - ret = -EIO; - goto err_dma_mask; + if (ndev->peer_spad_reg_bar) { + ndev->peer_spad_reg = pci_iomap(pdev, ndev->peer_spad_reg_bar, 0); + if (!ndev->peer_spad_reg) { + ret = -EIO; + goto err_dma_mask; + } + } else { + spad_sz = 4 * readl(ndev->ctrl_reg + NTB_EPF_SPAD_COUNT); + spad_off = readl(ndev->ctrl_reg + NTB_EPF_SPAD_OFFSET); + ndev->peer_spad_reg = ndev->ctrl_reg + spad_off + spad_sz; } ndev->db_reg = pci_iomap(pdev, ndev->db_reg_bar, 0); @@ -657,6 +666,7 @@ static int ntb_epf_pci_probe(struct pci_dev *pdev, enum pci_barno peer_spad_reg_bar = BAR_1; enum pci_barno ctrl_reg_bar = BAR_0; enum pci_barno db_reg_bar = BAR_2; + enum pci_barno mw_bar = BAR_2; struct device *dev = &pdev->dev; struct ntb_epf_data *data; struct ntb_epf_dev *ndev; @@ -671,17 +681,16 @@ static int ntb_epf_pci_probe(struct pci_dev *pdev, data = (struct ntb_epf_data *)id->driver_data; if (data) { - if (data->peer_spad_reg_bar) - peer_spad_reg_bar = data->peer_spad_reg_bar; - if (data->ctrl_reg_bar) - ctrl_reg_bar = data->ctrl_reg_bar; - if (data->db_reg_bar) - db_reg_bar = data->db_reg_bar; + peer_spad_reg_bar = data->peer_spad_reg_bar; + ctrl_reg_bar = data->ctrl_reg_bar; + db_reg_bar = data->db_reg_bar; + mw_bar = data->mw_bar; } ndev->peer_spad_reg_bar = peer_spad_reg_bar; ndev->ctrl_reg_bar = ctrl_reg_bar; ndev->db_reg_bar = db_reg_bar; + ndev->mw_bar = mw_bar; ndev->dev = dev; ntb_epf_init_struct(ndev, pdev); @@ -729,6 +738,14 @@ static const struct ntb_epf_data j721e_data = { .ctrl_reg_bar = BAR_0, .peer_spad_reg_bar = BAR_1, .db_reg_bar = BAR_2, + .mw_bar = BAR_2, +}; + +static const struct ntb_epf_data mx8_data = { + .ctrl_reg_bar = BAR_0, + .peer_spad_reg_bar = BAR_0, + .db_reg_bar = BAR_2, + .mw_bar = BAR_4, }; static const struct pci_device_id ntb_epf_pci_tbl[] = { @@ -737,6 +754,11 @@ static const struct pci_device_id ntb_epf_pci_tbl[] = { .class = PCI_CLASS_MEMORY_RAM << 8, .class_mask = 0xffff00, .driver_data = (kernel_ulong_t)&j721e_data, }, + { + PCI_DEVICE(PCI_VENDOR_ID_FREESCALE, 0x0809), + .class = PCI_CLASS_MEMORY_RAM << 8, .class_mask = 0xffff00, + .driver_data = (kernel_ulong_t)&mx8_data, + }, { }, }; From da51fd247424648b1b4611ad14c091ff0d946681 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 14 Feb 2022 23:38:43 -0600 Subject: [PATCH 0616/1250] NTB: EPF: support NTB transfer between PCI RC and EP connection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add NTB function driver and virtual PCI BUS and Virtual NTB driver to implement communication between PCIe RC and PCIe EP devices ┌────────────┐ ┌─────────────────────────────────────┐ │ │ │ │ ├────────────┤ │ ┌──────────────┤ │ NTB │ │ │ NTB │ │ NetDev │ │ │ NetDev │ ├────────────┤ │ ├──────────────┤ │ NTB │ │ │ NTB │ │ Transfer │ │ │ Transfer │ ├────────────┤ │ ├──────────────┤ │ │ │ │ │ │ PCI NTB │ │ │ │ │ EPF │ │ │ │ │ Driver │ │ │ PCI Virtual │ │ │ ├───────────────┐ │ NTB Driver │ │ │ │ PCI EP NTB │◄────►│ │ │ │ │ FN Driver │ │ │ ├────────────┤ ├───────────────┤ ├──────────────┤ │ │ │ │ │ │ │ PCI BUS │ ◄─────► │ PCI EP BUS │ │ Virtual PCI │ │ │ PCI │ │ │ BUS │ └────────────┘ └───────────────┴──────┴──────────────┘ PCI RC PCI EP This driver include 3 part: 1 PCI EP NTB function driver 2 Virtual PCI bus 3 PCI virutal NTB driver, which is loaded only by above virtual pci bus Signed-off-by: Frank Li Reported-by: kernel test robot Signed-off-by: Jon Mason --- drivers/pci/endpoint/functions/Kconfig | 11 + drivers/pci/endpoint/functions/Makefile | 1 + drivers/pci/endpoint/functions/pci-epf-vntb.c | 1425 +++++++++++++++++ 3 files changed, 1437 insertions(+) create mode 100644 drivers/pci/endpoint/functions/pci-epf-vntb.c diff --git a/drivers/pci/endpoint/functions/Kconfig b/drivers/pci/endpoint/functions/Kconfig index 5f1242ca2f4e45..362555b024e8fd 100644 --- a/drivers/pci/endpoint/functions/Kconfig +++ b/drivers/pci/endpoint/functions/Kconfig @@ -25,3 +25,14 @@ config PCI_EPF_NTB device tree. If in doubt, say "N" to disable Endpoint NTB driver. + +config PCI_EPF_VNTB + tristate "PCI Endpoint NTB driver" + depends on PCI_ENDPOINT + select CONFIGFS_FS + help + Select this configuration option to enable the Non-Transparent + Bridge (NTB) driver for PCI Endpoint. NTB driver implements NTB + between PCI host and PCIe Endpoint. + + If in doubt, say "N" to disable Endpoint NTB driver. diff --git a/drivers/pci/endpoint/functions/Makefile b/drivers/pci/endpoint/functions/Makefile index 96ab932a537a24..5c13001deaba12 100644 --- a/drivers/pci/endpoint/functions/Makefile +++ b/drivers/pci/endpoint/functions/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_PCI_EPF_TEST) += pci-epf-test.o obj-$(CONFIG_PCI_EPF_NTB) += pci-epf-ntb.o +obj-$(CONFIG_PCI_EPF_VNTB) += pci-epf-vntb.o diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c new file mode 100644 index 00000000000000..ebf7e243eefa44 --- /dev/null +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -0,0 +1,1425 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Endpoint Function Driver to implement Non-Transparent Bridge functionality + * Between PCI RC and EP + * + * Copyright (C) 2020 Texas Instruments + * Copyright (C) 2022 NXP + * + * Based on pci-epf-ntb.c + * Author: Frank Li + * Author: Kishon Vijay Abraham I + */ + +/** + * +------------+ +---------------------------------------+ + * | | | | + * +------------+ | +--------------+ + * | NTB | | | NTB | + * | NetDev | | | NetDev | + * +------------+ | +--------------+ + * | NTB | | | NTB | + * | Transfer | | | Transfer | + * +------------+ | +--------------+ + * | | | | | + * | PCI NTB | | | | + * | EPF | | | | + * | Driver | | | PCI Virtual | + * | | +---------------+ | NTB Driver | + * | | | PCI EP NTB |<------>| | + * | | | FN Driver | | | + * +------------+ +---------------+ +--------------+ + * | | | | | | + * | PCI BUS | <-----> | PCI EP BUS | | Virtual PCI | + * | | PCI | | | BUS | + * +------------+ +---------------+--------+--------------+ + * PCI RC PCI EP + */ + +#include +#include +#include +#include + +#include +#include +#include + +static struct workqueue_struct *kpcintb_workqueue; + +#define COMMAND_CONFIGURE_DOORBELL 1 +#define COMMAND_TEARDOWN_DOORBELL 2 +#define COMMAND_CONFIGURE_MW 3 +#define COMMAND_TEARDOWN_MW 4 +#define COMMAND_LINK_UP 5 +#define COMMAND_LINK_DOWN 6 + +#define COMMAND_STATUS_OK 1 +#define COMMAND_STATUS_ERROR 2 + +#define LINK_STATUS_UP BIT(0) + +#define SPAD_COUNT 64 +#define DB_COUNT 4 +#define NTB_MW_OFFSET 2 +#define DB_COUNT_MASK GENMASK(15, 0) +#define MSIX_ENABLE BIT(16) +#define MAX_DB_COUNT 32 +#define MAX_MW 4 + +#define VNTB_VID 0x1957 +#define VNTB_PID 0x080A + +enum epf_ntb_bar { + BAR_CONFIG, + BAR_DB, + BAR_MW0, + BAR_MW1, + BAR_MW2, +}; + +/* + * +--------------------------------------------------+ Base + * | | + * | | + * | | + * | Common Control Register | + * | | + * | | + * | | + * +-----------------------+--------------------------+ Base+span_offset + * | | | + * | Peer Span Space | Span Space | + * | | | + * | | | + * +-----------------------+--------------------------+ Base+span_offset + * | | | +span_count * 4 + * | | | + * | Span Space | Peer Span Space | + * | | | + * +-----------------------+--------------------------+ + * Virtual PCI Pcie Endpoint + * NTB Driver NTB Driver + */ +struct epf_ntb_ctrl { + u32 command; + u32 argument; + u16 command_status; + u16 link_status; + u32 topology; + u64 addr; + u64 size; + u32 num_mws; + u32 reserved; + u32 spad_offset; + u32 spad_count; + u32 db_entry_size; + u32 db_data[MAX_DB_COUNT]; + u32 db_offset[MAX_DB_COUNT]; +} __packed; + +struct epf_ntb { + struct ntb_dev ntb; + struct pci_epf *epf; + struct config_group group; + + u32 num_mws; + u32 db_count; + u32 spad_count; + u64 mws_size[MAX_MW]; + u64 db; + u32 vbus_number; + + bool linkup; + u32 spad_size; + + enum pci_barno epf_ntb_bar[6]; + + struct epf_ntb_ctrl *reg; + + phys_addr_t epf_db_phy; + void __iomem *epf_db; + + phys_addr_t vpci_mw_phy[MAX_MW]; + void __iomem *vpci_mw_addr[MAX_MW]; + + struct delayed_work cmd_handler; +}; + +#define to_epf_ntb(epf_group) container_of((epf_group), struct epf_ntb, group) +#define ntb_ndev(__ntb) container_of(__ntb, struct epf_ntb, ntb) + +static struct pci_epf_header epf_ntb_header = { + .vendorid = PCI_ANY_ID, + .deviceid = PCI_ANY_ID, + .baseclass_code = PCI_BASE_CLASS_MEMORY, + .interrupt_pin = PCI_INTERRUPT_INTA, +}; + +/** + * epf_ntb_link_up() - Raise link_up interrupt to Virtual Host + * @ntb: NTB device that facilitates communication between HOST and VHOST + * @link_up: true or false indicating Link is UP or Down + * + * Once NTB function in HOST invoke ntb_link_enable(), + * this NTB function driver will trigger a link event to vhost. + */ +static int epf_ntb_link_up(struct epf_ntb *ntb, bool link_up) +{ + if (link_up) + ntb->reg->link_status |= LINK_STATUS_UP; + else + ntb->reg->link_status &= ~LINK_STATUS_UP; + + ntb_link_event(&ntb->ntb); + return 0; +} + +/** + * epf_ntb_configure_mw() - Configure the Outbound Address Space for vhost + * to access the memory window of host + * @ntb: NTB device that facilitates communication between host and vhost + * @mw: Index of the memory window (either 0, 1, 2 or 3) + * + * EP Outbound Window + * +--------+ +-----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * | | +-----------+ + * | Virtual| | Memory Win| + * | NTB | -----------> | | + * | Driver | | | + * | | +-----------+ + * | | | | + * | | | | + * +--------+ +-----------+ + * VHost PCI EP + */ +static int epf_ntb_configure_mw(struct epf_ntb *ntb, u32 mw) +{ + phys_addr_t phys_addr; + u8 func_no, vfunc_no; + u64 addr, size; + int ret = 0; + + phys_addr = ntb->vpci_mw_phy[mw]; + addr = ntb->reg->addr; + size = ntb->reg->size; + + func_no = ntb->epf->func_no; + vfunc_no = ntb->epf->vfunc_no; + + ret = pci_epc_map_addr(ntb->epf->epc, func_no, vfunc_no, phys_addr, addr, size); + if (ret) + dev_err(&ntb->epf->epc->dev, + "intf: Failed to map memory window %d address\n", mw); + return ret; +} + +/** + * epf_ntb_teardown_mw() - Teardown the configured OB ATU + * @ntb: NTB device that facilitates communication between HOST and vHOST + * @mw: Index of the memory window (either 0, 1, 2 or 3) + * + * Teardown the configured OB ATU configured in epf_ntb_configure_mw() using + * pci_epc_unmap_addr() + */ +static void epf_ntb_teardown_mw(struct epf_ntb *ntb, u32 mw) +{ + pci_epc_unmap_addr(ntb->epf->epc, + ntb->epf->func_no, + ntb->epf->vfunc_no, + ntb->vpci_mw_phy[mw]); +} + +/** + * epf_ntb_cmd_handler() - Handle commands provided by the NTB Host + * @work: work_struct for the epf_ntb_epc + * + * Workqueue function that gets invoked for the two epf_ntb_epc + * periodically (once every 5ms) to see if it has received any commands + * from NTB host. The host can send commands to configure doorbell or + * configure memory window or to update link status. + */ +static void epf_ntb_cmd_handler(struct work_struct *work) +{ + struct epf_ntb_ctrl *ctrl; + u32 command, argument; + struct epf_ntb *ntb; + struct device *dev; + int ret; + int i; + + ntb = container_of(work, struct epf_ntb, cmd_handler.work); + + for (i = 1; i < ntb->db_count; i++) { + if (readl(ntb->epf_db + i * 4)) { + if (readl(ntb->epf_db + i * 4)) + ntb->db |= 1 << (i - 1); + + ntb_db_event(&ntb->ntb, i); + writel(0, ntb->epf_db + i * 4); + } + } + + ctrl = ntb->reg; + command = ctrl->command; + if (!command) + goto reset_handler; + argument = ctrl->argument; + + ctrl->command = 0; + ctrl->argument = 0; + + ctrl = ntb->reg; + dev = &ntb->epf->dev; + + switch (command) { + case COMMAND_CONFIGURE_DOORBELL: + ctrl->command_status = COMMAND_STATUS_OK; + break; + case COMMAND_TEARDOWN_DOORBELL: + ctrl->command_status = COMMAND_STATUS_OK; + break; + case COMMAND_CONFIGURE_MW: + ret = epf_ntb_configure_mw(ntb, argument); + if (ret < 0) + ctrl->command_status = COMMAND_STATUS_ERROR; + else + ctrl->command_status = COMMAND_STATUS_OK; + break; + case COMMAND_TEARDOWN_MW: + epf_ntb_teardown_mw(ntb, argument); + ctrl->command_status = COMMAND_STATUS_OK; + break; + case COMMAND_LINK_UP: + ntb->linkup = true; + ret = epf_ntb_link_up(ntb, true); + if (ret < 0) + ctrl->command_status = COMMAND_STATUS_ERROR; + else + ctrl->command_status = COMMAND_STATUS_OK; + goto reset_handler; + case COMMAND_LINK_DOWN: + ntb->linkup = false; + ret = epf_ntb_link_up(ntb, false); + if (ret < 0) + ctrl->command_status = COMMAND_STATUS_ERROR; + else + ctrl->command_status = COMMAND_STATUS_OK; + break; + default: + dev_err(dev, "intf UNKNOWN command: %d\n", command); + break; + } + +reset_handler: + queue_delayed_work(kpcintb_workqueue, &ntb->cmd_handler, + msecs_to_jiffies(5)); +} + +/** + * epf_ntb_config_sspad_bar_clear() - Clear Config + Self scratchpad BAR + * @ntb_epc: EPC associated with one of the HOST which holds peer's outbound + * address. + * + * Clear BAR0 of EP CONTROLLER 1 which contains the HOST1's config and + * self scratchpad region (removes inbound ATU configuration). While BAR0 is + * the default self scratchpad BAR, an NTB could have other BARs for self + * scratchpad (because of reserved BARs). This function can get the exact BAR + * used for self scratchpad from epf_ntb_bar[BAR_CONFIG]. + * + * Please note the self scratchpad region and config region is combined to + * a single region and mapped using the same BAR. Also note HOST2's peer + * scratchpad is HOST1's self scratchpad. + */ +static void epf_ntb_config_sspad_bar_clear(struct epf_ntb *ntb) +{ + struct pci_epf_bar *epf_bar; + enum pci_barno barno; + + barno = ntb->epf_ntb_bar[BAR_CONFIG]; + epf_bar = &ntb->epf->bar[barno]; + + pci_epc_clear_bar(ntb->epf->epc, ntb->epf->func_no, ntb->epf->vfunc_no, epf_bar); +} + +/** + * epf_ntb_config_sspad_bar_set() - Set Config + Self scratchpad BAR + * @ntb: NTB device that facilitates communication between HOST and vHOST + * + * Map BAR0 of EP CONTROLLER 1 which contains the HOST1's config and + * self scratchpad region. + * + * Please note the self scratchpad region and config region is combined to + * a single region and mapped using the same BAR. + */ +static int epf_ntb_config_sspad_bar_set(struct epf_ntb *ntb) +{ + struct pci_epf_bar *epf_bar; + enum pci_barno barno; + u8 func_no, vfunc_no; + struct device *dev; + int ret; + + dev = &ntb->epf->dev; + func_no = ntb->epf->func_no; + vfunc_no = ntb->epf->vfunc_no; + barno = ntb->epf_ntb_bar[BAR_CONFIG]; + epf_bar = &ntb->epf->bar[barno]; + + ret = pci_epc_set_bar(ntb->epf->epc, func_no, vfunc_no, epf_bar); + if (ret) { + dev_err(dev, "inft: Config/Status/SPAD BAR set failed\n"); + return ret; + } + return 0; +} + +/** + * epf_ntb_config_spad_bar_free() - Free the physical memory associated with + * config + scratchpad region + * @ntb: NTB device that facilitates communication between HOST and vHOST + */ +static void epf_ntb_config_spad_bar_free(struct epf_ntb *ntb) +{ + enum pci_barno barno; + + barno = ntb->epf_ntb_bar[BAR_CONFIG]; + pci_epf_free_space(ntb->epf, ntb->reg, barno, 0); +} + +/** + * epf_ntb_config_spad_bar_alloc() - Allocate memory for config + scratchpad + * region + * @ntb: NTB device that facilitates communication between HOST1 and HOST2 + * + * Allocate the Local Memory mentioned in the above diagram. The size of + * CONFIG REGION is sizeof(struct epf_ntb_ctrl) and size of SCRATCHPAD REGION + * is obtained from "spad-count" configfs entry. + */ +static int epf_ntb_config_spad_bar_alloc(struct epf_ntb *ntb) +{ + size_t align; + enum pci_barno barno; + struct epf_ntb_ctrl *ctrl; + u32 spad_size, ctrl_size; + u64 size; + struct pci_epf *epf = ntb->epf; + struct device *dev = &epf->dev; + u32 spad_count; + void *base; + int i; + const struct pci_epc_features *epc_features = pci_epc_get_features(epf->epc, + epf->func_no, + epf->vfunc_no); + barno = ntb->epf_ntb_bar[BAR_CONFIG]; + size = epc_features->bar_fixed_size[barno]; + align = epc_features->align; + + if ((!IS_ALIGNED(size, align))) + return -EINVAL; + + spad_count = ntb->spad_count; + + ctrl_size = sizeof(struct epf_ntb_ctrl); + spad_size = 2 * spad_count * 4; + + if (!align) { + ctrl_size = roundup_pow_of_two(ctrl_size); + spad_size = roundup_pow_of_two(spad_size); + } else { + ctrl_size = ALIGN(ctrl_size, align); + spad_size = ALIGN(spad_size, align); + } + + if (!size) + size = ctrl_size + spad_size; + else if (size < ctrl_size + spad_size) + return -EINVAL; + + base = pci_epf_alloc_space(epf, size, barno, align, 0); + if (!base) { + dev_err(dev, "intf: Config/Status/SPAD alloc region fail\n"); + return -ENOMEM; + } + + ntb->reg = base; + + ctrl = ntb->reg; + ctrl->spad_offset = ctrl_size; + + ctrl->spad_count = spad_count; + ctrl->num_mws = ntb->num_mws; + ntb->spad_size = spad_size; + + ctrl->db_entry_size = 4; + + for (i = 0; i < ntb->db_count; i++) { + ntb->reg->db_data[i] = 1 + i; + ntb->reg->db_offset[i] = 0; + } + + return 0; +} + +/** + * epf_ntb_configure_interrupt() - Configure MSI/MSI-X capaiblity + * @ntb: NTB device that facilitates communication between HOST and vHOST + * + * Configure MSI/MSI-X capability for each interface with number of + * interrupts equal to "db_count" configfs entry. + */ +static int epf_ntb_configure_interrupt(struct epf_ntb *ntb) +{ + const struct pci_epc_features *epc_features; + bool msix_capable, msi_capable; + u8 func_no, vfunc_no; + struct device *dev; + u32 db_count; + int ret; + + dev = &ntb->epf->dev; + + epc_features = pci_epc_get_features(ntb->epf->epc, ntb->epf->func_no, ntb->epf->vfunc_no); + msix_capable = epc_features->msix_capable; + msi_capable = epc_features->msi_capable; + + if (!(msix_capable || msi_capable)) { + dev_err(dev, "MSI or MSI-X is required for doorbell\n"); + return -EINVAL; + } + + func_no = ntb->epf->func_no; + vfunc_no = ntb->epf->vfunc_no; + + db_count = ntb->db_count; + if (db_count > MAX_DB_COUNT) { + dev_err(dev, "DB count cannot be more than %d\n", MAX_DB_COUNT); + return -EINVAL; + } + + ntb->db_count = db_count; + + if (msi_capable) { + ret = pci_epc_set_msi(ntb->epf->epc, func_no, vfunc_no, 16); + if (ret) { + dev_err(dev, "intf: MSI configuration failed\n"); + return ret; + } + } + + return 0; +} + +/** + * epf_ntb_db_bar_init() - Configure Doorbell window BARs + * @ntb: NTB device that facilitates communication between HOST and vHOST + * + */ +static int epf_ntb_db_bar_init(struct epf_ntb *ntb) +{ + const struct pci_epc_features *epc_features; + u32 align; + struct device *dev = &ntb->epf->dev; + int ret; + struct pci_epf_bar *epf_bar; + void __iomem *mw_addr; + enum pci_barno barno; + size_t size = 4 * ntb->db_count; + + epc_features = pci_epc_get_features(ntb->epf->epc, + ntb->epf->func_no, + ntb->epf->vfunc_no); + align = epc_features->align; + + if (size < 128) + size = 128; + + if (align) + size = ALIGN(size, align); + else + size = roundup_pow_of_two(size); + + barno = ntb->epf_ntb_bar[BAR_DB]; + + mw_addr = pci_epf_alloc_space(ntb->epf, size, barno, align, 0); + if (!mw_addr) { + dev_err(dev, "intf: Failed to allocate OB address\n"); + return -ENOMEM; + } + + ntb->epf_db = mw_addr; + + epf_bar = &ntb->epf->bar[barno]; + + ret = pci_epc_set_bar(ntb->epf->epc, ntb->epf->func_no, ntb->epf->vfunc_no, epf_bar); + if (ret) { + dev_err(dev, "intf: DoorBell BAR set failed\n"); + goto err_alloc_peer_mem; + } + return ret; + +err_alloc_peer_mem: + pci_epc_mem_free_addr(ntb->epf->epc, epf_bar->phys_addr, mw_addr, epf_bar->size); + return -1; +} + +/** + * epf_ntb_db_bar_clear() - Clear doorbell BAR and free memory + * allocated in peers outbound address space + * @ntb: NTB device that facilitates communication between HOST and vHOST + */ +static void epf_ntb_db_bar_clear(struct epf_ntb *ntb) +{ + enum pci_barno barno; + + barno = ntb->epf_ntb_bar[BAR_DB]; + pci_epf_free_space(ntb->epf, ntb->epf_db, barno, 0); + pci_epc_clear_bar(ntb->epf->epc, + ntb->epf->func_no, + ntb->epf->vfunc_no, + &ntb->epf->bar[barno]); +} + +/** + * epf_ntb_mw_bar_init() - Configure Memory window BARs + * @ntb: NTB device that facilitates communication between HOST and vHOST + * + */ +static int epf_ntb_mw_bar_init(struct epf_ntb *ntb) +{ + int ret = 0; + int i; + u64 size; + enum pci_barno barno; + struct device *dev = &ntb->epf->dev; + + for (i = 0; i < ntb->num_mws; i++) { + + size = ntb->mws_size[i]; + + barno = ntb->epf_ntb_bar[BAR_MW0 + i]; + + ntb->epf->bar[barno].barno = barno; + ntb->epf->bar[barno].size = size; + ntb->epf->bar[barno].addr = 0; + ntb->epf->bar[barno].phys_addr = 0; + ntb->epf->bar[barno].flags |= upper_32_bits(size) ? + PCI_BASE_ADDRESS_MEM_TYPE_64 : + PCI_BASE_ADDRESS_MEM_TYPE_32; + + ret = pci_epc_set_bar(ntb->epf->epc, + ntb->epf->func_no, + ntb->epf->vfunc_no, + &ntb->epf->bar[barno]); + if (ret) { + dev_err(dev, "intf: MW set failed\n"); + goto err_alloc_mem; + } + + /* allocate epc outbound memory windows to vpci vntb device */ + ntb->vpci_mw_addr[i] = pci_epc_mem_alloc_addr(ntb->epf->epc, + &ntb->vpci_mw_phy[i], + size); + if (!ntb->vpci_mw_addr[i]) { + dev_err(dev, "Failed to allocate source address\n"); + goto err_alloc_mem; + } + } + + return ret; +err_alloc_mem: + return ret; +} + +/** + * epf_ntb_mw_bar_clear() - Clear Memory window BARs + * @ntb: NTB device that facilitates communication between HOST and vHOST + * + */ +static void epf_ntb_mw_bar_clear(struct epf_ntb *ntb) +{ + enum pci_barno barno; + int i; + + for (i = 0; i < ntb->num_mws; i++) { + barno = ntb->epf_ntb_bar[BAR_MW0 + i]; + pci_epc_clear_bar(ntb->epf->epc, + ntb->epf->func_no, + ntb->epf->vfunc_no, + &ntb->epf->bar[barno]); + + pci_epc_mem_free_addr(ntb->epf->epc, + ntb->vpci_mw_phy[i], + ntb->vpci_mw_addr[i], + ntb->mws_size[i]); + } +} + +/** + * epf_ntb_epc_destroy() - Cleanup NTB EPC interface + * @ntb: NTB device that facilitates communication between HOST and vHOST + * + * Wrapper for epf_ntb_epc_destroy_interface() to cleanup all the NTB interfaces + */ +static void epf_ntb_epc_destroy(struct epf_ntb *ntb) +{ + pci_epc_remove_epf(ntb->epf->epc, ntb->epf, 0); + pci_epc_put(ntb->epf->epc); +} + +/** + * epf_ntb_init_epc_bar() - Identify BARs to be used for each of the NTB + * constructs (scratchpad region, doorbell, memorywindow) + * @ntb: NTB device that facilitates communication between HOST and vHOST + * + */ +static int epf_ntb_init_epc_bar(struct epf_ntb *ntb) +{ + const struct pci_epc_features *epc_features; + enum pci_barno barno; + enum epf_ntb_bar bar; + struct device *dev; + u32 num_mws; + int i; + + barno = BAR_0; + num_mws = ntb->num_mws; + dev = &ntb->epf->dev; + epc_features = pci_epc_get_features(ntb->epf->epc, ntb->epf->func_no, ntb->epf->vfunc_no); + + /* These are required BARs which are mandatory for NTB functionality */ + for (bar = BAR_CONFIG; bar <= BAR_MW0; bar++, barno++) { + barno = pci_epc_get_next_free_bar(epc_features, barno); + if (barno < 0) { + dev_err(dev, "intf: Fail to get NTB function BAR\n"); + return barno; + } + ntb->epf_ntb_bar[bar] = barno; + } + + /* These are optional BARs which don't impact NTB functionality */ + for (bar = BAR_MW1, i = 1; i < num_mws; bar++, barno++, i++) { + barno = pci_epc_get_next_free_bar(epc_features, barno); + if (barno < 0) { + ntb->num_mws = i; + dev_dbg(dev, "BAR not available for > MW%d\n", i + 1); + } + ntb->epf_ntb_bar[bar] = barno; + } + + return 0; +} + +/** + * epf_ntb_epc_init() - Initialize NTB interface + * @ntb: NTB device that facilitates communication between HOST and vHOST2 + * + * Wrapper to initialize a particular EPC interface and start the workqueue + * to check for commands from host. This function will write to the + * EP controller HW for configuring it. + */ +static int epf_ntb_epc_init(struct epf_ntb *ntb) +{ + u8 func_no, vfunc_no; + struct pci_epc *epc; + struct pci_epf *epf; + struct device *dev; + int ret; + + epf = ntb->epf; + dev = &epf->dev; + epc = epf->epc; + func_no = ntb->epf->func_no; + vfunc_no = ntb->epf->vfunc_no; + + ret = epf_ntb_config_sspad_bar_set(ntb); + if (ret) { + dev_err(dev, "intf: Config/self SPAD BAR init failed"); + return ret; + } + + ret = epf_ntb_configure_interrupt(ntb); + if (ret) { + dev_err(dev, "intf: Interrupt configuration failed\n"); + goto err_config_interrupt; + } + + ret = epf_ntb_db_bar_init(ntb); + if (ret) { + dev_err(dev, "intf: DB BAR init failed\n"); + goto err_db_bar_init; + } + + ret = epf_ntb_mw_bar_init(ntb); + if (ret) { + dev_err(dev, "intf: MW BAR init failed\n"); + goto err_mw_bar_init; + } + + if (vfunc_no <= 1) { + ret = pci_epc_write_header(epc, func_no, vfunc_no, epf->header); + if (ret) { + dev_err(dev, "intf: Configuration header write failed\n"); + goto err_write_header; + } + } + + INIT_DELAYED_WORK(&ntb->cmd_handler, epf_ntb_cmd_handler); + queue_work(kpcintb_workqueue, &ntb->cmd_handler.work); + + return 0; + +err_write_header: + epf_ntb_mw_bar_clear(ntb); +err_mw_bar_init: + epf_ntb_db_bar_clear(ntb); +err_db_bar_init: +err_config_interrupt: + epf_ntb_config_sspad_bar_clear(ntb); + + return ret; +} + + +/** + * epf_ntb_epc_cleanup() - Cleanup all NTB interfaces + * @ntb: NTB device that facilitates communication between HOST1 and HOST2 + * + * Wrapper to cleanup all NTB interfaces. + */ +static void epf_ntb_epc_cleanup(struct epf_ntb *ntb) +{ + epf_ntb_db_bar_clear(ntb); + epf_ntb_mw_bar_clear(ntb); +} + +#define EPF_NTB_R(_name) \ +static ssize_t epf_ntb_##_name##_show(struct config_item *item, \ + char *page) \ +{ \ + struct config_group *group = to_config_group(item); \ + struct epf_ntb *ntb = to_epf_ntb(group); \ + \ + return sprintf(page, "%d\n", ntb->_name); \ +} + +#define EPF_NTB_W(_name) \ +static ssize_t epf_ntb_##_name##_store(struct config_item *item, \ + const char *page, size_t len) \ +{ \ + struct config_group *group = to_config_group(item); \ + struct epf_ntb *ntb = to_epf_ntb(group); \ + u32 val; \ + int ret; \ + \ + ret = kstrtou32(page, 0, &val); \ + if (ret) \ + return ret; \ + \ + ntb->_name = val; \ + \ + return len; \ +} + +#define EPF_NTB_MW_R(_name) \ +static ssize_t epf_ntb_##_name##_show(struct config_item *item, \ + char *page) \ +{ \ + struct config_group *group = to_config_group(item); \ + struct epf_ntb *ntb = to_epf_ntb(group); \ + int win_no; \ + \ + sscanf(#_name, "mw%d", &win_no); \ + \ + return sprintf(page, "%lld\n", ntb->mws_size[win_no - 1]); \ +} + +#define EPF_NTB_MW_W(_name) \ +static ssize_t epf_ntb_##_name##_store(struct config_item *item, \ + const char *page, size_t len) \ +{ \ + struct config_group *group = to_config_group(item); \ + struct epf_ntb *ntb = to_epf_ntb(group); \ + struct device *dev = &ntb->epf->dev; \ + int win_no; \ + u64 val; \ + int ret; \ + \ + ret = kstrtou64(page, 0, &val); \ + if (ret) \ + return ret; \ + \ + if (sscanf(#_name, "mw%d", &win_no) != 1) \ + return -EINVAL; \ + \ + if (ntb->num_mws < win_no) { \ + dev_err(dev, "Invalid num_nws: %d value\n", ntb->num_mws); \ + return -EINVAL; \ + } \ + \ + ntb->mws_size[win_no - 1] = val; \ + \ + return len; \ +} + +static ssize_t epf_ntb_num_mws_store(struct config_item *item, + const char *page, size_t len) +{ + struct config_group *group = to_config_group(item); + struct epf_ntb *ntb = to_epf_ntb(group); + u32 val; + int ret; + + ret = kstrtou32(page, 0, &val); + if (ret) + return ret; + + if (val > MAX_MW) + return -EINVAL; + + ntb->num_mws = val; + + return len; +} + +EPF_NTB_R(spad_count) +EPF_NTB_W(spad_count) +EPF_NTB_R(db_count) +EPF_NTB_W(db_count) +EPF_NTB_R(num_mws) +EPF_NTB_R(vbus_number) +EPF_NTB_W(vbus_number) +EPF_NTB_MW_R(mw1) +EPF_NTB_MW_W(mw1) +EPF_NTB_MW_R(mw2) +EPF_NTB_MW_W(mw2) +EPF_NTB_MW_R(mw3) +EPF_NTB_MW_W(mw3) +EPF_NTB_MW_R(mw4) +EPF_NTB_MW_W(mw4) + +CONFIGFS_ATTR(epf_ntb_, spad_count); +CONFIGFS_ATTR(epf_ntb_, db_count); +CONFIGFS_ATTR(epf_ntb_, num_mws); +CONFIGFS_ATTR(epf_ntb_, mw1); +CONFIGFS_ATTR(epf_ntb_, mw2); +CONFIGFS_ATTR(epf_ntb_, mw3); +CONFIGFS_ATTR(epf_ntb_, mw4); +CONFIGFS_ATTR(epf_ntb_, vbus_number); + +static struct configfs_attribute *epf_ntb_attrs[] = { + &epf_ntb_attr_spad_count, + &epf_ntb_attr_db_count, + &epf_ntb_attr_num_mws, + &epf_ntb_attr_mw1, + &epf_ntb_attr_mw2, + &epf_ntb_attr_mw3, + &epf_ntb_attr_mw4, + &epf_ntb_attr_vbus_number, + NULL, +}; + +static const struct config_item_type ntb_group_type = { + .ct_attrs = epf_ntb_attrs, + .ct_owner = THIS_MODULE, +}; + +/** + * epf_ntb_add_cfs() - Add configfs directory specific to NTB + * @epf: NTB endpoint function device + * @group: A pointer to the config_group structure referencing a group of + * config_items of a specific type that belong to a specific sub-system. + * + * Add configfs directory specific to NTB. This directory will hold + * NTB specific properties like db_count, spad_count, num_mws etc., + */ +static struct config_group *epf_ntb_add_cfs(struct pci_epf *epf, + struct config_group *group) +{ + struct epf_ntb *ntb = epf_get_drvdata(epf); + struct config_group *ntb_group = &ntb->group; + struct device *dev = &epf->dev; + + config_group_init_type_name(ntb_group, dev_name(dev), &ntb_group_type); + + return ntb_group; +} + +/*==== virtual PCI bus driver, which only load virutal ntb pci driver ====*/ + +#define VPCI_BUS_NUM 0x10 + +uint32_t pci_space[] = { + (VNTB_VID | (VNTB_PID << 16)), //DeviceID, Vendor ID + 0, // status, Command + 0xffffffff, // Class code, subclass, prog if, revision id + 0x40, //bist, header type, latency Timer, cache line size + 0, //bar 0 + 0, //bar 1 + 0, //bar 2 + 0, //bar 3 + 0, //bar 4 + 0, //bar 5 + 0, //cardbus cis point + 0, //Subsystem ID Subystem vendor id + 0, //ROM Base Address + 0, //Reserved, Cap. Point + 0, //Reserved, + 0, //Max Lat, Min Gnt, interrupt pin, interrupt line +}; + +int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) +{ + if (devfn == 0) { + memcpy(val, ((uint8_t *)pci_space) + where, size); + return 0; + } + return -1; +} + +int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) +{ + return 0; +} + +struct pci_ops vpci_ops = { + .read = pci_read, + .write = pci_write, +}; + +static int vpci_bus(void *sysdata) +{ + struct pci_bus *vpci_bus; + + vpci_bus = pci_scan_bus(VPCI_BUS_NUM, &vpci_ops, sysdata); + if (vpci_bus) + pr_err("create pci bus\n"); + + pci_bus_add_devices(vpci_bus); + + return 0; +} + +/*==================== Virtual PCIe NTB driver ==========================*/ + +static int vntb_epf_mw_count(struct ntb_dev *ntb, int pidx) +{ + struct epf_ntb *ndev = ntb_ndev(ntb); + + return ndev->num_mws; +} + +static int vntb_epf_spad_count(struct ntb_dev *ntb) +{ + return ntb_ndev(ntb)->spad_count; +} + +static int vntb_epf_peer_mw_count(struct ntb_dev *ntb) +{ + return ntb_ndev(ntb)->num_mws; +} + +static u64 vntb_epf_db_valid_mask(struct ntb_dev *ntb) +{ + return BIT_ULL(ntb_ndev(ntb)->db_count) - 1; +} + +static int vntb_epf_db_set_mask(struct ntb_dev *ntb, u64 db_bits) +{ + return 0; +} + +static int vntb_epf_mw_set_trans(struct ntb_dev *ndev, int pidx, int idx, + dma_addr_t addr, resource_size_t size) +{ + struct epf_ntb *ntb = ntb_ndev(ndev); + struct pci_epf_bar *epf_bar; + enum pci_barno barno; + int ret; + struct device *dev; + + dev = &ntb->ntb.dev; + barno = ntb->epf_ntb_bar[BAR_MW0 + idx]; + epf_bar = &ntb->epf->bar[barno]; + epf_bar->phys_addr = addr; + epf_bar->barno = barno; + epf_bar->size = size; + + ret = pci_epc_set_bar(ntb->epf->epc, 0, 0, epf_bar); + if (ret) { + dev_err(dev, "failure set mw trans\n"); + return ret; + } + return 0; +} + +static int vntb_epf_mw_clear_trans(struct ntb_dev *ntb, int pidx, int idx) +{ + return 0; +} + +static int vntb_epf_peer_mw_get_addr(struct ntb_dev *ndev, int idx, + phys_addr_t *base, resource_size_t *size) +{ + + struct epf_ntb *ntb = ntb_ndev(ndev); + + if (base) + *base = ntb->vpci_mw_phy[idx]; + + if (size) + *size = ntb->mws_size[idx]; + + return 0; +} + +static int vntb_epf_link_enable(struct ntb_dev *ntb, + enum ntb_speed max_speed, + enum ntb_width max_width) +{ + return 0; +} + +static u32 vntb_epf_spad_read(struct ntb_dev *ndev, int idx) +{ + struct epf_ntb *ntb = ntb_ndev(ndev); + int off = ntb->reg->spad_offset, ct = ntb->reg->spad_count * 4; + u32 val; + void __iomem *base = ntb->reg; + + val = readl(base + off + ct + idx * 4); + return val; +} + +static int vntb_epf_spad_write(struct ntb_dev *ndev, int idx, u32 val) +{ + struct epf_ntb *ntb = ntb_ndev(ndev); + struct epf_ntb_ctrl *ctrl = ntb->reg; + int off = ctrl->spad_offset, ct = ctrl->spad_count * 4; + void __iomem *base = ntb->reg; + + writel(val, base + off + ct + idx * 4); + return 0; +} + +static u32 vntb_epf_peer_spad_read(struct ntb_dev *ndev, int pidx, int idx) +{ + struct epf_ntb *ntb = ntb_ndev(ndev); + struct epf_ntb_ctrl *ctrl = ntb->reg; + int off = ctrl->spad_offset; + void __iomem *base = ntb->reg; + u32 val; + + val = readl(base + off + idx * 4); + return val; +} + +static int vntb_epf_peer_spad_write(struct ntb_dev *ndev, int pidx, int idx, u32 val) +{ + struct epf_ntb *ntb = ntb_ndev(ndev); + struct epf_ntb_ctrl *ctrl = ntb->reg; + int off = ctrl->spad_offset; + void __iomem *base = ntb->reg; + + writel(val, base + off + idx * 4); + return 0; +} + +static int vntb_epf_peer_db_set(struct ntb_dev *ndev, u64 db_bits) +{ + u32 interrupt_num = ffs(db_bits) + 1; + struct epf_ntb *ntb = ntb_ndev(ndev); + u8 func_no, vfunc_no; + int ret; + + func_no = ntb->epf->func_no; + vfunc_no = ntb->epf->vfunc_no; + + ret = pci_epc_raise_irq(ntb->epf->epc, + func_no, + vfunc_no, + PCI_EPC_IRQ_MSI, + interrupt_num + 1); + if (ret) { + dev_err(&ntb->ntb.dev, "intf: Failed to raise IRQ\n"); + return ret; + } + + return 0; +} + +static u64 vntb_epf_db_read(struct ntb_dev *ndev) +{ + struct epf_ntb *ntb = ntb_ndev(ndev); + + return ntb->db; +} + +static int vntb_epf_mw_get_align(struct ntb_dev *ndev, int pidx, int idx, + resource_size_t *addr_align, + resource_size_t *size_align, + resource_size_t *size_max) +{ + struct epf_ntb *ntb = ntb_ndev(ndev); + + if (addr_align) + *addr_align = SZ_4K; + + if (size_align) + *size_align = 1; + + if (size_max) + *size_max = ntb->mws_size[idx]; + + return 0; +} + +static u64 vntb_epf_link_is_up(struct ntb_dev *ndev, + enum ntb_speed *speed, + enum ntb_width *width) +{ + struct epf_ntb *ntb = ntb_ndev(ndev); + + return ntb->reg->link_status; +} + +static int vntb_epf_db_clear_mask(struct ntb_dev *ndev, u64 db_bits) +{ + return 0; +} + +static int vntb_epf_db_clear(struct ntb_dev *ndev, u64 db_bits) +{ + struct epf_ntb *ntb = ntb_ndev(ndev); + + ntb->db &= ~db_bits; + return 0; +} + +static int vntb_epf_link_disable(struct ntb_dev *ntb) +{ + return 0; +} + +static const struct ntb_dev_ops vntb_epf_ops = { + .mw_count = vntb_epf_mw_count, + .spad_count = vntb_epf_spad_count, + .peer_mw_count = vntb_epf_peer_mw_count, + .db_valid_mask = vntb_epf_db_valid_mask, + .db_set_mask = vntb_epf_db_set_mask, + .mw_set_trans = vntb_epf_mw_set_trans, + .mw_clear_trans = vntb_epf_mw_clear_trans, + .peer_mw_get_addr = vntb_epf_peer_mw_get_addr, + .link_enable = vntb_epf_link_enable, + .spad_read = vntb_epf_spad_read, + .spad_write = vntb_epf_spad_write, + .peer_spad_read = vntb_epf_peer_spad_read, + .peer_spad_write = vntb_epf_peer_spad_write, + .peer_db_set = vntb_epf_peer_db_set, + .db_read = vntb_epf_db_read, + .mw_get_align = vntb_epf_mw_get_align, + .link_is_up = vntb_epf_link_is_up, + .db_clear_mask = vntb_epf_db_clear_mask, + .db_clear = vntb_epf_db_clear, + .link_disable = vntb_epf_link_disable, +}; + +static int pci_vntb_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + int ret; + struct epf_ntb *ndev = (struct epf_ntb *)pdev->sysdata; + struct device *dev = &pdev->dev; + + ndev->ntb.pdev = pdev; + ndev->ntb.topo = NTB_TOPO_NONE; + ndev->ntb.ops = &vntb_epf_ops; + + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)); + if (ret) { + dev_err(dev, "Cannot set DMA mask\n"); + return -1; + } + + ret = ntb_register_device(&ndev->ntb); + if (ret) { + dev_err(dev, "Failed to register NTB device\n"); + goto err_register_dev; + } + + dev_info(dev, "PCI Virtual NTB driver loaded\n"); + return 0; + +err_register_dev: + return -1; +} + +static const struct pci_device_id pci_vntb_table[] = { + { + PCI_DEVICE(VNTB_VID, VNTB_PID), + }, + {}, +}; + +static struct pci_driver vntb_pci_driver = { + .name = "pci-vntb", + .id_table = pci_vntb_table, + .probe = pci_vntb_probe, +}; + +/* ============ PCIe EPF Driver Bind ====================*/ + +/** + * epf_ntb_bind() - Initialize endpoint controller to provide NTB functionality + * @epf: NTB endpoint function device + * + * Initialize both the endpoint controllers associated with NTB function device. + * Invoked when a primary interface or secondary interface is bound to EPC + * device. This function will succeed only when EPC is bound to both the + * interfaces. + */ +static int epf_ntb_bind(struct pci_epf *epf) +{ + struct epf_ntb *ntb = epf_get_drvdata(epf); + struct device *dev = &epf->dev; + int ret; + + if (!epf->epc) { + dev_dbg(dev, "PRIMARY EPC interface not yet bound\n"); + return 0; + } + + ret = epf_ntb_init_epc_bar(ntb); + if (ret) { + dev_err(dev, "Failed to create NTB EPC\n"); + goto err_bar_init; + } + + ret = epf_ntb_config_spad_bar_alloc(ntb); + if (ret) { + dev_err(dev, "Failed to allocate BAR memory\n"); + goto err_bar_alloc; + } + + ret = epf_ntb_epc_init(ntb); + if (ret) { + dev_err(dev, "Failed to initialize EPC\n"); + goto err_bar_alloc; + } + + epf_set_drvdata(epf, ntb); + + if (pci_register_driver(&vntb_pci_driver)) { + dev_err(dev, "failure register vntb pci driver\n"); + goto err_bar_alloc; + } + + vpci_bus(ntb); + + return 0; + +err_bar_alloc: + epf_ntb_config_spad_bar_free(ntb); + +err_bar_init: + epf_ntb_epc_destroy(ntb); + + return ret; +} + +/** + * epf_ntb_unbind() - Cleanup the initialization from epf_ntb_bind() + * @epf: NTB endpoint function device + * + * Cleanup the initialization from epf_ntb_bind() + */ +static void epf_ntb_unbind(struct pci_epf *epf) +{ + struct epf_ntb *ntb = epf_get_drvdata(epf); + + epf_ntb_epc_cleanup(ntb); + epf_ntb_config_spad_bar_free(ntb); + epf_ntb_epc_destroy(ntb); + + pci_unregister_driver(&vntb_pci_driver); +} + +// EPF driver probe +static struct pci_epf_ops epf_ntb_ops = { + .bind = epf_ntb_bind, + .unbind = epf_ntb_unbind, + .add_cfs = epf_ntb_add_cfs, +}; + +/** + * epf_ntb_probe() - Probe NTB function driver + * @epf: NTB endpoint function device + * + * Probe NTB function driver when endpoint function bus detects a NTB + * endpoint function. + */ +static int epf_ntb_probe(struct pci_epf *epf) +{ + struct epf_ntb *ntb; + struct device *dev; + + dev = &epf->dev; + + ntb = devm_kzalloc(dev, sizeof(*ntb), GFP_KERNEL); + if (!ntb) + return -ENOMEM; + + epf->header = &epf_ntb_header; + ntb->epf = epf; + epf_set_drvdata(epf, ntb); + + dev_info(dev, "pci-ep epf driver loaded\n"); + return 0; +} + +static const struct pci_epf_device_id epf_ntb_ids[] = { + { + .name = "pci_epf_vntb", + }, + {}, +}; + +static struct pci_epf_driver epf_ntb_driver = { + .driver.name = "pci_epf_vntb", + .probe = epf_ntb_probe, + .id_table = epf_ntb_ids, + .ops = &epf_ntb_ops, + .owner = THIS_MODULE, +}; + + +static int __init epf_ntb_init(void) +{ + int ret; + + kpcintb_workqueue = alloc_workqueue("kpcintb", WQ_MEM_RECLAIM | + WQ_HIGHPRI, 0); + ret = pci_epf_register_driver(&epf_ntb_driver); + if (ret) { + destroy_workqueue(kpcintb_workqueue); + pr_err("Failed to register pci epf ntb driver --> %d\n", ret); + return ret; + } + + return 0; +} +module_init(epf_ntb_init); + +static void __exit epf_ntb_exit(void) +{ + pci_epf_unregister_driver(&epf_ntb_driver); + destroy_workqueue(kpcintb_workqueue); +} +module_exit(epf_ntb_exit); + +MODULE_DESCRIPTION("PCI EPF NTB DRIVER"); +MODULE_AUTHOR("Frank Li "); +MODULE_LICENSE("GPL v2"); From fffebd168d38b8ff6f5b919d0935788e901678ea Mon Sep 17 00:00:00 2001 From: Frank Li Date: Mon, 14 Feb 2022 23:38:44 -0600 Subject: [PATCH 0617/1250] Documentation: PCI: Add specification for the PCI vNTB function device Add specification for the PCI vNTB function device. The endpoint function driver and the host PCI driver should be created based on this specification. Signed-off-by: Frank Li Signed-off-by: Jon Mason --- Documentation/PCI/endpoint/index.rst | 2 + .../PCI/endpoint/pci-vntb-function.rst | 126 ++++++++++++++ Documentation/PCI/endpoint/pci-vntb-howto.rst | 161 ++++++++++++++++++ 3 files changed, 289 insertions(+) create mode 100644 Documentation/PCI/endpoint/pci-vntb-function.rst create mode 100644 Documentation/PCI/endpoint/pci-vntb-howto.rst diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst index 38ea1f604b6d32..4d2333e7ae0671 100644 --- a/Documentation/PCI/endpoint/index.rst +++ b/Documentation/PCI/endpoint/index.rst @@ -13,6 +13,8 @@ PCI Endpoint Framework pci-test-howto pci-ntb-function pci-ntb-howto + pci-vntb-function + pci-vntb-howto function/binding/pci-test function/binding/pci-ntb diff --git a/Documentation/PCI/endpoint/pci-vntb-function.rst b/Documentation/PCI/endpoint/pci-vntb-function.rst new file mode 100644 index 00000000000000..cad8013e88390e --- /dev/null +++ b/Documentation/PCI/endpoint/pci-vntb-function.rst @@ -0,0 +1,126 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================= +PCI vNTB Function +================= + +:Author: Frank Li + +The difference between PCI NTB function and PCI vNTB function is + +PCI NTB function need at two endpoint instances and connect HOST1 +and HOST2. + +PCI vNTB function only use one host and one endpoint(EP), use NTB +connect EP and PCI host + +.. code-block:: text + + + +------------+ +---------------------------------------+ + | | | | + +------------+ | +--------------+ + | NTB | | | NTB | + | NetDev | | | NetDev | + +------------+ | +--------------+ + | NTB | | | NTB | + | Transfer | | | Transfer | + +------------+ | +--------------+ + | | | | | + | PCI NTB | | | | + | EPF | | | | + | Driver | | | PCI Virtual | + | | +---------------+ | NTB Driver | + | | | PCI EP NTB |<------>| | + | | | FN Driver | | | + +------------+ +---------------+ +--------------+ + | | | | | | + | PCI BUS | <-----> | PCI EP BUS | | Virtual PCI | + | | PCI | | | BUS | + +------------+ +---------------+--------+--------------+ + PCI RC PCI EP + +Constructs used for Implementing vNTB +===================================== + + 1) Config Region + 2) Self Scratchpad Registers + 3) Peer Scratchpad Registers + 4) Doorbell (DB) Registers + 5) Memory Window (MW) + + +Config Region: +-------------- + +It is same as PCI NTB Function driver + +Scratchpad Registers: +--------------------- + + It is appended after Config region. + + +--------------------------------------------------+ Base + | | + | | + | | + | Common Config Register | + | | + | | + | | + +-----------------------+--------------------------+ Base + span_offset + | | | + | Peer Span Space | Span Space | + | | | + | | | + +-----------------------+--------------------------+ Base + span_offset + | | | + span_count * 4 + | | | + | Span Space | Peer Span Space | + | | | + +-----------------------+--------------------------+ + Virtual PCI Pcie Endpoint + NTB Driver NTB Driver + + +Doorbell Registers: +------------------- + + Doorbell Registers are used by the hosts to interrupt each other. + +Memory Window: +-------------- + + Actual transfer of data between the two hosts will happen using the + memory window. + +Modeling Constructs: +==================== + +32-bit BARs. + +====== =============== +BAR NO CONSTRUCTS USED +====== =============== +BAR0 Config Region +BAR1 Doorbell +BAR2 Memory Window 1 +BAR3 Memory Window 2 +BAR4 Memory Window 3 +BAR5 Memory Window 4 +====== =============== + +64-bit BARs. + +====== =============================== +BAR NO CONSTRUCTS USED +====== =============================== +BAR0 Config Region + Scratchpad +BAR1 +BAR2 Doorbell +BAR3 +BAR4 Memory Window 1 +BAR5 +====== =============================== + + diff --git a/Documentation/PCI/endpoint/pci-vntb-howto.rst b/Documentation/PCI/endpoint/pci-vntb-howto.rst new file mode 100644 index 00000000000000..b4a679144692a8 --- /dev/null +++ b/Documentation/PCI/endpoint/pci-vntb-howto.rst @@ -0,0 +1,161 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================================================================== +PCI Non-Transparent Bridge (NTB) Endpoint Function (EPF) User Guide +=================================================================== + +:Author: Frank Li + +This document is a guide to help users use pci-epf-vntb function driver +and ntb_hw_epf host driver for NTB functionality. The list of steps to +be followed in the host side and EP side is given below. For the hardware +configuration and internals of NTB using configurable endpoints see +Documentation/PCI/endpoint/pci-vntb-function.rst + +Endpoint Device +=============== + +Endpoint Controller Devices +--------------------------- + +To find the list of endpoint controller devices in the system:: + + # ls /sys/class/pci_epc/ + 5f010000.pcie_ep + +If PCI_ENDPOINT_CONFIGFS is enabled:: + + # ls /sys/kernel/config/pci_ep/controllers + 5f010000.pcie_ep + +Endpoint Function Drivers +------------------------- + +To find the list of endpoint function drivers in the system:: + + # ls /sys/bus/pci-epf/drivers + pci_epf_ntb pci_epf_test pci_epf_vntb + +If PCI_ENDPOINT_CONFIGFS is enabled:: + + # ls /sys/kernel/config/pci_ep/functions + pci_epf_ntb pci_epf_test pci_epf_vntb + + +Creating pci-epf-vntb Device +---------------------------- + +PCI endpoint function device can be created using the configfs. To create +pci-epf-vntb device, the following commands can be used:: + + # mount -t configfs none /sys/kernel/config + # cd /sys/kernel/config/pci_ep/ + # mkdir functions/pci_epf_vntb/func1 + +The "mkdir func1" above creates the pci-epf-ntb function device that will +be probed by pci_epf_vntb driver. + +The PCI endpoint framework populates the directory with the following +configurable fields:: + + # ls functions/pci_epf_ntb/func1 + baseclass_code deviceid msi_interrupts pci-epf-ntb.0 + progif_code secondary subsys_id vendorid + cache_line_size interrupt_pin msix_interrupts primary + revid subclass_code subsys_vendor_id + +The PCI endpoint function driver populates these entries with default values +when the device is bound to the driver. The pci-epf-vntb driver populates +vendorid with 0xffff and interrupt_pin with 0x0001:: + + # cat functions/pci_epf_vntb/func1/vendorid + 0xffff + # cat functions/pci_epf_vntb/func1/interrupt_pin + 0x0001 + + +Configuring pci-epf-vntb Device +------------------------------- + +The user can configure the pci-epf-vntb device using its configfs entry. In order +to change the vendorid and the deviceid, the following +commands can be used:: + + # echo 0x1957 > functions/pci_epf_vntb/func1/vendorid + # echo 0x0809 > functions/pci_epf_vntb/func1/deviceid + +In order to configure NTB specific attributes, a new sub-directory to func1 +should be created:: + + # mkdir functions/pci_epf_vntb/func1/pci_epf_vntb.0/ + +The NTB function driver will populate this directory with various attributes +that can be configured by the user:: + + # ls functions/pci_epf_vntb/func1/pci_epf_vntb.0/ + db_count mw1 mw2 mw3 mw4 num_mws + spad_count + +A sample configuration for NTB function is given below:: + + # echo 4 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/db_count + # echo 128 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/spad_count + # echo 1 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/num_mws + # echo 0x100000 > functions/pci_epf_vntb/func1/pci_epf_vntb.0/mw1 + +Binding pci-epf-ntb Device to EP Controller +-------------------------------------------- + +NTB function device should be attached to PCI endpoint controllers +connected to the host. + + # ln -s controllers/5f010000.pcie_ep functions/pci-epf-ntb/func1/primary + +Once the above step is completed, the PCI endpoint controllers are ready to +establish a link with the host. + + +Start the Link +-------------- + +In order for the endpoint device to establish a link with the host, the _start_ +field should be populated with '1'. For NTB, both the PCI endpoint controllers +should establish link with the host (imx8 don't need this steps):: + + # echo 1 > controllers/5f010000.pcie_ep/start + +RootComplex Device +================== + +lspci Output at Host side +------------------------ + +Note that the devices listed here correspond to the values populated in +"Creating pci-epf-ntb Device" section above:: + + # lspci + 00:00.0 PCI bridge: Freescale Semiconductor Inc Device 0000 (rev 01) + 01:00.0 RAM memory: Freescale Semiconductor Inc Device 0809 + +Endpoint Device / Virtual PCI bus +================================= + +lspci Output at EP Side / Virtual PCI bus +----------------------------------------- + +Note that the devices listed here correspond to the values populated in +"Creating pci-epf-ntb Device" section above:: + + # lspci + 10:00.0 Unassigned class [ffff]: Dawicontrol Computersysteme GmbH Device 1234 (rev ff) + +Using ntb_hw_epf Device +----------------------- + +The host side software follows the standard NTB software architecture in Linux. +All the existing client side NTB utilities like NTB Transport Client and NTB +Netdev, NTB Ping Pong Test Client and NTB Tool Test Client can be used with NTB +function device. + +For more information on NTB see +:doc:`Non-Transparent Bridge <../../driver-api/ntb>` From 4394582f60f75948fc30efd41e88e5dc61c3c472 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 27 Jun 2022 08:57:10 -0700 Subject: [PATCH 0618/1250] drivers/ntb/test: avoid 64-bit modulus operation Redefine RESCHEDULE_RATIO to a closest power of 2 so that the following code in the perf_run_latency /* Avoid processor soft lock-ups */ if (!(pthr->tries % RESCHEDULE_RATIO)) schedule(); doesn't do 64-bit modulus operation. This fixes the following build failures on 32-bit architectures visible in linux-next: ERROR: modpost: "__umoddi3" [drivers/ntb/test/ntb_perf.ko] undefined! Fixes: dc150dfb081f ("ntb_perf: extend with burst latency measurement") Signed-off-by: Max Filippov Signed-off-by: Jon Mason --- drivers/ntb/test/ntb_perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index 4e05c7aa070d39..af7d3de65eb358 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -126,7 +126,7 @@ MODULE_DESCRIPTION("PCIe NTB Performance Measurement Tool"); #define PERF_BUF_LEN 1024 #define LAT_MIN_TRIES 20 -#define RESCHEDULE_RATIO 10000 +#define RESCHEDULE_RATIO 8192 /* power of 2, to avoid actual division */ static unsigned long max_mw_size; module_param(max_mw_size, ulong, 0644); From e8c04e435db5fc4bd6d55dc01d121ab75e01a134 Mon Sep 17 00:00:00 2001 From: "Souptick Joarder (HPE)" Date: Fri, 8 Jul 2022 07:30:35 +0530 Subject: [PATCH 0619/1250] NTB: EPF: Mark pci_read and pci_write as static kernel test robot throws below warning -> drivers/pci/endpoint/functions/pci-epf-vntb.c:975:5: warning: no previous prototype for 'pci_read' [-Wmissing-prototypes] drivers/pci/endpoint/functions/pci-epf-vntb.c:984:5: warning: no previous prototype for 'pci_write' [-Wmissing-prototypes] mark them as static. Reported-by: kernel test robot Signed-off-by: Souptick Joarder (HPE) Signed-off-by: Jon Mason --- drivers/pci/endpoint/functions/pci-epf-vntb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/endpoint/functions/pci-epf-vntb.c b/drivers/pci/endpoint/functions/pci-epf-vntb.c index ebf7e243eefa44..111568089d4588 100644 --- a/drivers/pci/endpoint/functions/pci-epf-vntb.c +++ b/drivers/pci/endpoint/functions/pci-epf-vntb.c @@ -972,7 +972,7 @@ uint32_t pci_space[] = { 0, //Max Lat, Min Gnt, interrupt pin, interrupt line }; -int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) +static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { if (devfn == 0) { memcpy(val, ((uint8_t *)pci_space) + where, size); @@ -981,7 +981,7 @@ int pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 * return -1; } -int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) +static int pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { return 0; } From 53c0fd4057dfca1e5356c9cb6acdadd1a13d66f9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 18 Jul 2022 20:06:24 +0100 Subject: [PATCH 0620/1250] cifs: Fix memory leak when using fscache If we hit the 'index == next_cached' case, we leak a refcount on the struct page. Fix this by using readahead_folio() which takes care of the refcount for you. Fixes: 0174ee9947bd ("cifs: Implement cache I/O by accessing the cache directly") Cc: David Howells Cc: Jeff Layton Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Steve French --- fs/cifs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index e64cda7a761012..6985710e14c28b 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -4459,10 +4459,10 @@ static void cifs_readahead(struct readahead_control *ractl) * TODO: Send a whole batch of pages to be read * by the cache. */ - page = readahead_page(ractl); - last_batch_size = 1 << thp_order(page); + struct folio *folio = readahead_folio(ractl); + last_batch_size = folio_nr_pages(folio); if (cifs_readpage_from_fscache(ractl->mapping->host, - page) < 0) { + &folio->page) < 0) { /* * TODO: Deal with cache read failure * here, but for the moment, delegate @@ -4470,7 +4470,7 @@ static void cifs_readahead(struct readahead_control *ractl) */ caching = false; } - unlock_page(page); + folio_unlock(folio); next_cached++; cache_nr_pages--; if (cache_nr_pages == 0) From e2dd36933f810645a6a704f806ed655776798d6a Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:02 +0800 Subject: [PATCH 0621/1250] RDMA: Add ERDMA to rdma_driver_id definition Link: https://lore.kernel.org/r/20220713094212.30943-2-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/ib_user_ioctl_verbs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 3072e5d6b69266..7dd56210226f53 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -250,6 +250,7 @@ enum rdma_driver_id { RDMA_DRIVER_QIB, RDMA_DRIVER_EFA, RDMA_DRIVER_SIW, + RDMA_DRIVER_ERDMA, }; enum ib_uverbs_gid_type { From ca52aa2d9f9a83b07e5a6ece47d6325adad5ae48 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:03 +0800 Subject: [PATCH 0622/1250] RDMA/erdma: Add the hardware related definitions ERDMA is a PCIe device, and this file provides ERDMA hardware related definitions, mainly including PCIe device capabilities restrictions, device registers definitions, doorbell space, doorbell structure definitions and WQE definitions. Link: https://lore.kernel.org/r/20220713094212.30943-3-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_hw.h | 508 +++++++++++++++++++++++++ 1 file changed, 508 insertions(+) create mode 100644 drivers/infiniband/hw/erdma/erdma_hw.h diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h new file mode 100644 index 00000000000000..b210c49c669fca --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -0,0 +1,508 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#ifndef __ERDMA_HW_H__ +#define __ERDMA_HW_H__ + +#include +#include + +/* PCIe device related definition. */ +#define PCI_VENDOR_ID_ALIBABA 0x1ded + +#define ERDMA_PCI_WIDTH 64 +#define ERDMA_FUNC_BAR 0 +#define ERDMA_MISX_BAR 2 + +#define ERDMA_BAR_MASK (BIT(ERDMA_FUNC_BAR) | BIT(ERDMA_MISX_BAR)) + +/* MSI-X related. */ +#define ERDMA_NUM_MSIX_VEC 32U +#define ERDMA_MSIX_VECTOR_CMDQ 0 + +/* PCIe Bar0 Registers. */ +#define ERDMA_REGS_VERSION_REG 0x0 +#define ERDMA_REGS_DEV_CTRL_REG 0x10 +#define ERDMA_REGS_DEV_ST_REG 0x14 +#define ERDMA_REGS_NETDEV_MAC_L_REG 0x18 +#define ERDMA_REGS_NETDEV_MAC_H_REG 0x1C +#define ERDMA_REGS_CMDQ_SQ_ADDR_L_REG 0x20 +#define ERDMA_REGS_CMDQ_SQ_ADDR_H_REG 0x24 +#define ERDMA_REGS_CMDQ_CQ_ADDR_L_REG 0x28 +#define ERDMA_REGS_CMDQ_CQ_ADDR_H_REG 0x2C +#define ERDMA_REGS_CMDQ_DEPTH_REG 0x30 +#define ERDMA_REGS_CMDQ_EQ_DEPTH_REG 0x34 +#define ERDMA_REGS_CMDQ_EQ_ADDR_L_REG 0x38 +#define ERDMA_REGS_CMDQ_EQ_ADDR_H_REG 0x3C +#define ERDMA_REGS_AEQ_ADDR_L_REG 0x40 +#define ERDMA_REGS_AEQ_ADDR_H_REG 0x44 +#define ERDMA_REGS_AEQ_DEPTH_REG 0x48 +#define ERDMA_REGS_GRP_NUM_REG 0x4c +#define ERDMA_REGS_AEQ_DB_REG 0x50 +#define ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG 0x60 +#define ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG 0x68 +#define ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG 0x70 +#define ERDMA_AEQ_DB_HOST_ADDR_REG 0x78 +#define ERDMA_REGS_STATS_TSO_IN_PKTS_REG 0x80 +#define ERDMA_REGS_STATS_TSO_OUT_PKTS_REG 0x88 +#define ERDMA_REGS_STATS_TSO_OUT_BYTES_REG 0x90 +#define ERDMA_REGS_STATS_TX_DROP_PKTS_REG 0x98 +#define ERDMA_REGS_STATS_TX_BPS_METER_DROP_PKTS_REG 0xa0 +#define ERDMA_REGS_STATS_TX_PPS_METER_DROP_PKTS_REG 0xa8 +#define ERDMA_REGS_STATS_RX_PKTS_REG 0xc0 +#define ERDMA_REGS_STATS_RX_BYTES_REG 0xc8 +#define ERDMA_REGS_STATS_RX_DROP_PKTS_REG 0xd0 +#define ERDMA_REGS_STATS_RX_BPS_METER_DROP_PKTS_REG 0xd8 +#define ERDMA_REGS_STATS_RX_PPS_METER_DROP_PKTS_REG 0xe0 +#define ERDMA_REGS_CEQ_DB_BASE_REG 0x100 +#define ERDMA_CMDQ_SQDB_REG 0x200 +#define ERDMA_CMDQ_CQDB_REG 0x300 + +/* DEV_CTRL_REG details. */ +#define ERDMA_REG_DEV_CTRL_RESET_MASK 0x00000001 +#define ERDMA_REG_DEV_CTRL_INIT_MASK 0x00000002 + +/* DEV_ST_REG details. */ +#define ERDMA_REG_DEV_ST_RESET_DONE_MASK 0x00000001U +#define ERDMA_REG_DEV_ST_INIT_DONE_MASK 0x00000002U + +/* eRDMA PCIe DBs definition. */ +#define ERDMA_BAR_DB_SPACE_BASE 4096 + +#define ERDMA_BAR_SQDB_SPACE_OFFSET ERDMA_BAR_DB_SPACE_BASE +#define ERDMA_BAR_SQDB_SPACE_SIZE (384 * 1024) + +#define ERDMA_BAR_RQDB_SPACE_OFFSET \ + (ERDMA_BAR_SQDB_SPACE_OFFSET + ERDMA_BAR_SQDB_SPACE_SIZE) +#define ERDMA_BAR_RQDB_SPACE_SIZE (96 * 1024) + +#define ERDMA_BAR_CQDB_SPACE_OFFSET \ + (ERDMA_BAR_RQDB_SPACE_OFFSET + ERDMA_BAR_RQDB_SPACE_SIZE) + +/* Doorbell page resources related. */ +/* + * Max # of parallelly issued directSQE is 3072 per device, + * hardware organizes this into 24 group, per group has 128 credits. + */ +#define ERDMA_DWQE_MAX_GRP_CNT 24 +#define ERDMA_DWQE_NUM_PER_GRP 128 + +#define ERDMA_DWQE_TYPE0_CNT 64 +#define ERDMA_DWQE_TYPE1_CNT 496 +/* type1 DB contains 2 DBs, takes 256Byte. */ +#define ERDMA_DWQE_TYPE1_CNT_PER_PAGE 16 + +#define ERDMA_SDB_SHARED_PAGE_INDEX 95 + +/* Doorbell related. */ +#define ERDMA_DB_SIZE 8 + +#define ERDMA_CQDB_IDX_MASK GENMASK_ULL(63, 56) +#define ERDMA_CQDB_CQN_MASK GENMASK_ULL(55, 32) +#define ERDMA_CQDB_ARM_MASK BIT_ULL(31) +#define ERDMA_CQDB_SOL_MASK BIT_ULL(30) +#define ERDMA_CQDB_CMDSN_MASK GENMASK_ULL(29, 28) +#define ERDMA_CQDB_CI_MASK GENMASK_ULL(23, 0) + +#define ERDMA_EQDB_ARM_MASK BIT(31) +#define ERDMA_EQDB_CI_MASK GENMASK_ULL(23, 0) + +#define ERDMA_PAGE_SIZE_SUPPORT 0x7FFFF000 + +/* WQE related. */ +#define EQE_SIZE 16 +#define EQE_SHIFT 4 +#define RQE_SIZE 32 +#define RQE_SHIFT 5 +#define CQE_SIZE 32 +#define CQE_SHIFT 5 +#define SQEBB_SIZE 32 +#define SQEBB_SHIFT 5 +#define SQEBB_MASK (~(SQEBB_SIZE - 1)) +#define SQEBB_ALIGN(size) ((size + SQEBB_SIZE - 1) & SQEBB_MASK) +#define SQEBB_COUNT(size) (SQEBB_ALIGN(size) >> SQEBB_SHIFT) + +#define ERDMA_MAX_SQE_SIZE 128 +#define ERDMA_MAX_WQEBB_PER_SQE 4 + +/* CMDQ related. */ +#define ERDMA_CMDQ_MAX_OUTSTANDING 128 +#define ERDMA_CMDQ_SQE_SIZE 64 + +/* cmdq sub module definition. */ +enum CMDQ_WQE_SUB_MOD { + CMDQ_SUBMOD_RDMA = 0, + CMDQ_SUBMOD_COMMON = 1 +}; + +enum CMDQ_RDMA_OPCODE { + CMDQ_OPCODE_QUERY_DEVICE = 0, + CMDQ_OPCODE_CREATE_QP = 1, + CMDQ_OPCODE_DESTROY_QP = 2, + CMDQ_OPCODE_MODIFY_QP = 3, + CMDQ_OPCODE_CREATE_CQ = 4, + CMDQ_OPCODE_DESTROY_CQ = 5, + CMDQ_OPCODE_REG_MR = 8, + CMDQ_OPCODE_DEREG_MR = 9 +}; + +enum CMDQ_COMMON_OPCODE { + CMDQ_OPCODE_CREATE_EQ = 0, + CMDQ_OPCODE_DESTROY_EQ = 1, + CMDQ_OPCODE_QUERY_FW_INFO = 2, +}; + +/* cmdq-SQE HDR */ +#define ERDMA_CMD_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52) +#define ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK GENMASK_ULL(47, 32) +#define ERDMA_CMD_HDR_SUB_MOD_MASK GENMASK_ULL(25, 24) +#define ERDMA_CMD_HDR_OPCODE_MASK GENMASK_ULL(23, 16) +#define ERDMA_CMD_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) + +struct erdma_cmdq_destroy_cq_req { + u64 hdr; + u32 cqn; +}; + +#define ERDMA_EQ_TYPE_AEQ 0 +#define ERDMA_EQ_TYPE_CEQ 1 + +struct erdma_cmdq_create_eq_req { + u64 hdr; + u64 qbuf_addr; + u8 vector_idx; + u8 eqn; + u8 depth; + u8 qtype; + u32 db_dma_addr_l; + u32 db_dma_addr_h; +}; + +struct erdma_cmdq_destroy_eq_req { + u64 hdr; + u64 rsvd0; + u8 vector_idx; + u8 eqn; + u8 rsvd1; + u8 qtype; +}; + +/* create_cq cfg0 */ +#define ERDMA_CMD_CREATE_CQ_DEPTH_MASK GENMASK(31, 24) +#define ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK GENMASK(23, 20) +#define ERDMA_CMD_CREATE_CQ_CQN_MASK GENMASK(19, 0) + +/* create_cq cfg1 */ +#define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16) +#define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15) +#define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0) + +struct erdma_cmdq_create_cq_req { + u64 hdr; + u32 cfg0; + u32 qbuf_addr_l; + u32 qbuf_addr_h; + u32 cfg1; + u64 cq_db_info_addr; + u32 first_page_offset; +}; + +/* regmr/deregmr cfg0 */ +#define ERDMA_CMD_MR_VALID_MASK BIT(31) +#define ERDMA_CMD_MR_KEY_MASK GENMASK(27, 20) +#define ERDMA_CMD_MR_MPT_IDX_MASK GENMASK(19, 0) + +/* regmr cfg1 */ +#define ERDMA_CMD_REGMR_PD_MASK GENMASK(31, 12) +#define ERDMA_CMD_REGMR_TYPE_MASK GENMASK(7, 6) +#define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 2) +#define ERDMA_CMD_REGMR_ACC_MODE_MASK GENMASK(1, 0) + +/* regmr cfg2 */ +#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27) +#define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20) +#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0) + +struct erdma_cmdq_reg_mr_req { + u64 hdr; + u32 cfg0; + u32 cfg1; + u64 start_va; + u32 size; + u32 cfg2; + u64 phy_addr[4]; +}; + +struct erdma_cmdq_dereg_mr_req { + u64 hdr; + u32 cfg; +}; + +/* modify qp cfg */ +#define ERDMA_CMD_MODIFY_QP_STATE_MASK GENMASK(31, 24) +#define ERDMA_CMD_MODIFY_QP_CC_MASK GENMASK(23, 20) +#define ERDMA_CMD_MODIFY_QP_QPN_MASK GENMASK(19, 0) + +struct erdma_cmdq_modify_qp_req { + u64 hdr; + u32 cfg; + u32 cookie; + __be32 dip; + __be32 sip; + __be16 sport; + __be16 dport; + u32 send_nxt; + u32 recv_nxt; +}; + +/* create qp cfg0 */ +#define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20) +#define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0) + +/* create qp cfg1 */ +#define ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK GENMASK(31, 20) +#define ERDMA_CMD_CREATE_QP_PD_MASK GENMASK(19, 0) + +/* create qp cqn_mtt_cfg */ +#define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28) +#define ERDMA_CMD_CREATE_QP_CQN_MASK GENMASK(23, 0) + +/* create qp mtt_cfg */ +#define ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK GENMASK(31, 12) +#define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1) +#define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0) + +#define ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK GENMASK_ULL(31, 0) + +struct erdma_cmdq_create_qp_req { + u64 hdr; + u32 cfg0; + u32 cfg1; + u32 sq_cqn_mtt_cfg; + u32 rq_cqn_mtt_cfg; + u64 sq_buf_addr; + u64 rq_buf_addr; + u32 sq_mtt_cfg; + u32 rq_mtt_cfg; + u64 sq_db_info_dma_addr; + u64 rq_db_info_dma_addr; +}; + +struct erdma_cmdq_destroy_qp_req { + u64 hdr; + u32 qpn; +}; + +/* cap qword 0 definition */ +#define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) +#define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) +#define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0) + +/* cap qword 1 definition */ +#define ERDMA_CMD_DEV_CAP_DMA_LOCAL_KEY_MASK GENMASK_ULL(63, 32) +#define ERDMA_CMD_DEV_CAP_DEFAULT_CC_MASK GENMASK_ULL(31, 28) +#define ERDMA_CMD_DEV_CAP_QBLOCK_MASK GENMASK_ULL(27, 16) +#define ERDMA_CMD_DEV_CAP_MAX_MW_MASK GENMASK_ULL(7, 0) + +#define ERDMA_NQP_PER_QBLOCK 1024 + +#define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0) + +/* CQE hdr */ +#define ERDMA_CQE_HDR_OWNER_MASK BIT(31) +#define ERDMA_CQE_HDR_OPCODE_MASK GENMASK(23, 16) +#define ERDMA_CQE_HDR_QTYPE_MASK GENMASK(15, 8) +#define ERDMA_CQE_HDR_SYNDROME_MASK GENMASK(7, 0) + +#define ERDMA_CQE_QTYPE_SQ 0 +#define ERDMA_CQE_QTYPE_RQ 1 +#define ERDMA_CQE_QTYPE_CMDQ 2 + +struct erdma_cqe { + __be32 hdr; + __be32 qe_idx; + __be32 qpn; + union { + __le32 imm_data; + __be32 inv_rkey; + }; + __be32 size; + __be32 rsvd[3]; +}; + +struct erdma_sge { + __aligned_le64 laddr; + __le32 length; + __le32 lkey; +}; + +/* Receive Queue Element */ +struct erdma_rqe { + __le16 qe_idx; + __le16 rsvd0; + __le32 qpn; + __le32 rsvd1; + __le32 rsvd2; + __le64 to; + __le32 length; + __le32 stag; +}; + +/* SQE */ +#define ERDMA_SQE_HDR_SGL_LEN_MASK GENMASK_ULL(63, 56) +#define ERDMA_SQE_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52) +#define ERDMA_SQE_HDR_QPN_MASK GENMASK_ULL(51, 32) +#define ERDMA_SQE_HDR_OPCODE_MASK GENMASK_ULL(31, 27) +#define ERDMA_SQE_HDR_DWQE_MASK BIT_ULL(26) +#define ERDMA_SQE_HDR_INLINE_MASK BIT_ULL(25) +#define ERDMA_SQE_HDR_FENCE_MASK BIT_ULL(24) +#define ERDMA_SQE_HDR_SE_MASK BIT_ULL(23) +#define ERDMA_SQE_HDR_CE_MASK BIT_ULL(22) +#define ERDMA_SQE_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) + +/* REG MR attrs */ +#define ERDMA_SQE_MR_MODE_MASK GENMASK(1, 0) +#define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 2) +#define ERDMA_SQE_MR_MTT_TYPE_MASK GENMASK(7, 6) +#define ERDMA_SQE_MR_MTT_CNT_MASK GENMASK(31, 12) + +struct erdma_write_sqe { + __le64 hdr; + __be32 imm_data; + __le32 length; + + __le32 sink_stag; + __le32 sink_to_l; + __le32 sink_to_h; + + __le32 rsvd; + + struct erdma_sge sgl[0]; +}; + +struct erdma_send_sqe { + __le64 hdr; + union { + __be32 imm_data; + __le32 invalid_stag; + }; + + __le32 length; + struct erdma_sge sgl[0]; +}; + +struct erdma_readreq_sqe { + __le64 hdr; + __le32 invalid_stag; + __le32 length; + __le32 sink_stag; + __le32 sink_to_l; + __le32 sink_to_h; + __le32 rsvd; +}; + +struct erdma_reg_mr_sqe { + __le64 hdr; + __le64 addr; + __le32 length; + __le32 stag; + __le32 attrs; + __le32 rsvd; +}; + +/* EQ related. */ +#define ERDMA_DEFAULT_EQ_DEPTH 256 + +/* ceqe */ +#define ERDMA_CEQE_HDR_DB_MASK BIT_ULL(63) +#define ERDMA_CEQE_HDR_PI_MASK GENMASK_ULL(55, 32) +#define ERDMA_CEQE_HDR_O_MASK BIT_ULL(31) +#define ERDMA_CEQE_HDR_CQN_MASK GENMASK_ULL(19, 0) + +/* aeqe */ +#define ERDMA_AEQE_HDR_O_MASK BIT(31) +#define ERDMA_AEQE_HDR_TYPE_MASK GENMASK(23, 16) +#define ERDMA_AEQE_HDR_SUBTYPE_MASK GENMASK(7, 0) + +#define ERDMA_AE_TYPE_QP_FATAL_EVENT 0 +#define ERDMA_AE_TYPE_QP_ERQ_ERR_EVENT 1 +#define ERDMA_AE_TYPE_ACC_ERR_EVENT 2 +#define ERDMA_AE_TYPE_CQ_ERR 3 +#define ERDMA_AE_TYPE_OTHER_ERROR 4 + +struct erdma_aeqe { + __le32 hdr; + __le32 event_data0; + __le32 event_data1; + __le32 rsvd; +}; + +enum erdma_opcode { + ERDMA_OP_WRITE = 0, + ERDMA_OP_READ = 1, + ERDMA_OP_SEND = 2, + ERDMA_OP_SEND_WITH_IMM = 3, + + ERDMA_OP_RECEIVE = 4, + ERDMA_OP_RECV_IMM = 5, + ERDMA_OP_RECV_INV = 6, + + ERDMA_OP_REQ_ERR = 7, + ERDMA_OP_READ_RESPONSE = 8, + ERDMA_OP_WRITE_WITH_IMM = 9, + + ERDMA_OP_RECV_ERR = 10, + + ERDMA_OP_INVALIDATE = 11, + ERDMA_OP_RSP_SEND_IMM = 12, + ERDMA_OP_SEND_WITH_INV = 13, + + ERDMA_OP_REG_MR = 14, + ERDMA_OP_LOCAL_INV = 15, + ERDMA_OP_READ_WITH_INV = 16, + ERDMA_NUM_OPCODES = 17, + ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 +}; + +enum erdma_wc_status { + ERDMA_WC_SUCCESS = 0, + ERDMA_WC_GENERAL_ERR = 1, + ERDMA_WC_RECV_WQE_FORMAT_ERR = 2, + ERDMA_WC_RECV_STAG_INVALID_ERR = 3, + ERDMA_WC_RECV_ADDR_VIOLATION_ERR = 4, + ERDMA_WC_RECV_RIGHT_VIOLATION_ERR = 5, + ERDMA_WC_RECV_PDID_ERR = 6, + ERDMA_WC_RECV_WARRPING_ERR = 7, + ERDMA_WC_SEND_WQE_FORMAT_ERR = 8, + ERDMA_WC_SEND_WQE_ORD_EXCEED = 9, + ERDMA_WC_SEND_STAG_INVALID_ERR = 10, + ERDMA_WC_SEND_ADDR_VIOLATION_ERR = 11, + ERDMA_WC_SEND_RIGHT_VIOLATION_ERR = 12, + ERDMA_WC_SEND_PDID_ERR = 13, + ERDMA_WC_SEND_WARRPING_ERR = 14, + ERDMA_WC_FLUSH_ERR = 15, + ERDMA_WC_RETRY_EXC_ERR = 16, + ERDMA_NUM_WC_STATUS +}; + +enum erdma_vendor_err { + ERDMA_WC_VENDOR_NO_ERR = 0, + ERDMA_WC_VENDOR_INVALID_RQE = 1, + ERDMA_WC_VENDOR_RQE_INVALID_STAG = 2, + ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION = 3, + ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR = 4, + ERDMA_WC_VENDOR_RQE_INVALID_PD = 5, + ERDMA_WC_VENDOR_RQE_WRAP_ERR = 6, + ERDMA_WC_VENDOR_INVALID_SQE = 0x20, + ERDMA_WC_VENDOR_ZERO_ORD = 0x21, + ERDMA_WC_VENDOR_SQE_INVALID_STAG = 0x30, + ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION = 0x31, + ERDMA_WC_VENDOR_SQE_ACCESS_ERR = 0x32, + ERDMA_WC_VENDOR_SQE_INVALID_PD = 0x33, + ERDMA_WC_VENDOR_SQE_WARP_ERR = 0x34 +}; + +#endif From cf38d4d2c53f6ba0bc4f29245bfbd625022e76df Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:04 +0800 Subject: [PATCH 0623/1250] RDMA/erdma: Add main include file Add ERDMA driver main header file, defining internal used data structures and operations. The defined data structures includes *cmdq*, which is used as the communication channel between ERDMA driver and hardware. Link: https://lore.kernel.org/r/20220713094212.30943-4-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma.h | 287 ++++++++++++++++++++++++++++ 1 file changed, 287 insertions(+) create mode 100644 drivers/infiniband/hw/erdma/erdma.h diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h new file mode 100644 index 00000000000000..2aae635c1c8da8 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -0,0 +1,287 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#ifndef __ERDMA_H__ +#define __ERDMA_H__ + +#include +#include +#include +#include + +#include "erdma_hw.h" + +#define DRV_MODULE_NAME "erdma" +#define ERDMA_NODE_DESC "Elastic RDMA(iWARP) stack" + +struct erdma_eq { + void *qbuf; + dma_addr_t qbuf_dma_addr; + + spinlock_t lock; + + u32 depth; + + u16 ci; + u16 rsvd; + + atomic64_t event_num; + atomic64_t notify_num; + + u64 __iomem *db_addr; + u64 *db_record; +}; + +struct erdma_cmdq_sq { + void *qbuf; + dma_addr_t qbuf_dma_addr; + + spinlock_t lock; + + u32 depth; + u16 ci; + u16 pi; + + u16 wqebb_cnt; + + u64 *db_record; +}; + +struct erdma_cmdq_cq { + void *qbuf; + dma_addr_t qbuf_dma_addr; + + spinlock_t lock; + + u32 depth; + u32 ci; + u32 cmdsn; + + u64 *db_record; + + atomic64_t armed_num; +}; + +enum { + ERDMA_CMD_STATUS_INIT, + ERDMA_CMD_STATUS_ISSUED, + ERDMA_CMD_STATUS_FINISHED, + ERDMA_CMD_STATUS_TIMEOUT +}; + +struct erdma_comp_wait { + struct completion wait_event; + u32 cmd_status; + u32 ctx_id; + u16 sq_pi; + u8 comp_status; + u8 rsvd; + u32 comp_data[4]; +}; + +enum { + ERDMA_CMDQ_STATE_OK_BIT = 0, + ERDMA_CMDQ_STATE_TIMEOUT_BIT = 1, + ERDMA_CMDQ_STATE_CTX_ERR_BIT = 2, +}; + +#define ERDMA_CMDQ_TIMEOUT_MS 15000 +#define ERDMA_REG_ACCESS_WAIT_MS 20 +#define ERDMA_WAIT_DEV_DONE_CNT 500 + +struct erdma_cmdq { + unsigned long *comp_wait_bitmap; + struct erdma_comp_wait *wait_pool; + spinlock_t lock; + + bool use_event; + + struct erdma_cmdq_sq sq; + struct erdma_cmdq_cq cq; + struct erdma_eq eq; + + unsigned long state; + + struct semaphore credits; + u16 max_outstandings; +}; + +#define COMPROMISE_CC ERDMA_CC_CUBIC +enum erdma_cc_alg { + ERDMA_CC_NEWRENO = 0, + ERDMA_CC_CUBIC, + ERDMA_CC_HPCC_RTT, + ERDMA_CC_HPCC_ECN, + ERDMA_CC_HPCC_INT, + ERDMA_CC_METHODS_NUM +}; + +struct erdma_devattr { + u32 fw_version; + + unsigned char peer_addr[ETH_ALEN]; + + int numa_node; + enum erdma_cc_alg cc; + u32 grp_num; + u32 irq_num; + + bool disable_dwqe; + u16 dwqe_pages; + u16 dwqe_entries; + + u32 max_qp; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_ord; + u32 max_ird; + + u32 max_send_sge; + u32 max_recv_sge; + u32 max_sge_rd; + u32 max_cq; + u32 max_cqe; + u64 max_mr_size; + u32 max_mr; + u32 max_pd; + u32 max_mw; + u32 local_dma_key; +}; + +#define ERDMA_IRQNAME_SIZE 50 + +struct erdma_irq { + char name[ERDMA_IRQNAME_SIZE]; + u32 msix_vector; + cpumask_t affinity_hint_mask; +}; + +struct erdma_eq_cb { + bool ready; + void *dev; /* All EQs use this fields to get erdma_dev struct */ + struct erdma_irq irq; + struct erdma_eq eq; + struct tasklet_struct tasklet; +}; + +struct erdma_resource_cb { + unsigned long *bitmap; + spinlock_t lock; + u32 next_alloc_idx; + u32 max_cap; +}; + +enum { + ERDMA_RES_TYPE_PD = 0, + ERDMA_RES_TYPE_STAG_IDX = 1, + ERDMA_RES_CNT = 2, +}; + +#define ERDMA_EXTRA_BUFFER_SIZE ERDMA_DB_SIZE +#define WARPPED_BUFSIZE(size) ((size) + ERDMA_EXTRA_BUFFER_SIZE) + +struct erdma_dev { + struct ib_device ibdev; + struct net_device *netdev; + struct pci_dev *pdev; + struct notifier_block netdev_nb; + + resource_size_t func_bar_addr; + resource_size_t func_bar_len; + u8 __iomem *func_bar; + + struct erdma_devattr attrs; + /* physical port state (only one port per device) */ + enum ib_port_state state; + + /* cmdq and aeq use the same msix vector */ + struct erdma_irq comm_irq; + struct erdma_cmdq cmdq; + struct erdma_eq aeq; + struct erdma_eq_cb ceqs[ERDMA_NUM_MSIX_VEC - 1]; + + spinlock_t lock; + struct erdma_resource_cb res_cb[ERDMA_RES_CNT]; + struct xarray qp_xa; + struct xarray cq_xa; + + u32 next_alloc_qpn; + u32 next_alloc_cqn; + + spinlock_t db_bitmap_lock; + /* We provide max 64 uContexts that each has one SQ doorbell Page. */ + DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT); + /* + * We provide max 496 uContexts that each has one SQ normal Db, + * and one directWQE db。 + */ + DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT); + + atomic_t num_ctx; + struct list_head cep_list; +}; + +static inline void *get_queue_entry(void *qbuf, u32 idx, u32 depth, u32 shift) +{ + idx &= (depth - 1); + + return qbuf + (idx << shift); +} + +static inline struct erdma_dev *to_edev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct erdma_dev, ibdev); +} + +static inline u32 erdma_reg_read32(struct erdma_dev *dev, u32 reg) +{ + return readl(dev->func_bar + reg); +} + +static inline u64 erdma_reg_read64(struct erdma_dev *dev, u32 reg) +{ + return readq(dev->func_bar + reg); +} + +static inline void erdma_reg_write32(struct erdma_dev *dev, u32 reg, u32 value) +{ + writel(value, dev->func_bar + reg); +} + +static inline void erdma_reg_write64(struct erdma_dev *dev, u32 reg, u64 value) +{ + writeq(value, dev->func_bar + reg); +} + +static inline u32 erdma_reg_read32_filed(struct erdma_dev *dev, u32 reg, + u32 filed_mask) +{ + u32 val = erdma_reg_read32(dev, reg); + + return FIELD_GET(filed_mask, val); +} + +int erdma_cmdq_init(struct erdma_dev *dev); +void erdma_finish_cmdq_init(struct erdma_dev *dev); +void erdma_cmdq_destroy(struct erdma_dev *dev); + +void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op); +int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size, + u64 *resp0, u64 *resp1); +void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq); + +int erdma_ceqs_init(struct erdma_dev *dev); +void erdma_ceqs_uninit(struct erdma_dev *dev); +void notify_eq(struct erdma_eq *eq); +void *get_next_valid_eqe(struct erdma_eq *eq); + +int erdma_aeq_init(struct erdma_dev *dev); +void erdma_aeq_destroy(struct erdma_dev *dev); + +void erdma_aeq_event_handler(struct erdma_dev *dev); +void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb); + +#endif From d94b0e502f1cd41cdf734af419ef192d4bab5c5d Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:05 +0800 Subject: [PATCH 0624/1250] RDMA/erdma: Add cmdq implementation Cmdq is the main control plane channel between erdma driver and hardware. After erdma device is initialized, the cmdq channel will be active in the whole lifecycle of this driver. This commit also includes two modifications from Christophe, one is using the bitmap API to allocate bitmaps instead of hand-writing, and another is using the non-atomic bitmap API when applicable. Link: https://lore.kernel.org/r/20220713094212.30943-5-chengyou@linux.alibaba.com Signed-off-by: Christophe JAILLET Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_cmdq.c | 493 +++++++++++++++++++++++ 1 file changed, 493 insertions(+) create mode 100644 drivers/infiniband/hw/erdma/erdma_cmdq.c diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c new file mode 100644 index 00000000000000..57da0c67047205 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include +#include +#include + +#include "erdma.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +static void arm_cmdq_cq(struct erdma_cmdq *cmdq) +{ + struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); + u64 db_data = FIELD_PREP(ERDMA_CQDB_CI_MASK, cmdq->cq.ci) | + FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | + FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cmdq->cq.cmdsn) | + FIELD_PREP(ERDMA_CQDB_IDX_MASK, cmdq->cq.cmdsn); + + *cmdq->cq.db_record = db_data; + writeq(db_data, dev->func_bar + ERDMA_CMDQ_CQDB_REG); + + atomic64_inc(&cmdq->cq.armed_num); +} + +static void kick_cmdq_db(struct erdma_cmdq *cmdq) +{ + struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); + u64 db_data = FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi); + + *cmdq->sq.db_record = db_data; + writeq(db_data, dev->func_bar + ERDMA_CMDQ_SQDB_REG); +} + +static struct erdma_comp_wait *get_comp_wait(struct erdma_cmdq *cmdq) +{ + int comp_idx; + + spin_lock(&cmdq->lock); + comp_idx = find_first_zero_bit(cmdq->comp_wait_bitmap, + cmdq->max_outstandings); + if (comp_idx == cmdq->max_outstandings) { + spin_unlock(&cmdq->lock); + return ERR_PTR(-ENOMEM); + } + + __set_bit(comp_idx, cmdq->comp_wait_bitmap); + spin_unlock(&cmdq->lock); + + return &cmdq->wait_pool[comp_idx]; +} + +static void put_comp_wait(struct erdma_cmdq *cmdq, + struct erdma_comp_wait *comp_wait) +{ + int used; + + cmdq->wait_pool[comp_wait->ctx_id].cmd_status = ERDMA_CMD_STATUS_INIT; + spin_lock(&cmdq->lock); + used = __test_and_clear_bit(comp_wait->ctx_id, cmdq->comp_wait_bitmap); + spin_unlock(&cmdq->lock); + + WARN_ON(!used); +} + +static int erdma_cmdq_wait_res_init(struct erdma_dev *dev, + struct erdma_cmdq *cmdq) +{ + int i; + + cmdq->wait_pool = + devm_kcalloc(&dev->pdev->dev, cmdq->max_outstandings, + sizeof(struct erdma_comp_wait), GFP_KERNEL); + if (!cmdq->wait_pool) + return -ENOMEM; + + spin_lock_init(&cmdq->lock); + cmdq->comp_wait_bitmap = devm_bitmap_zalloc( + &dev->pdev->dev, cmdq->max_outstandings, GFP_KERNEL); + if (!cmdq->comp_wait_bitmap) + return -ENOMEM; + + for (i = 0; i < cmdq->max_outstandings; i++) { + init_completion(&cmdq->wait_pool[i].wait_event); + cmdq->wait_pool[i].ctx_id = i; + } + + return 0; +} + +static int erdma_cmdq_sq_init(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + struct erdma_cmdq_sq *sq = &cmdq->sq; + u32 buf_size; + + sq->wqebb_cnt = SQEBB_COUNT(ERDMA_CMDQ_SQE_SIZE); + sq->depth = cmdq->max_outstandings * sq->wqebb_cnt; + + buf_size = sq->depth << SQEBB_SHIFT; + + sq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &sq->qbuf_dma_addr, GFP_KERNEL); + if (!sq->qbuf) + return -ENOMEM; + + sq->db_record = (u64 *)(sq->qbuf + buf_size); + + spin_lock_init(&sq->lock); + + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_H_REG, + upper_32_bits(sq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_L_REG, + lower_32_bits(sq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth); + erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, + sq->qbuf_dma_addr + buf_size); + + return 0; +} + +static int erdma_cmdq_cq_init(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + struct erdma_cmdq_cq *cq = &cmdq->cq; + u32 buf_size; + + cq->depth = cmdq->sq.depth; + buf_size = cq->depth << CQE_SHIFT; + + cq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &cq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!cq->qbuf) + return -ENOMEM; + + spin_lock_init(&cq->lock); + + cq->db_record = (u64 *)(cq->qbuf + buf_size); + + atomic64_set(&cq->armed_num, 0); + + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_H_REG, + upper_32_bits(cq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG, + lower_32_bits(cq->qbuf_dma_addr)); + erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, + cq->qbuf_dma_addr + buf_size); + + return 0; +} + +static int erdma_cmdq_eq_init(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + struct erdma_eq *eq = &cmdq->eq; + u32 buf_size; + + eq->depth = cmdq->max_outstandings; + buf_size = eq->depth << EQE_SHIFT; + + eq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!eq->qbuf) + return -ENOMEM; + + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + + eq->db_addr = + (u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG); + eq->db_record = (u64 *)(eq->qbuf + buf_size); + + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, + upper_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_L_REG, + lower_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth); + erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, + eq->qbuf_dma_addr + buf_size); + + return 0; +} + +int erdma_cmdq_init(struct erdma_dev *dev) +{ + int err, i; + struct erdma_cmdq *cmdq = &dev->cmdq; + u32 sts, ctrl; + + cmdq->max_outstandings = ERDMA_CMDQ_MAX_OUTSTANDING; + cmdq->use_event = false; + + sema_init(&cmdq->credits, cmdq->max_outstandings); + + err = erdma_cmdq_wait_res_init(dev, cmdq); + if (err) + return err; + + err = erdma_cmdq_sq_init(dev); + if (err) + return err; + + err = erdma_cmdq_cq_init(dev); + if (err) + goto err_destroy_sq; + + err = erdma_cmdq_eq_init(dev); + if (err) + goto err_destroy_cq; + + ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_INIT_MASK, 1); + erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); + + for (i = 0; i < ERDMA_WAIT_DEV_DONE_CNT; i++) { + sts = erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG, + ERDMA_REG_DEV_ST_INIT_DONE_MASK); + if (sts) + break; + + msleep(ERDMA_REG_ACCESS_WAIT_MS); + } + + if (i == ERDMA_WAIT_DEV_DONE_CNT) { + dev_err(&dev->pdev->dev, "wait init done failed.\n"); + err = -ETIMEDOUT; + goto err_destroy_eq; + } + + set_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + + return 0; + +err_destroy_eq: + dma_free_coherent(&dev->pdev->dev, + (cmdq->eq.depth << EQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); + +err_destroy_cq: + dma_free_coherent(&dev->pdev->dev, + (cmdq->cq.depth << CQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); + +err_destroy_sq: + dma_free_coherent(&dev->pdev->dev, + (cmdq->sq.depth << SQEBB_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); + + return err; +} + +void erdma_finish_cmdq_init(struct erdma_dev *dev) +{ + /* after device init successfully, change cmdq to event mode. */ + dev->cmdq.use_event = true; + arm_cmdq_cq(&dev->cmdq); +} + +void erdma_cmdq_destroy(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + + dma_free_coherent(&dev->pdev->dev, + (cmdq->eq.depth << EQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + (cmdq->sq.depth << SQEBB_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + (cmdq->cq.depth << CQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); +} + +static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq) +{ + __be32 *cqe = get_queue_entry(cmdq->cq.qbuf, cmdq->cq.ci, + cmdq->cq.depth, CQE_SHIFT); + u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, + __be32_to_cpu(READ_ONCE(*cqe))); + + return owner ^ !!(cmdq->cq.ci & cmdq->cq.depth) ? cqe : NULL; +} + +static void push_cmdq_sqe(struct erdma_cmdq *cmdq, u64 *req, size_t req_len, + struct erdma_comp_wait *comp_wait) +{ + __le64 *wqe; + u64 hdr = *req; + + comp_wait->cmd_status = ERDMA_CMD_STATUS_ISSUED; + reinit_completion(&comp_wait->wait_event); + comp_wait->sq_pi = cmdq->sq.pi; + + wqe = get_queue_entry(cmdq->sq.qbuf, cmdq->sq.pi, cmdq->sq.depth, + SQEBB_SHIFT); + memcpy(wqe, req, req_len); + + cmdq->sq.pi += cmdq->sq.wqebb_cnt; + hdr |= FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi) | + FIELD_PREP(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, + comp_wait->ctx_id) | + FIELD_PREP(ERDMA_CMD_HDR_WQEBB_CNT_MASK, cmdq->sq.wqebb_cnt - 1); + *wqe = cpu_to_le64(hdr); + + kick_cmdq_db(cmdq); +} + +static int erdma_poll_single_cmd_completion(struct erdma_cmdq *cmdq) +{ + struct erdma_comp_wait *comp_wait; + u32 hdr0, sqe_idx; + __be32 *cqe; + u16 ctx_id; + u64 *sqe; + int i; + + cqe = get_next_valid_cmdq_cqe(cmdq); + if (!cqe) + return -EAGAIN; + + cmdq->cq.ci++; + + dma_rmb(); + hdr0 = __be32_to_cpu(*cqe); + sqe_idx = __be32_to_cpu(*(cqe + 1)); + + sqe = get_queue_entry(cmdq->sq.qbuf, sqe_idx, cmdq->sq.depth, + SQEBB_SHIFT); + ctx_id = FIELD_GET(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, *sqe); + comp_wait = &cmdq->wait_pool[ctx_id]; + if (comp_wait->cmd_status != ERDMA_CMD_STATUS_ISSUED) + return -EIO; + + comp_wait->cmd_status = ERDMA_CMD_STATUS_FINISHED; + comp_wait->comp_status = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, hdr0); + cmdq->sq.ci += cmdq->sq.wqebb_cnt; + + for (i = 0; i < 4; i++) + comp_wait->comp_data[i] = __be32_to_cpu(*(cqe + 2 + i)); + + if (cmdq->use_event) + complete(&comp_wait->wait_event); + + return 0; +} + +static void erdma_polling_cmd_completions(struct erdma_cmdq *cmdq) +{ + unsigned long flags; + u16 comp_num; + + spin_lock_irqsave(&cmdq->cq.lock, flags); + + /* We must have less than # of max_outstandings + * completions at one time. + */ + for (comp_num = 0; comp_num < cmdq->max_outstandings; comp_num++) + if (erdma_poll_single_cmd_completion(cmdq)) + break; + + if (comp_num && cmdq->use_event) + arm_cmdq_cq(cmdq); + + spin_unlock_irqrestore(&cmdq->cq.lock, flags); +} + +void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) +{ + int got_event = 0; + + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state) || + !cmdq->use_event) + return; + + while (get_next_valid_eqe(&cmdq->eq)) { + cmdq->eq.ci++; + got_event++; + } + + if (got_event) { + cmdq->cq.cmdsn++; + erdma_polling_cmd_completions(cmdq); + } + + notify_eq(&cmdq->eq); +} + +static int erdma_poll_cmd_completion(struct erdma_comp_wait *comp_ctx, + struct erdma_cmdq *cmdq, u32 timeout) +{ + unsigned long comp_timeout = jiffies + msecs_to_jiffies(timeout); + + while (1) { + erdma_polling_cmd_completions(cmdq); + if (comp_ctx->cmd_status != ERDMA_CMD_STATUS_ISSUED) + break; + + if (time_is_before_jiffies(comp_timeout)) + return -ETIME; + + msleep(20); + } + + return 0; +} + +static int erdma_wait_cmd_completion(struct erdma_comp_wait *comp_ctx, + struct erdma_cmdq *cmdq, u32 timeout) +{ + unsigned long flags = 0; + + wait_for_completion_timeout(&comp_ctx->wait_event, + msecs_to_jiffies(timeout)); + + if (unlikely(comp_ctx->cmd_status != ERDMA_CMD_STATUS_FINISHED)) { + spin_lock_irqsave(&cmdq->cq.lock, flags); + comp_ctx->cmd_status = ERDMA_CMD_STATUS_TIMEOUT; + spin_unlock_irqrestore(&cmdq->cq.lock, flags); + return -ETIME; + } + + return 0; +} + +void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op) +{ + *hdr = FIELD_PREP(ERDMA_CMD_HDR_SUB_MOD_MASK, mod) | + FIELD_PREP(ERDMA_CMD_HDR_OPCODE_MASK, op); +} + +int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size, + u64 *resp0, u64 *resp1) +{ + struct erdma_comp_wait *comp_wait; + int ret; + + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) + return -ENODEV; + + down(&cmdq->credits); + + comp_wait = get_comp_wait(cmdq); + if (IS_ERR(comp_wait)) { + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + set_bit(ERDMA_CMDQ_STATE_CTX_ERR_BIT, &cmdq->state); + up(&cmdq->credits); + return PTR_ERR(comp_wait); + } + + spin_lock(&cmdq->sq.lock); + push_cmdq_sqe(cmdq, req, req_size, comp_wait); + spin_unlock(&cmdq->sq.lock); + + if (cmdq->use_event) + ret = erdma_wait_cmd_completion(comp_wait, cmdq, + ERDMA_CMDQ_TIMEOUT_MS); + else + ret = erdma_poll_cmd_completion(comp_wait, cmdq, + ERDMA_CMDQ_TIMEOUT_MS); + + if (ret) { + set_bit(ERDMA_CMDQ_STATE_TIMEOUT_BIT, &cmdq->state); + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + goto out; + } + + if (comp_wait->comp_status) + ret = -EIO; + + if (resp0 && resp1) { + *resp0 = *((u64 *)&comp_wait->comp_data[0]); + *resp1 = *((u64 *)&comp_wait->comp_data[2]); + } + put_comp_wait(cmdq, comp_wait); + +out: + up(&cmdq->credits); + + return ret; +} From 59798a2a497db1503328e68e1c8dfc2e9a7f28c9 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:06 +0800 Subject: [PATCH 0625/1250] RDMA/erdma: Add event queue implementation Event queue (EQ) is the main notification way from erdma hardware to its driver. Each erdma device contains 2 kinds EQs: asynchronous EQ (AEQ) and completion EQ (CEQ). Per device has 1 AEQ, which used for RDMA async event report, and max to 32 CEQs (numbered for CEQ0 to CEQ31). CEQ0 is used for cmdq completion event report, and the rest CEQs are used for RDMA completion event report. Link: https://lore.kernel.org/r/20220713094212.30943-6-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_eq.c | 329 +++++++++++++++++++++++++ 1 file changed, 329 insertions(+) create mode 100644 drivers/infiniband/hw/erdma/erdma_eq.c diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c new file mode 100644 index 00000000000000..8f2d094e02279c --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include +#include +#include + +#include "erdma.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +#define MAX_POLL_CHUNK_SIZE 16 + +void notify_eq(struct erdma_eq *eq) +{ + u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) | + FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1); + + *eq->db_record = db_data; + writeq(db_data, eq->db_addr); + + atomic64_inc(&eq->notify_num); +} + +void *get_next_valid_eqe(struct erdma_eq *eq) +{ + u64 *eqe = get_queue_entry(eq->qbuf, eq->ci, eq->depth, EQE_SHIFT); + u32 owner = FIELD_GET(ERDMA_CEQE_HDR_O_MASK, READ_ONCE(*eqe)); + + return owner ^ !!(eq->ci & eq->depth) ? eqe : NULL; +} + +void erdma_aeq_event_handler(struct erdma_dev *dev) +{ + struct erdma_aeqe *aeqe; + u32 cqn, qpn; + struct erdma_qp *qp; + struct erdma_cq *cq; + struct ib_event event; + u32 poll_cnt = 0; + + memset(&event, 0, sizeof(event)); + + while (poll_cnt < MAX_POLL_CHUNK_SIZE) { + aeqe = get_next_valid_eqe(&dev->aeq); + if (!aeqe) + break; + + dma_rmb(); + + dev->aeq.ci++; + atomic64_inc(&dev->aeq.event_num); + poll_cnt++; + + if (FIELD_GET(ERDMA_AEQE_HDR_TYPE_MASK, + le32_to_cpu(aeqe->hdr)) == ERDMA_AE_TYPE_CQ_ERR) { + cqn = le32_to_cpu(aeqe->event_data0); + cq = find_cq_by_cqn(dev, cqn); + if (!cq) + continue; + + event.device = cq->ibcq.device; + event.element.cq = &cq->ibcq; + event.event = IB_EVENT_CQ_ERR; + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&event, + cq->ibcq.cq_context); + } else { + qpn = le32_to_cpu(aeqe->event_data0); + qp = find_qp_by_qpn(dev, qpn); + if (!qp) + continue; + + event.device = qp->ibqp.device; + event.element.qp = &qp->ibqp; + event.event = IB_EVENT_QP_FATAL; + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&event, + qp->ibqp.qp_context); + } + } + + notify_eq(&dev->aeq); +} + +int erdma_aeq_init(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + u32 buf_size; + + eq->depth = ERDMA_DEFAULT_EQ_DEPTH; + buf_size = eq->depth << EQE_SHIFT; + + eq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!eq->qbuf) + return -ENOMEM; + + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + atomic64_set(&eq->notify_num, 0); + + eq->db_addr = (u64 __iomem *)(dev->func_bar + ERDMA_REGS_AEQ_DB_REG); + eq->db_record = (u64 *)(eq->qbuf + buf_size); + + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, + upper_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG, + lower_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); + erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, + eq->qbuf_dma_addr + buf_size); + + return 0; +} + +void erdma_aeq_destroy(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + + dma_free_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(eq->depth << EQE_SHIFT), eq->qbuf, + eq->qbuf_dma_addr); +} + +void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) +{ + struct erdma_dev *dev = ceq_cb->dev; + struct erdma_cq *cq; + u32 poll_cnt = 0; + u64 *ceqe; + int cqn; + + if (!ceq_cb->ready) + return; + + while (poll_cnt < MAX_POLL_CHUNK_SIZE) { + ceqe = get_next_valid_eqe(&ceq_cb->eq); + if (!ceqe) + break; + + dma_rmb(); + ceq_cb->eq.ci++; + poll_cnt++; + cqn = FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, READ_ONCE(*ceqe)); + + cq = find_cq_by_cqn(dev, cqn); + if (!cq) + continue; + + if (rdma_is_kernel_res(&cq->ibcq.res)) + cq->kern_cq.cmdsn++; + + if (cq->ibcq.comp_handler) + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + } + + notify_eq(&ceq_cb->eq); +} + +static irqreturn_t erdma_intr_ceq_handler(int irq, void *data) +{ + struct erdma_eq_cb *ceq_cb = data; + + tasklet_schedule(&ceq_cb->tasklet); + + return IRQ_HANDLED; +} + +static void erdma_intr_ceq_task(unsigned long data) +{ + erdma_ceq_completion_handler((struct erdma_eq_cb *)data); +} + +static int erdma_set_ceq_irq(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq_cb *eqc = &dev->ceqs[ceqn]; + int err; + + snprintf(eqc->irq.name, ERDMA_IRQNAME_SIZE, "erdma-ceq%u@pci:%s", ceqn, + pci_name(dev->pdev)); + eqc->irq.msix_vector = pci_irq_vector(dev->pdev, ceqn + 1); + + tasklet_init(&dev->ceqs[ceqn].tasklet, erdma_intr_ceq_task, + (unsigned long)&dev->ceqs[ceqn]); + + cpumask_set_cpu(cpumask_local_spread(ceqn + 1, dev->attrs.numa_node), + &eqc->irq.affinity_hint_mask); + + err = request_irq(eqc->irq.msix_vector, erdma_intr_ceq_handler, 0, + eqc->irq.name, eqc); + if (err) { + dev_err(&dev->pdev->dev, "failed to request_irq(%d)\n", err); + return err; + } + + irq_set_affinity_hint(eqc->irq.msix_vector, + &eqc->irq.affinity_hint_mask); + + return 0; +} + +static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq_cb *eqc = &dev->ceqs[ceqn]; + + irq_set_affinity_hint(eqc->irq.msix_vector, NULL); + free_irq(eqc->irq.msix_vector, eqc); +} + +static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) +{ + struct erdma_cmdq_create_eq_req req; + dma_addr_t db_info_dma_addr; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_CREATE_EQ); + req.eqn = eqn; + req.depth = ilog2(eq->depth); + req.qbuf_addr = eq->qbuf_dma_addr; + req.qtype = ERDMA_EQ_TYPE_CEQ; + /* Vector index is the same as EQN. */ + req.vector_idx = eqn; + db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT); + req.db_dma_addr_l = lower_32_bits(db_info_dma_addr); + req.db_dma_addr_h = upper_32_bits(db_info_dma_addr); + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, + sizeof(struct erdma_cmdq_create_eq_req), + NULL, NULL); +} + +static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq *eq = &dev->ceqs[ceqn].eq; + u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; + int ret; + + eq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!eq->qbuf) + return -ENOMEM; + + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + atomic64_set(&eq->notify_num, 0); + + eq->depth = ERDMA_DEFAULT_EQ_DEPTH; + eq->db_addr = + (u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + + (ceqn + 1) * ERDMA_DB_SIZE); + eq->db_record = (u64 *)(eq->qbuf + buf_size); + eq->ci = 0; + dev->ceqs[ceqn].dev = dev; + + /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ + ret = create_eq_cmd(dev, ceqn + 1, eq); + dev->ceqs[ceqn].ready = ret ? false : true; + + return ret; +} + +static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq *eq = &dev->ceqs[ceqn].eq; + u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; + struct erdma_cmdq_destroy_eq_req req; + int err; + + dev->ceqs[ceqn].ready = 0; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_DESTROY_EQ); + /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ + req.eqn = ceqn + 1; + req.qtype = ERDMA_EQ_TYPE_CEQ; + req.vector_idx = ceqn + 1; + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (err) + return; + + dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf, + eq->qbuf_dma_addr); +} + +int erdma_ceqs_init(struct erdma_dev *dev) +{ + u32 i, j; + int err; + + for (i = 0; i < dev->attrs.irq_num - 1; i++) { + err = erdma_ceq_init_one(dev, i); + if (err) + goto out_err; + + err = erdma_set_ceq_irq(dev, i); + if (err) { + erdma_ceq_uninit_one(dev, i); + goto out_err; + } + } + + return 0; + +out_err: + for (j = 0; j < i; j++) { + erdma_free_ceq_irq(dev, j); + erdma_ceq_uninit_one(dev, j); + } + + return err; +} + +void erdma_ceqs_uninit(struct erdma_dev *dev) +{ + u32 i; + + for (i = 0; i < dev->attrs.irq_num - 1; i++) { + erdma_free_ceq_irq(dev, i); + erdma_ceq_uninit_one(dev, i); + } +} From cafde184e628802f5a39c29af7b3d3a782e1657b Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:07 +0800 Subject: [PATCH 0626/1250] RDMA/erdma: Add verbs header file This header file defines the main structures and functions used for RDMA Verbs, including qp, cq, mr ucontext, etc,. Link: https://lore.kernel.org/r/20220713094212.30943-7-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_verbs.h | 342 ++++++++++++++++++++++ 1 file changed, 342 insertions(+) create mode 100644 drivers/infiniband/hw/erdma/erdma_verbs.h diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h new file mode 100644 index 00000000000000..c7baddb1f292db --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -0,0 +1,342 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#ifndef __ERDMA_VERBS_H__ +#define __ERDMA_VERBS_H__ + +#include + +#include +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_hw.h" + +/* RDMA Capability. */ +#define ERDMA_MAX_PD (128 * 1024) +#define ERDMA_MAX_SEND_WR 4096 +#define ERDMA_MAX_ORD 128 +#define ERDMA_MAX_IRD 128 +#define ERDMA_MAX_SGE_RD 1 +#define ERDMA_MAX_CONTEXT (128 * 1024) +#define ERDMA_MAX_SEND_SGE 6 +#define ERDMA_MAX_RECV_SGE 1 +#define ERDMA_MAX_INLINE (sizeof(struct erdma_sge) * (ERDMA_MAX_SEND_SGE)) +#define ERDMA_MAX_FRMR_PA 512 + +enum { + ERDMA_MMAP_IO_NC = 0, /* no cache */ +}; + +struct erdma_user_mmap_entry { + struct rdma_user_mmap_entry rdma_entry; + u64 address; + u8 mmap_flag; +}; + +struct erdma_ucontext { + struct ib_ucontext ibucontext; + + u32 sdb_type; + u32 sdb_idx; + u32 sdb_page_idx; + u32 sdb_page_off; + u64 sdb; + u64 rdb; + u64 cdb; + + struct rdma_user_mmap_entry *sq_db_mmap_entry; + struct rdma_user_mmap_entry *rq_db_mmap_entry; + struct rdma_user_mmap_entry *cq_db_mmap_entry; + + /* doorbell records */ + struct list_head dbrecords_page_list; + struct mutex dbrecords_page_mutex; +}; + +struct erdma_pd { + struct ib_pd ibpd; + u32 pdn; +}; + +/* + * MemoryRegion definition. + */ +#define ERDMA_MAX_INLINE_MTT_ENTRIES 4 +#define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt takes 8 Bytes. */ +#define ERDMA_MR_MAX_MTT_CNT 524288 +#define ERDMA_MTT_ENTRY_SIZE 8 + +#define ERDMA_MR_TYPE_NORMAL 0 +#define ERDMA_MR_TYPE_FRMR 1 +#define ERDMA_MR_TYPE_DMA 2 + +#define ERDMA_MR_INLINE_MTT 0 +#define ERDMA_MR_INDIRECT_MTT 1 + +#define ERDMA_MR_ACC_LR BIT(0) +#define ERDMA_MR_ACC_LW BIT(1) +#define ERDMA_MR_ACC_RR BIT(2) +#define ERDMA_MR_ACC_RW BIT(3) + +static inline u8 to_erdma_access_flags(int access) +{ + return (access & IB_ACCESS_REMOTE_READ ? ERDMA_MR_ACC_RR : 0) | + (access & IB_ACCESS_LOCAL_WRITE ? ERDMA_MR_ACC_LW : 0) | + (access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0); +} + +struct erdma_mem { + struct ib_umem *umem; + void *mtt_buf; + u32 mtt_type; + u32 page_size; + u32 page_offset; + u32 page_cnt; + u32 mtt_nents; + + u64 va; + u64 len; + + u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES]; +}; + +struct erdma_mr { + struct ib_mr ibmr; + struct erdma_mem mem; + u8 type; + u8 access; + u8 valid; +}; + +struct erdma_user_dbrecords_page { + struct list_head list; + struct ib_umem *umem; + u64 va; + int refcnt; +}; + +struct erdma_uqp { + struct erdma_mem sq_mtt; + struct erdma_mem rq_mtt; + + dma_addr_t sq_db_info_dma_addr; + dma_addr_t rq_db_info_dma_addr; + + struct erdma_user_dbrecords_page *user_dbr_page; + + u32 rq_offset; +}; + +struct erdma_kqp { + u16 sq_pi; + u16 sq_ci; + + u16 rq_pi; + u16 rq_ci; + + u64 *swr_tbl; + u64 *rwr_tbl; + + void __iomem *hw_sq_db; + void __iomem *hw_rq_db; + + void *sq_buf; + dma_addr_t sq_buf_dma_addr; + + void *rq_buf; + dma_addr_t rq_buf_dma_addr; + + void *sq_db_info; + void *rq_db_info; + + u8 sig_all; +}; + +enum erdma_qp_state { + ERDMA_QP_STATE_IDLE = 0, + ERDMA_QP_STATE_RTR = 1, + ERDMA_QP_STATE_RTS = 2, + ERDMA_QP_STATE_CLOSING = 3, + ERDMA_QP_STATE_TERMINATE = 4, + ERDMA_QP_STATE_ERROR = 5, + ERDMA_QP_STATE_UNDEF = 7, + ERDMA_QP_STATE_COUNT = 8 +}; + +enum erdma_qp_attr_mask { + ERDMA_QP_ATTR_STATE = (1 << 0), + ERDMA_QP_ATTR_LLP_HANDLE = (1 << 2), + ERDMA_QP_ATTR_ORD = (1 << 3), + ERDMA_QP_ATTR_IRD = (1 << 4), + ERDMA_QP_ATTR_SQ_SIZE = (1 << 5), + ERDMA_QP_ATTR_RQ_SIZE = (1 << 6), + ERDMA_QP_ATTR_MPA = (1 << 7) +}; + +struct erdma_qp_attrs { + enum erdma_qp_state state; + enum erdma_cc_alg cc; /* Congestion control algorithm */ + u32 sq_size; + u32 rq_size; + u32 orq_size; + u32 irq_size; + u32 max_send_sge; + u32 max_recv_sge; + u32 cookie; +#define ERDMA_QP_ACTIVE 0 +#define ERDMA_QP_PASSIVE 1 + u8 qp_type; + u8 pd_len; +}; + +struct erdma_qp { + struct ib_qp ibqp; + struct kref ref; + struct completion safe_free; + struct erdma_dev *dev; + struct erdma_cep *cep; + struct rw_semaphore state_lock; + + union { + struct erdma_kqp kern_qp; + struct erdma_uqp user_qp; + }; + + struct erdma_cq *scq; + struct erdma_cq *rcq; + + struct erdma_qp_attrs attrs; + spinlock_t lock; +}; + +struct erdma_kcq_info { + void *qbuf; + dma_addr_t qbuf_dma_addr; + u32 ci; + u32 cmdsn; + u32 notify_cnt; + + spinlock_t lock; + u8 __iomem *db; + u64 *db_record; +}; + +struct erdma_ucq_info { + struct erdma_mem qbuf_mtt; + struct erdma_user_dbrecords_page *user_dbr_page; + dma_addr_t db_info_dma_addr; +}; + +struct erdma_cq { + struct ib_cq ibcq; + u32 cqn; + + u32 depth; + u32 assoc_eqn; + + union { + struct erdma_kcq_info kern_cq; + struct erdma_ucq_info user_cq; + }; +}; + +#define QP_ID(qp) ((qp)->ibqp.qp_num) + +static inline struct erdma_qp *find_qp_by_qpn(struct erdma_dev *dev, int id) +{ + return (struct erdma_qp *)xa_load(&dev->qp_xa, id); +} + +static inline struct erdma_cq *find_cq_by_cqn(struct erdma_dev *dev, int id) +{ + return (struct erdma_cq *)xa_load(&dev->cq_xa, id); +} + +void erdma_qp_get(struct erdma_qp *qp); +void erdma_qp_put(struct erdma_qp *qp); +int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask); +void erdma_qp_llp_close(struct erdma_qp *qp); +void erdma_qp_cm_drop(struct erdma_qp *qp); + +static inline struct erdma_ucontext *to_ectx(struct ib_ucontext *ibctx) +{ + return container_of(ibctx, struct erdma_ucontext, ibucontext); +} + +static inline struct erdma_pd *to_epd(struct ib_pd *pd) +{ + return container_of(pd, struct erdma_pd, ibpd); +} + +static inline struct erdma_mr *to_emr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct erdma_mr, ibmr); +} + +static inline struct erdma_qp *to_eqp(struct ib_qp *qp) +{ + return container_of(qp, struct erdma_qp, ibqp); +} + +static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct erdma_cq, ibcq); +} + +static inline struct erdma_user_mmap_entry * +to_emmap(struct rdma_user_mmap_entry *ibmmap) +{ + return container_of(ibmmap, struct erdma_user_mmap_entry, rdma_entry); +} + +int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *data); +void erdma_dealloc_ucontext(struct ib_ucontext *ibctx); +int erdma_query_device(struct ib_device *dev, struct ib_device_attr *attr, + struct ib_udata *data); +int erdma_get_port_immutable(struct ib_device *dev, u32 port, + struct ib_port_immutable *ib_port_immutable); +int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *data); +int erdma_query_port(struct ib_device *dev, u32 port, + struct ib_port_attr *attr); +int erdma_query_gid(struct ib_device *dev, u32 port, int idx, + union ib_gid *gid); +int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *data); +int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *data); +int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_qp_init_attr *init_attr); +int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_udata *data); +int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); +int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, + u64 virt, int access, struct ib_udata *udata); +struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int rights); +int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *data); +int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma); +void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +void erdma_qp_get_ref(struct ib_qp *ibqp); +void erdma_qp_put_ref(struct ib_qp *ibqp); +struct ib_qp *erdma_get_ibqp(struct ib_device *dev, int id); +int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, + const struct ib_send_wr **bad_send_wr); +int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr); +int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, + u32 max_num_sg); +int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); +void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason); + +#endif From b7babc7c31c6c33de8cadd51f9d3b3c2346ec74e Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:08 +0800 Subject: [PATCH 0627/1250] RDMA/erdma: Add verbs implementation The RDMA verbs implementation of erdma is divided into three files: erdma_qp.c, erdma_cq.c, and erdma_verbs.c. Internal used functions and datapath functions of QP/CQ are put in erdma_qp.c and erdma_cq.c, the rest is in erdma_verbs.c. This commit also fixes some static check warnings. Link: https://lore.kernel.org/r/20220713094212.30943-8-chengyou@linux.alibaba.com Reported-by: Dan Carpenter Reported-by: Abaci Robot Signed-off-by: Yang Li Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_cq.c | 205 +++ drivers/infiniband/hw/erdma/erdma_qp.c | 566 ++++++++ drivers/infiniband/hw/erdma/erdma_verbs.c | 1460 +++++++++++++++++++++ 3 files changed, 2231 insertions(+) create mode 100644 drivers/infiniband/hw/erdma/erdma_cq.c create mode 100644 drivers/infiniband/hw/erdma/erdma_qp.c create mode 100644 drivers/infiniband/hw/erdma/erdma_verbs.c diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c new file mode 100644 index 00000000000000..751c7f9f0de70d --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include + +#include "erdma_hw.h" +#include "erdma_verbs.h" + +static void *get_next_valid_cqe(struct erdma_cq *cq) +{ + __be32 *cqe = get_queue_entry(cq->kern_cq.qbuf, cq->kern_cq.ci, + cq->depth, CQE_SHIFT); + u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, + __be32_to_cpu(READ_ONCE(*cqe))); + + return owner ^ !!(cq->kern_cq.ci & cq->depth) ? cqe : NULL; +} + +static void notify_cq(struct erdma_cq *cq, u8 solcitied) +{ + u64 db_data = + FIELD_PREP(ERDMA_CQDB_IDX_MASK, (cq->kern_cq.notify_cnt)) | + FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->cqn) | + FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | + FIELD_PREP(ERDMA_CQDB_SOL_MASK, solcitied) | + FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) | + FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci); + + *cq->kern_cq.db_record = db_data; + writeq(db_data, cq->kern_cq.db); +} + +int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct erdma_cq *cq = to_ecq(ibcq); + unsigned long irq_flags; + int ret = 0; + + spin_lock_irqsave(&cq->kern_cq.lock, irq_flags); + + notify_cq(cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED); + + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && get_next_valid_cqe(cq)) + ret = 1; + + cq->kern_cq.notify_cnt++; + + spin_unlock_irqrestore(&cq->kern_cq.lock, irq_flags); + + return ret; +} + +static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { + [ERDMA_OP_WRITE] = IB_WC_RDMA_WRITE, + [ERDMA_OP_READ] = IB_WC_RDMA_READ, + [ERDMA_OP_SEND] = IB_WC_SEND, + [ERDMA_OP_SEND_WITH_IMM] = IB_WC_SEND, + [ERDMA_OP_RECEIVE] = IB_WC_RECV, + [ERDMA_OP_RECV_IMM] = IB_WC_RECV_RDMA_WITH_IMM, + [ERDMA_OP_RECV_INV] = IB_WC_RECV, + [ERDMA_OP_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [ERDMA_OP_INVALIDATE] = IB_WC_LOCAL_INV, + [ERDMA_OP_RSP_SEND_IMM] = IB_WC_RECV, + [ERDMA_OP_SEND_WITH_INV] = IB_WC_SEND, + [ERDMA_OP_REG_MR] = IB_WC_REG_MR, + [ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV, + [ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ, +}; + +static const struct { + enum erdma_wc_status erdma; + enum ib_wc_status base; + enum erdma_vendor_err vendor; +} map_cqe_status[ERDMA_NUM_WC_STATUS] = { + { ERDMA_WC_SUCCESS, IB_WC_SUCCESS, ERDMA_WC_VENDOR_NO_ERR }, + { ERDMA_WC_GENERAL_ERR, IB_WC_GENERAL_ERR, ERDMA_WC_VENDOR_NO_ERR }, + { ERDMA_WC_RECV_WQE_FORMAT_ERR, IB_WC_GENERAL_ERR, + ERDMA_WC_VENDOR_INVALID_RQE }, + { ERDMA_WC_RECV_STAG_INVALID_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_INVALID_STAG }, + { ERDMA_WC_RECV_ADDR_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION }, + { ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR }, + { ERDMA_WC_RECV_PDID_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_INVALID_PD }, + { ERDMA_WC_RECV_WARRPING_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_WRAP_ERR }, + { ERDMA_WC_SEND_WQE_FORMAT_ERR, IB_WC_LOC_QP_OP_ERR, + ERDMA_WC_VENDOR_INVALID_SQE }, + { ERDMA_WC_SEND_WQE_ORD_EXCEED, IB_WC_GENERAL_ERR, + ERDMA_WC_VENDOR_ZERO_ORD }, + { ERDMA_WC_SEND_STAG_INVALID_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_INVALID_STAG }, + { ERDMA_WC_SEND_ADDR_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION }, + { ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_ACCESS_ERR }, + { ERDMA_WC_SEND_PDID_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_INVALID_PD }, + { ERDMA_WC_SEND_WARRPING_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_WARP_ERR }, + { ERDMA_WC_FLUSH_ERR, IB_WC_WR_FLUSH_ERR, ERDMA_WC_VENDOR_NO_ERR }, + { ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR }, +}; + +#define ERDMA_POLLCQ_NO_QP 1 + +static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) +{ + struct erdma_dev *dev = to_edev(cq->ibcq.device); + u8 opcode, syndrome, qtype; + struct erdma_kqp *kern_qp; + struct erdma_cqe *cqe; + struct erdma_qp *qp; + u16 wqe_idx, depth; + u32 qpn, cqe_hdr; + u64 *id_table; + u64 *wqe_hdr; + + cqe = get_next_valid_cqe(cq); + if (!cqe) + return -EAGAIN; + + cq->kern_cq.ci++; + + /* cqbuf should be ready when we poll */ + dma_rmb(); + + qpn = be32_to_cpu(cqe->qpn); + wqe_idx = be32_to_cpu(cqe->qe_idx); + cqe_hdr = be32_to_cpu(cqe->hdr); + + qp = find_qp_by_qpn(dev, qpn); + if (!qp) + return ERDMA_POLLCQ_NO_QP; + + kern_qp = &qp->kern_qp; + + qtype = FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr); + syndrome = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr); + opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, cqe_hdr); + + if (qtype == ERDMA_CQE_QTYPE_SQ) { + id_table = kern_qp->swr_tbl; + depth = qp->attrs.sq_size; + wqe_hdr = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, + qp->attrs.sq_size, SQEBB_SHIFT); + kern_qp->sq_ci = + FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *wqe_hdr) + + wqe_idx + 1; + } else { + id_table = kern_qp->rwr_tbl; + depth = qp->attrs.rq_size; + } + wc->wr_id = id_table[wqe_idx & (depth - 1)]; + wc->byte_len = be32_to_cpu(cqe->size); + + wc->wc_flags = 0; + + wc->opcode = wc_mapping_table[opcode]; + if (opcode == ERDMA_OP_RECV_IMM || opcode == ERDMA_OP_RSP_SEND_IMM) { + wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->imm_data)); + wc->wc_flags |= IB_WC_WITH_IMM; + } else if (opcode == ERDMA_OP_RECV_INV) { + wc->ex.invalidate_rkey = be32_to_cpu(cqe->inv_rkey); + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + } + + if (syndrome >= ERDMA_NUM_WC_STATUS) + syndrome = ERDMA_WC_GENERAL_ERR; + + wc->status = map_cqe_status[syndrome].base; + wc->vendor_err = map_cqe_status[syndrome].vendor; + wc->qp = &qp->ibqp; + + return 0; +} + +int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct erdma_cq *cq = to_ecq(ibcq); + unsigned long flags; + int npolled, ret; + + spin_lock_irqsave(&cq->kern_cq.lock, flags); + + for (npolled = 0; npolled < num_entries;) { + ret = erdma_poll_one_cqe(cq, wc + npolled); + + if (ret == -EAGAIN) /* no received new CQEs. */ + break; + else if (ret) /* ignore invalid CQEs. */ + continue; + + npolled++; + } + + spin_unlock_irqrestore(&cq->kern_cq.lock, flags); + + return npolled; +} diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c new file mode 100644 index 00000000000000..72f08171a28a72 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2021, Alibaba Group */ +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include + +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_verbs.h" + +void erdma_qp_llp_close(struct erdma_qp *qp) +{ + struct erdma_qp_attrs qp_attrs; + + down_write(&qp->state_lock); + + switch (qp->attrs.state) { + case ERDMA_QP_STATE_RTS: + case ERDMA_QP_STATE_RTR: + case ERDMA_QP_STATE_IDLE: + case ERDMA_QP_STATE_TERMINATE: + qp_attrs.state = ERDMA_QP_STATE_CLOSING; + erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + break; + case ERDMA_QP_STATE_CLOSING: + qp->attrs.state = ERDMA_QP_STATE_IDLE; + break; + default: + break; + } + + if (qp->cep) { + erdma_cep_put(qp->cep); + qp->cep = NULL; + } + + up_write(&qp->state_lock); +} + +struct ib_qp *erdma_get_ibqp(struct ib_device *ibdev, int id) +{ + struct erdma_qp *qp = find_qp_by_qpn(to_edev(ibdev), id); + + if (qp) + return &qp->ibqp; + + return NULL; +} + +static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, + struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask) +{ + int ret; + struct erdma_dev *dev = qp->dev; + struct erdma_cmdq_modify_qp_req req; + struct tcp_sock *tp; + struct erdma_cep *cep = qp->cep; + struct sockaddr_storage local_addr, remote_addr; + + if (!(mask & ERDMA_QP_ATTR_LLP_HANDLE)) + return -EINVAL; + + if (!(mask & ERDMA_QP_ATTR_MPA)) + return -EINVAL; + + ret = getname_local(cep->sock, &local_addr); + if (ret < 0) + return ret; + + ret = getname_peer(cep->sock, &remote_addr); + if (ret < 0) + return ret; + + qp->attrs.state = ERDMA_QP_STATE_RTS; + + tp = tcp_sk(qp->cep->sock->sk); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_MODIFY_QP); + + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, qp->attrs.state) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, qp->attrs.cc) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); + + req.cookie = be32_to_cpu(qp->cep->mpa.ext_data.cookie); + req.dip = to_sockaddr_in(remote_addr).sin_addr.s_addr; + req.sip = to_sockaddr_in(local_addr).sin_addr.s_addr; + req.dport = to_sockaddr_in(remote_addr).sin_port; + req.sport = to_sockaddr_in(local_addr).sin_port; + + req.send_nxt = tp->snd_nxt; + /* rsvd tcp seq for mpa-rsp in server. */ + if (qp->attrs.qp_type == ERDMA_QP_PASSIVE) + req.send_nxt += MPA_DEFAULT_HDR_LEN + qp->attrs.pd_len; + req.recv_nxt = tp->rcv_nxt; + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp, + struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask) +{ + struct erdma_dev *dev = qp->dev; + struct erdma_cmdq_modify_qp_req req; + + qp->attrs.state = attrs->state; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_MODIFY_QP); + + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, attrs->state) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask) +{ + int drop_conn, ret = 0; + + if (!mask) + return 0; + + if (!(mask & ERDMA_QP_ATTR_STATE)) + return 0; + + switch (qp->attrs.state) { + case ERDMA_QP_STATE_IDLE: + case ERDMA_QP_STATE_RTR: + if (attrs->state == ERDMA_QP_STATE_RTS) { + ret = erdma_modify_qp_state_to_rts(qp, attrs, mask); + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { + qp->attrs.state = ERDMA_QP_STATE_ERROR; + if (qp->cep) { + erdma_cep_put(qp->cep); + qp->cep = NULL; + } + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + } + break; + case ERDMA_QP_STATE_RTS: + drop_conn = 0; + + if (attrs->state == ERDMA_QP_STATE_CLOSING) { + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + drop_conn = 1; + } else if (attrs->state == ERDMA_QP_STATE_TERMINATE) { + qp->attrs.state = ERDMA_QP_STATE_TERMINATE; + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + drop_conn = 1; + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + qp->attrs.state = ERDMA_QP_STATE_ERROR; + drop_conn = 1; + } + + if (drop_conn) + erdma_qp_cm_drop(qp); + + break; + case ERDMA_QP_STATE_TERMINATE: + if (attrs->state == ERDMA_QP_STATE_ERROR) + qp->attrs.state = ERDMA_QP_STATE_ERROR; + break; + case ERDMA_QP_STATE_CLOSING: + if (attrs->state == ERDMA_QP_STATE_IDLE) { + qp->attrs.state = ERDMA_QP_STATE_IDLE; + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + qp->attrs.state = ERDMA_QP_STATE_ERROR; + } else if (attrs->state != ERDMA_QP_STATE_CLOSING) { + return -ECONNABORTED; + } + break; + default: + break; + } + + return ret; +} + +static void erdma_qp_safe_free(struct kref *ref) +{ + struct erdma_qp *qp = container_of(ref, struct erdma_qp, ref); + + complete(&qp->safe_free); +} + +void erdma_qp_put(struct erdma_qp *qp) +{ + WARN_ON(kref_read(&qp->ref) < 1); + kref_put(&qp->ref, erdma_qp_safe_free); +} + +void erdma_qp_get(struct erdma_qp *qp) +{ + kref_get(&qp->ref); +} + +static int fill_inline_data(struct erdma_qp *qp, + const struct ib_send_wr *send_wr, u16 wqe_idx, + u32 sgl_offset, __le32 *length_field) +{ + u32 remain_size, copy_size, data_off, bytes = 0; + char *data; + int i = 0; + + wqe_idx += (sgl_offset >> SQEBB_SHIFT); + sgl_offset &= (SQEBB_SIZE - 1); + data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, qp->attrs.sq_size, + SQEBB_SHIFT); + + while (i < send_wr->num_sge) { + bytes += send_wr->sg_list[i].length; + if (bytes > (int)ERDMA_MAX_INLINE) + return -EINVAL; + + remain_size = send_wr->sg_list[i].length; + data_off = 0; + + while (1) { + copy_size = min(remain_size, SQEBB_SIZE - sgl_offset); + + memcpy(data + sgl_offset, + (void *)(uintptr_t)send_wr->sg_list[i].addr + + data_off, + copy_size); + remain_size -= copy_size; + data_off += copy_size; + sgl_offset += copy_size; + wqe_idx += (sgl_offset >> SQEBB_SHIFT); + sgl_offset &= (SQEBB_SIZE - 1); + + data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, + qp->attrs.sq_size, SQEBB_SHIFT); + if (!remain_size) + break; + } + + i++; + } + *length_field = cpu_to_le32(bytes); + + return bytes; +} + +static int fill_sgl(struct erdma_qp *qp, const struct ib_send_wr *send_wr, + u16 wqe_idx, u32 sgl_offset, __le32 *length_field) +{ + int i = 0; + u32 bytes = 0; + char *sgl; + + if (send_wr->num_sge > qp->dev->attrs.max_send_sge) + return -EINVAL; + + if (sgl_offset & 0xF) + return -EINVAL; + + while (i < send_wr->num_sge) { + wqe_idx += (sgl_offset >> SQEBB_SHIFT); + sgl_offset &= (SQEBB_SIZE - 1); + sgl = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, + qp->attrs.sq_size, SQEBB_SHIFT); + + bytes += send_wr->sg_list[i].length; + memcpy(sgl + sgl_offset, &send_wr->sg_list[i], + sizeof(struct ib_sge)); + + sgl_offset += sizeof(struct ib_sge); + i++; + } + + *length_field = cpu_to_le32(bytes); + return 0; +} + +static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, + const struct ib_send_wr *send_wr) +{ + u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset; + u32 idx = *pi & (qp->attrs.sq_size - 1); + enum ib_wr_opcode op = send_wr->opcode; + struct erdma_readreq_sqe *read_sqe; + struct erdma_reg_mr_sqe *regmr_sge; + struct erdma_write_sqe *write_sqe; + struct erdma_send_sqe *send_sqe; + struct ib_rdma_wr *rdma_wr; + struct erdma_mr *mr; + __le32 *length_field; + u64 wqe_hdr, *entry; + struct ib_sge *sge; + u32 attrs; + int ret; + + entry = get_queue_entry(qp->kern_qp.sq_buf, idx, qp->attrs.sq_size, + SQEBB_SHIFT); + + /* Clear the SQE header section. */ + *entry = 0; + + qp->kern_qp.swr_tbl[idx] = send_wr->wr_id; + flags = send_wr->send_flags; + wqe_hdr = FIELD_PREP( + ERDMA_SQE_HDR_CE_MASK, + ((flags & IB_SEND_SIGNALED) || qp->kern_qp.sig_all) ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SE_MASK, + flags & IB_SEND_SOLICITED ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK, + flags & IB_SEND_FENCE ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK, + flags & IB_SEND_INLINE ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)); + + switch (op) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + hw_op = ERDMA_OP_WRITE; + if (op == IB_WR_RDMA_WRITE_WITH_IMM) + hw_op = ERDMA_OP_WRITE_WITH_IMM; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); + rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr); + write_sqe = (struct erdma_write_sqe *)entry; + + write_sqe->imm_data = send_wr->ex.imm_data; + write_sqe->sink_stag = cpu_to_le32(rdma_wr->rkey); + write_sqe->sink_to_h = + cpu_to_le32(upper_32_bits(rdma_wr->remote_addr)); + write_sqe->sink_to_l = + cpu_to_le32(lower_32_bits(rdma_wr->remote_addr)); + + length_field = &write_sqe->length; + wqe_size = sizeof(struct erdma_write_sqe); + sgl_offset = wqe_size; + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + read_sqe = (struct erdma_readreq_sqe *)entry; + if (unlikely(send_wr->num_sge != 1)) + return -EINVAL; + hw_op = ERDMA_OP_READ; + if (op == IB_WR_RDMA_READ_WITH_INV) { + hw_op = ERDMA_OP_READ_WITH_INV; + read_sqe->invalid_stag = + cpu_to_le32(send_wr->ex.invalidate_rkey); + } + + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); + rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr); + read_sqe->length = cpu_to_le32(send_wr->sg_list[0].length); + read_sqe->sink_stag = cpu_to_le32(send_wr->sg_list[0].lkey); + read_sqe->sink_to_l = + cpu_to_le32(lower_32_bits(send_wr->sg_list[0].addr)); + read_sqe->sink_to_h = + cpu_to_le32(upper_32_bits(send_wr->sg_list[0].addr)); + + sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1, + qp->attrs.sq_size, SQEBB_SHIFT); + sge->addr = rdma_wr->remote_addr; + sge->lkey = rdma_wr->rkey; + sge->length = send_wr->sg_list[0].length; + wqe_size = sizeof(struct erdma_readreq_sqe) + + send_wr->num_sge * sizeof(struct ib_sge); + + goto out; + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + send_sqe = (struct erdma_send_sqe *)entry; + hw_op = ERDMA_OP_SEND; + if (op == IB_WR_SEND_WITH_IMM) { + hw_op = ERDMA_OP_SEND_WITH_IMM; + send_sqe->imm_data = send_wr->ex.imm_data; + } else if (op == IB_WR_SEND_WITH_INV) { + hw_op = ERDMA_OP_SEND_WITH_INV; + send_sqe->invalid_stag = + cpu_to_le32(send_wr->ex.invalidate_rkey); + } + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); + length_field = &send_sqe->length; + wqe_size = sizeof(struct erdma_send_sqe); + sgl_offset = wqe_size; + + break; + case IB_WR_REG_MR: + wqe_hdr |= + FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ERDMA_OP_REG_MR); + regmr_sge = (struct erdma_reg_mr_sqe *)entry; + mr = to_emr(reg_wr(send_wr)->mr); + + mr->access = ERDMA_MR_ACC_LR | + to_erdma_access_flags(reg_wr(send_wr)->access); + regmr_sge->addr = cpu_to_le64(mr->ibmr.iova); + regmr_sge->length = cpu_to_le32(mr->ibmr.length); + regmr_sge->stag = cpu_to_le32(mr->ibmr.lkey); + attrs = FIELD_PREP(ERDMA_SQE_MR_MODE_MASK, 0) | + FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) | + FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK, + mr->mem.mtt_nents); + + if (mr->mem.mtt_nents < ERDMA_MAX_INLINE_MTT_ENTRIES) { + attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 0); + /* Copy SGLs to SQE content to accelerate */ + memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1, + qp->attrs.sq_size, SQEBB_SHIFT), + mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents)); + wqe_size = sizeof(struct erdma_reg_mr_sqe) + + MTT_SIZE(mr->mem.mtt_nents); + } else { + attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 1); + wqe_size = sizeof(struct erdma_reg_mr_sqe); + } + + regmr_sge->attrs = cpu_to_le32(attrs); + goto out; + case IB_WR_LOCAL_INV: + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, + ERDMA_OP_LOCAL_INV); + regmr_sge = (struct erdma_reg_mr_sqe *)entry; + regmr_sge->stag = cpu_to_le32(send_wr->ex.invalidate_rkey); + wqe_size = sizeof(struct erdma_reg_mr_sqe); + goto out; + default: + return -EOPNOTSUPP; + } + + if (flags & IB_SEND_INLINE) { + ret = fill_inline_data(qp, send_wr, idx, sgl_offset, + length_field); + if (ret < 0) + return -EINVAL; + wqe_size += ret; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, ret); + } else { + ret = fill_sgl(qp, send_wr, idx, sgl_offset, length_field); + if (ret) + return -EINVAL; + wqe_size += send_wr->num_sge * sizeof(struct ib_sge); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, + send_wr->num_sge); + } + +out: + wqebb_cnt = SQEBB_COUNT(wqe_size); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); + *pi += wqebb_cnt; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, *pi); + + *entry = wqe_hdr; + + return 0; +} + +static void kick_sq_db(struct erdma_qp *qp, u16 pi) +{ + u64 db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)) | + FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi); + + *(u64 *)qp->kern_qp.sq_db_info = db_data; + writeq(db_data, qp->kern_qp.hw_sq_db); +} + +int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, + const struct ib_send_wr **bad_send_wr) +{ + struct erdma_qp *qp = to_eqp(ibqp); + int ret = 0; + const struct ib_send_wr *wr = send_wr; + unsigned long flags; + u16 sq_pi; + + if (!send_wr) + return -EINVAL; + + spin_lock_irqsave(&qp->lock, flags); + sq_pi = qp->kern_qp.sq_pi; + + while (wr) { + if ((u16)(sq_pi - qp->kern_qp.sq_ci) >= qp->attrs.sq_size) { + ret = -ENOMEM; + *bad_send_wr = send_wr; + break; + } + + ret = erdma_push_one_sqe(qp, &sq_pi, wr); + if (ret) { + *bad_send_wr = wr; + break; + } + qp->kern_qp.sq_pi = sq_pi; + kick_sq_db(qp, sq_pi); + + wr = wr->next; + } + spin_unlock_irqrestore(&qp->lock, flags); + + return ret; +} + +static int erdma_post_recv_one(struct erdma_qp *qp, + const struct ib_recv_wr *recv_wr) +{ + struct erdma_rqe *rqe = + get_queue_entry(qp->kern_qp.rq_buf, qp->kern_qp.rq_pi, + qp->attrs.rq_size, RQE_SHIFT); + + rqe->qe_idx = cpu_to_le16(qp->kern_qp.rq_pi + 1); + rqe->qpn = cpu_to_le32(QP_ID(qp)); + + if (recv_wr->num_sge == 0) { + rqe->length = 0; + } else if (recv_wr->num_sge == 1) { + rqe->stag = cpu_to_le32(recv_wr->sg_list[0].lkey); + rqe->to = cpu_to_le64(recv_wr->sg_list[0].addr); + rqe->length = cpu_to_le32(recv_wr->sg_list[0].length); + } else { + return -EINVAL; + } + + *(u64 *)qp->kern_qp.rq_db_info = *(u64 *)rqe; + writeq(*(u64 *)rqe, qp->kern_qp.hw_rq_db); + + qp->kern_qp.rwr_tbl[qp->kern_qp.rq_pi & (qp->attrs.rq_size - 1)] = + recv_wr->wr_id; + qp->kern_qp.rq_pi++; + + return 0; +} + +int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr) +{ + const struct ib_recv_wr *wr = recv_wr; + struct erdma_qp *qp = to_eqp(ibqp); + unsigned long flags; + int ret; + + spin_lock_irqsave(&qp->lock, flags); + + while (wr) { + ret = erdma_post_recv_one(qp, wr); + if (ret) { + *bad_recv_wr = wr; + break; + } + wr = wr->next; + } + + spin_unlock_irqrestore(&qp->lock, flags); + return ret; +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c new file mode 100644 index 00000000000000..a7a3d42e201676 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -0,0 +1,1460 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +/* Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp) +{ + struct erdma_cmdq_create_qp_req req; + struct erdma_pd *pd = to_epd(qp->ibqp.pd); + struct erdma_uqp *user_qp; + u64 resp0, resp1; + int err; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_CREATE_QP); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK, + ilog2(qp->attrs.sq_size)) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_QPN_MASK, QP_ID(qp)); + req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK, + ilog2(qp->attrs.rq_size)) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_PD_MASK, pd->pdn); + + if (rdma_is_kernel_res(&qp->ibqp.res)) { + u32 pgsz_range = ilog2(SZ_1M) - PAGE_SHIFT; + + req.sq_cqn_mtt_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + pgsz_range) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn); + req.rq_cqn_mtt_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + pgsz_range) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn); + + req.sq_mtt_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK, 0) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, 1) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + ERDMA_MR_INLINE_MTT); + req.rq_mtt_cfg = req.sq_mtt_cfg; + + req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr; + req.sq_buf_addr = qp->kern_qp.sq_buf_dma_addr; + req.sq_db_info_dma_addr = qp->kern_qp.sq_buf_dma_addr + + (qp->attrs.sq_size << SQEBB_SHIFT); + req.rq_db_info_dma_addr = qp->kern_qp.rq_buf_dma_addr + + (qp->attrs.rq_size << RQE_SHIFT); + } else { + user_qp = &qp->user_qp; + req.sq_cqn_mtt_cfg = FIELD_PREP( + ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + ilog2(user_qp->sq_mtt.page_size) - PAGE_SHIFT); + req.sq_cqn_mtt_cfg |= + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn); + + req.rq_cqn_mtt_cfg = FIELD_PREP( + ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + ilog2(user_qp->rq_mtt.page_size) - PAGE_SHIFT); + req.rq_cqn_mtt_cfg |= + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn); + + req.sq_mtt_cfg = user_qp->sq_mtt.page_offset; + req.sq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, + user_qp->sq_mtt.mtt_nents) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + user_qp->sq_mtt.mtt_type); + + req.rq_mtt_cfg = user_qp->rq_mtt.page_offset; + req.rq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, + user_qp->rq_mtt.mtt_nents) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + user_qp->rq_mtt.mtt_type); + + req.sq_buf_addr = user_qp->sq_mtt.mtt_entry[0]; + req.rq_buf_addr = user_qp->rq_mtt.mtt_entry[0]; + + req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; + req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; + } + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), &resp0, + &resp1); + if (!err) + qp->attrs.cookie = + FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0); + + return err; +} + +static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) +{ + struct erdma_cmdq_reg_mr_req req; + struct erdma_pd *pd = to_epd(mr->ibmr.pd); + u64 *phy_addr; + int i; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_REG_MR); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) | + FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, mr->ibmr.lkey & 0xFF) | + FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8); + req.cfg1 = FIELD_PREP(ERDMA_CMD_REGMR_PD_MASK, pd->pdn) | + FIELD_PREP(ERDMA_CMD_REGMR_TYPE_MASK, mr->type) | + FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access) | + FIELD_PREP(ERDMA_CMD_REGMR_ACC_MODE_MASK, 0); + req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK, + ilog2(mr->mem.page_size)) | + FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) | + FIELD_PREP(ERDMA_CMD_REGMR_MTT_CNT_MASK, mr->mem.page_cnt); + + if (mr->type == ERDMA_MR_TYPE_DMA) + goto post_cmd; + + if (mr->type == ERDMA_MR_TYPE_NORMAL) { + req.start_va = mr->mem.va; + req.size = mr->mem.len; + } + + if (mr->type == ERDMA_MR_TYPE_FRMR || + mr->mem.mtt_type == ERDMA_MR_INDIRECT_MTT) { + phy_addr = req.phy_addr; + *phy_addr = mr->mem.mtt_entry[0]; + } else { + phy_addr = req.phy_addr; + for (i = 0; i < mr->mem.mtt_nents; i++) + *phy_addr++ = mr->mem.mtt_entry[i]; + } + +post_cmd: + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +static int create_cq_cmd(struct erdma_dev *dev, struct erdma_cq *cq) +{ + struct erdma_cmdq_create_cq_req req; + u32 page_size; + struct erdma_mem *mtt; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_CREATE_CQ); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_CQN_MASK, cq->cqn) | + FIELD_PREP(ERDMA_CMD_CREATE_CQ_DEPTH_MASK, ilog2(cq->depth)); + req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_EQN_MASK, cq->assoc_eqn); + + if (rdma_is_kernel_res(&cq->ibcq.res)) { + page_size = SZ_32M; + req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK, + ilog2(page_size) - PAGE_SHIFT); + req.qbuf_addr_l = lower_32_bits(cq->kern_cq.qbuf_dma_addr); + req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr); + + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) | + FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, + ERDMA_MR_INLINE_MTT); + + req.first_page_offset = 0; + req.cq_db_info_addr = + cq->kern_cq.qbuf_dma_addr + (cq->depth << CQE_SHIFT); + } else { + mtt = &cq->user_cq.qbuf_mtt; + req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK, + ilog2(mtt->page_size) - PAGE_SHIFT); + if (mtt->mtt_nents == 1) { + req.qbuf_addr_l = lower_32_bits(*(u64 *)mtt->mtt_buf); + req.qbuf_addr_h = upper_32_bits(*(u64 *)mtt->mtt_buf); + } else { + req.qbuf_addr_l = lower_32_bits(mtt->mtt_entry[0]); + req.qbuf_addr_h = upper_32_bits(mtt->mtt_entry[0]); + } + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, + mtt->mtt_nents); + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, + mtt->mtt_type); + + req.first_page_offset = mtt->page_offset; + req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; + } + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +static int erdma_alloc_idx(struct erdma_resource_cb *res_cb) +{ + int idx; + unsigned long flags; + + spin_lock_irqsave(&res_cb->lock, flags); + idx = find_next_zero_bit(res_cb->bitmap, res_cb->max_cap, + res_cb->next_alloc_idx); + if (idx == res_cb->max_cap) { + idx = find_first_zero_bit(res_cb->bitmap, res_cb->max_cap); + if (idx == res_cb->max_cap) { + res_cb->next_alloc_idx = 1; + spin_unlock_irqrestore(&res_cb->lock, flags); + return -ENOSPC; + } + } + + set_bit(idx, res_cb->bitmap); + res_cb->next_alloc_idx = idx + 1; + spin_unlock_irqrestore(&res_cb->lock, flags); + + return idx; +} + +static inline void erdma_free_idx(struct erdma_resource_cb *res_cb, u32 idx) +{ + unsigned long flags; + u32 used; + + spin_lock_irqsave(&res_cb->lock, flags); + used = __test_and_clear_bit(idx, res_cb->bitmap); + spin_unlock_irqrestore(&res_cb->lock, flags); + WARN_ON(!used); +} + +static struct rdma_user_mmap_entry * +erdma_user_mmap_entry_insert(struct erdma_ucontext *uctx, void *address, + u32 size, u8 mmap_flag, u64 *mmap_offset) +{ + struct erdma_user_mmap_entry *entry = + kzalloc(sizeof(*entry), GFP_KERNEL); + int ret; + + if (!entry) + return NULL; + + entry->address = (u64)address; + entry->mmap_flag = mmap_flag; + + size = PAGE_ALIGN(size); + + ret = rdma_user_mmap_entry_insert(&uctx->ibucontext, &entry->rdma_entry, + size); + if (ret) { + kfree(entry); + return NULL; + } + + *mmap_offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + + return &entry->rdma_entry; +} + +int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, + struct ib_udata *unused) +{ + struct erdma_dev *dev = to_edev(ibdev); + + memset(attr, 0, sizeof(*attr)); + + attr->max_mr_size = dev->attrs.max_mr_size; + attr->vendor_id = PCI_VENDOR_ID_ALIBABA; + attr->vendor_part_id = dev->pdev->device; + attr->hw_ver = dev->pdev->revision; + attr->max_qp = dev->attrs.max_qp; + attr->max_qp_wr = min(dev->attrs.max_send_wr, dev->attrs.max_recv_wr); + attr->max_qp_rd_atom = dev->attrs.max_ord; + attr->max_qp_init_rd_atom = dev->attrs.max_ird; + attr->max_res_rd_atom = dev->attrs.max_qp * dev->attrs.max_ird; + attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; + attr->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; + ibdev->local_dma_lkey = dev->attrs.local_dma_key; + attr->max_send_sge = dev->attrs.max_send_sge; + attr->max_recv_sge = dev->attrs.max_recv_sge; + attr->max_sge_rd = dev->attrs.max_sge_rd; + attr->max_cq = dev->attrs.max_cq; + attr->max_cqe = dev->attrs.max_cqe; + attr->max_mr = dev->attrs.max_mr; + attr->max_pd = dev->attrs.max_pd; + attr->max_mw = dev->attrs.max_mw; + attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; + attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; + attr->fw_ver = dev->attrs.fw_version; + + if (dev->netdev) + addrconf_addr_eui48((u8 *)&attr->sys_image_guid, + dev->netdev->dev_addr); + + return 0; +} + +int erdma_query_gid(struct ib_device *ibdev, u32 port, int idx, + union ib_gid *gid) +{ + struct erdma_dev *dev = to_edev(ibdev); + + memset(gid, 0, sizeof(*gid)); + ether_addr_copy(gid->raw, dev->attrs.peer_addr); + + return 0; +} + +int erdma_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *attr) +{ + struct erdma_dev *dev = to_edev(ibdev); + struct net_device *ndev = dev->netdev; + + memset(attr, 0, sizeof(*attr)); + + attr->gid_tbl_len = 1; + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; + attr->max_msg_sz = -1; + + if (!ndev) + goto out; + + ib_get_eth_speed(ibdev, port, &attr->active_speed, &attr->active_width); + attr->max_mtu = ib_mtu_int_to_enum(ndev->mtu); + attr->active_mtu = ib_mtu_int_to_enum(ndev->mtu); + if (netif_running(ndev) && netif_carrier_ok(ndev)) + dev->state = IB_PORT_ACTIVE; + else + dev->state = IB_PORT_DOWN; + attr->state = dev->state; + +out: + if (dev->state == IB_PORT_ACTIVE) + attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + else + attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + + return 0; +} + +int erdma_get_port_immutable(struct ib_device *ibdev, u32 port, + struct ib_port_immutable *port_immutable) +{ + port_immutable->gid_tbl_len = 1; + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + + return 0; +} + +int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct erdma_pd *pd = to_epd(ibpd); + struct erdma_dev *dev = to_edev(ibpd->device); + int pdn; + + pdn = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_PD]); + if (pdn < 0) + return pdn; + + pd->pdn = pdn; + + return 0; +} + +int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct erdma_pd *pd = to_epd(ibpd); + struct erdma_dev *dev = to_edev(ibpd->device); + + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_PD], pd->pdn); + + return 0; +} + +static int erdma_qp_validate_cap(struct erdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if ((attrs->cap.max_send_wr > dev->attrs.max_send_wr) || + (attrs->cap.max_recv_wr > dev->attrs.max_recv_wr) || + (attrs->cap.max_send_sge > dev->attrs.max_send_sge) || + (attrs->cap.max_recv_sge > dev->attrs.max_recv_sge) || + (attrs->cap.max_inline_data > ERDMA_MAX_INLINE) || + !attrs->cap.max_send_wr || !attrs->cap.max_recv_wr) { + return -EINVAL; + } + + return 0; +} + +static int erdma_qp_validate_attr(struct erdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if (attrs->qp_type != IB_QPT_RC) + return -EOPNOTSUPP; + + if (attrs->srq) + return -EOPNOTSUPP; + + if (!attrs->send_cq || !attrs->recv_cq) + return -EOPNOTSUPP; + + return 0; +} + +static void free_kernel_qp(struct erdma_qp *qp) +{ + struct erdma_dev *dev = qp->dev; + + vfree(qp->kern_qp.swr_tbl); + vfree(qp->kern_qp.rwr_tbl); + + if (qp->kern_qp.sq_buf) + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), + qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + + if (qp->kern_qp.rq_buf) + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), + qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); +} + +static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, + struct ib_qp_init_attr *attrs) +{ + struct erdma_kqp *kqp = &qp->kern_qp; + int size; + + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) + kqp->sig_all = 1; + + kqp->sq_pi = 0; + kqp->sq_ci = 0; + kqp->rq_pi = 0; + kqp->rq_ci = 0; + kqp->hw_sq_db = + dev->func_bar + (ERDMA_SDB_SHARED_PAGE_INDEX << PAGE_SHIFT); + kqp->hw_rq_db = dev->func_bar + ERDMA_BAR_RQDB_SPACE_OFFSET; + + kqp->swr_tbl = vmalloc(qp->attrs.sq_size * sizeof(u64)); + kqp->rwr_tbl = vmalloc(qp->attrs.rq_size * sizeof(u64)); + if (!kqp->swr_tbl || !kqp->rwr_tbl) + goto err_out; + + size = (qp->attrs.sq_size << SQEBB_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + kqp->sq_buf = dma_alloc_coherent(&dev->pdev->dev, size, + &kqp->sq_buf_dma_addr, GFP_KERNEL); + if (!kqp->sq_buf) + goto err_out; + + size = (qp->attrs.rq_size << RQE_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + kqp->rq_buf = dma_alloc_coherent(&dev->pdev->dev, size, + &kqp->rq_buf_dma_addr, GFP_KERNEL); + if (!kqp->rq_buf) + goto err_out; + + kqp->sq_db_info = kqp->sq_buf + (qp->attrs.sq_size << SQEBB_SHIFT); + kqp->rq_db_info = kqp->rq_buf + (qp->attrs.rq_size << RQE_SHIFT); + + return 0; + +err_out: + free_kernel_qp(qp); + return -ENOMEM; +} + +static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem, + u64 start, u64 len, int access, u64 virt, + unsigned long req_page_size, u8 force_indirect_mtt) +{ + struct ib_block_iter biter; + uint64_t *phy_addr = NULL; + int ret = 0; + + mem->umem = ib_umem_get(&dev->ibdev, start, len, access); + if (IS_ERR(mem->umem)) { + ret = PTR_ERR(mem->umem); + mem->umem = NULL; + return ret; + } + + mem->va = virt; + mem->len = len; + mem->page_size = ib_umem_find_best_pgsz(mem->umem, req_page_size, virt); + mem->page_offset = start & (mem->page_size - 1); + mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size); + mem->page_cnt = mem->mtt_nents; + + if (mem->page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES || + force_indirect_mtt) { + mem->mtt_type = ERDMA_MR_INDIRECT_MTT; + mem->mtt_buf = + alloc_pages_exact(MTT_SIZE(mem->page_cnt), GFP_KERNEL); + if (!mem->mtt_buf) { + ret = -ENOMEM; + goto error_ret; + } + phy_addr = mem->mtt_buf; + } else { + mem->mtt_type = ERDMA_MR_INLINE_MTT; + phy_addr = mem->mtt_entry; + } + + rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) { + *phy_addr = rdma_block_iter_dma_address(&biter); + phy_addr++; + } + + if (mem->mtt_type == ERDMA_MR_INDIRECT_MTT) { + mem->mtt_entry[0] = + dma_map_single(&dev->pdev->dev, mem->mtt_buf, + MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, mem->mtt_entry[0])) { + free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); + mem->mtt_buf = NULL; + ret = -ENOMEM; + goto error_ret; + } + } + + return 0; + +error_ret: + if (mem->umem) { + ib_umem_release(mem->umem); + mem->umem = NULL; + } + + return ret; +} + +static void put_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem) +{ + if (mem->mtt_buf) { + dma_unmap_single(&dev->pdev->dev, mem->mtt_entry[0], + MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); + free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); + } + + if (mem->umem) { + ib_umem_release(mem->umem); + mem->umem = NULL; + } +} + +static int erdma_map_user_dbrecords(struct erdma_ucontext *ctx, + u64 dbrecords_va, + struct erdma_user_dbrecords_page **dbr_page, + dma_addr_t *dma_addr) +{ + struct erdma_user_dbrecords_page *page = NULL; + int rv = 0; + + mutex_lock(&ctx->dbrecords_page_mutex); + + list_for_each_entry(page, &ctx->dbrecords_page_list, list) + if (page->va == (dbrecords_va & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + rv = -ENOMEM; + goto out; + } + + page->va = (dbrecords_va & PAGE_MASK); + page->refcnt = 0; + + page->umem = ib_umem_get(ctx->ibucontext.device, + dbrecords_va & PAGE_MASK, PAGE_SIZE, 0); + if (IS_ERR(page->umem)) { + rv = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &ctx->dbrecords_page_list); + +found: + *dma_addr = sg_dma_address(page->umem->sgt_append.sgt.sgl) + + (dbrecords_va & ~PAGE_MASK); + *dbr_page = page; + page->refcnt++; + +out: + mutex_unlock(&ctx->dbrecords_page_mutex); + return rv; +} + +static void +erdma_unmap_user_dbrecords(struct erdma_ucontext *ctx, + struct erdma_user_dbrecords_page **dbr_page) +{ + if (!ctx || !(*dbr_page)) + return; + + mutex_lock(&ctx->dbrecords_page_mutex); + if (--(*dbr_page)->refcnt == 0) { + list_del(&(*dbr_page)->list); + ib_umem_release((*dbr_page)->umem); + kfree(*dbr_page); + } + + *dbr_page = NULL; + mutex_unlock(&ctx->dbrecords_page_mutex); +} + +static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, + u64 va, u32 len, u64 db_info_va) +{ + dma_addr_t db_info_dma_addr; + u32 rq_offset; + int ret; + + if (len < (PAGE_ALIGN(qp->attrs.sq_size * SQEBB_SIZE) + + qp->attrs.rq_size * RQE_SIZE)) + return -EINVAL; + + ret = get_mtt_entries(qp->dev, &qp->user_qp.sq_mtt, va, + qp->attrs.sq_size << SQEBB_SHIFT, 0, va, + (SZ_1M - SZ_4K), 1); + if (ret) + return ret; + + rq_offset = PAGE_ALIGN(qp->attrs.sq_size << SQEBB_SHIFT); + qp->user_qp.rq_offset = rq_offset; + + ret = get_mtt_entries(qp->dev, &qp->user_qp.rq_mtt, va + rq_offset, + qp->attrs.rq_size << RQE_SHIFT, 0, va + rq_offset, + (SZ_1M - SZ_4K), 1); + if (ret) + goto put_sq_mtt; + + ret = erdma_map_user_dbrecords(uctx, db_info_va, + &qp->user_qp.user_dbr_page, + &db_info_dma_addr); + if (ret) + goto put_rq_mtt; + + qp->user_qp.sq_db_info_dma_addr = db_info_dma_addr; + qp->user_qp.rq_db_info_dma_addr = db_info_dma_addr + ERDMA_DB_SIZE; + + return 0; + +put_rq_mtt: + put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt); + +put_sq_mtt: + put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt); + + return ret; +} + +static void free_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx) +{ + put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt); + put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt); + erdma_unmap_user_dbrecords(uctx, &qp->user_qp.user_dbr_page); +} + +int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct erdma_qp *qp = to_eqp(ibqp); + struct erdma_dev *dev = to_edev(ibqp->device); + struct erdma_ucontext *uctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + struct erdma_ureq_create_qp ureq; + struct erdma_uresp_create_qp uresp; + int ret; + + ret = erdma_qp_validate_cap(dev, attrs); + if (ret) + goto err_out; + + ret = erdma_qp_validate_attr(dev, attrs); + if (ret) + goto err_out; + + qp->scq = to_ecq(attrs->send_cq); + qp->rcq = to_ecq(attrs->recv_cq); + qp->dev = dev; + qp->attrs.cc = dev->attrs.cc; + + init_rwsem(&qp->state_lock); + kref_init(&qp->ref); + init_completion(&qp->safe_free); + + ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, + XA_LIMIT(1, dev->attrs.max_qp - 1), + &dev->next_alloc_qpn, GFP_KERNEL); + if (ret < 0) { + ret = -ENOMEM; + goto err_out; + } + + qp->attrs.sq_size = roundup_pow_of_two(attrs->cap.max_send_wr * + ERDMA_MAX_WQEBB_PER_SQE); + qp->attrs.rq_size = roundup_pow_of_two(attrs->cap.max_recv_wr); + + if (uctx) { + ret = ib_copy_from_udata(&ureq, udata, + min(sizeof(ureq), udata->inlen)); + if (ret) + goto err_out_xa; + + ret = init_user_qp(qp, uctx, ureq.qbuf_va, ureq.qbuf_len, + ureq.db_record_va); + if (ret) + goto err_out_xa; + + memset(&uresp, 0, sizeof(uresp)); + + uresp.num_sqe = qp->attrs.sq_size; + uresp.num_rqe = qp->attrs.rq_size; + uresp.qp_id = QP_ID(qp); + uresp.rq_offset = qp->user_qp.rq_offset; + + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (ret) + goto err_out_cmd; + } else { + init_kernel_qp(dev, qp, attrs); + } + + qp->attrs.max_send_sge = attrs->cap.max_send_sge; + qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; + qp->attrs.state = ERDMA_QP_STATE_IDLE; + + ret = create_qp_cmd(dev, qp); + if (ret) + goto err_out_cmd; + + spin_lock_init(&qp->lock); + + return 0; + +err_out_cmd: + if (uctx) + free_user_qp(qp, uctx); + else + free_kernel_qp(qp); +err_out_xa: + xa_erase(&dev->qp_xa, QP_ID(qp)); +err_out: + return ret; +} + +static int erdma_create_stag(struct erdma_dev *dev, u32 *stag) +{ + int stag_idx; + + stag_idx = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX]); + if (stag_idx < 0) + return stag_idx; + + /* For now, we always let key field be zero. */ + *stag = (stag_idx << 8); + + return 0; +} + +struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int acc) +{ + struct erdma_dev *dev = to_edev(ibpd->device); + struct erdma_mr *mr; + u32 stag; + int ret; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + ret = erdma_create_stag(dev, &stag); + if (ret) + goto out_free; + + mr->type = ERDMA_MR_TYPE_DMA; + + mr->ibmr.lkey = stag; + mr->ibmr.rkey = stag; + mr->ibmr.pd = ibpd; + mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(acc); + ret = regmr_cmd(dev, mr); + if (ret) + goto out_remove_stag; + + return &mr->ibmr; + +out_remove_stag: + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], + mr->ibmr.lkey >> 8); + +out_free: + kfree(mr); + + return ERR_PTR(ret); +} + +struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct erdma_mr *mr; + struct erdma_dev *dev = to_edev(ibpd->device); + int ret; + u32 stag; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EOPNOTSUPP); + + if (max_num_sg > ERDMA_MR_MAX_MTT_CNT) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + ret = erdma_create_stag(dev, &stag); + if (ret) + goto out_free; + + mr->type = ERDMA_MR_TYPE_FRMR; + + mr->ibmr.lkey = stag; + mr->ibmr.rkey = stag; + mr->ibmr.pd = ibpd; + /* update it in FRMR. */ + mr->access = ERDMA_MR_ACC_LR | ERDMA_MR_ACC_LW | ERDMA_MR_ACC_RR | + ERDMA_MR_ACC_RW; + + mr->mem.page_size = PAGE_SIZE; /* update it later. */ + mr->mem.page_cnt = max_num_sg; + mr->mem.mtt_type = ERDMA_MR_INDIRECT_MTT; + mr->mem.mtt_buf = + alloc_pages_exact(MTT_SIZE(mr->mem.page_cnt), GFP_KERNEL); + if (!mr->mem.mtt_buf) { + ret = -ENOMEM; + goto out_remove_stag; + } + + mr->mem.mtt_entry[0] = + dma_map_single(&dev->pdev->dev, mr->mem.mtt_buf, + MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, mr->mem.mtt_entry[0])) { + ret = -ENOMEM; + goto out_free_mtt; + } + + ret = regmr_cmd(dev, mr); + if (ret) + goto out_dma_unmap; + + return &mr->ibmr; + +out_dma_unmap: + dma_unmap_single(&dev->pdev->dev, mr->mem.mtt_entry[0], + MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); +out_free_mtt: + free_pages_exact(mr->mem.mtt_buf, MTT_SIZE(mr->mem.page_cnt)); + +out_remove_stag: + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], + mr->ibmr.lkey >> 8); + +out_free: + kfree(mr); + + return ERR_PTR(ret); +} + +static int erdma_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct erdma_mr *mr = to_emr(ibmr); + + if (mr->mem.mtt_nents >= mr->mem.page_cnt) + return -1; + + *((u64 *)mr->mem.mtt_buf + mr->mem.mtt_nents) = addr; + mr->mem.mtt_nents++; + + return 0; +} + +int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct erdma_mr *mr = to_emr(ibmr); + int num; + + mr->mem.mtt_nents = 0; + + num = ib_sg_to_pages(&mr->ibmr, sg, sg_nents, sg_offset, + erdma_set_page); + + return num; +} + +struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, + u64 virt, int access, struct ib_udata *udata) +{ + struct erdma_mr *mr = NULL; + struct erdma_dev *dev = to_edev(ibpd->device); + u32 stag; + int ret; + + if (!len || len > dev->attrs.max_mr_size) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + ret = get_mtt_entries(dev, &mr->mem, start, len, access, virt, + SZ_2G - SZ_4K, 0); + if (ret) + goto err_out_free; + + ret = erdma_create_stag(dev, &stag); + if (ret) + goto err_out_put_mtt; + + mr->ibmr.lkey = mr->ibmr.rkey = stag; + mr->ibmr.pd = ibpd; + mr->mem.va = virt; + mr->mem.len = len; + mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(access); + mr->valid = 1; + mr->type = ERDMA_MR_TYPE_NORMAL; + + ret = regmr_cmd(dev, mr); + if (ret) + goto err_out_mr; + + return &mr->ibmr; + +err_out_mr: + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], + mr->ibmr.lkey >> 8); + +err_out_put_mtt: + put_mtt_entries(dev, &mr->mem); + +err_out_free: + kfree(mr); + + return ERR_PTR(ret); +} + +int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct erdma_mr *mr; + struct erdma_dev *dev = to_edev(ibmr->device); + struct erdma_cmdq_dereg_mr_req req; + int ret; + + mr = to_emr(ibmr); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DEREG_MR); + + req.cfg = FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, ibmr->lkey >> 8) | + FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, ibmr->lkey & 0xFF); + + ret = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (ret) + return ret; + + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], ibmr->lkey >> 8); + + put_mtt_entries(dev, &mr->mem); + + kfree(mr); + return 0; +} + +int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct erdma_cq *cq = to_ecq(ibcq); + struct erdma_dev *dev = to_edev(ibcq->device); + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + int err; + struct erdma_cmdq_destroy_cq_req req; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DESTROY_CQ); + req.cqn = cq->cqn; + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (err) + return err; + + if (rdma_is_kernel_res(&cq->ibcq.res)) { + dma_free_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + } else { + erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); + } + + xa_erase(&dev->cq_xa, cq->cqn); + + return 0; +} + +int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct erdma_qp *qp = to_eqp(ibqp); + struct erdma_dev *dev = to_edev(ibqp->device); + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + struct erdma_qp_attrs qp_attrs; + int err; + struct erdma_cmdq_destroy_qp_req req; + + down_write(&qp->state_lock); + qp_attrs.state = ERDMA_QP_STATE_ERROR; + erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + up_write(&qp->state_lock); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DESTROY_QP); + req.qpn = QP_ID(qp); + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (err) + return err; + + erdma_qp_put(qp); + wait_for_completion(&qp->safe_free); + + if (rdma_is_kernel_res(&qp->ibqp.res)) { + vfree(qp->kern_qp.swr_tbl); + vfree(qp->kern_qp.rwr_tbl); + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), + qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), + qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + } else { + put_mtt_entries(dev, &qp->user_qp.sq_mtt); + put_mtt_entries(dev, &qp->user_qp.rq_mtt); + erdma_unmap_user_dbrecords(ctx, &qp->user_qp.user_dbr_page); + } + + if (qp->cep) + erdma_cep_put(qp->cep); + xa_erase(&dev->qp_xa, QP_ID(qp)); + + return 0; +} + +void erdma_qp_get_ref(struct ib_qp *ibqp) +{ + erdma_qp_get(to_eqp(ibqp)); +} + +void erdma_qp_put_ref(struct ib_qp *ibqp) +{ + erdma_qp_put(to_eqp(ibqp)); +} + +int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) +{ + struct rdma_user_mmap_entry *rdma_entry; + struct erdma_user_mmap_entry *entry; + pgprot_t prot; + int err; + + rdma_entry = rdma_user_mmap_entry_get(ctx, vma); + if (!rdma_entry) + return -EINVAL; + + entry = to_emmap(rdma_entry); + + switch (entry->mmap_flag) { + case ERDMA_MMAP_IO_NC: + /* map doorbell. */ + prot = pgprot_device(vma->vm_page_prot); + break; + default: + return -EINVAL; + } + + err = rdma_user_mmap_io(ctx, vma, PFN_DOWN(entry->address), PAGE_SIZE, + prot, rdma_entry); + + rdma_user_mmap_entry_put(rdma_entry); + return err; +} + +void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct erdma_user_mmap_entry *entry = to_emmap(rdma_entry); + + kfree(entry); +} + +#define ERDMA_SDB_PAGE 0 +#define ERDMA_SDB_ENTRY 1 +#define ERDMA_SDB_SHARED 2 + +static void alloc_db_resources(struct erdma_dev *dev, + struct erdma_ucontext *ctx) +{ + u32 bitmap_idx; + struct erdma_devattr *attrs = &dev->attrs; + + if (attrs->disable_dwqe) + goto alloc_normal_db; + + /* Try to alloc independent SDB page. */ + spin_lock(&dev->db_bitmap_lock); + bitmap_idx = find_first_zero_bit(dev->sdb_page, attrs->dwqe_pages); + if (bitmap_idx != attrs->dwqe_pages) { + set_bit(bitmap_idx, dev->sdb_page); + spin_unlock(&dev->db_bitmap_lock); + + ctx->sdb_type = ERDMA_SDB_PAGE; + ctx->sdb_idx = bitmap_idx; + ctx->sdb_page_idx = bitmap_idx; + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + + (bitmap_idx << PAGE_SHIFT); + ctx->sdb_page_off = 0; + + return; + } + + bitmap_idx = find_first_zero_bit(dev->sdb_entry, attrs->dwqe_entries); + if (bitmap_idx != attrs->dwqe_entries) { + set_bit(bitmap_idx, dev->sdb_entry); + spin_unlock(&dev->db_bitmap_lock); + + ctx->sdb_type = ERDMA_SDB_ENTRY; + ctx->sdb_idx = bitmap_idx; + ctx->sdb_page_idx = attrs->dwqe_pages + + bitmap_idx / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + ctx->sdb_page_off = bitmap_idx % ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + + (ctx->sdb_page_idx << PAGE_SHIFT); + + return; + } + + spin_unlock(&dev->db_bitmap_lock); + +alloc_normal_db: + ctx->sdb_type = ERDMA_SDB_SHARED; + ctx->sdb_idx = 0; + ctx->sdb_page_idx = ERDMA_SDB_SHARED_PAGE_INDEX; + ctx->sdb_page_off = 0; + + ctx->sdb = dev->func_bar_addr + (ctx->sdb_page_idx << PAGE_SHIFT); +} + +static void erdma_uctx_user_mmap_entries_remove(struct erdma_ucontext *uctx) +{ + rdma_user_mmap_entry_remove(uctx->sq_db_mmap_entry); + rdma_user_mmap_entry_remove(uctx->rq_db_mmap_entry); + rdma_user_mmap_entry_remove(uctx->cq_db_mmap_entry); +} + +int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) +{ + struct erdma_ucontext *ctx = to_ectx(ibctx); + struct erdma_dev *dev = to_edev(ibctx->device); + int ret; + struct erdma_uresp_alloc_ctx uresp = {}; + + if (atomic_inc_return(&dev->num_ctx) > ERDMA_MAX_CONTEXT) { + ret = -ENOMEM; + goto err_out; + } + + INIT_LIST_HEAD(&ctx->dbrecords_page_list); + mutex_init(&ctx->dbrecords_page_mutex); + + alloc_db_resources(dev, ctx); + + ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; + ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; + + if (udata->outlen < sizeof(uresp)) { + ret = -EINVAL; + goto err_out; + } + + ctx->sq_db_mmap_entry = erdma_user_mmap_entry_insert( + ctx, (void *)ctx->sdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.sdb); + if (!ctx->sq_db_mmap_entry) { + ret = -ENOMEM; + goto err_out; + } + + ctx->rq_db_mmap_entry = erdma_user_mmap_entry_insert( + ctx, (void *)ctx->rdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.rdb); + if (!ctx->rq_db_mmap_entry) { + ret = -EINVAL; + goto err_out; + } + + ctx->cq_db_mmap_entry = erdma_user_mmap_entry_insert( + ctx, (void *)ctx->cdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.cdb); + if (!ctx->cq_db_mmap_entry) { + ret = -EINVAL; + goto err_out; + } + + uresp.dev_id = dev->pdev->device; + uresp.sdb_type = ctx->sdb_type; + uresp.sdb_offset = ctx->sdb_page_off; + + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (ret) + goto err_out; + + return 0; + +err_out: + erdma_uctx_user_mmap_entries_remove(ctx); + atomic_dec(&dev->num_ctx); + return ret; +} + +void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) +{ + struct erdma_ucontext *ctx = to_ectx(ibctx); + struct erdma_dev *dev = to_edev(ibctx->device); + + spin_lock(&dev->db_bitmap_lock); + if (ctx->sdb_type == ERDMA_SDB_PAGE) + clear_bit(ctx->sdb_idx, dev->sdb_page); + else if (ctx->sdb_type == ERDMA_SDB_ENTRY) + clear_bit(ctx->sdb_idx, dev->sdb_entry); + + erdma_uctx_user_mmap_entries_remove(ctx); + + spin_unlock(&dev->db_bitmap_lock); + + atomic_dec(&dev->num_ctx); +} + +static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = ERDMA_QP_STATE_IDLE, + [IB_QPS_INIT] = ERDMA_QP_STATE_IDLE, + [IB_QPS_RTR] = ERDMA_QP_STATE_RTR, + [IB_QPS_RTS] = ERDMA_QP_STATE_RTS, + [IB_QPS_SQD] = ERDMA_QP_STATE_CLOSING, + [IB_QPS_SQE] = ERDMA_QP_STATE_TERMINATE, + [IB_QPS_ERR] = ERDMA_QP_STATE_ERROR +}; + +int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + struct erdma_qp_attrs new_attrs; + enum erdma_qp_attr_mask erdma_attr_mask = 0; + struct erdma_qp *qp = to_eqp(ibqp); + int ret = 0; + + if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) + return -EOPNOTSUPP; + + memset(&new_attrs, 0, sizeof(new_attrs)); + + if (attr_mask & IB_QP_STATE) { + new_attrs.state = ib_qp_state_to_erdma_qp_state[attr->qp_state]; + + erdma_attr_mask |= ERDMA_QP_ATTR_STATE; + } + + down_write(&qp->state_lock); + + ret = erdma_modify_qp_internal(qp, &new_attrs, erdma_attr_mask); + + up_write(&qp->state_lock); + + return ret; +} + +int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct erdma_qp *qp; + struct erdma_dev *dev; + + if (ibqp && qp_attr && qp_init_attr) { + qp = to_eqp(ibqp); + dev = to_edev(ibqp->device); + } else { + return -EINVAL; + } + + qp_attr->cap.max_inline_data = ERDMA_MAX_INLINE; + qp_init_attr->cap.max_inline_data = ERDMA_MAX_INLINE; + + qp_attr->cap.max_send_wr = qp->attrs.sq_size; + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; + qp_attr->cap.max_send_sge = qp->attrs.max_send_sge; + qp_attr->cap.max_recv_sge = qp->attrs.max_recv_sge; + + qp_attr->path_mtu = ib_mtu_int_to_enum(dev->netdev->mtu); + qp_attr->max_rd_atomic = qp->attrs.irq_size; + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; + + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + + qp_init_attr->cap = qp_attr->cap; + + return 0; +} + +static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq, + struct erdma_ureq_create_cq *ureq) +{ + int ret; + struct erdma_dev *dev = to_edev(cq->ibcq.device); + + ret = get_mtt_entries(dev, &cq->user_cq.qbuf_mtt, ureq->qbuf_va, + ureq->qbuf_len, 0, ureq->qbuf_va, SZ_64M - SZ_4K, + 1); + if (ret) + return ret; + + ret = erdma_map_user_dbrecords(ctx, ureq->db_record_va, + &cq->user_cq.user_dbr_page, + &cq->user_cq.db_info_dma_addr); + if (ret) + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); + + return ret; +} + +static int erdma_init_kernel_cq(struct erdma_cq *cq) +{ + struct erdma_dev *dev = to_edev(cq->ibcq.device); + + cq->kern_cq.qbuf = + dma_alloc_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + &cq->kern_cq.qbuf_dma_addr, GFP_KERNEL); + if (!cq->kern_cq.qbuf) + return -ENOMEM; + + cq->kern_cq.db_record = + (u64 *)(cq->kern_cq.qbuf + (cq->depth << CQE_SHIFT)); + spin_lock_init(&cq->kern_cq.lock); + /* use default cqdb addr */ + cq->kern_cq.db = dev->func_bar + ERDMA_BAR_CQDB_SPACE_OFFSET; + + return 0; +} + +int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct erdma_cq *cq = to_ecq(ibcq); + struct erdma_dev *dev = to_edev(ibcq->device); + unsigned int depth = attr->cqe; + int ret; + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + + if (depth > dev->attrs.max_cqe) + return -EINVAL; + + depth = roundup_pow_of_two(depth); + cq->ibcq.cqe = depth; + cq->depth = depth; + cq->assoc_eqn = attr->comp_vector + 1; + + ret = xa_alloc_cyclic(&dev->cq_xa, &cq->cqn, cq, + XA_LIMIT(1, dev->attrs.max_cq - 1), + &dev->next_alloc_cqn, GFP_KERNEL); + if (ret < 0) + return ret; + + if (!rdma_is_kernel_res(&ibcq->res)) { + struct erdma_ureq_create_cq ureq; + struct erdma_uresp_create_cq uresp; + + ret = ib_copy_from_udata(&ureq, udata, + min(udata->inlen, sizeof(ureq))); + if (ret) + goto err_out_xa; + + ret = erdma_init_user_cq(ctx, cq, &ureq); + if (ret) + goto err_out_xa; + + uresp.cq_id = cq->cqn; + uresp.num_cqe = depth; + + ret = ib_copy_to_udata(udata, &uresp, + min(sizeof(uresp), udata->outlen)); + if (ret) + goto err_free_res; + } else { + ret = erdma_init_kernel_cq(cq); + if (ret) + goto err_out_xa; + } + + ret = create_cq_cmd(dev, cq); + if (ret) + goto err_free_res; + + return 0; + +err_free_res: + if (!rdma_is_kernel_res(&ibcq->res)) { + erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); + } else { + dma_free_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(depth << CQE_SHIFT), + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + } + +err_out_xa: + xa_erase(&dev->cq_xa, cq->cqn); + + return ret; +} + +void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason) +{ + struct ib_event event; + + event.device = &dev->ibdev; + event.element.port_num = 1; + event.event = reason; + + ib_dispatch_event(&event); +} From e2692bf4bc6384223d67f7a1f70f637b3d2038fe Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:09 +0800 Subject: [PATCH 0628/1250] RDMA/erdma: Add connection management (CM) support ERDMA's transport protocol is iWarp, so the driver must support CM interface. In CM part, we use the same way as SoftiWarp: using kernel socket to set up the connection, then performing MPA negotiation in kernel. So, this part of code mainly comes from SoftiWarp, base on it, we add some more features, such as non-blocking iw_connect implementation. This commit also fixes a duplicated include issue reported by Abaci Robot. Link: https://lore.kernel.org/r/20220713094212.30943-9-chengyou@linux.alibaba.com Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_cm.c | 1430 ++++++++++++++++++++++++ drivers/infiniband/hw/erdma/erdma_cm.h | 167 +++ 2 files changed, 1597 insertions(+) create mode 100644 drivers/infiniband/hw/erdma/erdma_cm.c create mode 100644 drivers/infiniband/hw/erdma/erdma_cm.h diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c new file mode 100644 index 00000000000000..f13f16479ecadf --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -0,0 +1,1430 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +/* Authors: Bernard Metzler */ +/* Fredy Neeser */ +/* Greg Joyce */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_verbs.h" + +static struct workqueue_struct *erdma_cm_wq; + +static void erdma_cm_llp_state_change(struct sock *sk); +static void erdma_cm_llp_data_ready(struct sock *sk); +static void erdma_cm_llp_error_report(struct sock *sk); + +static void erdma_sk_assign_cm_upcalls(struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + sk->sk_state_change = erdma_cm_llp_state_change; + sk->sk_data_ready = erdma_cm_llp_data_ready; + sk->sk_error_report = erdma_cm_llp_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void erdma_sk_save_upcalls(struct sock *sk) +{ + struct erdma_cep *cep = sk_to_cep(sk); + + write_lock_bh(&sk->sk_callback_lock); + cep->sk_state_change = sk->sk_state_change; + cep->sk_data_ready = sk->sk_data_ready; + cep->sk_error_report = sk->sk_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void erdma_sk_restore_upcalls(struct sock *sk, struct erdma_cep *cep) +{ + sk->sk_state_change = cep->sk_state_change; + sk->sk_data_ready = cep->sk_data_ready; + sk->sk_error_report = cep->sk_error_report; + sk->sk_user_data = NULL; +} + +static void erdma_socket_disassoc(struct socket *s) +{ + struct sock *sk = s->sk; + struct erdma_cep *cep; + + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + cep = sk_to_cep(sk); + if (cep) { + erdma_sk_restore_upcalls(sk, cep); + erdma_cep_put(cep); + } else { + WARN_ON_ONCE(1); + } + write_unlock_bh(&sk->sk_callback_lock); + } else { + WARN_ON_ONCE(1); + } +} + +static void erdma_cep_socket_assoc(struct erdma_cep *cep, struct socket *s) +{ + cep->sock = s; + erdma_cep_get(cep); + s->sk->sk_user_data = cep; + + erdma_sk_save_upcalls(s->sk); + erdma_sk_assign_cm_upcalls(s->sk); +} + +static void erdma_disassoc_listen_cep(struct erdma_cep *cep) +{ + if (cep->listen_cep) { + erdma_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } +} + +static struct erdma_cep *erdma_cep_alloc(struct erdma_dev *dev) +{ + struct erdma_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); + unsigned long flags; + + if (!cep) + return NULL; + + INIT_LIST_HEAD(&cep->listenq); + INIT_LIST_HEAD(&cep->devq); + INIT_LIST_HEAD(&cep->work_freelist); + + kref_init(&cep->ref); + cep->state = ERDMA_EPSTATE_IDLE; + init_waitqueue_head(&cep->waitq); + spin_lock_init(&cep->lock); + cep->dev = dev; + + spin_lock_irqsave(&dev->lock, flags); + list_add_tail(&cep->devq, &dev->cep_list); + spin_unlock_irqrestore(&dev->lock, flags); + + return cep; +} + +static void erdma_cm_free_work(struct erdma_cep *cep) +{ + struct list_head *w, *tmp; + struct erdma_cm_work *work; + + list_for_each_safe(w, tmp, &cep->work_freelist) { + work = list_entry(w, struct erdma_cm_work, list); + list_del(&work->list); + kfree(work); + } +} + +static void erdma_cancel_mpatimer(struct erdma_cep *cep) +{ + spin_lock_bh(&cep->lock); + if (cep->mpa_timer) { + if (cancel_delayed_work(&cep->mpa_timer->work)) { + erdma_cep_put(cep); + kfree(cep->mpa_timer); + } + cep->mpa_timer = NULL; + } + spin_unlock_bh(&cep->lock); +} + +static void erdma_put_work(struct erdma_cm_work *work) +{ + INIT_LIST_HEAD(&work->list); + spin_lock_bh(&work->cep->lock); + list_add(&work->list, &work->cep->work_freelist); + spin_unlock_bh(&work->cep->lock); +} + +static void erdma_cep_set_inuse(struct erdma_cep *cep) +{ + unsigned long flags; + + spin_lock_irqsave(&cep->lock, flags); + while (cep->in_use) { + spin_unlock_irqrestore(&cep->lock, flags); + wait_event_interruptible(cep->waitq, !cep->in_use); + if (signal_pending(current)) + flush_signals(current); + + spin_lock_irqsave(&cep->lock, flags); + } + + cep->in_use = 1; + spin_unlock_irqrestore(&cep->lock, flags); +} + +static void erdma_cep_set_free(struct erdma_cep *cep) +{ + unsigned long flags; + + spin_lock_irqsave(&cep->lock, flags); + cep->in_use = 0; + spin_unlock_irqrestore(&cep->lock, flags); + + wake_up(&cep->waitq); +} + +static void __erdma_cep_dealloc(struct kref *ref) +{ + struct erdma_cep *cep = container_of(ref, struct erdma_cep, ref); + struct erdma_dev *dev = cep->dev; + unsigned long flags; + + WARN_ON(cep->listen_cep); + + kfree(cep->private_data); + kfree(cep->mpa.pdata); + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) + erdma_cm_free_work(cep); + spin_unlock_bh(&cep->lock); + + spin_lock_irqsave(&dev->lock, flags); + list_del(&cep->devq); + spin_unlock_irqrestore(&dev->lock, flags); + kfree(cep); +} + +static struct erdma_cm_work *erdma_get_work(struct erdma_cep *cep) +{ + struct erdma_cm_work *work = NULL; + + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) { + work = list_entry(cep->work_freelist.next, struct erdma_cm_work, + list); + list_del_init(&work->list); + } + + spin_unlock_bh(&cep->lock); + return work; +} + +static int erdma_cm_alloc_work(struct erdma_cep *cep, int num) +{ + struct erdma_cm_work *work; + + while (num--) { + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + if (!(list_empty(&cep->work_freelist))) + erdma_cm_free_work(cep); + return -ENOMEM; + } + work->cep = cep; + INIT_LIST_HEAD(&work->list); + list_add(&work->list, &cep->work_freelist); + } + + return 0; +} + +static int erdma_cm_upcall(struct erdma_cep *cep, enum iw_cm_event_type reason, + int status) +{ + struct iw_cm_event event; + struct iw_cm_id *cm_id; + + memset(&event, 0, sizeof(event)); + event.status = status; + event.event = reason; + + if (reason == IW_CM_EVENT_CONNECT_REQUEST) { + event.provider_data = cep; + cm_id = cep->listen_cep->cm_id; + + event.ird = cep->dev->attrs.max_ird; + event.ord = cep->dev->attrs.max_ord; + } else { + cm_id = cep->cm_id; + } + + if (reason == IW_CM_EVENT_CONNECT_REQUEST || + reason == IW_CM_EVENT_CONNECT_REPLY) { + u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); + + if (pd_len && cep->mpa.pdata) { + event.private_data_len = pd_len; + event.private_data = cep->mpa.pdata; + } + + getname_local(cep->sock, &event.local_addr); + getname_peer(cep->sock, &event.remote_addr); + } + + return cm_id->event_handler(cm_id, &event); +} + +void erdma_qp_cm_drop(struct erdma_qp *qp) +{ + struct erdma_cep *cep = qp->cep; + + if (!qp->cep) + return; + + erdma_cep_set_inuse(cep); + + /* already closed. */ + if (cep->state == ERDMA_EPSTATE_CLOSED) + goto out; + + if (cep->cm_id) { + switch (cep->state) { + case ERDMA_EPSTATE_AWAIT_MPAREP: + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -EINVAL); + break; + case ERDMA_EPSTATE_RDMA_MODE: + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + break; + case ERDMA_EPSTATE_IDLE: + case ERDMA_EPSTATE_LISTENING: + case ERDMA_EPSTATE_CONNECTING: + case ERDMA_EPSTATE_AWAIT_MPAREQ: + case ERDMA_EPSTATE_RECVD_MPAREQ: + case ERDMA_EPSTATE_CLOSED: + default: + break; + } + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + erdma_cep_put(cep); + } + cep->state = ERDMA_EPSTATE_CLOSED; + + if (cep->sock) { + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + + if (cep->qp) { + cep->qp = NULL; + erdma_qp_put(qp); + } +out: + erdma_cep_set_free(cep); +} + +void erdma_cep_put(struct erdma_cep *cep) +{ + WARN_ON(kref_read(&cep->ref) < 1); + kref_put(&cep->ref, __erdma_cep_dealloc); +} + +void erdma_cep_get(struct erdma_cep *cep) +{ + kref_get(&cep->ref); +} + +static int erdma_send_mpareqrep(struct erdma_cep *cep, const void *pdata, + u8 pd_len) +{ + struct socket *s = cep->sock; + struct mpa_rr *rr = &cep->mpa.hdr; + struct kvec iov[3]; + struct msghdr msg; + int iovec_num = 0; + int ret; + int mpa_len; + + memset(&msg, 0, sizeof(msg)); + + rr->params.pd_len = cpu_to_be16(pd_len); + + iov[iovec_num].iov_base = rr; + iov[iovec_num].iov_len = sizeof(*rr); + iovec_num++; + mpa_len = sizeof(*rr); + + iov[iovec_num].iov_base = &cep->mpa.ext_data; + iov[iovec_num].iov_len = sizeof(cep->mpa.ext_data); + iovec_num++; + mpa_len += sizeof(cep->mpa.ext_data); + + if (pd_len) { + iov[iovec_num].iov_base = (char *)pdata; + iov[iovec_num].iov_len = pd_len; + mpa_len += pd_len; + iovec_num++; + } + + ret = kernel_sendmsg(s, &msg, iov, iovec_num, mpa_len); + + return ret < 0 ? ret : 0; +} + +static inline int ksock_recv(struct socket *sock, char *buf, size_t size, + int flags) +{ + struct kvec iov = { buf, size }; + struct msghdr msg = { .msg_name = NULL, .msg_flags = flags }; + + return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); +} + +static int __recv_mpa_hdr(struct erdma_cep *cep, int hdr_rcvd, char *hdr, + int hdr_size, int *rcvd_out) +{ + struct socket *s = cep->sock; + int rcvd; + + *rcvd_out = 0; + if (hdr_rcvd < hdr_size) { + rcvd = ksock_recv(s, hdr + hdr_rcvd, hdr_size - hdr_rcvd, + MSG_DONTWAIT); + if (rcvd == -EAGAIN) + return -EAGAIN; + + if (rcvd <= 0) + return -ECONNABORTED; + + hdr_rcvd += rcvd; + *rcvd_out = rcvd; + + if (hdr_rcvd < hdr_size) + return -EAGAIN; + } + + return 0; +} + +static void __mpa_rr_set_revision(__be16 *bits, u8 rev) +{ + *bits = (*bits & ~MPA_RR_MASK_REVISION) | + (cpu_to_be16(rev) & MPA_RR_MASK_REVISION); +} + +static u8 __mpa_rr_revision(__be16 mpa_rr_bits) +{ + __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION; + + return (u8)be16_to_cpu(rev); +} + +static void __mpa_ext_set_cc(__be32 *bits, u32 cc) +{ + *bits = (*bits & ~MPA_EXT_FLAG_CC) | + (cpu_to_be32(cc) & MPA_EXT_FLAG_CC); +} + +static u8 __mpa_ext_cc(__be32 mpa_ext_bits) +{ + __be32 cc = mpa_ext_bits & MPA_EXT_FLAG_CC; + + return (u8)be32_to_cpu(cc); +} + +/* + * Receive MPA Request/Reply header. + * + * Returns 0 if complete MPA Request/Reply haeder including + * eventual private data was received. Returns -EAGAIN if + * header was partially received or negative error code otherwise. + * + * Context: May be called in process context only + */ +static int erdma_recv_mpa_rr(struct erdma_cep *cep) +{ + struct mpa_rr *hdr = &cep->mpa.hdr; + struct socket *s = cep->sock; + u16 pd_len; + int rcvd, to_rcv, ret, pd_rcvd; + + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { + ret = __recv_mpa_hdr(cep, cep->mpa.bytes_rcvd, + (char *)&cep->mpa.hdr, + sizeof(struct mpa_rr), &rcvd); + cep->mpa.bytes_rcvd += rcvd; + if (ret) + return ret; + } + + if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA || + __mpa_rr_revision(hdr->params.bits) != MPA_REVISION_EXT_1) + return -EPROTO; + + if (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) < + sizeof(struct erdma_mpa_ext)) { + ret = __recv_mpa_hdr( + cep, cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), + (char *)&cep->mpa.ext_data, + sizeof(struct erdma_mpa_ext), &rcvd); + cep->mpa.bytes_rcvd += rcvd; + if (ret) + return ret; + } + + pd_len = be16_to_cpu(hdr->params.pd_len); + pd_rcvd = cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) - + sizeof(struct erdma_mpa_ext); + to_rcv = pd_len - pd_rcvd; + + if (!to_rcv) { + /* + * We have received the whole MPA Request/Reply message. + * Check against peer protocol violation. + */ + u32 word; + + ret = __recv_mpa_hdr(cep, 0, (char *)&word, sizeof(word), + &rcvd); + if (ret == -EAGAIN && rcvd == 0) + return 0; + + if (ret) + return ret; + + return -EPROTO; + } + + /* + * At this point, MPA header has been fully received, and pd_len != 0. + * So, begin to receive private data. + */ + if (!cep->mpa.pdata) { + cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); + if (!cep->mpa.pdata) + return -ENOMEM; + } + + rcvd = ksock_recv(s, cep->mpa.pdata + pd_rcvd, to_rcv + 4, + MSG_DONTWAIT); + if (rcvd < 0) + return rcvd; + + if (rcvd > to_rcv) + return -EPROTO; + + cep->mpa.bytes_rcvd += rcvd; + + if (to_rcv == rcvd) + return 0; + + return -EAGAIN; +} + +/* + * erdma_proc_mpareq() + * + * Read MPA Request from socket and signal new connection to IWCM + * if success. Caller must hold lock on corresponding listening CEP. + */ +static int erdma_proc_mpareq(struct erdma_cep *cep) +{ + struct mpa_rr *req; + int ret; + + ret = erdma_recv_mpa_rr(cep); + if (ret) + return ret; + + req = &cep->mpa.hdr; + + if (memcmp(req->key, MPA_KEY_REQ, MPA_KEY_SIZE)) + return -EPROTO; + + memcpy(req->key, MPA_KEY_REP, MPA_KEY_SIZE); + + /* Currently does not support marker and crc. */ + if (req->params.bits & MPA_RR_FLAG_MARKERS || + req->params.bits & MPA_RR_FLAG_CRC) + goto reject_conn; + + cep->state = ERDMA_EPSTATE_RECVD_MPAREQ; + + /* Keep reference until IWCM accepts/rejects */ + erdma_cep_get(cep); + ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); + if (ret) + erdma_cep_put(cep); + + return ret; + +reject_conn: + req->params.bits &= ~MPA_RR_FLAG_MARKERS; + req->params.bits |= MPA_RR_FLAG_REJECT; + req->params.bits &= ~MPA_RR_FLAG_CRC; + + kfree(cep->mpa.pdata); + cep->mpa.pdata = NULL; + erdma_send_mpareqrep(cep, NULL, 0); + + return -EOPNOTSUPP; +} + +static int erdma_proc_mpareply(struct erdma_cep *cep) +{ + struct erdma_qp_attrs qp_attrs; + struct erdma_qp *qp = cep->qp; + struct mpa_rr *rep; + int ret; + + ret = erdma_recv_mpa_rr(cep); + if (ret) + goto out_err; + + erdma_cancel_mpatimer(cep); + + rep = &cep->mpa.hdr; + + if (memcmp(rep->key, MPA_KEY_REP, MPA_KEY_SIZE)) { + ret = -EPROTO; + goto out_err; + } + + if (rep->params.bits & MPA_RR_FLAG_REJECT) { + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); + return -ECONNRESET; + } + + /* Currently does not support marker and crc. */ + if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || + (rep->params.bits & MPA_RR_FLAG_CRC)) { + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); + return -EINVAL; + } + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.irq_size = cep->ird; + qp_attrs.orq_size = cep->ord; + qp_attrs.state = ERDMA_QP_STATE_RTS; + + down_write(&qp->state_lock); + if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto out_err; + } + + qp->attrs.qp_type = ERDMA_QP_ACTIVE; + if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) + qp->attrs.cc = COMPROMISE_CC; + + ret = erdma_modify_qp_internal(qp, &qp_attrs, + ERDMA_QP_ATTR_STATE | + ERDMA_QP_ATTR_LLP_HANDLE | + ERDMA_QP_ATTR_MPA); + + up_write(&qp->state_lock); + + if (!ret) { + ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); + if (!ret) + cep->state = ERDMA_EPSTATE_RDMA_MODE; + + return 0; + } + +out_err: + if (ret != -EAGAIN) + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); + + return ret; +} + +static void erdma_accept_newconn(struct erdma_cep *cep) +{ + struct socket *s = cep->sock; + struct socket *new_s = NULL; + struct erdma_cep *new_cep = NULL; + int ret = 0; + + if (cep->state != ERDMA_EPSTATE_LISTENING) + goto error; + + new_cep = erdma_cep_alloc(cep->dev); + if (!new_cep) + goto error; + + /* + * 4: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout. + */ + if (erdma_cm_alloc_work(new_cep, 4) != 0) + goto error; + + /* + * Copy saved socket callbacks from listening CEP + * and assign new socket with new CEP + */ + new_cep->sk_state_change = cep->sk_state_change; + new_cep->sk_data_ready = cep->sk_data_ready; + new_cep->sk_error_report = cep->sk_error_report; + + ret = kernel_accept(s, &new_s, O_NONBLOCK); + if (ret != 0) + goto error; + + new_cep->sock = new_s; + erdma_cep_get(new_cep); + new_s->sk->sk_user_data = new_cep; + + tcp_sock_set_nodelay(new_s->sk); + new_cep->state = ERDMA_EPSTATE_AWAIT_MPAREQ; + + ret = erdma_cm_queue_work(new_cep, ERDMA_CM_WORK_MPATIMEOUT); + if (ret) + goto error; + + new_cep->listen_cep = cep; + erdma_cep_get(cep); + + if (atomic_read(&new_s->sk->sk_rmem_alloc)) { + /* MPA REQ already queued */ + erdma_cep_set_inuse(new_cep); + ret = erdma_proc_mpareq(new_cep); + if (ret != -EAGAIN) { + erdma_cep_put(cep); + new_cep->listen_cep = NULL; + if (ret) { + erdma_cep_set_free(new_cep); + goto error; + } + } + erdma_cep_set_free(new_cep); + } + return; + +error: + if (new_cep) { + new_cep->state = ERDMA_EPSTATE_CLOSED; + erdma_cancel_mpatimer(new_cep); + + erdma_cep_put(new_cep); + new_cep->sock = NULL; + } + + if (new_s) { + erdma_socket_disassoc(new_s); + sock_release(new_s); + } +} + +static int erdma_newconn_connected(struct erdma_cep *cep) +{ + int ret = 0; + + cep->mpa.hdr.params.bits = 0; + __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1); + + memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + __mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc); + + ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len); + cep->state = ERDMA_EPSTATE_AWAIT_MPAREP; + cep->mpa.hdr.params.pd_len = 0; + + if (ret >= 0) + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_MPATIMEOUT); + + return ret; +} + +static void erdma_cm_work_handler(struct work_struct *w) +{ + struct erdma_cm_work *work; + struct erdma_cep *cep; + int release_cep = 0, ret = 0; + + work = container_of(w, struct erdma_cm_work, work.work); + cep = work->cep; + + erdma_cep_set_inuse(cep); + + switch (work->type) { + case ERDMA_CM_WORK_CONNECTED: + erdma_cancel_mpatimer(cep); + if (cep->state == ERDMA_EPSTATE_CONNECTING) { + ret = erdma_newconn_connected(cep); + if (ret) { + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -EIO); + release_cep = 1; + } + } + break; + case ERDMA_CM_WORK_CONNECTTIMEOUT: + if (cep->state == ERDMA_EPSTATE_CONNECTING) { + cep->mpa_timer = NULL; + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ETIMEDOUT); + release_cep = 1; + } + break; + case ERDMA_CM_WORK_ACCEPT: + erdma_accept_newconn(cep); + break; + case ERDMA_CM_WORK_READ_MPAHDR: + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { + if (cep->listen_cep) { + erdma_cep_set_inuse(cep->listen_cep); + + if (cep->listen_cep->state == + ERDMA_EPSTATE_LISTENING) + ret = erdma_proc_mpareq(cep); + else + ret = -EFAULT; + + erdma_cep_set_free(cep->listen_cep); + + if (ret != -EAGAIN) { + erdma_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + if (ret) + erdma_cep_put(cep); + } + } + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { + ret = erdma_proc_mpareply(cep); + } + + if (ret && ret != -EAGAIN) + release_cep = 1; + break; + case ERDMA_CM_WORK_CLOSE_LLP: + if (cep->cm_id) + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + release_cep = 1; + break; + case ERDMA_CM_WORK_PEER_CLOSE: + if (cep->cm_id) { + if (cep->state == ERDMA_EPSTATE_CONNECTING || + cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { + /* + * MPA reply not received, but connection drop + */ + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ECONNRESET); + } else if (cep->state == ERDMA_EPSTATE_RDMA_MODE) { + /* + * NOTE: IW_CM_EVENT_DISCONNECT is given just + * to transition IWCM into CLOSING. + */ + erdma_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + } + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { + /* Socket close before MPA request received. */ + erdma_disassoc_listen_cep(cep); + erdma_cep_put(cep); + } + release_cep = 1; + break; + case ERDMA_CM_WORK_MPATIMEOUT: + cep->mpa_timer = NULL; + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { + /* + * MPA request timed out: + * Hide any partially received private data and signal + * timeout + */ + cep->mpa.hdr.params.pd_len = 0; + + if (cep->cm_id) + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ETIMEDOUT); + release_cep = 1; + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { + /* No MPA req received after peer TCP stream setup. */ + erdma_disassoc_listen_cep(cep); + + erdma_cep_put(cep); + release_cep = 1; + } + break; + default: + WARN(1, "Undefined CM work type: %d\n", work->type); + } + + if (release_cep) { + erdma_cancel_mpatimer(cep); + cep->state = ERDMA_EPSTATE_CLOSED; + if (cep->qp) { + struct erdma_qp *qp = cep->qp; + /* + * Serialize a potential race with application + * closing the QP and calling erdma_qp_cm_drop() + */ + erdma_qp_get(qp); + erdma_cep_set_free(cep); + + erdma_qp_llp_close(qp); + erdma_qp_put(qp); + + erdma_cep_set_inuse(cep); + cep->qp = NULL; + erdma_qp_put(qp); + } + + if (cep->sock) { + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + if (cep->state != ERDMA_EPSTATE_LISTENING) + erdma_cep_put(cep); + } + } + erdma_cep_set_free(cep); + erdma_put_work(work); + erdma_cep_put(cep); +} + +int erdma_cm_queue_work(struct erdma_cep *cep, enum erdma_work_type type) +{ + struct erdma_cm_work *work = erdma_get_work(cep); + unsigned long delay = 0; + + if (!work) + return -ENOMEM; + + work->type = type; + work->cep = cep; + + erdma_cep_get(cep); + + INIT_DELAYED_WORK(&work->work, erdma_cm_work_handler); + + if (type == ERDMA_CM_WORK_MPATIMEOUT) { + cep->mpa_timer = work; + + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) + delay = MPAREP_TIMEOUT; + else + delay = MPAREQ_TIMEOUT; + } else if (type == ERDMA_CM_WORK_CONNECTTIMEOUT) { + cep->mpa_timer = work; + + delay = CONNECT_TIMEOUT; + } + + queue_delayed_work(erdma_cm_wq, &work->work, delay); + + return 0; +} + +static void erdma_cm_llp_data_ready(struct sock *sk) +{ + struct erdma_cep *cep; + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) + goto out; + + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ || + cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) + erdma_cm_queue_work(cep, ERDMA_CM_WORK_READ_MPAHDR); + +out: + read_unlock(&sk->sk_callback_lock); +} + +static void erdma_cm_llp_error_report(struct sock *sk) +{ + struct erdma_cep *cep = sk_to_cep(sk); + + if (cep) + cep->sk_error_report(sk); +} + +static void erdma_cm_llp_state_change(struct sock *sk) +{ + struct erdma_cep *cep; + void (*orig_state_change)(struct sock *sk); + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + read_unlock(&sk->sk_callback_lock); + return; + } + orig_state_change = cep->sk_state_change; + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + if (cep->state == ERDMA_EPSTATE_CONNECTING) + erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED); + else + erdma_cm_queue_work(cep, ERDMA_CM_WORK_ACCEPT); + break; + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + if (cep->state != ERDMA_EPSTATE_LISTENING) + erdma_cm_queue_work(cep, ERDMA_CM_WORK_PEER_CLOSE); + break; + default: + break; + } + read_unlock(&sk->sk_callback_lock); + orig_state_change(sk); +} + +static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, + int laddrlen, struct sockaddr *raddr, + int raddrlen, int flags) +{ + int ret; + + sock_set_reuseaddr(s->sk); + ret = s->ops->bind(s, laddr, laddrlen); + if (ret) + return ret; + ret = s->ops->connect(s, raddr, raddrlen, flags); + return ret < 0 ? ret : 0; +} + +int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct erdma_dev *dev = to_edev(id->device); + struct erdma_qp *qp; + struct erdma_cep *cep = NULL; + struct socket *s = NULL; + struct sockaddr *laddr = (struct sockaddr *)&id->m_local_addr; + struct sockaddr *raddr = (struct sockaddr *)&id->m_remote_addr; + u16 pd_len = params->private_data_len; + int ret; + + if (pd_len > MPA_MAX_PRIVDATA) + return -EINVAL; + + if (params->ird > dev->attrs.max_ird || + params->ord > dev->attrs.max_ord) + return -EINVAL; + + if (laddr->sa_family != AF_INET || raddr->sa_family != AF_INET) + return -EAFNOSUPPORT; + + qp = find_qp_by_qpn(dev, params->qpn); + if (!qp) + return -ENOENT; + erdma_qp_get(qp); + + ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s); + if (ret < 0) + goto error_put_qp; + + cep = erdma_cep_alloc(dev); + if (!cep) { + ret = -ENOMEM; + goto error_release_sock; + } + + erdma_cep_set_inuse(cep); + + /* Associate QP with CEP */ + erdma_cep_get(cep); + qp->cep = cep; + cep->qp = qp; + + /* Associate cm_id with CEP */ + id->add_ref(id); + cep->cm_id = id; + + /* + * 6: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout, connected event + * and connect timeout. + */ + ret = erdma_cm_alloc_work(cep, 6); + if (ret != 0) { + ret = -ENOMEM; + goto error_release_cep; + } + + cep->ird = params->ird; + cep->ord = params->ord; + cep->state = ERDMA_EPSTATE_CONNECTING; + + erdma_cep_socket_assoc(cep, s); + + if (pd_len) { + cep->pd_len = pd_len; + cep->private_data = kmalloc(pd_len, GFP_KERNEL); + if (!cep->private_data) { + ret = -ENOMEM; + goto error_disassoc; + } + + memcpy(cep->private_data, params->private_data, + params->private_data_len); + } + + ret = kernel_bindconnect(s, laddr, sizeof(*laddr), raddr, + sizeof(*raddr), O_NONBLOCK); + if (ret != -EINPROGRESS && ret != 0) { + goto error_disassoc; + } else if (ret == 0) { + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED); + if (ret) + goto error_disassoc; + } else { + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTTIMEOUT); + if (ret) + goto error_disassoc; + } + + erdma_cep_set_free(cep); + return 0; + +error_disassoc: + kfree(cep->private_data); + cep->private_data = NULL; + cep->pd_len = 0; + + erdma_socket_disassoc(s); + +error_release_cep: + /* disassoc with cm_id */ + cep->cm_id = NULL; + id->rem_ref(id); + + /* disassoc with qp */ + qp->cep = NULL; + erdma_cep_put(cep); + cep->qp = NULL; + + cep->state = ERDMA_EPSTATE_CLOSED; + + erdma_cep_set_free(cep); + + /* release the cep. */ + erdma_cep_put(cep); + +error_release_sock: + if (s) + sock_release(s); +error_put_qp: + erdma_qp_put(qp); + + return ret; +} + +int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct erdma_dev *dev = to_edev(id->device); + struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; + struct erdma_qp *qp; + struct erdma_qp_attrs qp_attrs; + int ret; + + erdma_cep_set_inuse(cep); + erdma_cep_put(cep); + + /* Free lingering inbound private data */ + if (cep->mpa.hdr.params.pd_len) { + cep->mpa.hdr.params.pd_len = 0; + kfree(cep->mpa.pdata); + cep->mpa.pdata = NULL; + } + erdma_cancel_mpatimer(cep); + + if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) { + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return -ECONNRESET; + } + + qp = find_qp_by_qpn(dev, params->qpn); + if (!qp) + return -ENOENT; + erdma_qp_get(qp); + + down_write(&qp->state_lock); + if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + + if (params->ord > dev->attrs.max_ord || + params->ird > dev->attrs.max_ord) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + + if (params->private_data_len > MPA_MAX_PRIVDATA) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + + cep->ird = params->ird; + cep->ord = params->ord; + + cep->cm_id = id; + id->add_ref(id); + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.orq_size = params->ord; + qp_attrs.irq_size = params->ird; + + qp_attrs.state = ERDMA_QP_STATE_RTS; + + /* Associate QP with CEP */ + erdma_cep_get(cep); + qp->cep = cep; + cep->qp = qp; + + cep->state = ERDMA_EPSTATE_RDMA_MODE; + + qp->attrs.qp_type = ERDMA_QP_PASSIVE; + qp->attrs.pd_len = params->private_data_len; + + if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) + qp->attrs.cc = COMPROMISE_CC; + + /* move to rts */ + ret = erdma_modify_qp_internal(qp, &qp_attrs, + ERDMA_QP_ATTR_STATE | + ERDMA_QP_ATTR_ORD | + ERDMA_QP_ATTR_LLP_HANDLE | + ERDMA_QP_ATTR_IRD | + ERDMA_QP_ATTR_MPA); + up_write(&qp->state_lock); + + if (ret) + goto error; + + cep->mpa.ext_data.bits = 0; + __mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + + ret = erdma_send_mpareqrep(cep, params->private_data, + params->private_data_len); + if (!ret) { + ret = erdma_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); + if (ret) + goto error; + + erdma_cep_set_free(cep); + + return 0; + } + +error: + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = ERDMA_EPSTATE_CLOSED; + + if (cep->cm_id) { + cep->cm_id->rem_ref(id); + cep->cm_id = NULL; + } + + if (qp->cep) { + erdma_cep_put(cep); + qp->cep = NULL; + } + + cep->qp = NULL; + erdma_qp_put(qp); + + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return ret; +} + +int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen) +{ + struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; + + erdma_cep_set_inuse(cep); + erdma_cep_put(cep); + + erdma_cancel_mpatimer(cep); + + if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) { + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return -ECONNRESET; + } + + if (__mpa_rr_revision(cep->mpa.hdr.params.bits) == MPA_REVISION_EXT_1) { + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ + erdma_send_mpareqrep(cep, pdata, plen); + } + + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = ERDMA_EPSTATE_CLOSED; + + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return 0; +} + +int erdma_create_listen(struct iw_cm_id *id, int backlog) +{ + struct socket *s; + struct erdma_cep *cep = NULL; + int ret = 0; + struct erdma_dev *dev = to_edev(id->device); + int addr_family = id->local_addr.ss_family; + struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); + + if (addr_family != AF_INET) + return -EAFNOSUPPORT; + + ret = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); + if (ret < 0) + return ret; + + sock_set_reuseaddr(s->sk); + + /* For wildcard addr, limit binding to current device only */ + if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) + s->sk->sk_bound_dev_if = dev->netdev->ifindex; + + ret = s->ops->bind(s, (struct sockaddr *)laddr, + sizeof(struct sockaddr_in)); + if (ret) + goto error; + + cep = erdma_cep_alloc(dev); + if (!cep) { + ret = -ENOMEM; + goto error; + } + erdma_cep_socket_assoc(cep, s); + + ret = erdma_cm_alloc_work(cep, backlog); + if (ret) + goto error; + + ret = s->ops->listen(s, backlog); + if (ret) + goto error; + + cep->cm_id = id; + id->add_ref(id); + + if (!id->provider_data) { + id->provider_data = + kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!id->provider_data) { + ret = -ENOMEM; + goto error; + } + INIT_LIST_HEAD((struct list_head *)id->provider_data); + } + + list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); + cep->state = ERDMA_EPSTATE_LISTENING; + + return 0; + +error: + if (cep) { + erdma_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + cep->sock = NULL; + erdma_socket_disassoc(s); + cep->state = ERDMA_EPSTATE_CLOSED; + + erdma_cep_set_free(cep); + erdma_cep_put(cep); + } + sock_release(s); + + return ret; +} + +static void erdma_drop_listeners(struct iw_cm_id *id) +{ + struct list_head *p, *tmp; + /* + * In case of a wildcard rdma_listen on a multi-homed device, + * a listener's IWCM id is associated with more than one listening CEP. + */ + list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { + struct erdma_cep *cep = + list_entry(p, struct erdma_cep, listenq); + + list_del(p); + + erdma_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + if (cep->sock) { + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + cep->state = ERDMA_EPSTATE_CLOSED; + erdma_cep_set_free(cep); + erdma_cep_put(cep); + } +} + +int erdma_destroy_listen(struct iw_cm_id *id) +{ + if (!id->provider_data) + return 0; + + erdma_drop_listeners(id); + kfree(id->provider_data); + id->provider_data = NULL; + + return 0; +} + +int erdma_cm_init(void) +{ + erdma_cm_wq = create_singlethread_workqueue("erdma_cm_wq"); + if (!erdma_cm_wq) + return -ENOMEM; + + return 0; +} + +void erdma_cm_exit(void) +{ + if (erdma_cm_wq) + destroy_workqueue(erdma_cm_wq); +} diff --git a/drivers/infiniband/hw/erdma/erdma_cm.h b/drivers/infiniband/hw/erdma/erdma_cm.h new file mode 100644 index 00000000000000..8a3f998fec9bda --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cm.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +/* Authors: Bernard Metzler */ +/* Greg Joyce */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#ifndef __ERDMA_CM_H__ +#define __ERDMA_CM_H__ + +#include +#include +#include + +/* iWarp MPA protocol defs */ +#define MPA_REVISION_EXT_1 129 +#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" +#define MPA_KEY_SIZE 16 +#define MPA_DEFAULT_HDR_LEN 28 + +struct mpa_rr_params { + __be16 bits; + __be16 pd_len; +}; + +/* + * MPA request/response Hdr bits & fields + */ +enum { + MPA_RR_FLAG_MARKERS = __cpu_to_be16(0x8000), + MPA_RR_FLAG_CRC = __cpu_to_be16(0x4000), + MPA_RR_FLAG_REJECT = __cpu_to_be16(0x2000), + MPA_RR_RESERVED = __cpu_to_be16(0x1f00), + MPA_RR_MASK_REVISION = __cpu_to_be16(0x00ff) +}; + +/* + * MPA request/reply header + */ +struct mpa_rr { + u8 key[16]; + struct mpa_rr_params params; +}; + +struct erdma_mpa_ext { + __be32 cookie; + __be32 bits; +}; + +enum { + MPA_EXT_FLAG_CC = cpu_to_be32(0x0000000f), +}; + +struct erdma_mpa_info { + struct mpa_rr hdr; /* peer mpa hdr in host byte order */ + struct erdma_mpa_ext ext_data; + char *pdata; + int bytes_rcvd; +}; + +struct erdma_sk_upcalls { + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk, int bytes); + void (*sk_error_report)(struct sock *sk); +}; + +struct erdma_dev; + +enum erdma_cep_state { + ERDMA_EPSTATE_IDLE = 1, + ERDMA_EPSTATE_LISTENING, + ERDMA_EPSTATE_CONNECTING, + ERDMA_EPSTATE_AWAIT_MPAREQ, + ERDMA_EPSTATE_RECVD_MPAREQ, + ERDMA_EPSTATE_AWAIT_MPAREP, + ERDMA_EPSTATE_RDMA_MODE, + ERDMA_EPSTATE_CLOSED +}; + +struct erdma_cep { + struct iw_cm_id *cm_id; + struct erdma_dev *dev; + struct list_head devq; + spinlock_t lock; + struct kref ref; + int in_use; + wait_queue_head_t waitq; + enum erdma_cep_state state; + + struct list_head listenq; + struct erdma_cep *listen_cep; + + struct erdma_qp *qp; + struct socket *sock; + + struct erdma_cm_work *mpa_timer; + struct list_head work_freelist; + + struct erdma_mpa_info mpa; + int ord; + int ird; + + int pd_len; + /* hold user's private data. */ + void *private_data; + + /* Saved upcalls of socket llp.sock */ + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk); + void (*sk_error_report)(struct sock *sk); +}; + +#define MPAREQ_TIMEOUT (HZ * 20) +#define MPAREP_TIMEOUT (HZ * 10) +#define CONNECT_TIMEOUT (HZ * 10) + +enum erdma_work_type { + ERDMA_CM_WORK_ACCEPT = 1, + ERDMA_CM_WORK_READ_MPAHDR, + ERDMA_CM_WORK_CLOSE_LLP, /* close socket */ + ERDMA_CM_WORK_PEER_CLOSE, /* socket indicated peer close */ + ERDMA_CM_WORK_MPATIMEOUT, + ERDMA_CM_WORK_CONNECTED, + ERDMA_CM_WORK_CONNECTTIMEOUT +}; + +struct erdma_cm_work { + struct delayed_work work; + struct list_head list; + enum erdma_work_type type; + struct erdma_cep *cep; +}; + +#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a))) + +static inline int getname_peer(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 1); +} + +static inline int getname_local(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 0); +} + +int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *param); +int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param); +int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen); +int erdma_create_listen(struct iw_cm_id *id, int backlog); +int erdma_destroy_listen(struct iw_cm_id *id); + +void erdma_cep_get(struct erdma_cep *ceq); +void erdma_cep_put(struct erdma_cep *ceq); +int erdma_cm_queue_work(struct erdma_cep *ceq, enum erdma_work_type type); + +int erdma_cm_init(void); +void erdma_cm_exit(void); + +#define sk_to_cep(sk) ((struct erdma_cep *)((sk)->sk_user_data)) + +#endif From a6cd49345d4ecc7d84befd8c5632e1fc0c73130f Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:10 +0800 Subject: [PATCH 0629/1250] RDMA/erdma: Add the erdma module Add the main erdma module, which provides interface to infiniband subsystem. This commit includes a modification from Christophe, that using the bitmap API to allocate bitmaps instead of hand-writing. And the commit also fixes warnings reported by static checkers. Link: https://lore.kernel.org/r/20220713094212.30943-10-chengyou@linux.alibaba.com Signed-off-by: Christophe JAILLET Signed-off-by: Dan Carpenter Reported-by: kernel test robot Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/erdma/erdma_main.c | 608 +++++++++++++++++++++++ 1 file changed, 608 insertions(+) create mode 100644 drivers/infiniband/hw/erdma/erdma_main.c diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c new file mode 100644 index 00000000000000..07e743d248470f --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -0,0 +1,608 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +MODULE_AUTHOR("Cheng Xu "); +MODULE_DESCRIPTION("Alibaba elasticRDMA adapter driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +static int erdma_netdev_event(struct notifier_block *nb, unsigned long event, + void *arg) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(arg); + struct erdma_dev *dev = container_of(nb, struct erdma_dev, netdev_nb); + + if (dev->netdev == NULL || dev->netdev != netdev) + goto done; + + switch (event) { + case NETDEV_UP: + dev->state = IB_PORT_ACTIVE; + erdma_port_event(dev, IB_EVENT_PORT_ACTIVE); + break; + case NETDEV_DOWN: + dev->state = IB_PORT_DOWN; + erdma_port_event(dev, IB_EVENT_PORT_ERR); + break; + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGEMTU: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGE: + default: + break; + } + +done: + return NOTIFY_OK; +} + +static int erdma_enum_and_get_netdev(struct erdma_dev *dev) +{ + struct net_device *netdev; + int ret = -ENODEV; + + /* Already binded to a net_device, so we skip. */ + if (dev->netdev) + return 0; + + rtnl_lock(); + for_each_netdev(&init_net, netdev) { + /* + * In erdma, the paired netdev and ibdev should have the same + * MAC address. erdma can get the value from its PCIe bar + * registers. Since erdma can not get the paired netdev + * reference directly, we do a traverse here to get the paired + * netdev. + */ + if (ether_addr_equal_unaligned(netdev->perm_addr, + dev->attrs.peer_addr)) { + ret = ib_device_set_netdev(&dev->ibdev, netdev, 1); + if (ret) { + rtnl_unlock(); + ibdev_warn(&dev->ibdev, + "failed (%d) to link netdev", ret); + return ret; + } + + dev->netdev = netdev; + break; + } + } + + rtnl_unlock(); + + return ret; +} + +static int erdma_device_register(struct erdma_dev *dev) +{ + struct ib_device *ibdev = &dev->ibdev; + int ret; + + ret = erdma_enum_and_get_netdev(dev); + if (ret) + return ret; + + addrconf_addr_eui48((u8 *)&ibdev->node_guid, dev->netdev->dev_addr); + + ret = ib_register_device(ibdev, "erdma_%d", &dev->pdev->dev); + if (ret) { + dev_err(&dev->pdev->dev, + "ib_register_device failed: ret = %d\n", ret); + return ret; + } + + dev->netdev_nb.notifier_call = erdma_netdev_event; + ret = register_netdevice_notifier(&dev->netdev_nb); + if (ret) { + ibdev_err(&dev->ibdev, "failed to register notifier.\n"); + ib_unregister_device(ibdev); + } + + return ret; +} + +static irqreturn_t erdma_comm_irq_handler(int irq, void *data) +{ + struct erdma_dev *dev = data; + + erdma_cmdq_completion_handler(&dev->cmdq); + erdma_aeq_event_handler(dev); + + return IRQ_HANDLED; +} + +static void erdma_dwqe_resource_init(struct erdma_dev *dev) +{ + int total_pages, type0, type1; + + dev->attrs.grp_num = erdma_reg_read32(dev, ERDMA_REGS_GRP_NUM_REG); + + if (dev->attrs.grp_num < 4) + dev->attrs.disable_dwqe = true; + else + dev->attrs.disable_dwqe = false; + + /* One page contains 4 goups. */ + total_pages = dev->attrs.grp_num * 4; + + if (dev->attrs.grp_num >= ERDMA_DWQE_MAX_GRP_CNT) { + dev->attrs.grp_num = ERDMA_DWQE_MAX_GRP_CNT; + type0 = ERDMA_DWQE_TYPE0_CNT; + type1 = ERDMA_DWQE_TYPE1_CNT / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + } else { + type1 = total_pages / 3; + type0 = total_pages - type1 - 1; + } + + dev->attrs.dwqe_pages = type0; + dev->attrs.dwqe_entries = type1 * ERDMA_DWQE_TYPE1_CNT_PER_PAGE; +} + +static int erdma_request_vectors(struct erdma_dev *dev) +{ + int expect_irq_num = min(num_possible_cpus() + 1, ERDMA_NUM_MSIX_VEC); + int ret; + + ret = pci_alloc_irq_vectors(dev->pdev, 1, expect_irq_num, PCI_IRQ_MSIX); + if (ret < 0) { + dev_err(&dev->pdev->dev, "request irq vectors failed(%d)\n", + ret); + return ret; + } + dev->attrs.irq_num = ret; + + return 0; +} + +static int erdma_comm_irq_init(struct erdma_dev *dev) +{ + snprintf(dev->comm_irq.name, ERDMA_IRQNAME_SIZE, "erdma-common@pci:%s", + pci_name(dev->pdev)); + dev->comm_irq.msix_vector = + pci_irq_vector(dev->pdev, ERDMA_MSIX_VECTOR_CMDQ); + + cpumask_set_cpu(cpumask_first(cpumask_of_pcibus(dev->pdev->bus)), + &dev->comm_irq.affinity_hint_mask); + irq_set_affinity_hint(dev->comm_irq.msix_vector, + &dev->comm_irq.affinity_hint_mask); + + return request_irq(dev->comm_irq.msix_vector, erdma_comm_irq_handler, 0, + dev->comm_irq.name, dev); +} + +static void erdma_comm_irq_uninit(struct erdma_dev *dev) +{ + irq_set_affinity_hint(dev->comm_irq.msix_vector, NULL); + free_irq(dev->comm_irq.msix_vector, dev); +} + +static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) +{ + int ret; + + erdma_dwqe_resource_init(dev); + + ret = dma_set_mask_and_coherent(&pdev->dev, + DMA_BIT_MASK(ERDMA_PCI_WIDTH)); + if (ret) + return ret; + + dma_set_max_seg_size(&pdev->dev, UINT_MAX); + + return 0; +} + +static void erdma_device_uninit(struct erdma_dev *dev) +{ + u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1); + + erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); +} + +static const struct pci_device_id erdma_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) }, + {} +}; + +static int erdma_probe_dev(struct pci_dev *pdev) +{ + struct erdma_dev *dev; + int bars, err; + u32 version; + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "pci_enable_device failed(%d)\n", err); + return err; + } + + pci_set_master(pdev); + + dev = ib_alloc_device(erdma_dev, ibdev); + if (!dev) { + dev_err(&pdev->dev, "ib_alloc_device failed\n"); + err = -ENOMEM; + goto err_disable_device; + } + + pci_set_drvdata(pdev, dev); + dev->pdev = pdev; + dev->attrs.numa_node = dev_to_node(&pdev->dev); + + bars = pci_select_bars(pdev, IORESOURCE_MEM); + err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + if (bars != ERDMA_BAR_MASK || err) { + err = err ? err : -EINVAL; + goto err_ib_device_release; + } + + dev->func_bar_addr = pci_resource_start(pdev, ERDMA_FUNC_BAR); + dev->func_bar_len = pci_resource_len(pdev, ERDMA_FUNC_BAR); + + dev->func_bar = + devm_ioremap(&pdev->dev, dev->func_bar_addr, dev->func_bar_len); + if (!dev->func_bar) { + dev_err(&pdev->dev, "devm_ioremap failed.\n"); + err = -EFAULT; + goto err_release_bars; + } + + version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); + if (version == 0) { + /* we knows that it is a non-functional function. */ + err = -ENODEV; + goto err_iounmap_func_bar; + } + + err = erdma_device_init(dev, pdev); + if (err) + goto err_iounmap_func_bar; + + err = erdma_request_vectors(dev); + if (err) + goto err_iounmap_func_bar; + + err = erdma_comm_irq_init(dev); + if (err) + goto err_free_vectors; + + err = erdma_aeq_init(dev); + if (err) + goto err_uninit_comm_irq; + + err = erdma_cmdq_init(dev); + if (err) + goto err_uninit_aeq; + + err = erdma_ceqs_init(dev); + if (err) + goto err_uninit_cmdq; + + erdma_finish_cmdq_init(dev); + + return 0; + +err_uninit_cmdq: + erdma_device_uninit(dev); + erdma_cmdq_destroy(dev); + +err_uninit_aeq: + erdma_aeq_destroy(dev); + +err_uninit_comm_irq: + erdma_comm_irq_uninit(dev); + +err_free_vectors: + pci_free_irq_vectors(dev->pdev); + +err_iounmap_func_bar: + devm_iounmap(&pdev->dev, dev->func_bar); + +err_release_bars: + pci_release_selected_regions(pdev, bars); + +err_ib_device_release: + ib_dealloc_device(&dev->ibdev); + +err_disable_device: + pci_disable_device(pdev); + + return err; +} + +static void erdma_remove_dev(struct pci_dev *pdev) +{ + struct erdma_dev *dev = pci_get_drvdata(pdev); + + erdma_ceqs_uninit(dev); + + erdma_device_uninit(dev); + + erdma_cmdq_destroy(dev); + erdma_aeq_destroy(dev); + erdma_comm_irq_uninit(dev); + pci_free_irq_vectors(dev->pdev); + + devm_iounmap(&pdev->dev, dev->func_bar); + pci_release_selected_regions(pdev, ERDMA_BAR_MASK); + + ib_dealloc_device(&dev->ibdev); + + pci_disable_device(pdev); +} + +#define ERDMA_GET_CAP(name, cap) FIELD_GET(ERDMA_CMD_DEV_CAP_##name##_MASK, cap) + +static int erdma_dev_attrs_init(struct erdma_dev *dev) +{ + int err; + u64 req_hdr, cap0, cap1; + + erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_QUERY_DEVICE); + + err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, + &cap1); + if (err) + return err; + + dev->attrs.max_cqe = 1 << ERDMA_GET_CAP(MAX_CQE, cap0); + dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0); + dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1); + dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0); + dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1); + dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1); + dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); + dev->attrs.max_mr = dev->attrs.max_qp << 1; + dev->attrs.max_cq = dev->attrs.max_qp << 1; + + dev->attrs.max_send_wr = ERDMA_MAX_SEND_WR; + dev->attrs.max_ord = ERDMA_MAX_ORD; + dev->attrs.max_ird = ERDMA_MAX_IRD; + dev->attrs.max_send_sge = ERDMA_MAX_SEND_SGE; + dev->attrs.max_recv_sge = ERDMA_MAX_RECV_SGE; + dev->attrs.max_sge_rd = ERDMA_MAX_SGE_RD; + dev->attrs.max_pd = ERDMA_MAX_PD; + + dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD; + dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr; + + erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_QUERY_FW_INFO); + + err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, + &cap1); + if (!err) + dev->attrs.fw_version = + FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0); + + return err; +} + +static int erdma_res_cb_init(struct erdma_dev *dev) +{ + int i, j; + + for (i = 0; i < ERDMA_RES_CNT; i++) { + dev->res_cb[i].next_alloc_idx = 1; + spin_lock_init(&dev->res_cb[i].lock); + dev->res_cb[i].bitmap = + bitmap_zalloc(dev->res_cb[i].max_cap, GFP_KERNEL); + if (!dev->res_cb[i].bitmap) + goto err; + } + + return 0; + +err: + for (j = 0; j < i; j++) + bitmap_free(dev->res_cb[j].bitmap); + + return -ENOMEM; +} + +static void erdma_res_cb_free(struct erdma_dev *dev) +{ + int i; + + for (i = 0; i < ERDMA_RES_CNT; i++) + bitmap_free(dev->res_cb[i].bitmap); +} + +static const struct ib_device_ops erdma_device_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_ERDMA, + .uverbs_abi_ver = ERDMA_ABI_VERSION, + + .alloc_mr = erdma_ib_alloc_mr, + .alloc_pd = erdma_alloc_pd, + .alloc_ucontext = erdma_alloc_ucontext, + .create_cq = erdma_create_cq, + .create_qp = erdma_create_qp, + .dealloc_pd = erdma_dealloc_pd, + .dealloc_ucontext = erdma_dealloc_ucontext, + .dereg_mr = erdma_dereg_mr, + .destroy_cq = erdma_destroy_cq, + .destroy_qp = erdma_destroy_qp, + .get_dma_mr = erdma_get_dma_mr, + .get_port_immutable = erdma_get_port_immutable, + .iw_accept = erdma_accept, + .iw_add_ref = erdma_qp_get_ref, + .iw_connect = erdma_connect, + .iw_create_listen = erdma_create_listen, + .iw_destroy_listen = erdma_destroy_listen, + .iw_get_qp = erdma_get_ibqp, + .iw_reject = erdma_reject, + .iw_rem_ref = erdma_qp_put_ref, + .map_mr_sg = erdma_map_mr_sg, + .mmap = erdma_mmap, + .mmap_free = erdma_mmap_free, + .modify_qp = erdma_modify_qp, + .post_recv = erdma_post_recv, + .post_send = erdma_post_send, + .poll_cq = erdma_poll_cq, + .query_device = erdma_query_device, + .query_gid = erdma_query_gid, + .query_port = erdma_query_port, + .query_qp = erdma_query_qp, + .req_notify_cq = erdma_req_notify_cq, + .reg_user_mr = erdma_reg_user_mr, + + INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, erdma_ucontext, ibucontext), + INIT_RDMA_OBJ_SIZE(ib_qp, erdma_qp, ibqp), +}; + +static int erdma_ib_device_add(struct pci_dev *pdev) +{ + struct erdma_dev *dev = pci_get_drvdata(pdev); + struct ib_device *ibdev = &dev->ibdev; + u64 mac; + int ret; + + ret = erdma_dev_attrs_init(dev); + if (ret) + return ret; + + ibdev->node_type = RDMA_NODE_RNIC; + memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC)); + + /* + * Current model (one-to-one device association): + * One ERDMA device per net_device or, equivalently, + * per physical port. + */ + ibdev->phys_port_cnt = 1; + ibdev->num_comp_vectors = dev->attrs.irq_num - 1; + + ib_set_device_ops(ibdev, &erdma_device_ops); + + INIT_LIST_HEAD(&dev->cep_list); + + spin_lock_init(&dev->lock); + xa_init_flags(&dev->qp_xa, XA_FLAGS_ALLOC1); + xa_init_flags(&dev->cq_xa, XA_FLAGS_ALLOC1); + dev->next_alloc_cqn = 1; + dev->next_alloc_qpn = 1; + + ret = erdma_res_cb_init(dev); + if (ret) + return ret; + + spin_lock_init(&dev->db_bitmap_lock); + bitmap_zero(dev->sdb_page, ERDMA_DWQE_TYPE0_CNT); + bitmap_zero(dev->sdb_entry, ERDMA_DWQE_TYPE1_CNT); + + atomic_set(&dev->num_ctx, 0); + + mac = erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_L_REG); + mac |= (u64)erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_H_REG) << 32; + + u64_to_ether_addr(mac, dev->attrs.peer_addr); + + ret = erdma_device_register(dev); + if (ret) + goto err_out; + + return 0; + +err_out: + xa_destroy(&dev->qp_xa); + xa_destroy(&dev->cq_xa); + + erdma_res_cb_free(dev); + + return ret; +} + +static void erdma_ib_device_remove(struct pci_dev *pdev) +{ + struct erdma_dev *dev = pci_get_drvdata(pdev); + + unregister_netdevice_notifier(&dev->netdev_nb); + ib_unregister_device(&dev->ibdev); + + erdma_res_cb_free(dev); + xa_destroy(&dev->qp_xa); + xa_destroy(&dev->cq_xa); +} + +static int erdma_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret; + + ret = erdma_probe_dev(pdev); + if (ret) + return ret; + + ret = erdma_ib_device_add(pdev); + if (ret) { + erdma_remove_dev(pdev); + return ret; + } + + return 0; +} + +static void erdma_remove(struct pci_dev *pdev) +{ + erdma_ib_device_remove(pdev); + erdma_remove_dev(pdev); +} + +static struct pci_driver erdma_pci_driver = { + .name = DRV_MODULE_NAME, + .id_table = erdma_pci_tbl, + .probe = erdma_probe, + .remove = erdma_remove +}; + +MODULE_DEVICE_TABLE(pci, erdma_pci_tbl); + +static __init int erdma_init_module(void) +{ + int ret; + + ret = erdma_cm_init(); + if (ret) + return ret; + + ret = pci_register_driver(&erdma_pci_driver); + if (ret) + erdma_cm_exit(); + + return ret; +} + +static void __exit erdma_exit_module(void) +{ + pci_unregister_driver(&erdma_pci_driver); + + erdma_cm_exit(); +} + +module_init(erdma_init_module); +module_exit(erdma_exit_module); From 06eb746d91cf12a6fed6694c4351ef2bda883c64 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:11 +0800 Subject: [PATCH 0630/1250] RDMA/erdma: Add the ABI definitions Link: https://lore.kernel.org/r/20220713094212.30943-11-chengyou@linux.alibaba.com Reported-by: kernel test robot Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/erdma-abi.h | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 include/uapi/rdma/erdma-abi.h diff --git a/include/uapi/rdma/erdma-abi.h b/include/uapi/rdma/erdma-abi.h new file mode 100644 index 00000000000000..fcbaff1d84c3ef --- /dev/null +++ b/include/uapi/rdma/erdma-abi.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * Copyright (c) 2020-2022, Alibaba Group. + */ + +#ifndef __ERDMA_USER_H__ +#define __ERDMA_USER_H__ + +#include + +#define ERDMA_ABI_VERSION 1 + +struct erdma_ureq_create_cq { + __u64 db_record_va; + __u64 qbuf_va; + __u32 qbuf_len; + __u32 rsvd0; +}; + +struct erdma_uresp_create_cq { + __u32 cq_id; + __u32 num_cqe; +}; + +struct erdma_ureq_create_qp { + __u64 db_record_va; + __u64 qbuf_va; + __u32 qbuf_len; + __u32 rsvd0; +}; + +struct erdma_uresp_create_qp { + __u32 qp_id; + __u32 num_sqe; + __u32 num_rqe; + __u32 rq_offset; +}; + +struct erdma_uresp_alloc_ctx { + __u32 dev_id; + __u32 pad; + __u32 sdb_type; + __u32 sdb_offset; + __u64 sdb; + __u64 rdb; + __u64 cdb; +}; + +#endif From bea86a8116f60c40d3d49cfef02e0e8d82c4c4d1 Mon Sep 17 00:00:00 2001 From: Cheng Xu Date: Wed, 13 Jul 2022 17:42:12 +0800 Subject: [PATCH 0631/1250] RDMA/erdma: Add driver to kernel build environment Add erdma to the kernel build environment, and sort the source order in drivers/infiniband/Kconfig. Link: https://lore.kernel.org/r/20220713094212.30943-12-chengyou@linux.alibaba.com Signed-off-by: Cheng Xu Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 8 ++++++++ drivers/infiniband/Kconfig | 15 ++++++++------- drivers/infiniband/hw/Makefile | 1 + drivers/infiniband/hw/erdma/Kconfig | 12 ++++++++++++ drivers/infiniband/hw/erdma/Makefile | 4 ++++ 5 files changed, 33 insertions(+), 7 deletions(-) create mode 100644 drivers/infiniband/hw/erdma/Kconfig create mode 100644 drivers/infiniband/hw/erdma/Makefile diff --git a/MAINTAINERS b/MAINTAINERS index a6d3bd9d2a8d0f..e034f1461eb4d8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -733,6 +733,14 @@ S: Maintained F: Documentation/i2c/busses/i2c-ali1563.rst F: drivers/i2c/busses/i2c-ali1563.c +ALIBABA ELASTIC RDMA DRIVER +M: Cheng Xu +M: Kai Shen +L: linux-rdma@vger.kernel.org +S: Supported +F: drivers/infiniband/hw/erdma +F: include/uapi/rdma/erdma-abi.h + ALIENWARE WMI DRIVER L: Dell.Client.Kernel@dell.com S: Maintained diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 33d3ce9c888ecf..aa36ac618e7291 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -78,20 +78,21 @@ config INFINIBAND_VIRT_DMA def_bool !HIGHMEM if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS -source "drivers/infiniband/hw/mthca/Kconfig" -source "drivers/infiniband/hw/qib/Kconfig" +source "drivers/infiniband/hw/bnxt_re/Kconfig" source "drivers/infiniband/hw/cxgb4/Kconfig" source "drivers/infiniband/hw/efa/Kconfig" +source "drivers/infiniband/hw/erdma/Kconfig" +source "drivers/infiniband/hw/hfi1/Kconfig" +source "drivers/infiniband/hw/hns/Kconfig" source "drivers/infiniband/hw/irdma/Kconfig" source "drivers/infiniband/hw/mlx4/Kconfig" source "drivers/infiniband/hw/mlx5/Kconfig" +source "drivers/infiniband/hw/mthca/Kconfig" source "drivers/infiniband/hw/ocrdma/Kconfig" -source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" -source "drivers/infiniband/hw/usnic/Kconfig" -source "drivers/infiniband/hw/hns/Kconfig" -source "drivers/infiniband/hw/bnxt_re/Kconfig" -source "drivers/infiniband/hw/hfi1/Kconfig" source "drivers/infiniband/hw/qedr/Kconfig" +source "drivers/infiniband/hw/qib/Kconfig" +source "drivers/infiniband/hw/usnic/Kconfig" +source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" source "drivers/infiniband/sw/rxe/Kconfig" source "drivers/infiniband/sw/siw/Kconfig" diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index fba0b3be903e26..6b3a88046125ad 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -13,3 +13,4 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ obj-$(CONFIG_INFINIBAND_HNS) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ +obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ diff --git a/drivers/infiniband/hw/erdma/Kconfig b/drivers/infiniband/hw/erdma/Kconfig new file mode 100644 index 00000000000000..169038e3ceb12f --- /dev/null +++ b/drivers/infiniband/hw/erdma/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_ERDMA + tristate "Alibaba Elastic RDMA Adapter (ERDMA) support" + depends on PCI_MSI && 64BIT + depends on INFINIBAND_ADDR_TRANS + depends on INFINIBAND_USER_ACCESS + help + This is a RDMA/iWarp driver for Alibaba Elastic RDMA Adapter(ERDMA), + which supports RDMA features in Alibaba cloud environment. + + To compile this driver as module, choose M here. The module will be + called erdma. diff --git a/drivers/infiniband/hw/erdma/Makefile b/drivers/infiniband/hw/erdma/Makefile new file mode 100644 index 00000000000000..51d2ef91905a8c --- /dev/null +++ b/drivers/infiniband/hw/erdma/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_INFINIBAND_ERDMA) := erdma.o + +erdma-y := erdma_cm.o erdma_main.o erdma_cmdq.o erdma_cq.o erdma_verbs.o erdma_qp.o erdma_eq.o From ab345b04433da6191f5cecfc036c9419ce05011e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Jul 2022 17:12:14 -0700 Subject: [PATCH 0632/1250] Bluetooth: hci_sync: Fix not updating privacy_mode When programming a new entry into the resolving list it shall default to network mode since the params may contain the mode programmed when the device was last added to the resolving list. Link: https://bugzilla.kernel.org/show_bug.cgi?id=209745 Fixes: 853b70b506a20 ("Bluetooth: hci_sync: Set Privacy Mode when updating the resolving list") Signed-off-by: Luiz Augusto von Dentz Tested-by: Zhengping Jiang --- net/bluetooth/hci_sync.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 212b0cdb25f5ef..8da3053fa1efa1 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1636,6 +1636,9 @@ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, bacpy(&cp.bdaddr, ¶ms->addr); memcpy(cp.peer_irk, irk->val, 16); + /* Default privacy mode is always Network */ + params->privacy_mode = HCI_NETWORK_PRIVACY; + done: if (hci_dev_test_flag(hdev, HCI_PRIVACY)) memcpy(cp.local_irk, hdev->irk, 16); From ff1688aab0d1a2947eb12bf4c839006b6143a5d1 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 13 Jul 2022 17:17:55 -0700 Subject: [PATCH 0633/1250] Bluetooth: hci_sync: Don't remove connected devices from accept list These devices are likely going to be reprogrammed when disconnected so this avoid a whole bunch of commands attempt to remove and the add back to the list. Signed-off-by: Luiz Augusto von Dentz Tested-by: Zhengping Jiang --- net/bluetooth/hci_sync.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 8da3053fa1efa1..464a5e2c56fb5b 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1892,12 +1892,15 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) } /* Go through the current accept list programmed into the - * controller one by one and check if that address is still - * in the list of pending connections or list of devices to + * controller one by one and check if that address is connected or is + * still in the list of pending connections or list of devices to * report. If not present in either list, then remove it from * the controller. */ list_for_each_entry_safe(b, t, &hdev->le_accept_list, list) { + if (hci_conn_hash_lookup_le(hdev, &b->bdaddr, b->bdaddr_type)) + continue; + pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns, &b->bdaddr, b->bdaddr_type); From 2d09caa02b0868f749567906c2362b13dd057f01 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 28 Jun 2022 17:25:39 -0400 Subject: [PATCH 0634/1250] scsi: sd: Fix discard errors during revalidate We previously switched to SD_LBP_WS16 mode by default if a device identified itself as being thinly provisioned. This was done for compatibility with older devices that predate the Logical Block Provisioning VPD page and the introduction of the separate UNMAP command. Since WS16 was originally the only option there was no way to explicitly signal support for it outside of the device reporting LBPME=1. And thus we switch it on every time we discover a thinly provisioned device in READ CAPACITY(16). Some devices, however, report different values for unmap operations performed with WRITE SAME and ones performed with the UNMAP command. For instance a device may report that it can unmap 64KB with WRITE SAME but only 32KB with UNMAP. If the device then reports a preference for UNMAP in the LBP VPD, there is a tiny window between the WS16 being enabled and the UNMAP limit being set. And during that window the block layer can issue 64KB discards which, when being prepped by the sd driver, now violate the UNMAP limit. To avoid temporarily setting WS16 during revalidate, relocate all the provisioning mode setting heuristics to sd_config_discard(). Introduce a new mode, SD_LBP_DEFAULT, which sd_revalidate() will use to trigger the heuristic to select a suitable mode based on what the device reports. SD_LBP_DEFAULT can also be triggered in sysfs via the string "default", should a user decide to change back to the kernel-chosen provisioning mode after manually overriding the default. Link: https://lore.kernel.org/r/20220302053559.32147-10-martin.petersen@oracle.com Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 124 +++++++++++++++++++++++++++++++++++----------- drivers/scsi/sd.h | 9 ++-- 2 files changed, 101 insertions(+), 32 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 081e39b3543b7f..9b2144e30eb694 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -99,7 +99,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 -static void sd_config_discard(struct scsi_disk *, unsigned int); +static void sd_config_discard(struct scsi_disk *, enum sd_lbp_mode); static void sd_config_write_same(struct scsi_disk *); static int sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); @@ -369,6 +369,7 @@ static DEVICE_ATTR_RO(thin_provisioning); /* sysfs_match_string() requires dense arrays */ static const char *lbp_mode[] = { + [SD_LBP_DEFAULT] = "default", [SD_LBP_FULL] = "full", [SD_LBP_UNMAP] = "unmap", [SD_LBP_WS16] = "writesame_16", @@ -409,6 +410,11 @@ provisioning_mode_store(struct device *dev, struct device_attribute *attr, if (mode < 0) return -EINVAL; + if (mode == SD_LBP_DEFAULT) + sdkp->provisioning_override = false; + else + sdkp->provisioning_override = true; + sd_config_discard(sdkp, mode); return count; @@ -780,23 +786,95 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, return protect; } -static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) +/* + * It took many iterations in T10 to develop a model for thinly provisioned + * devices. Linux was an early adopter of the concept of discards, and as a + * result of the SCSI spec being a moving target for several years, we have a + * set of heuristics in place that allow us to support a wide variety of + * devices that predate the final SBC specification. + * + * These heuristics are triggered by default during device discovery, but the + * user can subsequently override what the kernel decided by writing a + * particular mode string to a scsi_disk's provisioning_mode node in sysfs. + * For devices that predate any of the provisioning knobs in the spec but rely + * on zero-detection, it is possible to enable discard through the + * "writesame_zero" override. + * + * For Linux to automatically identify a SCSI disk as being thinly + * provisioned, the device must set the LBPME bit in READ CAPACITY(16). + * + * In the ratified version of T10 SBC-4, a device must also provide a Logical + * Block Provisioning VPD page which has three fields that indicate which + * provisioning commands the device supports. The device should also implement + * the extended version of the Block Limits VPD which is used to indicate any + * limitations on the size of unmap operations as well as alignment and + * granularity used inside the device. + * + * If the device supports the Logical Block Provisioning VPD, and sets the + * LBPU flag, and reports a MAXIMUM UNMAP LBA COUNT > 0 and a MAXIMUM UNMAP + * BLOCK DESCRIPTOR count > 0 in the extended Block Limits VPD, then we will + * use UNMAP for discards. Otherwise, if the device set LBPWS in the LBP VPD, + * we will use WRITE SAME(16) with the UNMAP bit set for discards. Otherwise, + * if the device sets LBPWS10 in the LBP VPD, then we will use WRITE SAME(10) + * with the UNMAP bit set for discards. + * + * If the device does *not* support the Logical Block Provisioning VPD, we + * rely on the extended version of the Block Limits VPD. If that is supported, + * and the device reports a MAXIMUM UNMAP LBA COUNT > 0 and a MAXIMUM + * UNMAP BLOCK DESCRIPTOR count > 0, then we will use UNMAP for discards. + * Otherwise we will use WRITE SAME(16) with the UNMAP bit set for discards. + * + * If a device implements the *short* version of the Block Limits VPD or does + * not have a Block Limits VPD at all, we default to using WRITE SAME(16) with + * the UNMAP bit set for discards. + * + * The possible values for provisioning_mode in sysfs are: + * + * "default" - use heuristics outlined above to decide on command + * "full" - the device does not support discard + * "unmap" - use the UNMAP command + * "writesame_16" - use the WRITE SAME(16) command with the UNMAP bit set + * "writesame_10" - use the WRITE SAME(10) command with the UNMAP bit set + * "writesame_zero" - use WRITE SAME(16) with a zeroed payload, no UNMAP bit + * "disabled" - discards disabled due to command failure + */ +static void sd_config_discard(struct scsi_disk *sdkp, enum sd_lbp_mode mode) { struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; - q->limits.discard_alignment = - sdkp->unmap_alignment * logical_block_size; - q->limits.discard_granularity = - max(sdkp->physical_block_size, - sdkp->unmap_granularity * logical_block_size); - sdkp->provisioning_mode = mode; + if (mode == SD_LBP_DEFAULT && !sdkp->provisioning_override) { + if (sdkp->lbpme) { /* Logical Block Provisioning Enabled */ + if (sdkp->lbpvpd) { /* Logical Block Provisioning VPD */ + if (sdkp->lbpu && sdkp->max_unmap_blocks) + mode = SD_LBP_UNMAP; + else if (sdkp->lbpws) + mode = SD_LBP_WS16; + else if (sdkp->lbpws10) + mode = SD_LBP_WS10; + else + mode = SD_LBP_FULL; + } else if (sdkp->lblvpd) { /* Long Block Limits VPD */ + if (sdkp->max_unmap_blocks) + mode = SD_LBP_UNMAP; + else + mode = SD_LBP_WS16; + } else /* LBPME only, no VPDs supported */ + mode = SD_LBP_WS16; + } else + mode = SD_LBP_FULL; + } switch (mode) { - + case SD_LBP_DEFAULT: case SD_LBP_FULL: case SD_LBP_DISABLE: + if (mode == SD_LBP_DISABLE) + sdkp->provisioning_override = true; + sdkp->provisioning_mode = mode; + q->limits.discard_alignment = 0; + q->limits.discard_granularity = 0; blk_queue_max_discard_sectors(q, 0); return; @@ -829,6 +907,12 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) break; } + sdkp->provisioning_mode = mode; + q->limits.discard_alignment = + sdkp->unmap_alignment * logical_block_size; + q->limits.discard_granularity = + max(sdkp->physical_block_size, + sdkp->unmap_granularity * logical_block_size); blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); } @@ -2334,8 +2418,6 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; - - sd_config_discard(sdkp, SD_LBP_WS16); } sdkp->capacity = lba + 1; @@ -2865,6 +2947,7 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) if (vpd->len >= 64) { unsigned int lba_count, desc_count; + sdkp->lblvpd = 1; sdkp->max_ws_blocks = (u32)get_unaligned_be64(&vpd->data[36]); if (!sdkp->lbpme) @@ -2881,24 +2964,6 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) if (vpd->data[32] & 0x80) sdkp->unmap_alignment = get_unaligned_be32(&vpd->data[32]) & ~(1 << 31); - - if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ - - if (sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); - else - sd_config_discard(sdkp, SD_LBP_WS16); - - } else { /* LBP VPD page tells us what to use */ - if (sdkp->lbpu && sdkp->max_unmap_blocks) - sd_config_discard(sdkp, SD_LBP_UNMAP); - else if (sdkp->lbpws) - sd_config_discard(sdkp, SD_LBP_WS16); - else if (sdkp->lbpws10) - sd_config_discard(sdkp, SD_LBP_WS10); - else - sd_config_discard(sdkp, SD_LBP_DISABLE); - } } out: @@ -3267,6 +3332,7 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); sd_config_protection(sdkp); + sd_config_discard(sdkp, SD_LBP_DEFAULT); } /* diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index b89187761d61f3..8ad33be82e5ffd 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -51,12 +51,13 @@ enum { SD_MAX_WS16_BLOCKS = 0x7fffff, }; -enum { - SD_LBP_FULL = 0, /* Full logical block provisioning */ +enum sd_lbp_mode { + SD_LBP_DEFAULT = 0, /* Select mode based on what device reports */ + SD_LBP_FULL, /* Full logical block provisioning */ SD_LBP_UNMAP, /* Use UNMAP command */ SD_LBP_WS16, /* Use WRITE SAME(16) with UNMAP bit */ SD_LBP_WS10, /* Use WRITE SAME(10) with UNMAP bit */ - SD_LBP_ZERO, /* Use WRITE SAME(10) with zero payload */ + SD_LBP_ZERO, /* Use WRITE SAME(10) with zeroed payload */ SD_LBP_DISABLE, /* Discard disabled due to failed cmd */ }; @@ -131,6 +132,8 @@ struct scsi_disk { u8 provisioning_mode; u8 zeroing_mode; u8 nr_actuators; /* Number of actuators */ + bool lblvpd; /* Long Block Limits VPD */ + bool provisioning_override; unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ From 605a1c008d9102a5f99972c0f5044c05c2ff978b Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 3 Jan 2022 18:21:23 -0500 Subject: [PATCH 0635/1250] scsi: sd: Move WRITE_ZEROES configuration to a separate function In preparation for adding support for the WRITE SAME(16) NDOB flag, move configuration of the WRITE_ZEROES operation to a separate function. This is done to facilitate fetching all VPD pages before choosing the appropriate zeroing method for a given device. The deferred configuration also allows us to mirror the discard behavior and permit the user to revert a device to the kernel default configuration by echoing "default" to the sysfs file. Link: https://lore.kernel.org/r/20220302053559.32147-11-martin.petersen@oracle.com Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 56 +++++++++++++++++++++++++++++++++-------------- drivers/scsi/sd.h | 7 ++++-- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 9b2144e30eb694..a9fb15442ca699 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -100,6 +100,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #define SD_MINORS 16 static void sd_config_discard(struct scsi_disk *, enum sd_lbp_mode); +static void sd_config_write_zeroes(struct scsi_disk *, enum sd_zeroing_mode); static void sd_config_write_same(struct scsi_disk *); static int sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); @@ -423,10 +424,12 @@ static DEVICE_ATTR_RW(provisioning_mode); /* sysfs_match_string() requires dense arrays */ static const char *zeroing_mode[] = { + [SD_ZERO_DEFAULT] = "default", [SD_ZERO_WRITE] = "write", [SD_ZERO_WS] = "writesame", [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap", [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap", + [SD_ZERO_DISABLE] = "disabled", }; static ssize_t @@ -452,7 +455,12 @@ zeroing_mode_store(struct device *dev, struct device_attribute *attr, if (mode < 0) return -EINVAL; - sdkp->zeroing_mode = mode; + if (mode == SD_ZERO_DEFAULT) + sdkp->zeroing_override = false; + else + sdkp->zeroing_override = true; + + sd_config_write_zeroes(sdkp, mode); return count; } @@ -1015,6 +1023,31 @@ static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, return scsi_alloc_sgtables(cmd); } +static void sd_config_write_zeroes(struct scsi_disk *sdkp, + enum sd_zeroing_mode mode) +{ + struct request_queue *q = sdkp->disk->queue; + unsigned int logical_block_size = sdkp->device->sector_size; + + if (mode == SD_ZERO_DEFAULT && !sdkp->zeroing_override) { + if (sdkp->lbprz && sdkp->lbpws) + mode = SD_ZERO_WS16_UNMAP; + else if (sdkp->lbprz && sdkp->lbpws10) + mode = SD_ZERO_WS10_UNMAP; + else if (sdkp->max_ws_blocks) + mode = SD_ZERO_WS; + else + mode = SD_ZERO_WRITE; + } + + if (mode == SD_ZERO_DISABLE) + sdkp->zeroing_override = true; + + sdkp->zeroing_mode = mode; + blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks * + (logical_block_size >> 9)); +} + static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = scsi_cmd_to_rq(cmd); @@ -1045,12 +1078,11 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) static void sd_config_write_same(struct scsi_disk *sdkp) { - struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { sdkp->max_ws_blocks = 0; - goto out; + return; } /* Some devices can not handle block counts above 0xffff despite @@ -1069,15 +1101,6 @@ static void sd_config_write_same(struct scsi_disk *sdkp) sdkp->max_ws_blocks = 0; } - if (sdkp->lbprz && sdkp->lbpws) - sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP; - else if (sdkp->lbprz && sdkp->lbpws10) - sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP; - else if (sdkp->max_ws_blocks) - sdkp->zeroing_mode = SD_ZERO_WS; - else - sdkp->zeroing_mode = SD_ZERO_WRITE; - if (sdkp->max_ws_blocks && sdkp->physical_block_size > logical_block_size) { /* @@ -1097,10 +1120,6 @@ static void sd_config_write_same(struct scsi_disk *sdkp) bytes_to_logical(sdkp->device, sdkp->physical_block_size)); } - -out: - blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks * - (logical_block_size >> 9)); } static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd) @@ -2097,6 +2116,8 @@ static int sd_done(struct scsi_cmnd *SCpnt) case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ sd_config_discard(sdkp, SD_LBP_DISABLE); + sd_config_write_zeroes(sdkp, + SD_ZERO_DISABLE); } else { sdkp->device->no_write_same = 1; sd_config_write_same(sdkp); @@ -3332,7 +3353,9 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); sd_config_protection(sdkp); + sd_config_write_same(sdkp); sd_config_discard(sdkp, SD_LBP_DEFAULT); + sd_config_write_zeroes(sdkp, SD_ZERO_DEFAULT); } /* @@ -3378,7 +3401,6 @@ static int sd_revalidate_disk(struct gendisk *disk) sdkp->first_scan = 0; set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); - sd_config_write_same(sdkp); kfree(buffer); /* diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 8ad33be82e5ffd..5028fcbd2be600 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -61,11 +61,13 @@ enum sd_lbp_mode { SD_LBP_DISABLE, /* Discard disabled due to failed cmd */ }; -enum { - SD_ZERO_WRITE = 0, /* Use WRITE(10/16) command */ +enum sd_zeroing_mode { + SD_ZERO_DEFAULT = 0, /* Default mode based on what device reports */ + SD_ZERO_WRITE, /* Use WRITE(10/16) command */ SD_ZERO_WS, /* Use WRITE SAME(10/16) command */ SD_ZERO_WS16_UNMAP, /* Use WRITE SAME(16) with UNMAP */ SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */ + SD_ZERO_DISABLE, /* Write Zeroes disabled due to failed cmd */ }; /** @@ -134,6 +136,7 @@ struct scsi_disk { u8 nr_actuators; /* Number of actuators */ bool lblvpd; /* Long Block Limits VPD */ bool provisioning_override; + bool zeroing_override; unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ From 33532ca20451fe4625b7ce5a7f9052626f676b10 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 27 Feb 2019 21:19:41 -0500 Subject: [PATCH 0636/1250] scsi: sd: Implement support for NDOB flag in WRITE SAME(16) The NDOB flag removes the need for a zeroed logical block in the data-out buffer when using WRITE SAME(16) to zero block ranges. Implement support for NDOB in the SCSI disk driver to mirror WRITE ZEROES in NVMe. The only way to detect whether a device supports NDOB is through REPORT SUPPORTED OPERATION CODES. Since we can't safely send that command to all devices we only attempt this if the device implements the Block Provisioning VPD page and sets the LBPWS flag. If we issue a WRITE SAME(16) we check whether NDOB is set for the device in question. If so we do not allocate a zeroed page from the pool and simply issue the command with a zero-length payload. Link: https://lore.kernel.org/r/20220302053559.32147-12-martin.petersen@oracle.com Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_trace.c | 3 +- drivers/scsi/sd.c | 93 ++++++++++++++++++++++++++------------- drivers/scsi/sd.h | 4 ++ 3 files changed, 69 insertions(+), 31 deletions(-) diff --git a/drivers/scsi/scsi_trace.c b/drivers/scsi/scsi_trace.c index 41a9500759130f..1d1f25f689efee 100644 --- a/drivers/scsi/scsi_trace.c +++ b/drivers/scsi/scsi_trace.c @@ -83,7 +83,8 @@ scsi_trace_rw16(struct trace_seq *p, unsigned char *cdb, int len) cdb[1] >> 5); if (cdb[0] == WRITE_SAME_16) - trace_seq_printf(p, " unmap=%u", cdb[1] >> 3 & 1); + trace_seq_printf(p, " unmap=%u ndob=%u", cdb[1] >> 3 & 1, + cdb[1] & 1); trace_seq_putc(p, 0); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index a9fb15442ca699..981e99e3931090 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -374,6 +374,7 @@ static const char *lbp_mode[] = { [SD_LBP_FULL] = "full", [SD_LBP_UNMAP] = "unmap", [SD_LBP_WS16] = "writesame_16", + [SD_LBP_WS16_NDOB] = "writesame_16_ndob", [SD_LBP_WS10] = "writesame_10", [SD_LBP_ZERO] = "writesame_zero", [SD_LBP_DISABLE] = "disabled", @@ -424,12 +425,14 @@ static DEVICE_ATTR_RW(provisioning_mode); /* sysfs_match_string() requires dense arrays */ static const char *zeroing_mode[] = { - [SD_ZERO_DEFAULT] = "default", - [SD_ZERO_WRITE] = "write", - [SD_ZERO_WS] = "writesame", - [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap", - [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap", - [SD_ZERO_DISABLE] = "disabled", + [SD_ZERO_DEFAULT] = "default", + [SD_ZERO_WRITE] = "write", + [SD_ZERO_WS] = "writesame", + [SD_ZERO_WS16_UNMAP_NDOB] = "writesame_16_unmap_ndob", + [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap", + [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap", + [SD_ZERO_WS16_NDOB] = "writesame_16_ndob", + [SD_ZERO_DISABLE] = "disabled", }; static ssize_t @@ -838,13 +841,14 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, * * The possible values for provisioning_mode in sysfs are: * - * "default" - use heuristics outlined above to decide on command - * "full" - the device does not support discard - * "unmap" - use the UNMAP command - * "writesame_16" - use the WRITE SAME(16) command with the UNMAP bit set - * "writesame_10" - use the WRITE SAME(10) command with the UNMAP bit set - * "writesame_zero" - use WRITE SAME(16) with a zeroed payload, no UNMAP bit - * "disabled" - discards disabled due to command failure + * "default" - use heuristics outlined above to decide on command + * "full" - the device does not support discard + * "unmap" - use the UNMAP command + * "writesame_16" - use the WRITE SAME(16) command with the UNMAP bit set + * "writesame_16_ndob" - use WRITE SAME(16) with UNMAP and NDOB bits set + * "writesame_10" - use the WRITE SAME(10) command with the UNMAP bit set + * "writesame_zero" - use WRITE SAME(16) with a zeroed payload, no UNMAP bit + * "disabled" - discards disabled due to command failure */ static void sd_config_discard(struct scsi_disk *sdkp, enum sd_lbp_mode mode) { @@ -857,9 +861,12 @@ static void sd_config_discard(struct scsi_disk *sdkp, enum sd_lbp_mode mode) if (sdkp->lbpvpd) { /* Logical Block Provisioning VPD */ if (sdkp->lbpu && sdkp->max_unmap_blocks) mode = SD_LBP_UNMAP; - else if (sdkp->lbpws) - mode = SD_LBP_WS16; - else if (sdkp->lbpws10) + else if (sdkp->lbpws) { + if (sdkp->ndob) + mode = SD_LBP_WS16_NDOB; + else + mode = SD_LBP_WS16; + } else if (sdkp->lbpws10) mode = SD_LBP_WS10; else mode = SD_LBP_FULL; @@ -892,6 +899,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, enum sd_lbp_mode mode) break; case SD_LBP_WS16: + case SD_LBP_WS16_NDOB: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else @@ -960,7 +968,7 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) } static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, - bool unmap) + bool unmap, bool ndob) { struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); @@ -969,23 +977,32 @@ static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); u32 data_len = sdp->sector_size; - rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); - if (!rq->special_vec.bv_page) - return BLK_STS_RESOURCE; - clear_highpage(rq->special_vec.bv_page); - rq->special_vec.bv_offset = 0; - rq->special_vec.bv_len = data_len; + if (ndob) { + rq->special_vec.bv_page = NULL; + rq->special_vec.bv_len = 0; + } else { + rq->special_vec.bv_page = + mempool_alloc(sd_page_pool, GFP_ATOMIC); + if (!rq->special_vec.bv_page) + return BLK_STS_RESOURCE; + clear_highpage(rq->special_vec.bv_page); + rq->special_vec.bv_len = data_len; + } + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->special_vec.bv_offset = 0; cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ + if (ndob) + cmd->cmnd[1] |= 0x1; /* NDOB */ put_unaligned_be64(lba, &cmd->cmnd[2]); put_unaligned_be32(nr_blocks, &cmd->cmnd[10]); cmd->allowed = sdkp->max_retries; - cmd->transfersize = data_len; + cmd->transfersize = rq->special_vec.bv_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; return scsi_alloc_sgtables(cmd); @@ -1030,10 +1047,14 @@ static void sd_config_write_zeroes(struct scsi_disk *sdkp, unsigned int logical_block_size = sdkp->device->sector_size; if (mode == SD_ZERO_DEFAULT && !sdkp->zeroing_override) { - if (sdkp->lbprz && sdkp->lbpws) + if (sdkp->lbprz && sdkp->lbpws && sdkp->ndob) + mode = SD_ZERO_WS16_UNMAP_NDOB; + else if (sdkp->lbprz && sdkp->lbpws) mode = SD_ZERO_WS16_UNMAP; else if (sdkp->lbprz && sdkp->lbpws10) mode = SD_ZERO_WS10_UNMAP; + else if (sdkp->max_ws_blocks && sdkp->ndob) + mode = SD_ZERO_WS16_NDOB; else if (sdkp->max_ws_blocks) mode = SD_ZERO_WS; else @@ -1058,8 +1079,10 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) if (!(rq->cmd_flags & REQ_NOUNMAP)) { switch (sdkp->zeroing_mode) { + case SD_ZERO_WS16_UNMAP_NDOB: + return sd_setup_write_same16_cmnd(cmd, true, true); case SD_ZERO_WS16_UNMAP: - return sd_setup_write_same16_cmnd(cmd, true); + return sd_setup_write_same16_cmnd(cmd, true, false); case SD_ZERO_WS10_UNMAP: return sd_setup_write_same10_cmnd(cmd, true); } @@ -1070,8 +1093,12 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) return BLK_STS_TARGET; } - if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff) - return sd_setup_write_same16_cmnd(cmd, false); + if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff) { + if (sdkp->zeroing_mode == SD_ZERO_WS16_NDOB) + return sd_setup_write_same16_cmnd(cmd, false, true); + else + return sd_setup_write_same16_cmnd(cmd, false, false); + } return sd_setup_write_same10_cmnd(cmd, false); } @@ -1338,7 +1365,9 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd) case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: - return sd_setup_write_same16_cmnd(cmd, true); + return sd_setup_write_same16_cmnd(cmd, true, false); + case SD_LBP_WS16_NDOB: + return sd_setup_write_same16_cmnd(cmd, true, true); case SD_LBP_WS10: return sd_setup_write_same10_cmnd(cmd, true); case SD_LBP_ZERO: @@ -3101,9 +3130,13 @@ static void sd_read_write_same(struct scsi_disk *sdkp, unsigned char *buffer) rcu_read_unlock(); } - if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16) == 1) + if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16) == 1) { sdkp->ws16 = 1; + if (get_unaligned_be16(&buffer[2]) >= 2) + sdkp->ndob = buffer[5] & 1; + } + if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME) == 1) sdkp->ws10 = 1; } diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 5028fcbd2be600..b843a94ad53c52 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -56,6 +56,7 @@ enum sd_lbp_mode { SD_LBP_FULL, /* Full logical block provisioning */ SD_LBP_UNMAP, /* Use UNMAP command */ SD_LBP_WS16, /* Use WRITE SAME(16) with UNMAP bit */ + SD_LBP_WS16_NDOB, /* Use WRITE SAME(16) with UNMAP + NDOB bits */ SD_LBP_WS10, /* Use WRITE SAME(10) with UNMAP bit */ SD_LBP_ZERO, /* Use WRITE SAME(10) with zeroed payload */ SD_LBP_DISABLE, /* Discard disabled due to failed cmd */ @@ -65,8 +66,10 @@ enum sd_zeroing_mode { SD_ZERO_DEFAULT = 0, /* Default mode based on what device reports */ SD_ZERO_WRITE, /* Use WRITE(10/16) command */ SD_ZERO_WS, /* Use WRITE SAME(10/16) command */ + SD_ZERO_WS16_UNMAP_NDOB,/* Use WRITE SAME(16) with UNMAP + NDOB bits */ SD_ZERO_WS16_UNMAP, /* Use WRITE SAME(16) with UNMAP */ SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */ + SD_ZERO_WS16_NDOB, /* Use WRITE SAME(16) with NDOB */ SD_ZERO_DISABLE, /* Write Zeroes disabled due to failed cmd */ }; @@ -137,6 +140,7 @@ struct scsi_disk { bool lblvpd; /* Long Block Limits VPD */ bool provisioning_override; bool zeroing_override; + bool ndob; unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ From 60936e2bb466381a65d7b196f8df68f1d464a384 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 10 Jun 2021 22:12:13 -0400 Subject: [PATCH 0637/1250] scsi: sd: Enable modern protocol features on more devices Due to legacy USB devices having a tendency to either lock up or return garbage if one attempts to query device capabilities, the USB transport disables VPD pages by default. This prevents discard from being properly configured on most modern USB-attached SSDs. Introduce two additional heuristics to determine whether VPD pages should be consulted. The first heuristic fetches VPD pages if a device reports that Logical Block Provisioning is enabled. It is very unusual for a device to support thin provisioning and not provide the associated VPDs. Consequently, if a device reports that Logical Block Provisioning is enabled (LBPME) in READ CAPACITY(16) response, the scsi_device has no VPDs attached, and the reported SPC version is larger than 3, then an attempt will be made to read the VPD pages during revalidate. The second heuristic relies on the fact that almost all modern devices return a set of version descriptors in the INQUIRY response. These descriptors outline which version of various protocol features are supported. If a device manufacturer has gone through the effort of filling out compliance descriptors, it is highly unlikely that VPD pages are not supported. So if a device provides version descriptors in the INQUIRY response, the scsi_device has no VPDs attached, and the reported SBC version is larger than 2, then an attempt will be made to read the VPD pages. In addition, READ CAPACITY(16) will be preferred over READ CAPACITY(10) to facilitate accessing the LBPME flag. The benefit to relying on INQUIRY is that it is data we already have. We do not have to blindly poke the device for additional information and risk confusing it. Extracting the SBC version is done by a new helper, sd_sbc_version(). Another helper is provided to determine whether a scsi_device has VPD pages attached or not. Link: https://lore.kernel.org/r/20220302053559.32147-15-martin.petersen@oracle.com Reported-by: Aman Karmani Tested-by: Aman Karmani Reported-by: David Sebek Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi.c | 1 + drivers/scsi/sd.c | 77 +++++++++++++++++++++++++++++++++++++- drivers/scsi/sd.h | 2 + include/scsi/scsi_device.h | 14 +++++++ 4 files changed, 92 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index c59eac7a32f2a0..a8b18327ed9416 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -492,6 +492,7 @@ void scsi_attach_vpd(struct scsi_device *sdev) } kfree(vpd_buf); } +EXPORT_SYMBOL_GPL(scsi_attach_vpd); /** * scsi_report_opcode - Find out if a given command opcode is supported diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 981e99e3931090..65f62c79914b6a 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2468,6 +2468,19 @@ static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; + /* + * If a device sets LBPME=1 then it should, in theory, support + * the Logical Block Provisioning VPD page. Assume that querying + * VPD pages is safe if logical block provisioning is enabled + * and the device claims conformance to a recent version of the + * spec. + */ + if (!sdkp->reattach_vpds && !scsi_device_has_vpd(sdp) && + sdp->scsi_level > SCSI_SPC_3) { + sd_first_printk(KERN_NOTICE, sdkp, + "Logical Block Provisioning enabled, fetching VPDs\n"); + sdkp->reattach_vpds = true; + } } sdkp->capacity = lba + 1; @@ -2534,8 +2547,10 @@ static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp, return sector_size; } -static int sd_try_rc16_first(struct scsi_device *sdp) +static int sd_try_rc16_first(struct scsi_disk *sdkp) { + struct scsi_device *sdp = sdkp->device; + if (sdp->host->max_cmd_len < 16) return 0; if (sdp->try_rc_10_first) @@ -2556,7 +2571,7 @@ sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer) int sector_size; struct scsi_device *sdp = sdkp->device; - if (sd_try_rc16_first(sdp)) { + if (sd_try_rc16_first(sdkp)) { sector_size = read_capacity_16(sdkp, sdp, buffer); if (sector_size == -EOVERFLOW) goto got_data; @@ -3370,6 +3385,12 @@ static int sd_revalidate_disk(struct gendisk *disk) blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q); + if (sdkp->reattach_vpds) { + sdp->try_vpd_pages = 1; + scsi_attach_vpd(sdp); + sdkp->reattach_vpds = false; + } + if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); sd_read_block_limits(sdkp); @@ -3514,6 +3535,42 @@ static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen) return 0; } +enum { + INQUIRY_DESC_START = 58, + INQUIRY_DESC_END = 74, + INQUIRY_DESC_SIZE = 2, +}; + +static unsigned int sd_sbc_version(struct scsi_device *sdp) +{ + unsigned int i; + unsigned int max; + + if (sdp->inquiry_len < INQUIRY_DESC_START + INQUIRY_DESC_SIZE) + return 0; + + max = min_t(unsigned int, sdp->inquiry_len, INQUIRY_DESC_END); + max = rounddown(max, INQUIRY_DESC_SIZE); + + for (i = INQUIRY_DESC_START ; i < max ; i += INQUIRY_DESC_SIZE) { + u16 desc = get_unaligned_be16(&sdp->inquiry[i]); + + switch (desc) { + case 0x0600: + return 4; + case 0x04c0: case 0x04c3: case 0x04c5: case 0x04c8: + return 3; + case 0x0320: case 0x0322: case 0x0324: case 0x033B: + case 0x033D: case 0x033E: + return 2; + case 0x0180: case 0x019b: case 0x019c: + return 1; + } + } + + return 0; +} + /** * sd_probe - called during driver initialization and whenever a * new scsi device is attached to the system. It is called once @@ -3539,6 +3596,7 @@ static int sd_probe(struct device *dev) struct gendisk *gd; int index; int error; + unsigned int sbc_version; scsi_autopm_get_device(sdp); error = -ENODEV; @@ -3627,6 +3685,21 @@ static int sd_probe(struct device *dev) sdkp->first_scan = 1; sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS; + /* + * If the device explicitly claims support for SBC version 3 + * or later, unset the LLD flags which prevent probing for + * modern protocol features and reattach VPD pages. + */ + sbc_version = sd_sbc_version(sdp); + if (!scsi_device_has_vpd(sdp) && sbc_version >= 3) { + sdkp->reattach_vpds = true; + sdp->try_rc_10_first = 0; + sdp->no_read_capacity_16 = 0; + sd_first_printk(KERN_NOTICE, sdkp, + "Detected SBC version %u, fetching VPDs\n", + sbc_version); + } + sd_revalidate_disk(gd); if (sdp->removable) { diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index b843a94ad53c52..6e20574baf85aa 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -131,6 +131,7 @@ struct scsi_disk { unsigned int physical_block_size; unsigned int max_medium_access_timeouts; unsigned int medium_access_timed_out; + unsigned int sbc_version; u8 media_present; u8 write_prot; u8 protection_type;/* Data Integrity Field */ @@ -141,6 +142,7 @@ struct scsi_disk { bool provisioning_override; bool zeroing_override; bool ndob; + bool reattach_vpds; unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 7cf5f3b7589f57..99d63a64e5ce06 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -606,6 +606,20 @@ static inline int scsi_device_supports_vpd(struct scsi_device *sdev) return 0; } +static inline bool scsi_device_has_vpd(struct scsi_device *sdev) +{ + struct scsi_vpd *vpd; + bool found = false; + + rcu_read_lock(); + vpd = rcu_dereference(sdev->vpd_pg0); + if (vpd) + found = true; + rcu_read_unlock(); + + return found; +} + static inline int scsi_device_busy(struct scsi_device *sdev) { return sbitmap_weight(&sdev->budget_map); From 70f58e3e07e6cd81be1e83a8010534f73a3d7d5b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 16 Jun 2022 09:36:17 +0800 Subject: [PATCH 0638/1250] scsi: sd: Support multiple LBA ranges in an UNMAP command Previously we only described a single LBA range in an UNMAP command even if the device reported it could handle multiple ranges. This restriction was due to a limitation in the block layer which is no longer present. Set max_discard_segments according to the Block Limits VPD of the device and enable unmapping multiple LBA ranges in a single UNMAP command. [mkp: Rebased on top of discard/discovery changes, minor tweaks] Link: https://lore.kernel.org/r/20220616013617.2284341-1-chao@kernel.org Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Chao Yu Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 34 +++++++++++++++++++++++++--------- drivers/scsi/sd.h | 2 ++ 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 65f62c79914b6a..e2f76803b2e6c2 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -855,6 +855,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, enum sd_lbp_mode mode) struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; + unsigned int max_segments = 1; if (mode == SD_LBP_DEFAULT && !sdkp->provisioning_override) { if (sdkp->lbpme) { /* Logical Block Provisioning Enabled */ @@ -891,11 +892,14 @@ static void sd_config_discard(struct scsi_disk *sdkp, enum sd_lbp_mode mode) q->limits.discard_alignment = 0; q->limits.discard_granularity = 0; blk_queue_max_discard_sectors(q, 0); + blk_queue_max_discard_segments(q, 0); return; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, (u32)SD_MAX_WS16_BLOCKS); + max_segments = clamp(sdkp->max_unmap_descriptors, 1U, + (u32)SD_MAX_UNMAP_DESCS); break; case SD_LBP_WS16: @@ -930,6 +934,8 @@ static void sd_config_discard(struct scsi_disk *sdkp, enum sd_lbp_mode mode) max(sdkp->physical_block_size, sdkp->unmap_granularity * logical_block_size); blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); + blk_queue_max_discard_segments(q, max_segments); + } static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) @@ -937,9 +943,10 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) struct scsi_device *sdp = cmd->device; struct request *rq = scsi_cmd_to_rq(cmd); struct scsi_disk *sdkp = scsi_disk(rq->q->disk); - u64 lba = sectors_to_logical(sdp, blk_rq_pos(rq)); - u32 nr_blocks = sectors_to_logical(sdp, blk_rq_sectors(rq)); - unsigned int data_len = 24; + unsigned short segments = blk_rq_nr_discard_segments(rq); + unsigned int data_len = 8 + 16 * segments; + unsigned int descriptor_offset = 8; + struct bio *bio; char *buf; rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); @@ -952,13 +959,20 @@ static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) cmd->cmd_len = 10; cmd->cmnd[0] = UNMAP; - cmd->cmnd[8] = 24; + cmd->cmnd[8] = data_len; buf = bvec_virt(&rq->special_vec); - put_unaligned_be16(6 + 16, &buf[0]); - put_unaligned_be16(16, &buf[2]); - put_unaligned_be64(lba, &buf[8]); - put_unaligned_be32(nr_blocks, &buf[16]); + put_unaligned_be16(6 + 16 * segments, &buf[0]); + put_unaligned_be16(16 * segments, &buf[2]); + + __rq_for_each_bio(bio, rq) { + u64 lba = sectors_to_logical(sdp, bio->bi_iter.bi_sector); + u32 nr_blocks = sectors_to_logical(sdp, bio_sectors(bio)); + + put_unaligned_be64(lba, &buf[descriptor_offset]); + put_unaligned_be32(nr_blocks, &buf[descriptor_offset + 8]); + descriptor_offset += 16; + } cmd->allowed = sdkp->max_retries; cmd->transfersize = data_len; @@ -3021,8 +3035,10 @@ static void sd_read_block_limits(struct scsi_disk *sdkp) lba_count = get_unaligned_be32(&vpd->data[20]); desc_count = get_unaligned_be32(&vpd->data[24]); - if (lba_count && desc_count) + if (lba_count && desc_count) { sdkp->max_unmap_blocks = lba_count; + sdkp->max_unmap_descriptors = desc_count; + } sdkp->unmap_granularity = get_unaligned_be32(&vpd->data[28]); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 6e20574baf85aa..3b145a2d2eaa0b 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -49,6 +49,7 @@ enum { SD_MAX_XFER_BLOCKS = 0xffffffff, SD_MAX_WS10_BLOCKS = 0xffff, SD_MAX_WS16_BLOCKS = 0x7fffff, + SD_MAX_UNMAP_DESCS = 0xffff, }; enum sd_lbp_mode { @@ -125,6 +126,7 @@ struct scsi_disk { u32 opt_xfer_blocks; u32 max_ws_blocks; u32 max_unmap_blocks; + u32 max_unmap_descriptors; u32 unmap_granularity; u32 unmap_alignment; u32 index; From e818884c382e1baea28937f31d649d5d306d9285 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 19 Jul 2022 06:19:41 +0200 Subject: [PATCH 0639/1250] parisc: Drop pa_swapper_pg_lock spinlock This spinlock isn't used any longer. Remove it to silence a sparse warning. Signed-off-by: Helge Deller Reported-by: kernel test robot --- arch/parisc/kernel/cache.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index a9bc578e4c52e5..af3d7cdc1541bd 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -50,9 +50,6 @@ void flush_instruction_cache_local(void); /* flushes local code-cache only */ */ DEFINE_SPINLOCK(pa_tlb_flush_lock); -/* Swapper page setup lock. */ -DEFINE_SPINLOCK(pa_swapper_pg_lock); - #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) int pa_serialize_tlb_flushes __ro_after_init; #endif From 2d68c320aa14e5b494de698ce70ca6e6db40bc4b Mon Sep 17 00:00:00 2001 From: Liang He Date: Sat, 16 Jul 2022 11:13:24 +0800 Subject: [PATCH 0640/1250] memory: pl353-smc: Fix refcount leak bug in pl353_smc_probe() The break of for_each_available_child_of_node() needs a corresponding of_node_put() when the reference 'child' is not used anymore. Here we do not need to call of_node_put() in fail path as '!match' means no break. While the of_platform_device_create() will created a new reference by 'child' but it has considered the refcounting. Fixes: fee10bd22678 ("memory: pl353: Add driver for arm pl353 static memory controller") Signed-off-by: Liang He Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220716031324.447680-1-windhl@126.com --- drivers/memory/pl353-smc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/memory/pl353-smc.c b/drivers/memory/pl353-smc.c index f84b98278745c8..d39ee7d06665bd 100644 --- a/drivers/memory/pl353-smc.c +++ b/drivers/memory/pl353-smc.c @@ -122,6 +122,7 @@ static int pl353_smc_probe(struct amba_device *adev, const struct amba_id *id) } of_platform_device_create(child, NULL, &adev->dev); + of_node_put(child); return 0; From d16232de82d64ce02fe6d43aa51de004755de5e5 Mon Sep 17 00:00:00 2001 From: Debarati Biswas Date: Wed, 13 Jul 2022 09:03:55 -0400 Subject: [PATCH 0641/1250] memory: dfl-emif: Update the dfl emif driver support revision 1 The next generation (revision 1) of the DFL EMIF feature device requires support for more than 4 memory banks. It does not support the selective clearing of memory banks. A capability register replaces the previous control register, and contains a bitmask to indicate the presence of each memory bank. This bitmask aligns with the previous control register bitmask that served the same purpose. The control and capability registers are treated like a C Union structure in order to support both the new and old revisions of the EMIF device. Signed-off-by: Debarati Biswas Signed-off-by: Russ Weight Signed-off-by: Tianfei Zhang Reviewed-by: Matthew Gerlach Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220713130355.196115-1-tianfei.zhang@intel.com --- drivers/memory/dfl-emif.c | 62 +++++++++++++++++++++++++++++++++++---- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/drivers/memory/dfl-emif.c b/drivers/memory/dfl-emif.c index 3f719816771d4f..da06cd30a01617 100644 --- a/drivers/memory/dfl-emif.c +++ b/drivers/memory/dfl-emif.c @@ -24,11 +24,24 @@ #define EMIF_STAT_CLEAR_BUSY_SFT 16 #define EMIF_CTRL 0x10 #define EMIF_CTRL_CLEAR_EN_SFT 0 -#define EMIF_CTRL_CLEAR_EN_MSK GENMASK_ULL(3, 0) +#define EMIF_CTRL_CLEAR_EN_MSK GENMASK_ULL(7, 0) #define EMIF_POLL_INVL 10000 /* us */ #define EMIF_POLL_TIMEOUT 5000000 /* us */ +/* + * The Capability Register replaces the Control Register (at the same + * offset) for EMIF feature revisions > 0. The bitmask that indicates + * the presence of memory channels exists in both the Capability Register + * and Control Register definitions. These can be thought of as a C union. + * The Capability Register definitions are used to check for the existence + * of a memory channel, and the Control Register definitions are used for + * managing the memory-clear functionality in revision 0. + */ +#define EMIF_CAPABILITY_BASE 0x10 +#define EMIF_CAPABILITY_CHN_MSK_V0 GENMASK_ULL(3, 0) +#define EMIF_CAPABILITY_CHN_MSK GENMASK_ULL(7, 0) + struct dfl_emif { struct device *dev; void __iomem *base; @@ -106,16 +119,30 @@ emif_state_attr(init_done, EMIF_STAT_INIT_DONE_SFT, 0); emif_state_attr(init_done, EMIF_STAT_INIT_DONE_SFT, 1); emif_state_attr(init_done, EMIF_STAT_INIT_DONE_SFT, 2); emif_state_attr(init_done, EMIF_STAT_INIT_DONE_SFT, 3); +emif_state_attr(init_done, EMIF_STAT_INIT_DONE_SFT, 4); +emif_state_attr(init_done, EMIF_STAT_INIT_DONE_SFT, 5); +emif_state_attr(init_done, EMIF_STAT_INIT_DONE_SFT, 6); +emif_state_attr(init_done, EMIF_STAT_INIT_DONE_SFT, 7); emif_state_attr(cal_fail, EMIF_STAT_CALC_FAIL_SFT, 0); emif_state_attr(cal_fail, EMIF_STAT_CALC_FAIL_SFT, 1); emif_state_attr(cal_fail, EMIF_STAT_CALC_FAIL_SFT, 2); emif_state_attr(cal_fail, EMIF_STAT_CALC_FAIL_SFT, 3); +emif_state_attr(cal_fail, EMIF_STAT_CALC_FAIL_SFT, 4); +emif_state_attr(cal_fail, EMIF_STAT_CALC_FAIL_SFT, 5); +emif_state_attr(cal_fail, EMIF_STAT_CALC_FAIL_SFT, 6); +emif_state_attr(cal_fail, EMIF_STAT_CALC_FAIL_SFT, 7); + emif_clear_attr(0); emif_clear_attr(1); emif_clear_attr(2); emif_clear_attr(3); +emif_clear_attr(4); +emif_clear_attr(5); +emif_clear_attr(6); +emif_clear_attr(7); + static struct attribute *dfl_emif_attrs[] = { &emif_attr_inf0_init_done.attr.attr, @@ -134,6 +161,22 @@ static struct attribute *dfl_emif_attrs[] = { &emif_attr_inf3_cal_fail.attr.attr, &emif_attr_inf3_clear.attr.attr, + &emif_attr_inf4_init_done.attr.attr, + &emif_attr_inf4_cal_fail.attr.attr, + &emif_attr_inf4_clear.attr.attr, + + &emif_attr_inf5_init_done.attr.attr, + &emif_attr_inf5_cal_fail.attr.attr, + &emif_attr_inf5_clear.attr.attr, + + &emif_attr_inf6_init_done.attr.attr, + &emif_attr_inf6_cal_fail.attr.attr, + &emif_attr_inf6_clear.attr.attr, + + &emif_attr_inf7_init_done.attr.attr, + &emif_attr_inf7_cal_fail.attr.attr, + &emif_attr_inf7_clear.attr.attr, + NULL, }; @@ -143,15 +186,24 @@ static umode_t dfl_emif_visible(struct kobject *kobj, struct dfl_emif *de = dev_get_drvdata(kobj_to_dev(kobj)); struct emif_attr *eattr = container_of(attr, struct emif_attr, attr.attr); + struct dfl_device *ddev = to_dfl_dev(de->dev); u64 val; /* - * This device supports upto 4 memory interfaces, but not all + * This device supports up to 8 memory interfaces, but not all * interfaces are used on different platforms. The read out value of - * CLEAN_EN field (which is a bitmap) could tell how many interfaces - * are available. + * CAPABILITY_CHN_MSK field (which is a bitmap) indicates which + * interfaces are available. */ - val = FIELD_GET(EMIF_CTRL_CLEAR_EN_MSK, readq(de->base + EMIF_CTRL)); + if (ddev->revision > 0 && strstr(attr->name, "_clear")) + return 0; + + if (ddev->revision == 0) + val = FIELD_GET(EMIF_CAPABILITY_CHN_MSK_V0, + readq(de->base + EMIF_CAPABILITY_BASE)); + else + val = FIELD_GET(EMIF_CAPABILITY_CHN_MSK, + readq(de->base + EMIF_CAPABILITY_BASE)); return (val & BIT_ULL(eattr->index)) ? attr->mode : 0; } From b54af20531018c2bb7181ba2f511327b3c9f1cef Mon Sep 17 00:00:00 2001 From: Liang He Date: Tue, 19 Jul 2022 16:56:39 +0800 Subject: [PATCH 0642/1250] memory: of: Fix refcount leak bug in of_get_ddr_timings() We should add the of_node_put() when breaking out of for_each_child_of_node() as it will automatically increase and decrease the refcount. Fixes: e6b42eb6a66c ("memory: emif: add device tree support to emif driver") Signed-off-by: Liang He Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220719085640.1210583-1-windhl@126.com --- drivers/memory/of_memory.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/memory/of_memory.c b/drivers/memory/of_memory.c index dbdf87bc0b78ef..8e2ef4bf6b17b8 100644 --- a/drivers/memory/of_memory.c +++ b/drivers/memory/of_memory.c @@ -134,6 +134,7 @@ const struct lpddr2_timings *of_get_ddr_timings(struct device_node *np_ddr, for_each_child_of_node(np_ddr, np_tim) { if (of_device_is_compatible(np_tim, tim_compat)) { if (of_do_get_timings(np_tim, &timings[i])) { + of_node_put(np_tim); devm_kfree(dev, timings); goto default_timings; } From 2f1b3550a152baa8287ee95586f0385410a5296b Mon Sep 17 00:00:00 2001 From: Liang He Date: Tue, 19 Jul 2022 16:56:40 +0800 Subject: [PATCH 0643/1250] memory: of: Fix refcount leak bug in of_lpddr3_get_ddr_timings() We should add the of_node_put() when breaking out of for_each_child_of_node() as it will automatically increase and decrease the refcount. Fixes: 976897dd96db ("memory: Extend of_memory with LPDDR3 support") Signed-off-by: Liang He Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20220719085640.1210583-2-windhl@126.com --- drivers/memory/of_memory.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/memory/of_memory.c b/drivers/memory/of_memory.c index 8e2ef4bf6b17b8..fcd20d85d38574 100644 --- a/drivers/memory/of_memory.c +++ b/drivers/memory/of_memory.c @@ -285,6 +285,7 @@ const struct lpddr3_timings if (of_device_is_compatible(np_tim, tim_compat)) { if (of_lpddr3_do_get_timings(np_tim, &timings[i])) { devm_kfree(dev, timings); + of_node_put(np_tim); goto default_timings; } i++; From edee0ff54e847b4af58ae062624f915eabc4a7ac Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 17 Jul 2022 22:05:05 +0100 Subject: [PATCH 0644/1250] btrfs: join running log transaction when logging new name When logging a new name, in case of a rename, we pin the log before changing it. We then either delete a directory entry from the log or insert a key range item to mark the old name for deletion on log replay. However when doing one of those log changes we may have another task that started writing out the log (at btrfs_sync_log()) and it started before we pinned the log root. So we may end up changing a log tree while its writeback is being started by another task syncing the log. This can lead to inconsistencies in a log tree and other unexpected results during log replay, because we can get some committed node pointing to a node/leaf that ends up not getting written to disk before the next log commit. The problem, conceptually, started to happen in commit 88d2beec7e53fc ("btrfs: avoid logging all directory changes during renames"), because there we started to update the log without joining its current transaction first. However the problem only became visible with commit 259c4b96d78dda ("btrfs: stop doing unnecessary log updates during a rename"), and that is because we used to pin the log at btrfs_rename() and then before entering btrfs_log_new_name(), when unlinking the old dentry, we ended up at btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log(). Both of them join the current log transaction, effectively waiting for any log transaction writeout (due to acquiring the root's log_mutex). This made it safe even after leaving the current log transaction, because we remained with the log pinned when we called btrfs_log_new_name(). Then in commit 259c4b96d78dda ("btrfs: stop doing unnecessary log updates during a rename"), we removed the log pinning from btrfs_rename() and stopped calling btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log() during the rename, and started to do all the needed work at btrfs_log_new_name(), but without joining the current log transaction, only pinning the log, which is racy because another task may have started writeout of the log tree right before we pinned the log. Both commits landed in kernel 5.18, so it doesn't make any practical difference which should be blamed, but I'm blaming the second commit only because with the first one, by chance, the problem did not happen due to the fact we joined the log transaction after pinning the log and unpinned it only after calling btrfs_log_new_name(). So make btrfs_log_new_name() join the current log transaction instead of pinning it, so that we never do log updates if it's writeout is starting. Fixes: 259c4b96d78dda ("btrfs: stop doing unnecessary log updates during a rename") CC: stable@vger.kernel.org # 5.18+ Reported-by: Zygo Blaxell Tested-by: Zygo Blaxell Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 370388fadf960a..ed3ec237f5147a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7029,8 +7029,15 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * anyone from syncing the log until we have updated both inodes * in the log. */ + ret = join_running_log_trans(root); + /* + * At least one of the inodes was logged before, so this should + * not fail, but if it does, it's not serious, just bail out and + * mark the log for a full commit. + */ + if (WARN_ON_ONCE(ret < 0)) + goto out; log_pinned = true; - btrfs_pin_log_trans(root); path = btrfs_alloc_path(); if (!path) { From 9d08a6c65b5d35ad5a8035c7ff69b72d728fec71 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Jul 2022 16:07:06 +0200 Subject: [PATCH 0645/1250] soc: document merges Signed-off-by: Arnd Bergmann --- arch/arm/arm-soc-for-next-contents.txt | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/arm/arm-soc-for-next-contents.txt b/arch/arm/arm-soc-for-next-contents.txt index f5d1de17c91f75..81efe0bb17072f 100644 --- a/arch/arm/arm-soc-for-next-contents.txt +++ b/arch/arm/arm-soc-for-next-contents.txt @@ -19,6 +19,8 @@ arm/soc git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-gpio tags/davinci-boards-delete-v5.20 broadcom/soc-2 https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/soc-part2 + mvebu/soc + git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu tags/mvebu-arm-5.20-1 arm/dt samsung/dt @@ -28,7 +30,6 @@ arm/dt renesas/dt git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-arm-dt-for-v5.20-tag1 renesas/dt-bindings -renesas/dt-fixes git://git.kernel.org/pub/scm/linux/kernel/git/geert/renesas-devel tags/renesas-dt-bindings-for-v5.20-tag1 socfpga/dt git://git.kernel.org/pub/scm/linux/kernel/git/dinguyen/linux tags/socfpga_dts_updates_for_v5.20 @@ -108,6 +109,10 @@ renesas/dt-fixes https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/devicetree-part2 broadcom/dt64-2 https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/devicetree-arm64-part2 + mvebu/dt + git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu tags/mvebu-dt-5.20-1 + mvebu/dt64 + git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu tags/mvebu-dt64-5.20-1 arm/drivers renesas/drivers @@ -134,6 +139,8 @@ arm/drivers https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/drivers qcom/drivers git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-drivers-for-5.20 + patch + soc: fujitsu: Add A64FX diagnostic interrupt driver arm/defconfig renesas/defconfig @@ -155,3 +162,9 @@ arm/late arm/fixes +arm/newsoc + sunplus/newsoc + Merge branch 'sunplus/newsoc' into arm/newsoc + nuvoton/newsoc + Merge branch 'nuvoton/newsoc' into arm/newsoc + From bbeefc0ad610d11184515485f2ca1e66cd5fa1c0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 19 Jul 2022 09:18:35 -0400 Subject: [PATCH 0646/1250] SUNRPC: Fix xdr_encode_bool() I discovered that xdr_encode_bool() was returning the same address that was passed in the @p parameter. The documenting comment states that the intent is to return the address of the next buffer location, just like the other "xdr_encode_*" helpers. The result was the encoded results of NFSv3 PATHCONF operations were not formed correctly. Fixes: ded04a587f6c ("NFSD: Update the NFSv3 PATHCONF3res encoder to use struct xdr_stream") Signed-off-by: Chuck Lever --- include/linux/sunrpc/xdr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 5860f32e395803..986c8a17ca5e74 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -419,8 +419,8 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr) */ static inline __be32 *xdr_encode_bool(__be32 *p, u32 n) { - *p = n ? xdr_one : xdr_zero; - return p++; + *p++ = n ? xdr_one : xdr_zero; + return p; } /** From 013cfbccb0cb3bbac478666aed78e4f5f7e39dd6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 5 Jul 2022 18:45:05 -0400 Subject: [PATCH 0647/1250] batman-adv: tracing: Use the new __vstring() helper Instead of open coding a __dynamic_array() with a fixed length (which defeats the purpose of the dynamic array in the first place). Use the new __vstring() helper that will use a va_list and only write enough of the string into the ring buffer that is needed. Link: https://lkml.kernel.org/r/20220705224751.080390002@goodmis.org Cc: Marek Lindner Cc: Ingo Molnar Cc: Andrew Morton Cc: Simon Wunderlich Cc: Antonio Quartulli Cc: Sven Eckelmann Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: b.a.t.m.a.n@lists.open-mesh.org Cc: netdev@vger.kernel.org Signed-off-by: Steven Rostedt (Google) --- net/batman-adv/trace.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index d673ebdd042673..67d2a8a0196cdc 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -40,16 +40,13 @@ TRACE_EVENT(batadv_dbg, TP_STRUCT__entry( __string(device, bat_priv->soft_iface->name) __string(driver, KBUILD_MODNAME) - __dynamic_array(char, msg, BATADV_MAX_MSG_LEN) + __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( __assign_str(device, bat_priv->soft_iface->name); __assign_str(driver, KBUILD_MODNAME); - WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), - BATADV_MAX_MSG_LEN, - vaf->fmt, - *vaf->va) >= BATADV_MAX_MSG_LEN); + __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk( From a9350b9454ea22c9aba99c207d3dbb0a03941e9f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 5 Jul 2022 18:45:06 -0400 Subject: [PATCH 0648/1250] mac80211: tracing: Use the new __vstring() helper Instead of open coding a __dynamic_array() with a fixed length (which defeats the purpose of the dynamic array in the first place). Use the new __vstring() helper that will use a va_list and only write enough of the string into the ring buffer that is needed. Link: https://lkml.kernel.org/r/20220705224751.271015450@goodmis.org Cc: Johannes Berg Cc: Ingo Molnar Cc: Andrew Morton Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Steven Rostedt (Google) --- net/mac80211/trace_msg.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/mac80211/trace_msg.h b/net/mac80211/trace_msg.h index 40141df09f255f..c9dbe9aab7bdca 100644 --- a/net/mac80211/trace_msg.h +++ b/net/mac80211/trace_msg.h @@ -24,13 +24,11 @@ DECLARE_EVENT_CLASS(mac80211_msg_event, TP_ARGS(vaf), TP_STRUCT__entry( - __dynamic_array(char, msg, MAX_MSG_LEN) + __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign( - WARN_ON_ONCE(vsnprintf(__get_dynamic_array(msg), - MAX_MSG_LEN, vaf->fmt, - *vaf->va) >= MAX_MSG_LEN); + __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("%s", __get_str(msg)) From c6422b4c27b6b82e57cf7aa021bb8f8b12045693 Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:05 +0800 Subject: [PATCH 0649/1250] tracing: eprobe: Add missing log index Add trace_probe_log_set_index(1) to allow report correct error if user input wrong SYSTEM.EVENT format. Link: https://lore.kernel.org/all/1656296348-16111-2-git-send-email-quic_linyyuan@quicinc.com/ Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 7d4478525c6696..b805b570305fc5 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -881,6 +881,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (!is_good_name(event) || !is_good_name(group)) goto parse_error; + trace_probe_log_set_index(1); sys_event = argv[1]; ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, sys_event - argv[1]); From ca836ffa4c93fc37e6232d50883de2be55bdf33d Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:06 +0800 Subject: [PATCH 0650/1250] tracing: eprobe: Remove duplicate is_good_name() operation traceprobe_parse_event_name() already validate SYSTEM and EVENT name, there is no need to call is_good_name() after it. Link: https://lore.kernel.org/all/1656296348-16111-3-git-send-email-quic_linyyuan@quicinc.com/ Acked-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_eprobe.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index b805b570305fc5..8979cb9ec37a55 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -887,8 +887,6 @@ static int __trace_eprobe_create(int argc, const char *argv[]) sys_event - argv[1]); if (ret || !sys_name) goto parse_error; - if (!is_good_name(sys_event) || !is_good_name(sys_name)) - goto parse_error; mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); From 05ebd4a6329863a14c415fcc7fd7e55e36823bdf Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:07 +0800 Subject: [PATCH 0651/1250] tracing: Auto generate event name when creating a group of events Currently when creating a specific group of trace events, take kprobe event as example, the user must use the following format: p:GRP/EVENT [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS], which means user must enter EVENT name, one example is: echo 'p:usb_gadget/config_usb_cfg_link config_usb_cfg_link $arg1' >> kprobe_events It is not simple if there are too many entries because the event name is the same as symbol name. This change allows user to specify no EVENT name, format changed as: p:GRP/ [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] It will generate event name automatically and one example is: echo 'p:usb_gadget/ config_usb_cfg_link $arg1' >> kprobe_events. Link: https://lore.kernel.org/all/1656296348-16111-4-git-send-email-quic_linyyuan@quicinc.com/ Acked-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- Documentation/trace/kprobetrace.rst | 8 ++++---- Documentation/trace/uprobetracer.rst | 8 ++++---- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_eprobe.c | 25 +++++++++++++------------ kernel/trace/trace_kprobe.c | 16 ++++++++++------ kernel/trace/trace_probe.c | 4 ++++ kernel/trace/trace_uprobe.c | 12 ++++++++---- 8 files changed, 48 insertions(+), 35 deletions(-) diff --git a/Documentation/trace/kprobetrace.rst b/Documentation/trace/kprobetrace.rst index b175d88f31ebb8..4274cc6a2f94f1 100644 --- a/Documentation/trace/kprobetrace.rst +++ b/Documentation/trace/kprobetrace.rst @@ -28,10 +28,10 @@ Synopsis of kprobe_events ------------------------- :: - p[:[GRP/]EVENT] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe - r[MAXACTIVE][:[GRP/]EVENT] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe - p:[GRP/]EVENT] [MOD:]SYM[+0]%return [FETCHARGS] : Set a return probe - -:[GRP/]EVENT : Clear a probe + p[:[GRP/][EVENT]] [MOD:]SYM[+offs]|MEMADDR [FETCHARGS] : Set a probe + r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]SYM[+0] [FETCHARGS] : Set a return probe + p[:[GRP/][EVENT]] [MOD:]SYM[+0]%return [FETCHARGS] : Set a return probe + -:[GRP/][EVENT] : Clear a probe GRP : Group name. If omitted, use "kprobes" for it. EVENT : Event name. If omitted, the event name is generated diff --git a/Documentation/trace/uprobetracer.rst b/Documentation/trace/uprobetracer.rst index a8e5938f609e29..3a1797d707f4cf 100644 --- a/Documentation/trace/uprobetracer.rst +++ b/Documentation/trace/uprobetracer.rst @@ -26,10 +26,10 @@ Synopsis of uprobe_tracer ------------------------- :: - p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a uprobe - r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return uprobe (uretprobe) - p[:[GRP/]EVENT] PATH:OFFSET%return [FETCHARGS] : Set a return uprobe (uretprobe) - -:[GRP/]EVENT : Clear uprobe or uretprobe event + p[:[GRP/][EVENT]] PATH:OFFSET [FETCHARGS] : Set a uprobe + r[:[GRP/][EVENT]] PATH:OFFSET [FETCHARGS] : Set a return uprobe (uretprobe) + p[:[GRP/][EVENT]] PATH:OFFSET%return [FETCHARGS] : Set a return uprobe (uretprobe) + -:[GRP/][EVENT] : Clear uprobe or uretprobe event GRP : Group name. If omitted, "uprobes" is the default value. EVENT : Event name. If omitted, the event name is generated based diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b8dd546270750e..7eb5bce625006d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5569,13 +5569,13 @@ static const char readme_msg[] = #endif #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS) "\t accepts: event-definitions (one definition per line)\n" - "\t Format: p[:[/]] []\n" - "\t r[maxactive][:[/]] []\n" + "\t Format: p[:[/][]] []\n" + "\t r[maxactive][:[/][]] []\n" #ifdef CONFIG_HIST_TRIGGERS "\t s:[synthetic/] []\n" #endif - "\t e[:[/]] . []\n" - "\t -:[/]\n" + "\t e[:[/][]] . []\n" + "\t -:[/][]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" "place (kretprobe): [:][+]%return|\n" diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 076b447a1b8894..154996684fb548 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -101,7 +101,7 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type event = p + 1; *p = '\0'; } - if (event[0] == '\0') { + if (!system && event[0] == '\0') { ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 8979cb9ec37a55..a30f21499e8120 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -125,6 +125,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, * We match the following: * event only - match all eprobes with event name * system and event only - match all system/event probes + * system only - match all system probes * * The below has the above satisfied with more arguments: * @@ -143,7 +144,7 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, return false; /* Must match the event name */ - if (strcmp(trace_probe_name(&ep->tp), event) != 0) + if (event[0] != '\0' && strcmp(trace_probe_name(&ep->tp), event) != 0) return false; /* No arguments match all */ @@ -848,7 +849,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) { /* * Argument syntax: - * e[:[GRP/]ENAME] SYSTEM.EVENT [FETCHARGS] + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] * Fetch args: * =$[:TYPE] */ @@ -858,6 +859,7 @@ static int __trace_eprobe_create(int argc, const char *argv[]) struct trace_eprobe *ep = NULL; char buf1[MAX_EVENT_NAME_LEN]; char buf2[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; int ret = 0; int i; @@ -869,25 +871,24 @@ static int __trace_eprobe_create(int argc, const char *argv[]) event = strchr(&argv[0][1], ':'); if (event) { event++; - ret = traceprobe_parse_event_name(&event, &group, buf1, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { - strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); - sanitize_event_name(buf1); - event = buf1; } - if (!is_good_name(event) || !is_good_name(group)) - goto parse_error; trace_probe_log_set_index(1); sys_event = argv[1]; - ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, - sys_event - argv[1]); - if (ret || !sys_name) + ret = traceprobe_parse_event_name(&sys_event, &sys_name, buf2, 0); + if (!sys_event || !sys_name) goto parse_error; + if (!event) { + strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); + sanitize_event_name(buf1); + event = buf1; + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a245ea673715d2..23f7f0ec4f4cf3 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -163,7 +163,8 @@ static bool trace_kprobe_match(const char *system, const char *event, { struct trace_kprobe *tk = to_trace_kprobe(ev); - return strcmp(trace_probe_name(&tk->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tk->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tk->tp), system) == 0) && trace_kprobe_match_command_head(tk, argc, argv); } @@ -708,11 +709,11 @@ static int __trace_kprobe_create(int argc, const char *argv[]) /* * Argument syntax: * - Add kprobe: - * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] * - Add kretprobe: - * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * r[MAXACTIVE][:[GRP/][EVENT]] [MOD:]KSYM[+0] [FETCHARGS] * Or - * p:[GRP/]EVENT] [MOD:]KSYM[+0]%return [FETCHARGS] + * p[:[GRP/][EVENT]] [MOD:]KSYM[+0]%return [FETCHARGS] * * Fetch args: * $retval : fetch return value @@ -739,6 +740,7 @@ static int __trace_kprobe_create(int argc, const char *argv[]) long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; unsigned int flags = TPARG_FL_KERNEL; switch (argv[0][0]) { @@ -833,11 +835,13 @@ static int __trace_kprobe_create(int argc, const char *argv[]) trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto parse_error; - } else { + } + + if (!event) { /* Make a new event name */ if (symbol) snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 80863c6508e5e9..850a88abd33ba2 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -257,6 +257,10 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, } len = strlen(event); if (len == 0) { + if (slash) { + *pevent = NULL; + return 0; + } trace_probe_log_err(offset, NO_EVENT_NAME); return -EINVAL; } else if (len > MAX_EVENT_NAME_LEN) { diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index c3dc4f859a6bcb..a3fec28961d62a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -312,7 +312,8 @@ static bool trace_uprobe_match(const char *system, const char *event, { struct trace_uprobe *tu = to_trace_uprobe(ev); - return strcmp(trace_probe_name(&tu->tp), event) == 0 && + return (event[0] == '\0' || + strcmp(trace_probe_name(&tu->tp), event) == 0) && (!system || strcmp(trace_probe_group_name(&tu->tp), system) == 0) && trace_uprobe_match_command_head(tu, argc, argv); } @@ -532,7 +533,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) /* * Argument syntax: - * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET[%return][(REF)] [FETCHARGS] + * - Add uprobe: p|r[:[GRP/][EVENT]] PATH:OFFSET[%return][(REF)] [FETCHARGS] */ static int __trace_uprobe_create(int argc, const char **argv) { @@ -540,6 +541,7 @@ static int __trace_uprobe_create(int argc, const char **argv) const char *event = NULL, *group = UPROBE_EVENT_SYSTEM; char *arg, *filename, *rctr, *rctr_end, *tmp; char buf[MAX_EVENT_NAME_LEN]; + char gbuf[MAX_EVENT_NAME_LEN]; enum probe_print_type ptype; struct path path; unsigned long offset, ref_ctr_offset; @@ -644,11 +646,13 @@ static int __trace_uprobe_create(int argc, const char **argv) /* setup a probe */ trace_probe_log_set_index(0); if (event) { - ret = traceprobe_parse_event_name(&event, &group, buf, + ret = traceprobe_parse_event_name(&event, &group, gbuf, event - argv[0]); if (ret) goto fail_address_parse; - } else { + } + + if (!event) { char *tail; char *ptr; From 9a5f84cf1296762a4d8a762610a9e0a44f499871 Mon Sep 17 00:00:00 2001 From: Linyu Yuan Date: Mon, 27 Jun 2022 10:19:08 +0800 Subject: [PATCH 0652/1250] selftests/ftrace: Add test case for GRP/ only input Add kprobe and eprobe event test for new GRP/ only format. Link: https://lore.kernel.org/all/1656296348-16111-5-git-send-email-quic_linyyuan@quicinc.com/ Acked-by: Masami Hiramatsu (Google) Reviewed-by: Tom Zanussi Signed-off-by: Linyu Yuan Signed-off-by: Steven Rostedt (Google) --- .../ftrace/test.d/dynevent/add_remove_eprobe.tc | 9 ++++++++- .../ftrace/test.d/dynevent/add_remove_kprobe.tc | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc index 60c02b482be834..c300eb0202620c 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_eprobe.tc @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 # description: Generic dynamic event - add/remove eprobe events -# requires: dynamic_events events/syscalls/sys_enter_openat "e[:[/]] . []":README +# requires: dynamic_events events/syscalls/sys_enter_openat ". []":README echo 0 > events/enable @@ -87,4 +87,11 @@ echo "-:eprobes/$EPROBE $SYSTEM/$EVENT $OPTIONS" >> dynamic_events ! grep -q "$EPROBE" dynamic_events ! test -d events/eprobes/$EPROBE +if grep -q "e\[:\[/]\[]]" README; then + echo "e:mygroup/ $SYSTEM/$EVENT $OPTIONS" >> dynamic_events + test -d events/mygroup + echo "-:mygroup/" >> dynamic_events + ! test -d events/mygroup +fi + clear_trace diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc index b4da41d126d583..13d43f40a6fc6d 100644 --- a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc +++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc @@ -23,4 +23,11 @@ grep -q myevent1 dynamic_events echo > dynamic_events +if grep -q "p\[:\[/]\[]]" README; then + echo "p:mygroup/ $PLACE" >> dynamic_events + test -d events/mygroup + echo "-:mygroup/" >> dynamic_events + ! test -d events/mygroup +fi + clear_trace From 3b0043f4cae57f6a83eb59996fcb2a9f558ba9f8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Tue, 12 Jul 2022 16:17:07 -0400 Subject: [PATCH 0653/1250] selftests/kprobe: Do not test for GRP/ without event failures A new feature is added where kprobes (and other probes) do not need to explicitly state the event name when creating a probe. The event name will come from what is being attached. That is: # echo 'p:foo/ vfs_read' > kprobe_events Will no longer error, but instead create an event: # cat kprobe_events p:foo/p_vfs_read_0 vfs_read This should not be tested as an error case anymore. Remove it from the selftest as now this feature "breaks" the selftest as it no longer fails as expected. Link: https://lore.kernel.org/all/1656296348-16111-1-git-send-email-quic_linyyuan@quicinc.com/ Link: https://lkml.kernel.org/r/20220712161707.6dc08a14@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- .../selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index fa928b431555ca..7c02509c71d0a0 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -21,7 +21,6 @@ check_error 'p:^/bar vfs_read' # NO_GROUP_NAME check_error 'p:^12345678901234567890123456789012345678901234567890123456789012345/bar vfs_read' # GROUP_TOO_LONG check_error 'p:^foo.1/bar vfs_read' # BAD_GROUP_NAME -check_error 'p:foo/^ vfs_read' # NO_EVENT_NAME check_error 'p:foo/^12345678901234567890123456789012345678901234567890123456789012345 vfs_read' # EVENT_TOO_LONG check_error 'p:foo/^bar.1 vfs_read' # BAD_EVENT_NAME From 5f7ce3a02ae6667390e717d1afd6fedff0aaa281 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Google)" Date: Fri, 15 Jul 2022 17:55:55 -0400 Subject: [PATCH 0654/1250] tracing: Add example and documentation for new __vstring() macro Update the sample trace events to include an example that uses the new __vstring() helpers for TRACE_EVENTS. Link: https://lkml.kernel.org/r/20220715175555.16375a3b@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- samples/trace_events/trace-events-sample.c | 14 ++++++++-- samples/trace_events/trace-events-sample.h | 32 +++++++++++++++++++--- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index 4d34dc0b0fee7d..608c4ae3b08a3a 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -19,9 +19,10 @@ static const char *random_strings[] = { "One ring to rule them all" }; -static void simple_thread_func(int cnt) +static void do_simple_thread_func(int cnt, const char *fmt, ...) { unsigned long bitmask[1] = {0xdeadbeefUL}; + va_list va; int array[6]; int len = cnt % 5; int i; @@ -33,9 +34,13 @@ static void simple_thread_func(int cnt) array[i] = i + 1; array[i] = 0; + va_start(va, fmt); + /* Silly tracepoints */ trace_foo_bar("hello", cnt, array, random_strings[len], - current->cpus_ptr); + current->cpus_ptr, fmt, &va); + + va_end(va); trace_foo_with_template_simple("HELLO", cnt); @@ -48,6 +53,11 @@ static void simple_thread_func(int cnt) trace_foo_rel_loc("Hello __rel_loc", cnt, bitmask); } +static void simple_thread_func(int cnt) +{ + do_simple_thread_func(cnt, "iter=%d", cnt); +} + static int simple_thread(void *arg) { int cnt = 0; diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index cbbbb83beced09..1a92226202fc5d 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -141,6 +141,27 @@ * In most cases, the __assign_str() macro will take the same * parameters as the __string() macro had to declare the string. * + * __vstring: This is similar to __string() but instead of taking a + * dynamic length, it takes a variable list va_list 'va' variable. + * Some event callers already have a message from parameters saved + * in a va_list. Passing in the format and the va_list variable + * will save just enough on the ring buffer for that string. + * Note, the va variable used is a pointer to a va_list, not + * to the va_list directly. + * + * (va_list *va) + * + * __vstring(foo, fmt, va) is similar to: vsnprintf(foo, fmt, va) + * + * To assign the string, use the helper macro __assign_vstr(). + * + * __assign_vstr(foo, fmt, va); + * + * In most cases, the __assign_vstr() macro will take the same + * parameters as the __vstring() macro had to declare the string. + * Use __get_str() to retrieve the __vstring() just like it would for + * __string(). + * * __string_len: This is a helper to a __dynamic_array, but it understands * that the array has characters in it, and with the combined * use of __assign_str_len(), it will allocate 'len' + 1 bytes @@ -256,9 +277,10 @@ TRACE_DEFINE_ENUM(TRACE_SAMPLE_ZOO); TRACE_EVENT(foo_bar, TP_PROTO(const char *foo, int bar, const int *lst, - const char *string, const struct cpumask *mask), + const char *string, const struct cpumask *mask, + const char *fmt, va_list *va), - TP_ARGS(foo, bar, lst, string, mask), + TP_ARGS(foo, bar, lst, string, mask, fmt, va), TP_STRUCT__entry( __array( char, foo, 10 ) @@ -266,6 +288,7 @@ TRACE_EVENT(foo_bar, __dynamic_array(int, list, __length_of(lst)) __string( str, string ) __bitmask( cpus, num_possible_cpus() ) + __vstring( vstr, fmt, va ) ), TP_fast_assign( @@ -274,10 +297,11 @@ TRACE_EVENT(foo_bar, memcpy(__get_dynamic_array(list), lst, __length_of(lst) * sizeof(int)); __assign_str(str, string); + __assign_vstr(vstr, fmt, va); __assign_bitmask(cpus, cpumask_bits(mask), num_possible_cpus()); ), - TP_printk("foo %s %d %s %s %s %s (%s)", __entry->foo, __entry->bar, + TP_printk("foo %s %d %s %s %s %s (%s) %s", __entry->foo, __entry->bar, /* * Notice here the use of some helper functions. This includes: @@ -321,7 +345,7 @@ TRACE_EVENT(foo_bar, __print_array(__get_dynamic_array(list), __get_dynamic_array_len(list) / sizeof(int), sizeof(int)), - __get_str(str), __get_bitmask(cpus)) + __get_str(str), __get_bitmask(cpus), __get_str(vstr)) ); /* From 41065bf8138605d15393cce052c77238165392de Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Mon, 18 Jul 2022 16:05:10 +0900 Subject: [PATCH 0655/1250] selftests/kprobe: Update test for no event name syntax error The commit 208003254c32 ("selftests/kprobe: Do not test for GRP/ without event failures") removed a syntax which is no more cause a syntax error (NO_EVENT_NAME error with GRP/). However, there are another case (NO_EVENT_NAME error without GRP/) which causes a same error. This adds a test for that case. Link: https://lkml.kernel.org/r/165812790993.1377963.9762767354560397298.stgit@devnote2 Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- .../selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc index 7c02509c71d0a0..9e85d3019ff0c6 100644 --- a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc +++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc @@ -21,6 +21,7 @@ check_error 'p:^/bar vfs_read' # NO_GROUP_NAME check_error 'p:^12345678901234567890123456789012345678901234567890123456789012345/bar vfs_read' # GROUP_TOO_LONG check_error 'p:^foo.1/bar vfs_read' # BAD_GROUP_NAME +check_error 'p:^ vfs_read' # NO_EVENT_NAME check_error 'p:foo/^12345678901234567890123456789012345678901234567890123456789012345 vfs_read' # EVENT_TOO_LONG check_error 'p:foo/^bar.1 vfs_read' # BAD_EVENT_NAME From d5166a3d3786a30598facae0dc155a7a98658ad3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Apr 2022 17:05:39 -0700 Subject: [PATCH 0656/1250] memory-model: Prohibit nested SRCU read-side critical sections This commit prohibits nested SRCU read-side critical sections of the same srcu_struct structure. The memory model does not currently handle these correctly because it ignores the required connection between srcu_read_lock() and srcu_read_unlock() provided by the value returned from the former and passed into the latter. Signed-off-by: Paul E. McKenney --- tools/memory-model/linux-kernel.bell | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tools/memory-model/linux-kernel.bell b/tools/memory-model/linux-kernel.bell index 5be86b1025e8d6..65c32ca9d5ea2c 100644 --- a/tools/memory-model/linux-kernel.bell +++ b/tools/memory-model/linux-kernel.bell @@ -56,17 +56,11 @@ let rcu-rscs = let rec flag ~empty Rcu-lock \ domain(rcu-rscs) as unbalanced-rcu-locking flag ~empty Rcu-unlock \ range(rcu-rscs) as unbalanced-rcu-locking -(* Compute matching pairs of nested Srcu-lock and Srcu-unlock *) -let srcu-rscs = let rec - unmatched-locks = Srcu-lock \ domain(matched) - and unmatched-unlocks = Srcu-unlock \ range(matched) - and unmatched = unmatched-locks | unmatched-unlocks - and unmatched-po = ([unmatched] ; po ; [unmatched]) & loc - and unmatched-locks-to-unlocks = - ([unmatched-locks] ; po ; [unmatched-unlocks]) & loc - and matched = matched | (unmatched-locks-to-unlocks \ - (unmatched-po ; unmatched-po)) - in matched +(* Compute matching pairs of Srcu-lock and Srcu-unlock, but prohibit nesting *) +let srcu-unmatched = Srcu-lock | Srcu-unlock +let srcu-unmatched-po = ([srcu-unmatched] ; po ; [srcu-unmatched]) & loc +let srcu-unmatched-locks-to-unlock = ([Srcu-lock] ; po ; [Srcu-unlock]) & loc +let srcu-rscs = srcu-unmatched-locks-to-unlock \ (srcu-unmatched-po ; srcu-unmatched-po) (* Validate nesting *) flag ~empty Srcu-lock \ domain(srcu-rscs) as unbalanced-srcu-locking From 3cbedec150c49afc8f24a582c683767b38acdcbc Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 16 Jun 2022 21:53:47 +0800 Subject: [PATCH 0657/1250] rcu: Fix rcu_read_unlock_strict() strict QS reporting Kernels built with CONFIG_PREEMPT=n and CONFIG_RCU_STRICT_GRACE_PERIOD=y report the quiescent state directly from the outermost rcu_read_unlock(). However, the current CPU's rcu_data structure's ->cpu_no_qs.b.norm might still be set, in which case rcu_report_qs_rdp() will exit early, thus failing to report quiescent state. This commit therefore causes rcu_read_unlock_strict() to clear CPU's rcu_data structure's ->cpu_no_qs.b.norm field before invoking rcu_report_qs_rdp(). Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 438ecae6bd7e7a..86772c95ed0aed 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -824,6 +824,7 @@ void rcu_read_unlock_strict(void) if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) return; rdp = this_cpu_ptr(&rcu_data); + rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); udelay(rcu_unlock_delay); } From 578b6339dbe5fd146b20b991c12e9db64b1f0054 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 17 Jun 2022 22:15:19 +0800 Subject: [PATCH 0658/1250] rcu/nocb: Choose the right rcuog/rcuop kthreads to output The show_rcu_nocb_gp_state() function is supposed to dump out the rcuog kthread and the show_rcu_nocb_state() function is supposed to dump out the rcuo[ps] kthread. Currently, both do a mixture, which is not optimal for debugging, even though it does not affect functionality. This commit therefore adjusts these two functions to focus on their respective kthreads. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_nocb.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index a8f574d8850d22..f20aec4f4394bd 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1452,8 +1452,8 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) (long)rdp->nocb_gp_seq, rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, - show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); + rdp->nocb_gp_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_gp_kthread)); } /* Dump out nocb kthread state for the specified rcu_data structure. */ @@ -1497,7 +1497,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], rcu_segcblist_n_cbs(&rdp->cblist), rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', - rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1, show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); /* It is OK for GP kthreads to have GP state. */ From 3160a8db94797478a86afbcddbad4c773eb483df Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 21 Jun 2022 11:02:25 -0700 Subject: [PATCH 0659/1250] torture: Optionally flush printk() buffers before powering off The rcutorture test suite produces quite a bit of console output at the end of a test. This means that the new-in-2022 printk() kthreads are likely to be in the process of flushing output at the time of the torture_shutdown() function's call to kernel_power_off(). Normally, rcutorture relies on printk() to flush any pending output upon shutdown, the better to detect bugs in this area, for example, the one introduced by 8e274732115f ("printk: extend console_lock for per-console locking"). However, once such a bug is detected and reported, it is necessary to test the rest of the system, without noise from the already-reported bug. This commit therefore adds a torture.printk_shutdown_bug_workaround kernel parameter, which causes torture_shutdown() to invoke pr_flush(), and print an informative message on the console, immediately before invoking kernel_power_off(). When this kernel parameter is not specified, it is up to printk() to flush its own buffers. Suggested-by: John Ogness Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ kernel/torture.c | 7 +++++++ 2 files changed, 13 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4cd3ca5d09a8a1..7e34086c64f5bb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6063,6 +6063,12 @@ are running concurrently, especially on systems with rotating-rust storage. + torture.printk_shutdown_bug_workaround= [KNL] + Execute pr_flush(1000, true) just before invoking + kernel_power_off() to work around any bugs that + might prevent printk() from flushing its buffers + at shutdown time. + torture.verbose_sleep_frequency= [KNL] Specifies how many verbose printk()s should be emitted between each sleep. The default of zero diff --git a/kernel/torture.c b/kernel/torture.c index 789aeb0e1159c6..7cd2016b020764 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -48,6 +48,9 @@ module_param(disable_onoff_at_boot, bool, 0444); static bool ftrace_dump_at_shutdown; module_param(ftrace_dump_at_shutdown, bool, 0444); +static bool printk_shutdown_bug_workaround; +module_param(printk_shutdown_bug_workaround, bool, 0444); + static int verbose_sleep_frequency; module_param(verbose_sleep_frequency, int, 0444); @@ -651,6 +654,10 @@ static int torture_shutdown(void *arg) VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); if (ftrace_dump_at_shutdown) rcu_ftrace_dump(DUMP_ALL); + if (printk_shutdown_bug_workaround) { + pr_info("%s: Flushing printk() buffers at power-down time.\n", __func__); + pr_flush(1000, true); + } kernel_power_off(); /* Shut down the system. */ return 0; } From fdd717cddc002c743a46e7416f97e729e42fc5c3 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Mon, 20 Jun 2022 17:17:49 +0900 Subject: [PATCH 0660/1250] docs/memory-barriers.txt: Fix confusing name of 'data dependency barrier' The term "data dependency barrier", which has been in memory-barriers.txt ever since it was first authored by David Howells, has become confusing due to the fact that in LKMM's explanations.txt and elsewhere, "data dependency" is used mostly for load-to-store data dependency. To prevent further confusions, do the changes listed below: - substitute "data dependency barrier" with "address-dependency barrier"; - add note on the removal of kernel APIs for explicit address- dependency barriers in kernel release v5.9; - note that address-dependency barriers are not necessary for load-to-store situations; - use READ_ONCE_OLD() for pre-4.15 READ_ONCE() (no implicit address- dependency barrier); - fix count of kernel memory barrier APIs; - and a few more context adjustments. Note: Cleanups of long lines are deferred to a followup patch. Reported-by: "Michael S. Tsirkin" Link: https://lore.kernel.org/r/20211011064233-mutt-send-email-mst@kernel.org/ Signed-off-by: Akira Yokosawa Cc: "Paul E. McKenney" Cc: Alan Stern Cc: Will Deacon Cc: Peter Zijlstra Cc: Boqun Feng Cc: Andrea Parri Cc: Nicholas Piggin Cc: David Howells Cc: Daniel Lustig Cc: Joel Fernandes Cc: Jonathan Corbet Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 116 ++++++++++++++++-------------- 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index b12df9137e1c17..bdbea3cc66a3ee 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -52,7 +52,7 @@ CONTENTS - Varieties of memory barrier. - What may not be assumed about memory barriers? - - Data dependency barriers (historical). + - Address-dependency barriers (historical). - Control dependencies. - SMP barrier pairing. - Examples of memory barrier sequences. @@ -187,7 +187,7 @@ As a further example, consider this sequence of events: B = 4; Q = P; P = &B; D = *Q; -There is an obvious data dependency here, as the value loaded into D depends on +There is an obvious address dependency here, as the value loaded into D depends on the address retrieved from P by CPU 2. At the end of the sequence, any of the following results are possible: @@ -391,49 +391,53 @@ Memory barriers come in four basic varieties: memory system as time progresses. All stores _before_ a write barrier will occur _before_ all the stores after the write barrier. - [!] Note that write barriers should normally be paired with read or data - dependency barriers; see the "SMP barrier pairing" subsection. + [!] Note that write barriers should normally be paired with read or + address-dependency barriers; see the "SMP barrier pairing" subsection. - (2) Data dependency barriers. + (2) Address-dependency barriers (historical). - A data dependency barrier is a weaker form of read barrier. In the case + An address-dependency barrier is a weaker form of read barrier. In the case where two loads are performed such that the second depends on the result of the first (eg: the first load retrieves the address to which the second - load will be directed), a data dependency barrier would be required to + load will be directed), an address-dependency barrier would be required to make sure that the target of the second load is updated after the address obtained by the first load is accessed. - A data dependency barrier is a partial ordering on interdependent loads + An address-dependency barrier is a partial ordering on interdependent loads only; it is not required to have any effect on stores, independent loads or overlapping loads. As mentioned in (1), the other CPUs in the system can be viewed as committing sequences of stores to the memory system that the CPU being - considered can then perceive. A data dependency barrier issued by the CPU + considered can then perceive. An address-dependency barrier issued by the CPU under consideration guarantees that for any load preceding it, if that load touches one of a sequence of stores from another CPU, then by the time the barrier completes, the effects of all the stores prior to that - touched by the load will be perceptible to any loads issued after the data + touched by the load will be perceptible to any loads issued after the address- dependency barrier. See the "Examples of memory barrier sequences" subsection for diagrams showing the ordering constraints. - [!] Note that the first load really has to have a _data_ dependency and + [!] Note that the first load really has to have an _address_ dependency and not a control dependency. If the address for the second load is dependent on the first load, but the dependency is through a conditional rather than actually loading the address itself, then it's a _control_ dependency and a full read barrier or better is required. See the "Control dependencies" subsection for more information. - [!] Note that data dependency barriers should normally be paired with + [!] Note that address-dependency barriers should normally be paired with write barriers; see the "SMP barrier pairing" subsection. + [!] Kernel release v5.9 removed kernel APIs for explicit address- + dependency barriers. Nowadays, APIs for marking loads from shared + variables such as READ_ONCE() and rcu_dereference() provide implicit + address-dependency barriers. (3) Read (or load) memory barriers. - A read barrier is a data dependency barrier plus a guarantee that all the + A read barrier is an address-dependency barrier plus a guarantee that all the LOAD operations specified before the barrier will appear to happen before all the LOAD operations specified after the barrier with respect to the other components of the system. @@ -441,7 +445,7 @@ Memory barriers come in four basic varieties: A read barrier is a partial ordering on loads only; it is not required to have any effect on stores. - Read memory barriers imply data dependency barriers, and so can substitute + Read memory barriers imply address-dependency barriers, and so can substitute for them. [!] Note that read barriers should normally be paired with write barriers; @@ -550,17 +554,21 @@ There are certain things that the Linux kernel memory barriers do not guarantee: Documentation/core-api/dma-api.rst -DATA DEPENDENCY BARRIERS (HISTORICAL) -------------------------------------- +ADDRESS-DEPENDENCY BARRIERS (HISTORICAL) +---------------------------------------- As of v4.15 of the Linux kernel, an smp_mb() was added to READ_ONCE() for DEC Alpha, which means that about the only people who need to pay attention to this section are those working on DEC Alpha architecture-specific code and those working on READ_ONCE() itself. For those who need it, and for those who are interested in the history, here is the story of -data-dependency barriers. +address-dependency barriers. + +[!] While address dependencies are observed in both load-to-load and +load-to-store relations, address-dependency barriers are not necessary +for load-to-store situations. -The usage requirements of data dependency barriers are a little subtle, and +The requirement of address-dependency barriers is a little subtle, and it's not always obvious that they're needed. To illustrate, consider the following sequence of events: @@ -570,10 +578,13 @@ following sequence of events: B = 4; WRITE_ONCE(P, &B); - Q = READ_ONCE(P); + Q = READ_ONCE_OLD(P); D = *Q; -There's a clear data dependency here, and it would seem that by the end of the +[!] READ_ONCE_OLD() corresponds to READ_ONCE() of pre-4.15 kernel, which +doesn't imply an address-dependency barrier. + +There's a clear address dependency here, and it would seem that by the end of the sequence, Q must be either &A or &B, and that: (Q == &A) implies (D == 1) @@ -588,8 +599,8 @@ While this may seem like a failure of coherency or causality maintenance, it isn't, and this behaviour can be observed on certain real CPUs (such as the DEC Alpha). -To deal with this, a data dependency barrier or better must be inserted -between the address load and the data load: +To deal with this, READ_ONCE() provides an implicit address-dependency +barrier since kernel release v4.15: CPU 1 CPU 2 =============== =============== @@ -598,7 +609,7 @@ between the address load and the data load: WRITE_ONCE(P, &B); Q = READ_ONCE(P); - + D = *Q; This enforces the occurrence of one of the two implications, and prevents the @@ -615,7 +626,7 @@ odd-numbered bank is idle, one can see the new value of the pointer P (&B), but the old value of the variable B (2). -A data-dependency barrier is not required to order dependent writes +An address-dependency barrier is not required to order dependent writes because the CPUs that the Linux kernel supports don't do writes until they are certain (1) that the write will actually happen, (2) of the location of the write, and (3) of the value to be written. @@ -629,12 +640,12 @@ break dependencies in a great many highly creative ways. B = 4; WRITE_ONCE(P, &B); - Q = READ_ONCE(P); + Q = READ_ONCE_OLD(P); WRITE_ONCE(*Q, 5); -Therefore, no data-dependency barrier is required to order the read into +Therefore, no address-dependency barrier is required to order the read into Q with the store into *Q. In other words, this outcome is prohibited, -even without a data-dependency barrier: +even without an implicit address-dependency barrier of modern READ_ONCE(): (Q == &B) && (B == 4) @@ -645,12 +656,12 @@ can be used to record rare error conditions and the like, and the CPUs' naturally occurring ordering prevents such records from being lost. -Note well that the ordering provided by a data dependency is local to +Note well that the ordering provided by an address dependency is local to the CPU containing it. See the section on "Multicopy atomicity" for more information. -The data dependency barrier is very important to the RCU system, +The address-dependency barrier is very important to the RCU system, for example. See rcu_assign_pointer() and rcu_dereference() in include/linux/rcupdate.h. This permits the current target of an RCU'd pointer to be replaced with a new modified target, without the replacement @@ -667,16 +678,17 @@ not understand them. The purpose of this section is to help you prevent the compiler's ignorance from breaking your code. A load-load control dependency requires a full read memory barrier, not -simply a data dependency barrier to make it work correctly. Consider the +simply an (implicit) address-dependency barrier to make it work correctly. Consider the following bit of code: q = READ_ONCE(a); + if (q) { - /* BUG: No data dependency!!! */ + /* BUG: No address dependency!!! */ p = READ_ONCE(b); } -This will not have the desired effect because there is no actual data +This will not have the desired effect because there is no actual address dependency, but rather a control dependency that the CPU may short-circuit by attempting to predict the outcome in advance, so that other CPUs see the load from b as having happened before the load from a. In such a @@ -927,9 +939,9 @@ General barriers pair with each other, though they also pair with most other types of barriers, albeit without multicopy atomicity. An acquire barrier pairs with a release barrier, but both may also pair with other barriers, including of course general barriers. A write barrier pairs -with a data dependency barrier, a control dependency, an acquire barrier, +with an address-dependency barrier, a control dependency, an acquire barrier, a release barrier, a read barrier, or a general barrier. Similarly a -read barrier, control dependency, or a data dependency barrier pairs +read barrier, control dependency, or an address-dependency barrier pairs with a write barrier, an acquire barrier, a release barrier, or a general barrier: @@ -948,7 +960,7 @@ Or: a = 1; WRITE_ONCE(b, &a); x = READ_ONCE(b); - + y = *x; Or even: @@ -968,7 +980,7 @@ Basically, the read barrier always has to be there, even though it can be of the "weaker" type. [!] Note that the stores before the write barrier would normally be expected to -match the loads after the read barrier or the data dependency barrier, and vice +match the loads after the read barrier or the address-dependency barrier, and vice versa: CPU 1 CPU 2 @@ -1021,7 +1033,7 @@ STORE B, STORE C } all occurring before the unordered set of { STORE D, STORE E V -Secondly, data dependency barriers act as partial orderings on data-dependent +Secondly, address-dependency barriers act as partial orderings on address-dependent loads. Consider the following sequence of events: CPU 1 CPU 2 @@ -1067,7 +1079,7 @@ effectively random order, despite the write barrier issued by CPU 1: In the above example, CPU 2 perceives that B is 7, despite the load of *C (which would be B) coming after the LOAD of C. -If, however, a data dependency barrier were to be placed between the load of C +If, however, an address-dependency barrier were to be placed between the load of C and the load of *C (ie: B) on CPU 2: CPU 1 CPU 2 @@ -1078,7 +1090,7 @@ and the load of *C (ie: B) on CPU 2: STORE C = &B LOAD X STORE D = 4 LOAD C (gets &B) - + LOAD *C (reads B) then the following will occur: @@ -1101,7 +1113,7 @@ then the following will occur: | +-------+ | | | | X->9 |------>| | | +-------+ | | - Makes sure all effects ---> \ ddddddddddddddddd | | + Makes sure all effects ---> \ aaaaaaaaaaaaaaaaa | | prior to the store of C \ +-------+ | | are perceptible to ----->| B->2 |------>| | subsequent loads +-------+ | | @@ -1292,7 +1304,7 @@ Which might appear as this: LOAD with immediate effect : : +-------+ -Placing a read barrier or a data dependency barrier just before the second +Placing a read barrier or an address-dependency barrier just before the second load: CPU 1 CPU 2 @@ -1816,20 +1828,20 @@ which may then reorder things however it wishes. CPU MEMORY BARRIERS ------------------- -The Linux kernel has eight basic CPU memory barriers: +The Linux kernel has seven basic CPU memory barriers: - TYPE MANDATORY SMP CONDITIONAL - =============== ======================= =========================== - GENERAL mb() smp_mb() - WRITE wmb() smp_wmb() - READ rmb() smp_rmb() - DATA DEPENDENCY READ_ONCE() + TYPE MANDATORY SMP CONDITIONAL + ======================= =============== =============== + GENERAL mb() smp_mb() + WRITE wmb() smp_wmb() + READ rmb() smp_rmb() + ADDRESS DEPENDENCY READ_ONCE() -All memory barriers except the data dependency barriers imply a compiler -barrier. Data dependencies do not impose any additional compiler ordering. +All memory barriers except the address-dependency barriers imply a compiler +barrier. Address dependencies do not impose any additional compiler ordering. -Aside: In the case of data dependencies, the compiler would be expected +Aside: In the case of address dependencies, the compiler would be expected to issue the loads in the correct order (eg. `a[b]` would have to load the value of b before loading a[b]), however there is no guarantee in the C specification that the compiler may not speculate the value of b @@ -2888,7 +2900,7 @@ AND THEN THERE'S THE ALPHA The DEC Alpha CPU is one of the most relaxed CPUs there is. Not only that, some versions of the Alpha CPU have a split data cache, permitting them to have two semantically-related cache lines updated at separate times. This is where -the data dependency barrier really becomes necessary as this synchronises both +the address-dependency barrier really becomes necessary as this synchronises both caches with the memory coherence system, thus making it seem like pointer changes vs new data occur in the right order. From e7daf3957cecb6009b092e2e15d23acffd631e98 Mon Sep 17 00:00:00 2001 From: Akira Yokosawa Date: Mon, 20 Jun 2022 17:19:35 +0900 Subject: [PATCH 0661/1250] docs/memory-barriers.txt: Fixup long lines Substitution of "data dependency barrier" with "address-dependency barrier" left quite a lot of lines exceeding 80 columns. Reflow those lines as well as a few short ones not related to the substitution. No changes in documentation text. Signed-off-by: Akira Yokosawa Cc: "Paul E. McKenney" Cc: Alan Stern Cc: Will Deacon Cc: Peter Zijlstra Cc: Boqun Feng Cc: Andrea Parri Cc: Nicholas Piggin Cc: David Howells Cc: Daniel Lustig Cc: Joel Fernandes Cc: "Michael S. Tsirkin" Cc: Jonathan Corbet Signed-off-by: Paul E. McKenney --- Documentation/memory-barriers.txt | 93 ++++++++++++++++--------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index bdbea3cc66a3ee..334b3768912711 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -187,9 +187,9 @@ As a further example, consider this sequence of events: B = 4; Q = P; P = &B; D = *Q; -There is an obvious address dependency here, as the value loaded into D depends on -the address retrieved from P by CPU 2. At the end of the sequence, any of the -following results are possible: +There is an obvious address dependency here, as the value loaded into D depends +on the address retrieved from P by CPU 2. At the end of the sequence, any of +the following results are possible: (Q == &A) and (D == 1) (Q == &B) and (D == 2) @@ -397,25 +397,25 @@ Memory barriers come in four basic varieties: (2) Address-dependency barriers (historical). - An address-dependency barrier is a weaker form of read barrier. In the case - where two loads are performed such that the second depends on the result - of the first (eg: the first load retrieves the address to which the second - load will be directed), an address-dependency barrier would be required to - make sure that the target of the second load is updated after the address - obtained by the first load is accessed. + An address-dependency barrier is a weaker form of read barrier. In the + case where two loads are performed such that the second depends on the + result of the first (eg: the first load retrieves the address to which + the second load will be directed), an address-dependency barrier would + be required to make sure that the target of the second load is updated + after the address obtained by the first load is accessed. - An address-dependency barrier is a partial ordering on interdependent loads - only; it is not required to have any effect on stores, independent loads - or overlapping loads. + An address-dependency barrier is a partial ordering on interdependent + loads only; it is not required to have any effect on stores, independent + loads or overlapping loads. As mentioned in (1), the other CPUs in the system can be viewed as committing sequences of stores to the memory system that the CPU being - considered can then perceive. An address-dependency barrier issued by the CPU - under consideration guarantees that for any load preceding it, if that - load touches one of a sequence of stores from another CPU, then by the - time the barrier completes, the effects of all the stores prior to that - touched by the load will be perceptible to any loads issued after the address- - dependency barrier. + considered can then perceive. An address-dependency barrier issued by + the CPU under consideration guarantees that for any load preceding it, + if that load touches one of a sequence of stores from another CPU, then + by the time the barrier completes, the effects of all the stores prior to + that touched by the load will be perceptible to any loads issued after + the address-dependency barrier. See the "Examples of memory barrier sequences" subsection for diagrams showing the ordering constraints. @@ -437,16 +437,16 @@ Memory barriers come in four basic varieties: (3) Read (or load) memory barriers. - A read barrier is an address-dependency barrier plus a guarantee that all the - LOAD operations specified before the barrier will appear to happen before - all the LOAD operations specified after the barrier with respect to the - other components of the system. + A read barrier is an address-dependency barrier plus a guarantee that all + the LOAD operations specified before the barrier will appear to happen + before all the LOAD operations specified after the barrier with respect to + the other components of the system. A read barrier is a partial ordering on loads only; it is not required to have any effect on stores. - Read memory barriers imply address-dependency barriers, and so can substitute - for them. + Read memory barriers imply address-dependency barriers, and so can + substitute for them. [!] Note that read barriers should normally be paired with write barriers; see the "SMP barrier pairing" subsection. @@ -584,8 +584,8 @@ following sequence of events: [!] READ_ONCE_OLD() corresponds to READ_ONCE() of pre-4.15 kernel, which doesn't imply an address-dependency barrier. -There's a clear address dependency here, and it would seem that by the end of the -sequence, Q must be either &A or &B, and that: +There's a clear address dependency here, and it would seem that by the end of +the sequence, Q must be either &A or &B, and that: (Q == &A) implies (D == 1) (Q == &B) implies (D == 4) @@ -599,8 +599,8 @@ While this may seem like a failure of coherency or causality maintenance, it isn't, and this behaviour can be observed on certain real CPUs (such as the DEC Alpha). -To deal with this, READ_ONCE() provides an implicit address-dependency -barrier since kernel release v4.15: +To deal with this, READ_ONCE() provides an implicit address-dependency barrier +since kernel release v4.15: CPU 1 CPU 2 =============== =============== @@ -627,12 +627,12 @@ but the old value of the variable B (2). An address-dependency barrier is not required to order dependent writes -because the CPUs that the Linux kernel supports don't do writes -until they are certain (1) that the write will actually happen, (2) -of the location of the write, and (3) of the value to be written. +because the CPUs that the Linux kernel supports don't do writes until they +are certain (1) that the write will actually happen, (2) of the location of +the write, and (3) of the value to be written. But please carefully read the "CONTROL DEPENDENCIES" section and the -Documentation/RCU/rcu_dereference.rst file: The compiler can and does -break dependencies in a great many highly creative ways. +Documentation/RCU/rcu_dereference.rst file: The compiler can and does break +dependencies in a great many highly creative ways. CPU 1 CPU 2 =============== =============== @@ -678,8 +678,8 @@ not understand them. The purpose of this section is to help you prevent the compiler's ignorance from breaking your code. A load-load control dependency requires a full read memory barrier, not -simply an (implicit) address-dependency barrier to make it work correctly. Consider the -following bit of code: +simply an (implicit) address-dependency barrier to make it work correctly. +Consider the following bit of code: q = READ_ONCE(a); @@ -691,8 +691,8 @@ following bit of code: This will not have the desired effect because there is no actual address dependency, but rather a control dependency that the CPU may short-circuit by attempting to predict the outcome in advance, so that other CPUs see -the load from b as having happened before the load from a. In such a -case what's actually required is: +the load from b as having happened before the load from a. In such a case +what's actually required is: q = READ_ONCE(a); if (q) { @@ -980,8 +980,8 @@ Basically, the read barrier always has to be there, even though it can be of the "weaker" type. [!] Note that the stores before the write barrier would normally be expected to -match the loads after the read barrier or the address-dependency barrier, and vice -versa: +match the loads after the read barrier or the address-dependency barrier, and +vice versa: CPU 1 CPU 2 =================== =================== @@ -1033,8 +1033,8 @@ STORE B, STORE C } all occurring before the unordered set of { STORE D, STORE E V -Secondly, address-dependency barriers act as partial orderings on address-dependent -loads. Consider the following sequence of events: +Secondly, address-dependency barriers act as partial orderings on address- +dependent loads. Consider the following sequence of events: CPU 1 CPU 2 ======================= ======================= @@ -1079,8 +1079,8 @@ effectively random order, despite the write barrier issued by CPU 1: In the above example, CPU 2 perceives that B is 7, despite the load of *C (which would be B) coming after the LOAD of C. -If, however, an address-dependency barrier were to be placed between the load of C -and the load of *C (ie: B) on CPU 2: +If, however, an address-dependency barrier were to be placed between the load +of C and the load of *C (ie: B) on CPU 2: CPU 1 CPU 2 ======================= ======================= @@ -2760,7 +2760,8 @@ is discarded from the CPU's cache and reloaded. To deal with this, the appropriate part of the kernel must invalidate the overlapping bits of the cache on each CPU. -See Documentation/core-api/cachetlb.rst for more information on cache management. +See Documentation/core-api/cachetlb.rst for more information on cache +management. CACHE COHERENCY VS MMIO @@ -2900,8 +2901,8 @@ AND THEN THERE'S THE ALPHA The DEC Alpha CPU is one of the most relaxed CPUs there is. Not only that, some versions of the Alpha CPU have a split data cache, permitting them to have two semantically-related cache lines updated at separate times. This is where -the address-dependency barrier really becomes necessary as this synchronises both -caches with the memory coherence system, thus making it seem like pointer +the address-dependency barrier really becomes necessary as this synchronises +both caches with the memory coherence system, thus making it seem like pointer changes vs new data occur in the right order. The Alpha defines the Linux kernel's memory model, although as of v4.15 From 279fe0d1bdaade2745f1ac0f177738c44aa20181 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 22 Jun 2022 13:47:11 +0200 Subject: [PATCH 0662/1250] rcu: Back off upon fill_page_cache_func() allocation failure The fill_page_cache_func() function allocates couple of pages to store kvfree_rcu_bulk_data structures. This is a lightweight (GFP_NORETRY) allocation which can fail under memory pressure. The function will, however keep retrying even when the previous attempt has failed. This retrying is in theory correct, but in practice the allocation is invoked from workqueue context, which means that if the memory reclaim gets stuck, these retries can hog the worker for quite some time. Although the workqueues subsystem automatically adjusts concurrency, such adjustment is not guaranteed to happen until the worker context sleeps. And the fill_page_cache_func() function's retry loop is not guaranteed to sleep (see the should_reclaim_retry() function). And we have seen this function cause workqueue lockups: kernel: BUG: workqueue lockup - pool cpus=93 node=1 flags=0x1 nice=0 stuck for 32s! [...] kernel: pool 74: cpus=37 node=0 flags=0x1 nice=0 hung=32s workers=2 manager: 2146 kernel: pwq 498: cpus=249 node=1 flags=0x1 nice=0 active=4/256 refcnt=5 kernel: in-flight: 1917:fill_page_cache_func kernel: pending: dbs_work_handler, free_work, kfree_rcu_monitor Originally, we thought that the root cause of this lockup was several retries with direct reclaim, but this is not yet confirmed. Furthermore, we have seen similar lockups without any heavy memory pressure. This suggests that there are other factors contributing to these lockups. However, it is not really clear that endless retries are desireable. So let's make the fill_page_cache_func() function back off after allocation failure. Cc: Uladzislau Rezki (Sony) Cc: "Paul E. McKenney" Cc: Frederic Weisbecker Cc: Neeraj Upadhyay Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Signed-off-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8240c4d631eef1..89cb173a1a9522 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3183,15 +3183,16 @@ static void fill_page_cache_func(struct work_struct *work) bnode = (struct kvfree_rcu_bulk_data *) __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); - if (bnode) { - raw_spin_lock_irqsave(&krcp->lock, flags); - pushed = put_cached_bnode(krcp, bnode); - raw_spin_unlock_irqrestore(&krcp->lock, flags); + if (!bnode) + break; - if (!pushed) { - free_page((unsigned long) bnode); - break; - } + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; } } From 173cdcd7c66c093bf594abc6154d9898bde2833c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 22 Jun 2022 22:51:02 +0000 Subject: [PATCH 0663/1250] rcu/kfree: Fix kfree_rcu_shrink_count() return value As per the comments in include/linux/shrinker.h, .count_objects callback should return the number of freeable items, but if there are no objects to free, SHRINK_EMPTY should be returned. The only time 0 is returned should be when we are unable to determine the number of objects, or the cache should be skipped for another reason. Signed-off-by: Joel Fernandes (Google) Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 89cb173a1a9522..b2c499f89e9ffa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3372,7 +3372,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) atomic_set(&krcp->backoff_page_cache_fill, 1); } - return count; + return count == 0 ? SHRINK_EMPTY : count; } static unsigned long From b575b66c0ebb418738f5816388155b5701df50d0 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 20 Jun 2022 14:42:24 +0800 Subject: [PATCH 0664/1250] rcu: Update rcu_preempt_deferred_qs() comments for !PREEMPT kernels In non-premptible kernels, tasks never do context switches within RCU read-side critical sections. Therefore, in such kernels, each leaf rcu_node structure's ->blkd_tasks list will always be empty. The comment on the non-preemptible version of rcu_preempt_deferred_qs() confuses this point, so this commit therefore fixes it. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 86772c95ed0aed..4152816dd29f63 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -932,10 +932,13 @@ static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t) return false; } -// Except that we do need to respond to a request by an expedited grace -// period for a quiescent state from this CPU. Note that requests from -// tasks are handled when removing the task from the blocked-tasks list -// below. +// Except that we do need to respond to a request by an expedited +// grace period for a quiescent state from this CPU. Note that in +// non-preemptible kernels, there can be no context switches within RCU +// read-side critical sections, which in turn means that the leaf rcu_node +// structure's blocked-tasks list is always empty. is therefore no need to +// actually check it. Instead, a quiescent state from this CPU suffices, +// and this function is only called from such a quiescent state. notrace void rcu_preempt_deferred_qs(struct task_struct *t) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); From 9ecc2c92b9ab1fefef3fedf5128fc11b295a21e7 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 30 Jun 2022 18:33:35 +0200 Subject: [PATCH 0665/1250] rcu/kvfree: Update KFREE_DRAIN_JIFFIES interval Currently the monitor work is scheduled with a fixed interval of HZ/20, which is roughly 50 milliseconds. The drawback of this approach is low utilization of the 512 page slots in scenarios with infrequence kvfree_rcu() calls. For example on an Android system: kworker/3:3-507 [003] .... 470.286305: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000d0f0dde5 nr_records=6 kworker/6:1-76 [006] .... 470.416613: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000ea0d6556 nr_records=1 kworker/6:1-76 [006] .... 470.416625: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000003e025849 nr_records=9 kworker/3:3-507 [003] .... 471.390000: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000815a8713 nr_records=48 kworker/1:1-73 [001] .... 471.725785: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000fda9bf20 nr_records=3 kworker/1:1-73 [001] .... 471.725833: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000a425b67b nr_records=76 kworker/0:4-1411 [000] .... 472.085673: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000007996be9d nr_records=1 kworker/0:4-1411 [000] .... 472.085728: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000d0f0dde5 nr_records=5 kworker/6:1-76 [006] .... 472.260340: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x0000000065630ee4 nr_records=102 In many cases, out of 512 slots, fewer than 10 were actually used. In order to improve batching and make utilization more efficient this commit sets a drain interval to a fixed 5-seconds interval. Floods are detected when a page fills quickly, and in that case, the reclaim work is re-scheduled for the next scheduling-clock tick (jiffy). After this change: kworker/7:1-371 [007] .... 5630.725708: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000005ab0ffb3 nr_records=121 kworker/7:1-371 [007] .... 5630.989702: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x0000000060c84761 nr_records=47 kworker/7:1-371 [007] .... 5630.989714: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000000babf308 nr_records=510 kworker/7:1-371 [007] .... 5631.553790: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000bb7bd0ef nr_records=169 kworker/7:1-371 [007] .... 5631.553808: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x0000000044c78753 nr_records=510 kworker/5:6-9428 [005] .... 5631.746102: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000d98519aa nr_records=123 kworker/4:7-9434 [004] .... 5632.001758: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x00000000526c9d44 nr_records=322 kworker/4:7-9434 [004] .... 5632.002073: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000002c6a8afa nr_records=185 kworker/7:1-371 [007] .... 5632.277515: rcu_invoke_kfree_bulk_callback: rcu_preempt bulk=0x000000007f4a962f nr_records=510 Here, all but one of the cases, more than one hundreds slots were used, representing an order-of-magnitude improvement. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b2c499f89e9ffa..84d2817766888f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2832,7 +2832,7 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ -#define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_DRAIN_JIFFIES (5 * HZ) #define KFREE_N_BATCHES 2 #define FREE_N_CHANNELS 2 @@ -3093,6 +3093,21 @@ need_offload_krc(struct kfree_rcu_cpu *krcp) return !!krcp->head; } +static void +schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) +{ + long delay, delay_left; + + delay = READ_ONCE(krcp->count) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES; + if (delayed_work_pending(&krcp->monitor_work)) { + delay_left = krcp->monitor_work.timer.expires - jiffies; + if (delay < delay_left) + mod_delayed_work(system_wq, &krcp->monitor_work, delay); + return; + } + queue_delayed_work(system_wq, &krcp->monitor_work, delay); +} + /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ @@ -3150,7 +3165,7 @@ static void kfree_rcu_monitor(struct work_struct *work) // work to repeat an attempt. Because previous batches are // still in progress. if (need_offload_krc(krcp)) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_monitor_work(krcp); raw_spin_unlock_irqrestore(&krcp->lock, flags); } @@ -3339,7 +3354,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_monitor_work(krcp); unlock_return: krc_this_cpu_unlock(krcp, flags); @@ -3415,7 +3430,7 @@ void __init kfree_rcu_scheduler_running(void) raw_spin_lock_irqsave(&krcp->lock, flags); if (need_offload_krc(krcp)) - schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_monitor_work(krcp); raw_spin_unlock_irqrestore(&krcp->lock, flags); } } From 3fa1f17d5ca97527bd18da012f1e2072ec1fbbff Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 5 Jul 2022 12:09:51 -0700 Subject: [PATCH 0666/1250] rcu: Add QS check in rcu_exp_handler() for non-preemptible kernels Kernels built with CONFIG_PREEMPTION=n and CONFIG_PREEMPT_COUNT=y maintain preempt_count() state. Because such kernels map __rcu_read_lock() and __rcu_read_unlock() to preempt_disable() and preempt_enable(), respectively, this allows the expedited grace period's !CONFIG_PREEMPT_RCU version of the rcu_exp_handler() IPI handler function to use preempt_count() to detect quiescent states. This preempt_count() usage might seem to risk failures due to use of implicit RCU readers in portions of the kernel under #ifndef CONFIG_PREEMPTION, except that rcu_core() already disallows such implicit RCU readers. The moral of this story is that you must use explicit read-side markings such as rcu_read_lock() or preempt_disable() even if the code knows that this kernel does not support preemption. This commit therefore adds a preempt_count()-based check for a quiescent state in the !CONFIG_PREEMPT_RCU version of the rcu_exp_handler() function for kernels built with CONFIG_PREEMPT_COUNT=y, reporting an immediate quiescent state when the interrupted code had both preemption and softirqs enabled. This change results in about a 2% reduction in expedited grace-period latency in kernels built with both CONFIG_PREEMPT_RCU=n and CONFIG_PREEMPT_COUNT=y. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Link: https://lore.kernel.org/all/20220622103549.2840087-1-qiang1.zhang@intel.com/ --- kernel/rcu/tree_exp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index be667583a5547e..b07998159d1fa3 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -828,11 +828,13 @@ static void rcu_exp_handler(void *unused) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) return; - if (rcu_is_cpu_rrupt_from_idle()) { + if (rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && preempt_bh_enabled)) { rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); return; } From cd3cfb271c997980df9fd0aeecd79a74362f6b39 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 5 Jul 2022 12:15:35 -0700 Subject: [PATCH 0667/1250] doc: Emphasize the need for explicit RCU read-side markers This commit updates checklist.rst to emphasize the need for explicit markers for RCU read-side critical sections. Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 42cc5d891bd26e..a1cff2ce940666 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -66,8 +66,13 @@ over a rather long period of time, but improvements are always welcome! As a rough rule of thumb, any dereference of an RCU-protected pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(), rcu_read_lock_sched(), or by the appropriate update-side lock. - Disabling of preemption can serve as rcu_read_lock_sched(), but - is less readable and prevents lockdep from detecting locking issues. + Explicit disabling of preemption (preempt_disable(), for example) + can serve as rcu_read_lock_sched(), but is less readable and + prevents lockdep from detecting locking issues. + + Please not that you *cannot* rely on code known to be built + only in non-preemptible kernels. Such code can and will break, + expecially in kernels built with CONFIG_PREEMPT_COUNT=y. Letting RCU-protected pointers "leak" out of an RCU read-side critical section is every bit as bad as letting them leak out From 5f13c9cc5a326992d753a004cbdf1e3d3d7afb33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20Heidekr=C3=BCger?= Date: Tue, 14 Jun 2022 15:48:11 +0000 Subject: [PATCH 0668/1250] tools/memory-model: Clarify LKMM's limitations in litmus-tests.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As discussed, clarify LKMM not recognizing certain kinds of orderings. In particular, highlight the fact that LKMM might deliberately make weaker guarantees than compilers and architectures. Link: https://lore.kernel.org/all/YpoW1deb%2FQeeszO1@ethstick13.dse.in.tum.de/T/#u Co-developed-by: Alan Stern Signed-off-by: Alan Stern Signed-off-by: Paul Heidekrüger Reviewed-by: Marco Elver Reviewed-by: Joel Fernandes (Google) Cc: Charalampos Mainas Cc: Pramod Bhatotia Cc: Soham Chakraborty Cc: Martin Fink Signed-off-by: Paul E. McKenney --- .../Documentation/litmus-tests.txt | 37 ++++++++++++++----- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/tools/memory-model/Documentation/litmus-tests.txt b/tools/memory-model/Documentation/litmus-tests.txt index 8a9d5d2787f9e9..cc355999815cb7 100644 --- a/tools/memory-model/Documentation/litmus-tests.txt +++ b/tools/memory-model/Documentation/litmus-tests.txt @@ -946,22 +946,39 @@ Limitations of the Linux-kernel memory model (LKMM) include: carrying a dependency, then the compiler can break that dependency by substituting a constant of that value. - Conversely, LKMM sometimes doesn't recognize that a particular - optimization is not allowed, and as a result, thinks that a - dependency is not present (because the optimization would break it). - The memory model misses some pretty obvious control dependencies - because of this limitation. A simple example is: + Conversely, LKMM will sometimes overestimate the amount of + reordering compilers and CPUs can carry out, leading it to miss + some pretty obvious cases of ordering. A simple example is: r1 = READ_ONCE(x); if (r1 == 0) smp_mb(); WRITE_ONCE(y, 1); - There is a control dependency from the READ_ONCE to the WRITE_ONCE, - even when r1 is nonzero, but LKMM doesn't realize this and thinks - that the write may execute before the read if r1 != 0. (Yes, that - doesn't make sense if you think about it, but the memory model's - intelligence is limited.) + The WRITE_ONCE() does not depend on the READ_ONCE(), and as a + result, LKMM does not claim ordering. However, even though no + dependency is present, the WRITE_ONCE() will not be executed before + the READ_ONCE(). There are two reasons for this: + + The presence of the smp_mb() in one of the branches + prevents the compiler from moving the WRITE_ONCE() + up before the "if" statement, since the compiler has + to assume that r1 will sometimes be 0 (but see the + comment below); + + CPUs do not execute stores before po-earlier conditional + branches, even in cases where the store occurs after the + two arms of the branch have recombined. + + It is clear that it is not dangerous in the slightest for LKMM to + make weaker guarantees than architectures. In fact, it is + desirable, as it gives compilers room for making optimizations. + For instance, suppose that a 0 value in r1 would trigger undefined + behavior elsewhere. Then a clever compiler might deduce that r1 + can never be 0 in the if condition. As a result, said clever + compiler might deem it safe to optimize away the smp_mb(), + eliminating the branch and any ordering an architecture would + guarantee otherwise. 2. Multiple access sizes for a single variable are not supported, and neither are misaligned or partially overlapping accesses. From 3ed58981942da9f032f878d0ad33bde2d2dbec34 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 1 Jul 2022 10:44:04 +0800 Subject: [PATCH 0669/1250] rcu: Make tiny RCU support leak callbacks for debug-object errors Currently, only Tree RCU leaks callbacks setting when it detects a duplicate call_rcu(). This commit causes Tiny RCU to also leak callbacks in this situation. Because this is Tiny RCU, kernel size is important: 1. CONFIG_TINY_RCU=y and CONFIG_DEBUG_OBJECTS_RCU_HEAD=n (Production kernel) Original: text data bss dec hex filename 26290663 20159823 15212544 61663030 3ace736 vmlinux With this commit: text data bss dec hex filename 26290663 20159823 15212544 61663030 3ace736 vmlinux 2. CONFIG_TINY_RCU=y and CONFIG_DEBUG_OBJECTS_RCU_HEAD=y (Debugging kernel) Original: text data bss dec hex filename 26291319 20160143 15212544 61664006 3aceb06 vmlinux With this commit: text data bss dec hex filename 26291319 20160431 15212544 61664294 3acec26 vmlinux These results show that the kernel size is unchanged for production kernels, as desired. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index f0561ee16b9c25..943d431b908f6b 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -158,6 +158,10 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); +static void tiny_rcu_leak_callback(struct rcu_head *rhp) +{ +} + /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -165,9 +169,20 @@ EXPORT_SYMBOL_GPL(synchronize_rcu); */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { + static atomic_t doublefrees; unsigned long flags; - debug_rcu_head_queue(head); + if (debug_rcu_head_queue(head)) { + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } + + if (!__is_kvfree_rcu_offset((unsigned long)head->func)) + WRITE_ONCE(head->func, tiny_rcu_leak_callback); + return; + } + head->func = func; head->next = NULL; From 65db52482d5fa50136e90a5aba95b926dd01032e Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 12 Jul 2022 16:26:05 +0800 Subject: [PATCH 0670/1250] rcu-tasks: Convert RCU_LOCKDEP_WARN() to WARN_ONCE() Kernels built with CONFIG_PROVE_RCU=y and CONFIG_DEBUG_LOCK_ALLOC=y attempt to emit a warning when the synchronize_rcu_tasks_generic() function is called during early boot while the rcu_scheduler_active variable is RCU_SCHEDULER_INACTIVE. However the warnings is not actually be printed because the debug_lockdep_rcu_enabled() returns false, exactly because the rcu_scheduler_active variable is still equal to RCU_SCHEDULER_INACTIVE. This commit therefore replaces RCU_LOCKDEP_WARN() with WARN_ONCE() to force these warnings to actually be printed. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 83c7e6620d4031..469bf2a3b505e2 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -560,7 +560,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) { /* Complain if the scheduler has not started. */ - RCU_LOCKDEP_WARN(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, + WARN_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INACTIVE, "synchronize_rcu_tasks called too soon"); // If the grace-period kthread is running, use it. From 0f07a845a2c9e12c8e7a59d489ce98253fa0752f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 18 Jul 2022 10:57:26 -0700 Subject: [PATCH 0671/1250] rcu-tasks: Ensure RCU Tasks Trace loops have quiescent states The RCU Tasks Trace grace-period kthread loops across all CPUs, and there can be quite a few CPUs, with some commercially available systems sporting well over a thousand of them. Some of these loops can feature IPIs, which can take some time. This commit therefore places a call to cond_resched_tasks_rcu_qs() in each such loop. Link: https://docs.google.com/document/d/1V0YnG1HTWMt9WHJjroiJL9lf-hMrud4v8Fn3fhyY0cI/edit?usp=sharing Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 469bf2a3b505e2..f5bf6fb430dabf 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1500,6 +1500,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) if (rcu_tasks_trace_pertask_prep(t, true)) trc_add_holdout(t, hop); rcu_read_unlock(); + cond_resched_tasks_rcu_qs(); } // Only after all running tasks have been accounted for is it @@ -1520,6 +1521,7 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop) raw_spin_lock_irqsave_rcu_node(rtpcp, flags); } raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + cond_resched_tasks_rcu_qs(); } // Re-enable CPU hotplug now that the holdout list is populated. @@ -1619,6 +1621,7 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); + cond_resched_tasks_rcu_qs(); } // Re-enable CPU hotplug now that the holdout list scan has completed. From 31c47ba2539f28d941bfcd33e57a6350b2d55db3 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Tue, 19 Jul 2022 12:39:00 +0800 Subject: [PATCH 0672/1250] rcu-tasks: Make RCU Tasks Trace check for userspace execution Userspace execution is a valid quiescent state for RCU Tasks Trace, but the scheduling-clock interrupt does not currently report such quiescent states. Of course, the scheduling-clock interrupt is not strictly speaking userspace execution. However, the only way that this code is not in a quiescent state is if something invoked rcu_read_lock_trace(), and that would be reflected in the ->trc_reader_nesting field in the task_struct structure. Furthermore, this field is checked by rcu_tasks_trace_qs(), which is invoked by rcu_tasks_qs() which is in turn invoked by rcu_note_voluntary_context_switch() in kernels building at least one of the RCU Tasks flavors. It is therefore safe to invoke rcu_tasks_trace_qs() from the rcu_sched_clock_irq(). But rcu_tasks_qs() also invokes rcu_tasks_classic_qs() for RCU Tasks, which lacks the read-side markers provided by RCU Tasks Trace. This raises the possibility that an RCU Tasks grace period could start after the interrupt from userspace execution, but before the call to rcu_sched_clock_irq(). However, it turns out that this is safe because the RCU Tasks grace period waits for an RCU grace period, which will wait for the entire scheduling-clock interrupt handler, including any RCU Tasks read-side critical section that this handler might contain. This commit therefore updates the rcu_sched_clock_irq() function's check for usermode execution and its call to rcu_tasks_classic_qs() to instead check for both usermode execution and interrupt from idle, and to instead call rcu_note_voluntary_context_switch(). This consolidates code and provides more faster RCU Tasks Trace reporting of quiescent states in kernels that do scheduling-clock interrupts for userspace execution. [ paulmck: Consolidate checks into rcu_sched_clock_irq(). ] Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_plugin.h | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 84d2817766888f..2122359f0c8621 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2341,8 +2341,8 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); - if (user) - rcu_tasks_classic_qs(current, false); + if (user || rcu_is_cpu_rrupt_from_idle()) + rcu_note_voluntary_context_switch(current); lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4152816dd29f63..b2219577fbe2d1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -718,9 +718,6 @@ static void rcu_flavor_sched_clock_irq(int user) struct task_struct *t = current; lockdep_assert_irqs_disabled(); - if (user || rcu_is_cpu_rrupt_from_idle()) { - rcu_note_voluntary_context_switch(current); - } if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ @@ -976,7 +973,6 @@ static void rcu_flavor_sched_clock_irq(int user) * neither access nor modify, at least not while the * corresponding CPU is online. */ - rcu_qs(); } } From 8d58e1d940cae580387c7f55c39c99918fa914ca Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:32 +0200 Subject: [PATCH 0673/1250] tools/nolibc: make argc 32-bit in riscv startup code The "ld a0, 0(sp)" instruction doesn't build on RISCV32 because that would load a 64-bit value into a 32-bit register. But argc 32-bit, not 64, so we ought to use "lw" here. Tested on both RISCV32 and RISCV64. Cc: Pranith Kumar Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/arch-riscv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/nolibc/arch-riscv.h b/tools/include/nolibc/arch-riscv.h index 95e2b792492572..ba04771cb3a341 100644 --- a/tools/include/nolibc/arch-riscv.h +++ b/tools/include/nolibc/arch-riscv.h @@ -190,7 +190,7 @@ __asm__ (".section .text\n" ".option norelax\n" "lla gp, __global_pointer$\n" ".option pop\n" - "ld a0, 0(sp)\n" // argc (a0) was in the stack + "lw a0, 0(sp)\n" // argc (a0) was in the stack "add a1, sp, "SZREG"\n" // argv (a1) = sp "slli a2, a0, "PTRLOG"\n" // envp (a2) = SZREG*argc ... "add a2, a2, "SZREG"\n" // + SZREG (skip null) From b9dcf40b9f994718a4abddd2f505435dab21b011 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:33 +0200 Subject: [PATCH 0674/1250] tools/nolibc: fix build warning in sys_mmap() when my_syscall6 is not defined We return -ENOSYS when there's no syscall6() operation, but we must cast it to void* to avoid a warning. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/sys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index 08491070387bc0..b8c96878c9ce0b 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -692,7 +692,7 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd, { #ifndef my_syscall6 /* Function not implemented. */ - return -ENOSYS; + return (void *)-ENOSYS; #else int n; From b038c350472e7c5d094ba138e57c9e8883670c34 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:34 +0200 Subject: [PATCH 0675/1250] tools/nolibc: make sys_mmap() automatically use the right __NR_mmap definition __NR_mmap2 was used for i386 but it's also needed for other archs such as RISCV32 or ARM. Let's decide to use it based on the __NR_mmap2 definition as it's not defined on other archs. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/include/nolibc/sys.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/include/nolibc/sys.h b/tools/include/nolibc/sys.h index b8c96878c9ce0b..ce3ee03aa6794b 100644 --- a/tools/include/nolibc/sys.h +++ b/tools/include/nolibc/sys.h @@ -697,7 +697,7 @@ void *sys_mmap(void *addr, size_t length, int prot, int flags, int fd, int n; -#if defined(__i386__) +#if defined(__NR_mmap2) n = __NR_mmap2; offset >>= 12; #else From 577b56bc8f6dbc92598872492f91ec0e0867f650 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:35 +0200 Subject: [PATCH 0676/1250] selftests/nolibc: add basic infrastructure to ease creation of nolibc tests This creates a "nolibc" selftest that intends to test various parts of the nolibc component, both in terms of build and execution for a given architecture. The aim is for it to be as simple to run as a kernel build, by just passing the compiler (for the build) and the ARCH (for kernel and execution). It brings a basic squeleton made of a single C file that will ease testing and error reporting. The code will be arranged so that it remains easy to add basic tests for syscalls or library calls that may rely on a condition to be executed, and whose result is compared to a value or to an error with a specific errno value. Tests will just use a relative line number in switch/case statements as an index, saving the user from having to maintain arrays and complicated functions which can often just be one-liners. MAINTAINERS was updated. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- MAINTAINERS | 1 + tools/testing/selftests/nolibc/Makefile | 43 ++ tools/testing/selftests/nolibc/nolibc-test.c | 395 +++++++++++++++++++ 3 files changed, 439 insertions(+) create mode 100644 tools/testing/selftests/nolibc/Makefile create mode 100644 tools/testing/selftests/nolibc/nolibc-test.c diff --git a/MAINTAINERS b/MAINTAINERS index 4e38d7533cbe9a..6ab10b235ed0be 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14114,6 +14114,7 @@ M: Willy Tarreau S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/wtarreau/nolibc.git F: tools/include/nolibc/ +F: tools/testing/selftests/nolibc/ NSDEPS M: Matthias Maennich diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile new file mode 100644 index 00000000000000..fd0a670823340b --- /dev/null +++ b/tools/testing/selftests/nolibc/Makefile @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for nolibc tests +include ../../../scripts/Makefile.include + +# we're in ".../tools/testing/selftests/nolibc" +ifeq ($(srctree),) +srctree := $(patsubst %/tools/testing/selftests/,%,$(dir $(CURDIR))) +endif + +ifeq ($(ARCH),) +include $(srctree)/scripts/subarch.include +ARCH = $(SUBARCH) +endif + +# OUTPUT is only set when run from the main makefile, otherwise +# it defaults to this nolibc directory. +OUTPUT ?= $(CURDIR)/ + +ifeq ($(V),1) +Q= +else +Q=@ +endif + +CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables +LDFLAGS := -s + +all: nolibc-test + +nolibc-test: nolibc-test.c + $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ + -nostdlib -static -include ../../../include/nolibc/nolibc.h $^ -lgcc + +initramfs: nolibc-test + $(QUIET_MKDIR)mkdir -p initramfs + $(call QUIET_INSTALL, initramfs/init) + $(Q)cp nolibc-test initramfs/init + +clean: + $(call QUIET_CLEAN, nolibc-test) + $(Q)rm -f nolibc-test + $(call QUIET_CLEAN, initramfs) + $(Q)rm -rf initramfs diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c new file mode 100644 index 00000000000000..6c050d4381fecf --- /dev/null +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* platform-specific include files coming from the compiler */ +#include + +/* libc-specific include files + * The program may be built in 2 ways: + * $(CC) -nostdlib -include /path/to/nolibc.h => NOLIBC already defined + * $(CC) -nostdlib -I/path/to/nolibc/sysroot + */ +#ifndef NOLIBC +#include +#include +#include +#endif + +/* will be used by nolibc by getenv() */ +char **environ; + +#define CASE_ERR(err) \ + case err: return #err + +/* returns the error name (e.g. "ENOENT") for common errors, "SUCCESS" for 0, + * or the decimal value for less common ones. + */ +const char *errorname(int err) +{ + switch (err) { + case 0: return "SUCCESS"; + CASE_ERR(EPERM); + CASE_ERR(ENOENT); + CASE_ERR(ESRCH); + CASE_ERR(EINTR); + CASE_ERR(EIO); + CASE_ERR(ENXIO); + CASE_ERR(E2BIG); + CASE_ERR(ENOEXEC); + CASE_ERR(EBADF); + CASE_ERR(ECHILD); + CASE_ERR(EAGAIN); + CASE_ERR(ENOMEM); + CASE_ERR(EACCES); + CASE_ERR(EFAULT); + CASE_ERR(ENOTBLK); + CASE_ERR(EBUSY); + CASE_ERR(EEXIST); + CASE_ERR(EXDEV); + CASE_ERR(ENODEV); + CASE_ERR(ENOTDIR); + CASE_ERR(EISDIR); + CASE_ERR(EINVAL); + CASE_ERR(ENFILE); + CASE_ERR(EMFILE); + CASE_ERR(ENOTTY); + CASE_ERR(ETXTBSY); + CASE_ERR(EFBIG); + CASE_ERR(ENOSPC); + CASE_ERR(ESPIPE); + CASE_ERR(EROFS); + CASE_ERR(EMLINK); + CASE_ERR(EPIPE); + CASE_ERR(EDOM); + CASE_ERR(ERANGE); + CASE_ERR(ENOSYS); + default: + return itoa(err); + } +} + +static int pad_spc(int llen, int cnt, const char *fmt, ...) +{ + va_list args; + int len; + int ret; + + for (len = 0; len < cnt - llen; len++) + putchar(' '); + + va_start(args, fmt); + ret = vfprintf(stdout, fmt, args); + va_end(args); + return ret < 0 ? ret : ret + len; +} + +/* The tests below are intended to be used by the macroes, which evaluate + * expression , print the status to stdout, and update the "ret" + * variable to count failures. The functions themselves return the number + * of failures, thus either 0 or 1. + */ + +#define EXPECT_ZR(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_zr(expr, llen); } while (0) + +static int expect_zr(int expr, int llen) +{ + int ret = !(expr == 0); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_NZ(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_nz(expr, llen; } while (0) + +static int expect_nz(int expr, int llen) +{ + int ret = !(expr != 0); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_EQ(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_eq(expr, llen, val); } while (0) + +static int expect_eq(int expr, int llen, int val) +{ + int ret = !(expr == val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_NE(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ne(expr, llen, val); } while (0) + +static int expect_ne(int expr, int llen, int val) +{ + int ret = !(expr != val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_GE(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ge(expr, llen, val); } while (0) + +static int expect_ge(int expr, int llen, int val) +{ + int ret = !(expr >= val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_GT(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_gt(expr, llen, val); } while (0) + +static int expect_gt(int expr, int llen, int val) +{ + int ret = !(expr > val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_LE(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_le(expr, llen, val); } while (0) + +static int expect_le(int expr, int llen, int val) +{ + int ret = !(expr <= val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_LT(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_lt(expr, llen, val); } while (0) + +static int expect_lt(int expr, int llen, int val) +{ + int ret = !(expr < val); + + llen += printf(" = %d ", expr); + pad_spc(llen, 40, ret ? "[FAIL]\n" : " [OK]\n"); + return ret; +} + + +#define EXPECT_SYSZR(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syszr(expr, llen); } while (0) + +static int expect_syszr(int expr, int llen) +{ + int ret = 0; + + if (expr) { + ret = 1; + llen += printf(" = %d %s ", expr, errorname(errno)); + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += printf(" = %d ", expr); + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_SYSEQ(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syseq(expr, llen, val); } while (0) + +static int expect_syseq(int expr, int llen, int val) +{ + int ret = 0; + + if (expr != val) { + ret = 1; + llen += printf(" = %d %s ", expr, errorname(errno)); + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += printf(" = %d ", expr); + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_SYSNE(cond, expr, val) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_sysne(expr, llen, val); } while (0) + +static int expect_sysne(int expr, int llen, int val) +{ + int ret = 0; + + if (expr == val) { + ret = 1; + llen += printf(" = %d %s ", expr, errorname(errno)); + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += printf(" = %d ", expr); + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_SYSER(cond, expr, expret, experr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_syserr(expr, expret, experr, llen); } while (0) + +static int expect_syserr(int expr, int expret, int experr, int llen) +{ + int ret = 0; + int _errno = errno; + + llen += printf(" = %d %s ", expr, errorname(_errno)); + if (expr != expret || _errno != experr) { + ret = 1; + llen += printf(" != (%d %s) ", expret, errorname(experr)); + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_PTRZR(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ptrzr(expr, llen); } while (0) + +static int expect_ptrzr(const void *expr, int llen) +{ + int ret = 0; + + llen += printf(" = <%p> ", expr); + if (expr) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_PTRNZ(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_ptrnz(expr, llen); } while (0) + +static int expect_ptrnz(const void *expr, int llen) +{ + int ret = 0; + + llen += printf(" = <%p> ", expr); + if (!expr) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_STRZR(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strzr(expr, llen); } while (0) + +static int expect_strzr(const char *expr, int llen) +{ + int ret = 0; + + llen += printf(" = <%s> ", expr); + if (expr) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_STRNZ(cond, expr) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strnz(expr, llen); } while (0) + +static int expect_strnz(const char *expr, int llen) +{ + int ret = 0; + + llen += printf(" = <%s> ", expr); + if (!expr) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_STREQ(cond, expr, cmp) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_streq(expr, llen, cmp); } while (0) + +static int expect_streq(const char *expr, int llen, const char *cmp) +{ + int ret = 0; + + llen += printf(" = <%s> ", expr); + if (strcmp(expr, cmp) != 0) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + + +#define EXPECT_STRNE(cond, expr, cmp) \ + do { if (!cond) pad_spc(llen, 40, "[SKIPPED]\n"); else ret += expect_strne(expr, llen, cmp); } while (0) + +static int expect_strne(const char *expr, int llen, const char *cmp) +{ + int ret = 0; + + llen += printf(" = <%s> ", expr); + if (strcmp(expr, cmp) == 0) { + ret = 1; + llen += pad_spc(llen, 40, "[FAIL]\n"); + } else { + llen += pad_spc(llen, 40, " [OK]\n"); + } + return ret; +} + +/* declare tests based on line numbers. There must be exactly one test per line. */ +#define CASE_TEST(name) \ + case __LINE__: llen += printf("%d %s", test, #name); + + +int main(int argc, char **argv, char **envp) +{ + int min = 0; + int max = __INT_MAX__; + int ret = 0; + + environ = envp; + + printf("Total number of errors: %d\n", ret); + printf("Exiting with status %d\n", !!ret); + return !!ret; +} From e8e30d00798a6a172007a03a237ca1a2643602b5 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:36 +0200 Subject: [PATCH 0677/1250] selftests/nolibc: support a test definition format It now becomes possible to pass a string either in argv[1] or in the NOLIBC_TEST environment variable (the former having precedence), to specify which tests to run. The format is: testname[:range]*[,testname...] Where a range is either a single value or the min and max numbers of the test IDs in a sequence, delimited by a dash. Multiple ranges are possible. This should provide enough flexibility to focus on certain failing parts just by playing with the boot command line in a boot loader or in qemu depending on what is accessible. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 91 ++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 6c050d4381fecf..49177ea9943cca 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -17,6 +17,12 @@ /* will be used by nolibc by getenv() */ char **environ; +/* definition of a series of tests */ +struct test { + const char *name; // test name + int (*func)(int min, int max); // handler +}; + #define CASE_ERR(err) \ case err: return #err @@ -376,19 +382,104 @@ static int expect_strne(const char *expr, int llen, const char *cmp) return ret; } + /* declare tests based on line numbers. There must be exactly one test per line. */ #define CASE_TEST(name) \ case __LINE__: llen += printf("%d %s", test, #name); +/* This is the definition of known test names, with their functions */ +static struct test test_names[] = { + /* add new tests here */ + { 0 } +}; + int main(int argc, char **argv, char **envp) { int min = 0; int max = __INT_MAX__; int ret = 0; + int err; + int idx; + char *test; environ = envp; + /* the definition of a series of tests comes from either argv[1] or the + * "NOLIBC_TEST" environment variable. It's made of a comma-delimited + * series of test names and optional ranges: + * syscall:5-15[:.*],stdlib:8-10 + */ + test = argv[1]; + if (!test) + test = getenv("NOLIBC_TEST"); + + if (test) { + char *comma, *colon, *dash, *value; + + do { + comma = strchr(test, ','); + if (comma) + *(comma++) = '\0'; + + colon = strchr(test, ':'); + if (colon) + *(colon++) = '\0'; + + for (idx = 0; test_names[idx].name; idx++) { + if (strcmp(test, test_names[idx].name) == 0) + break; + } + + if (test_names[idx].name) { + /* The test was named, it will be called at least + * once. We may have an optional range at + * here, which defaults to the full range. + */ + do { + min = 0; max = __INT_MAX__; + value = colon; + if (value && *value) { + colon = strchr(value, ':'); + if (colon) + *(colon++) = '\0'; + + dash = strchr(value, '-'); + if (dash) + *(dash++) = '\0'; + + /* support :val: :min-max: :min-: :-max: */ + if (*value) + min = atoi(value); + if (!dash) + max = min; + else if (*dash) + max = atoi(dash); + + value = colon; + } + + /* now's time to call the test */ + printf("Running test '%s'\n", test_names[idx].name); + err = test_names[idx].func(min, max); + ret += err; + printf("Errors during this test: %d\n\n", err); + } while (colon && *colon); + } else + printf("Ignoring unknown test name '%s'\n", test); + + test = comma; + } while (test && *test); + } else { + /* no test mentioned, run everything */ + for (idx = 0; test_names[idx].name; idx++) { + printf("Running test '%s'\n", test_names[idx].name); + err = test_names[idx].func(min, max); + ret += err; + printf("Errors during this test: %d\n\n", err); + } + } + printf("Total number of errors: %d\n", ret); printf("Exiting with status %d\n", !!ret); return !!ret; From b5bc3d2fbbf3c32f076e2af8e201cb3c898588a4 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:37 +0200 Subject: [PATCH 0678/1250] selftests/nolibc: implement a few tests for various syscalls This adds 63 tests covering about 34 syscalls. Both successes and failures are tested. Two tests fail when run as unprivileged user (link_dir which returns EACCESS instead of EPERM, and chroot which returns EPERM). One test (execve("/")) expects to fail on EACCESS, but needs to have valid arguments otherwise the kernel will log a message. And a few tests require /proc to be mounted. The code is not pretty since all tests are one-liners, sometimes resulting in long lines, especially when using compount statements to preset a line, but it's convenient and doesn't obfuscate the code, which is important to understand what failed. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 110 +++++++++++++++++++ 1 file changed, 110 insertions(+) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 49177ea9943cca..dc87832912ce44 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -388,9 +388,119 @@ static int expect_strne(const char *expr, int llen, const char *cmp) case __LINE__: llen += printf("%d %s", test, #name); +/* used by some syscall tests below */ +int test_getdents64(const char *dir) +{ + char buffer[4096]; + int fd, ret; + int err; + + ret = fd = open(dir, O_RDONLY | O_DIRECTORY, 0); + if (ret < 0) + return ret; + + ret = getdents64(fd, (void *)buffer, sizeof(buffer)); + err = errno; + close(fd); + + errno = err; + return ret; +} + +/* Run syscall tests between IDs and . + * Return 0 on success, non-zero on failure. + */ +int run_syscall(int min, int max) +{ + struct stat stat_buf; + int test; + int tmp; + int ret = 0; + void *p1, *p2; + + for (test = min; test >= 0 && test <= max; test++) { + int llen = 0; // line length + + /* avoid leaving empty lines below, this will insert holes into + * test numbers. + */ + switch (test + __LINE__ + 1) { + CASE_TEST(getpid); EXPECT_SYSNE(1, getpid(), -1); break; + CASE_TEST(getppid); EXPECT_SYSNE(1, getppid(), -1); break; + CASE_TEST(gettid); EXPECT_SYSNE(1, gettid(), -1); break; + CASE_TEST(getpgid_self); EXPECT_SYSNE(1, getpgid(0), -1); break; + CASE_TEST(getpgid_bad); EXPECT_SYSER(1, getpgid(-1), -1, ESRCH); break; + CASE_TEST(kill_0); EXPECT_SYSZR(1, kill(getpid(), 0)); break; + CASE_TEST(kill_CONT); EXPECT_SYSZR(1, kill(getpid(), 0)); break; + CASE_TEST(kill_BADPID); EXPECT_SYSER(1, kill(INT_MAX, 0), -1, ESRCH); break; + CASE_TEST(sbrk); if ((p1 = p2 = sbrk(4096)) != (void *)-1) p2 = sbrk(-4096); EXPECT_SYSZR(1, (p2 == (void *)-1) || p2 == p1); break; + CASE_TEST(brk); EXPECT_SYSZR(1, brk(sbrk(0))); break; + CASE_TEST(chdir_root); EXPECT_SYSZR(1, chdir("/")); break; + CASE_TEST(chdir_dot); EXPECT_SYSZR(1, chdir(".")); break; + CASE_TEST(chdir_blah); EXPECT_SYSER(1, chdir("/blah"), -1, ENOENT); break; + CASE_TEST(chmod_net); EXPECT_SYSZR(1, chmod("/proc/self/net", 0555)); break; + CASE_TEST(chmod_self); EXPECT_SYSER(1, chmod("/proc/self", 0555), -1, EPERM); break; + CASE_TEST(chown_self); EXPECT_SYSER(1, chown("/proc/self", 0, 0), -1, EPERM); break; + CASE_TEST(chroot_root); EXPECT_SYSZR(1, chroot("/")); break; + CASE_TEST(chroot_blah); EXPECT_SYSER(1, chroot("/proc/self/blah"), -1, ENOENT); break; + CASE_TEST(chroot_exe); EXPECT_SYSER(1, chroot("/proc/self/exe"), -1, ENOTDIR); break; + CASE_TEST(close_m1); EXPECT_SYSER(1, close(-1), -1, EBADF); break; + CASE_TEST(close_dup); EXPECT_SYSZR(1, close(dup(0))); break; + CASE_TEST(dup_0); tmp = dup(0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; + CASE_TEST(dup_m1); tmp = dup(-1); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break; + CASE_TEST(dup2_0); tmp = dup2(0, 100); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; + CASE_TEST(dup2_m1); tmp = dup2(-1, 100); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break; + CASE_TEST(dup3_0); tmp = dup3(0, 100, 0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; + CASE_TEST(dup3_m1); tmp = dup3(-1, 100, 0); EXPECT_SYSER(1, tmp, -1, EBADF); if (tmp != -1) close(tmp); break; + CASE_TEST(execve_root); EXPECT_SYSER(1, execve("/", (char*[]){ [0] = "/", [1] = NULL }, NULL), -1, EACCES); break; + CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; + CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; + CASE_TEST(gettimeofday_null); EXPECT_SYSZR(1, gettimeofday(NULL, NULL)); break; + CASE_TEST(gettimeofday_bad1); EXPECT_SYSER(1, gettimeofday((void *)1, NULL), -1, EFAULT); break; + CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break; + CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break; + CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; + CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; + CASE_TEST(link_root1); EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break; + CASE_TEST(link_blah); EXPECT_SYSER(1, link("/proc/self/blah", "/blah"), -1, ENOENT); break; + CASE_TEST(link_dir); EXPECT_SYSER(1, link("/", "/blah"), -1, EPERM); break; + CASE_TEST(link_cross); EXPECT_SYSER(1, link("/proc/self/net", "/blah"), -1, EXDEV); break; + CASE_TEST(lseek_m1); EXPECT_SYSER(1, lseek(-1, 0, SEEK_SET), -1, EBADF); break; + CASE_TEST(lseek_0); EXPECT_SYSER(1, lseek(0, 0, SEEK_SET), -1, ESPIPE); break; + CASE_TEST(mkdir_root); EXPECT_SYSER(1, mkdir("/", 0755), -1, EEXIST); break; + CASE_TEST(open_tty); EXPECT_SYSNE(1, tmp = open("/dev/null", 0), -1); if (tmp != -1) close(tmp); break; + CASE_TEST(open_blah); EXPECT_SYSER(1, tmp = open("/proc/self/blah", 0), -1, ENOENT); if (tmp != -1) close(tmp); break; + CASE_TEST(poll_null); EXPECT_SYSZR(1, poll(NULL, 0, 0)); break; + CASE_TEST(poll_stdout); EXPECT_SYSNE(1, ({ struct pollfd fds = { 1, POLLOUT, 0}; poll(&fds, 1, 0); }), -1); break; + CASE_TEST(poll_fault); EXPECT_SYSER(1, poll((void *)1, 1, 0), -1, EFAULT); break; + CASE_TEST(read_badf); EXPECT_SYSER(1, read(-1, &tmp, 1), -1, EBADF); break; + CASE_TEST(sched_yield); EXPECT_SYSZR(1, sched_yield()); break; + CASE_TEST(select_null); EXPECT_SYSZR(1, ({ struct timeval tv = { 0 }; select(0, NULL, NULL, NULL, &tv); })); break; + CASE_TEST(select_stdout); EXPECT_SYSNE(1, ({ fd_set fds; FD_ZERO(&fds); FD_SET(1, &fds); select(2, NULL, &fds, NULL, NULL); }), -1); break; + CASE_TEST(select_fault); EXPECT_SYSER(1, select(1, (void *)1, NULL, NULL, 0), -1, EFAULT); break; + CASE_TEST(stat_blah); EXPECT_SYSER(1, stat("/proc/self/blah", &stat_buf), -1, ENOENT); break; + CASE_TEST(stat_fault); EXPECT_SYSER(1, stat(NULL, &stat_buf), -1, EFAULT); break; + CASE_TEST(symlink_root); EXPECT_SYSER(1, symlink("/", "/"), -1, EEXIST); break; + CASE_TEST(unlink_root); EXPECT_SYSER(1, unlink("/"), -1, EISDIR); break; + CASE_TEST(unlink_blah); EXPECT_SYSER(1, unlink("/proc/self/blah"), -1, ENOENT); break; + CASE_TEST(wait_child); EXPECT_SYSER(1, wait(&tmp), -1, ECHILD); break; + CASE_TEST(waitpid_min); EXPECT_SYSER(1, waitpid(INT_MIN, &tmp, WNOHANG), -1, ESRCH); break; + CASE_TEST(waitpid_child); EXPECT_SYSER(1, waitpid(getpid(), &tmp, WNOHANG), -1, ECHILD); break; + CASE_TEST(write_badf); EXPECT_SYSER(1, write(-1, &tmp, 1), -1, EBADF); break; + CASE_TEST(write_zero); EXPECT_SYSZR(1, write(1, &tmp, 0)); break; + case __LINE__: + return ret; /* must be last */ + /* note: do not set any defaults so as to permit holes above */ + } + } + return ret; +} + + /* This is the definition of known test names, with their functions */ static struct test test_names[] = { /* add new tests here */ + { .name = "syscall", .func = run_syscall }, { 0 } }; From ca4872233da96436e5454ab1f037e06de2b19238 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:38 +0200 Subject: [PATCH 0679/1250] selftests/nolibc: add a few tests for some libc functions The test series called "stdlib" covers some libc functions (string, stdlib etc). By default they are automatically run after "syscall" but may be requested in argument or in variable NOLIBC_TEST. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 35 ++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index dc87832912ce44..b928f099431f78 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -496,11 +496,46 @@ int run_syscall(int min, int max) return ret; } +int run_stdlib(int min, int max) +{ + int test; + int tmp; + int ret = 0; + void *p1, *p2; + + for (test = min; test >= 0 && test <= max; test++) { + int llen = 0; // line length + + /* avoid leaving empty lines below, this will insert holes into + * test numbers. + */ + switch (test + __LINE__ + 1) { + CASE_TEST(getenv_TERM); EXPECT_STRNZ(1, getenv("TERM")); break; + CASE_TEST(getenv_blah); EXPECT_STRZR(1, getenv("blah")); break; + CASE_TEST(setcmp_blah_blah); EXPECT_EQ(1, strcmp("blah", "blah"), 0); break; + CASE_TEST(setcmp_blah_blah2); EXPECT_NE(1, strcmp("blah", "blah2"), 0); break; + CASE_TEST(setncmp_blah_blah); EXPECT_EQ(1, strncmp("blah", "blah", 10), 0); break; + CASE_TEST(setncmp_blah_blah4); EXPECT_EQ(1, strncmp("blah", "blah4", 4), 0); break; + CASE_TEST(setncmp_blah_blah5); EXPECT_NE(1, strncmp("blah", "blah5", 5), 0); break; + CASE_TEST(setncmp_blah_blah6); EXPECT_NE(1, strncmp("blah", "blah6", 6), 0); break; + CASE_TEST(strchr_foobar_o); EXPECT_STREQ(1, strchr("foobar", 'o'), "oobar"); break; + CASE_TEST(strchr_foobar_z); EXPECT_STRZR(1, strchr("foobar", 'z')); break; + CASE_TEST(strrchr_foobar_o); EXPECT_STREQ(1, strrchr("foobar", 'o'), "obar"); break; + CASE_TEST(strrchr_foobar_z); EXPECT_STRZR(1, strrchr("foobar", 'z')); break; + case __LINE__: + return ret; /* must be last */ + /* note: do not set any defaults so as to permit holes above */ + } + } + return ret; +} + /* This is the definition of known test names, with their functions */ static struct test test_names[] = { /* add new tests here */ { .name = "syscall", .func = run_syscall }, + { .name = "stdlib", .func = run_stdlib }, { 0 } }; From 5bd0a48ae13a9df08005d6527dc8f327315adff7 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:40 +0200 Subject: [PATCH 0680/1250] selftests/nolibc: exit with poweroff on success when getpid() == 1 The idea is to ease automated testing under qemu. If the test succeeds while running as PID 1, indicating the system was booted with init=/test, let's just power off so that qemu can exit with a successful code. In other situations it will exit and provoke a panic, which may be caught for example with CONFIG_PVPANIC. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index b928f099431f78..291d96bfd7c120 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -626,6 +626,20 @@ int main(int argc, char **argv, char **envp) } printf("Total number of errors: %d\n", ret); + + if (getpid() == 1) { + /* we're running as init, there's no other process on the + * system, thus likely started from a VM for a quick check. + * Exiting will provoke a kernel panic that may be reported + * as an error by Qemu or the hypervisor, while stopping + * cleanly will often be reported as a success. This allows + * to use the output of this program for bisecting kernels. + */ + printf("Leaving init with final status: %d\n", !!ret); + if (ret == 0) + reboot(LINUX_REBOOT_CMD_POWER_OFF); + } + printf("Exiting with status %d\n", !!ret); return !!ret; } From ecd82dc71fdfdec2efb0262847d1e015c0d66a84 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:41 +0200 Subject: [PATCH 0681/1250] selftests/nolibc: on x86, support exiting with isa-debug-exit QEMU, when started with "-device isa-debug-exit -no-reboot" will exit with status code 2N+1 when N is written to 0x501. This is particularly convenient for automated tests but this is not portable. As such we only enable this on x86_64 when pid==1. In addition, this requires an ioperm() call but in order not to have to define arch-specific syscalls we just perform the syscall by hand there. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 291d96bfd7c120..eeb254749239ca 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -638,6 +638,15 @@ int main(int argc, char **argv, char **envp) printf("Leaving init with final status: %d\n", !!ret); if (ret == 0) reboot(LINUX_REBOOT_CMD_POWER_OFF); +#if defined(__x86_64__) + /* QEMU started with "-device isa-debug-exit -no-reboot" will + * exit with status code 2N+1 when N is written to 0x501. We + * hard-code the syscall here as it's arch-dependent. + */ + else if (my_syscall3(__NR_ioperm, 0x501, 1, 1) == 0) + asm volatile ("outb %%al, %%dx" :: "d"(0x501), "a"(0)); + /* if it does nothing, fall back to the regular panic */ +#endif } printf("Exiting with status %d\n", !!ret); From 0a7373809ed4ec99681953e27a59a250d460f880 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:42 +0200 Subject: [PATCH 0682/1250] selftests/nolibc: recreate and populate /dev and /proc if missing Most of the time the program will be run alone in an initramfs. There is no value in requiring the user to populate /dev and /proc for such tests, we can do it ourselves, and it participates to the tests at the same time. What's done here is that when called as init (getpid()==1) we check if /dev exists or create it, if /dev/console and /dev/null exists, otherwise we try to mount a devtmpfs there, and if it fails we fall back to mknod. The console is reopened if stdout was closed. Finally /proc is created and mounted if /proc/self cannot be found. This is sufficient for most tests. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 56 ++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index eeb254749239ca..a697182c87f577 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -530,6 +530,54 @@ int run_stdlib(int min, int max) return ret; } +/* prepare what needs to be prepared for pid 1 (stdio, /dev, /proc, etc) */ +int prepare(void) +{ + struct stat stat_buf; + + /* It's possible that /dev doesn't even exist or was not mounted, so + * we'll try to create it, mount it, or create minimal entries into it. + * We want at least /dev/null and /dev/console. + */ + if (stat("/dev/.", &stat_buf) == 0 || mkdir("/dev", 0755) == 0) { + if (stat("/dev/console", &stat_buf) != 0 || + stat("/dev/null", &stat_buf) != 0) { + /* try devtmpfs first, otherwise fall back to manual creation */ + if (mount("/dev", "/dev", "devtmpfs", 0, 0) != 0) { + mknod("/dev/console", 0600 | S_IFCHR, makedev(5, 1)); + mknod("/dev/null", 0666 | S_IFCHR, makedev(1, 3)); + } + } + } + + /* If no /dev/console was found before calling init, stdio is closed so + * we need to reopen it from /dev/console. If it failed above, it will + * still fail here and we cannot emit a message anyway. + */ + if (close(dup(1)) == -1) { + int fd = open("/dev/console", O_RDWR); + + if (fd >= 0) { + if (fd != 0) + dup2(fd, 0); + if (fd != 1) + dup2(fd, 1); + if (fd != 2) + dup2(fd, 2); + if (fd > 2) + close(fd); + puts("\nSuccessfully reopened /dev/console."); + } + } + + /* try to mount /proc if not mounted. Silently fail otherwise */ + if (stat("/proc/.", &stat_buf) == 0 || mkdir("/proc", 0755) == 0) { + if (stat("/proc/self", &stat_buf) != 0) + mount("/proc", "/proc", "proc", 0, 0); + } + + return 0; +} /* This is the definition of known test names, with their functions */ static struct test test_names[] = { @@ -550,6 +598,14 @@ int main(int argc, char **argv, char **envp) environ = envp; + /* when called as init, it's possible that no console was opened, for + * example if no /dev file system was provided. We'll check that fd#1 + * was opened, and if not we'll attempt to create and open /dev/console + * and /dev/null that we'll use for later tests. + */ + if (getpid() == 1) + prepare(); + /* the definition of a series of tests comes from either argv[1] or the * "NOLIBC_TEST" environment variable. It's made of a comma-delimited * series of test names and optional ranges: From 67498b99dd601ec48f26b1035514aea016b4a7b4 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:43 +0200 Subject: [PATCH 0683/1250] selftests/nolibc: condition some tests on /proc existence If /proc is not available (program run inside a chroot or without sufficient permissions), it's better to disable the associated tests. Some will be preserved like the ones which check for a failure to create some entries there since they're still supposed to fail. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index a697182c87f577..662dea691749c7 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -413,11 +413,15 @@ int test_getdents64(const char *dir) int run_syscall(int min, int max) { struct stat stat_buf; + int proc; int test; int tmp; int ret = 0; void *p1, *p2; + /* indicates whether or not /proc is mounted */ + proc = stat("/proc", &stat_buf) == 0; + for (test = min; test >= 0 && test <= max; test++) { int llen = 0; // line length @@ -438,12 +442,12 @@ int run_syscall(int min, int max) CASE_TEST(chdir_root); EXPECT_SYSZR(1, chdir("/")); break; CASE_TEST(chdir_dot); EXPECT_SYSZR(1, chdir(".")); break; CASE_TEST(chdir_blah); EXPECT_SYSER(1, chdir("/blah"), -1, ENOENT); break; - CASE_TEST(chmod_net); EXPECT_SYSZR(1, chmod("/proc/self/net", 0555)); break; - CASE_TEST(chmod_self); EXPECT_SYSER(1, chmod("/proc/self", 0555), -1, EPERM); break; - CASE_TEST(chown_self); EXPECT_SYSER(1, chown("/proc/self", 0, 0), -1, EPERM); break; + CASE_TEST(chmod_net); EXPECT_SYSZR(proc, chmod("/proc/self/net", 0555)); break; + CASE_TEST(chmod_self); EXPECT_SYSER(proc, chmod("/proc/self", 0555), -1, EPERM); break; + CASE_TEST(chown_self); EXPECT_SYSER(proc, chown("/proc/self", 0, 0), -1, EPERM); break; CASE_TEST(chroot_root); EXPECT_SYSZR(1, chroot("/")); break; CASE_TEST(chroot_blah); EXPECT_SYSER(1, chroot("/proc/self/blah"), -1, ENOENT); break; - CASE_TEST(chroot_exe); EXPECT_SYSER(1, chroot("/proc/self/exe"), -1, ENOTDIR); break; + CASE_TEST(chroot_exe); EXPECT_SYSER(proc, chroot("/proc/self/exe"), -1, ENOTDIR); break; CASE_TEST(close_m1); EXPECT_SYSER(1, close(-1), -1, EBADF); break; CASE_TEST(close_dup); EXPECT_SYSZR(1, close(dup(0))); break; CASE_TEST(dup_0); tmp = dup(0); EXPECT_SYSNE(1, tmp, -1); close(tmp); break; @@ -464,7 +468,7 @@ int run_syscall(int min, int max) CASE_TEST(link_root1); EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break; CASE_TEST(link_blah); EXPECT_SYSER(1, link("/proc/self/blah", "/blah"), -1, ENOENT); break; CASE_TEST(link_dir); EXPECT_SYSER(1, link("/", "/blah"), -1, EPERM); break; - CASE_TEST(link_cross); EXPECT_SYSER(1, link("/proc/self/net", "/blah"), -1, EXDEV); break; + CASE_TEST(link_cross); EXPECT_SYSER(proc, link("/proc/self/net", "/blah"), -1, EXDEV); break; CASE_TEST(lseek_m1); EXPECT_SYSER(1, lseek(-1, 0, SEEK_SET), -1, EBADF); break; CASE_TEST(lseek_0); EXPECT_SYSER(1, lseek(0, 0, SEEK_SET), -1, ESPIPE); break; CASE_TEST(mkdir_root); EXPECT_SYSER(1, mkdir("/", 0755), -1, EEXIST); break; From c97d33d44e3cc22344bb6362252047c87013e23c Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:44 +0200 Subject: [PATCH 0684/1250] selftests/nolibc: support glibc as well Adding support for glibc can be useful to distinguish between bugs in nolibc and bugs in the kernel when a syscall reports an unusual value. It's not that much work and should not affect the long term maintainability of the tests. The necessary changes can essentially be summed up like this: - set _GNU_SOURCE a the top to access some definitions - many includes added when we know we don't come from nolibc (missing the stdio include guard) - disable gettid() which is not exposed by glibc - disable gettimeofday's support of bad pointers since these crash in glibc - add a simple itoa() for errorname(); strerror() is too verbose (no way to get short messages). strerrorname_np() was added in modern glibc (2.32) to do exactly this but that 's too recent to be usable as the default fallback. - use the standard ioperm() definition. May be we need to implement ioperm() in nolibc if that's useful. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/nolibc-test.c | 47 +++++++++++++++++++- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/nolibc/nolibc-test.c b/tools/testing/selftests/nolibc/nolibc-test.c index 662dea691749c7..78bced95ac6309 100644 --- a/tools/testing/selftests/nolibc/nolibc-test.c +++ b/tools/testing/selftests/nolibc/nolibc-test.c @@ -1,17 +1,41 @@ // SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + /* platform-specific include files coming from the compiler */ #include /* libc-specific include files - * The program may be built in 2 ways: + * The program may be built in 3 ways: * $(CC) -nostdlib -include /path/to/nolibc.h => NOLIBC already defined - * $(CC) -nostdlib -I/path/to/nolibc/sysroot + * $(CC) -nostdlib -I/path/to/nolibc/sysroot => _NOLIBC_* guards are present + * $(CC) with default libc => NOLIBC* never defined */ #ifndef NOLIBC #include #include #include +#ifndef _NOLIBC_STDIO_H +/* standard libcs need more includes */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif #endif /* will be used by nolibc by getenv() */ @@ -23,6 +47,17 @@ struct test { int (*func)(int min, int max); // handler }; +#ifndef _NOLIBC_STDLIB_H +char *itoa(int i) +{ + static char buf[12]; + int ret; + + ret = snprintf(buf, sizeof(buf), "%d", i); + return (ret >= 0 && ret < sizeof(buf)) ? buf : "#err"; +} +#endif + #define CASE_ERR(err) \ case err: return #err @@ -431,7 +466,9 @@ int run_syscall(int min, int max) switch (test + __LINE__ + 1) { CASE_TEST(getpid); EXPECT_SYSNE(1, getpid(), -1); break; CASE_TEST(getppid); EXPECT_SYSNE(1, getppid(), -1); break; +#ifdef NOLIBC CASE_TEST(gettid); EXPECT_SYSNE(1, gettid(), -1); break; +#endif CASE_TEST(getpgid_self); EXPECT_SYSNE(1, getpgid(0), -1); break; CASE_TEST(getpgid_bad); EXPECT_SYSER(1, getpgid(-1), -1, ESRCH); break; CASE_TEST(kill_0); EXPECT_SYSZR(1, kill(getpid(), 0)); break; @@ -460,9 +497,11 @@ int run_syscall(int min, int max) CASE_TEST(getdents64_root); EXPECT_SYSNE(1, test_getdents64("/"), -1); break; CASE_TEST(getdents64_null); EXPECT_SYSER(1, test_getdents64("/dev/null"), -1, ENOTDIR); break; CASE_TEST(gettimeofday_null); EXPECT_SYSZR(1, gettimeofday(NULL, NULL)); break; +#ifdef NOLIBC CASE_TEST(gettimeofday_bad1); EXPECT_SYSER(1, gettimeofday((void *)1, NULL), -1, EFAULT); break; CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break; CASE_TEST(gettimeofday_bad2); EXPECT_SYSER(1, gettimeofday(NULL, (void *)1), -1, EFAULT); break; +#endif CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; CASE_TEST(ioctl_tiocinq); EXPECT_SYSZR(1, ioctl(0, TIOCINQ, &tmp)); break; CASE_TEST(link_root1); EXPECT_SYSER(1, link("/", "/"), -1, EEXIST); break; @@ -703,7 +742,11 @@ int main(int argc, char **argv, char **envp) * exit with status code 2N+1 when N is written to 0x501. We * hard-code the syscall here as it's arch-dependent. */ +#if defined(_NOLIBC_SYS_H) else if (my_syscall3(__NR_ioperm, 0x501, 1, 1) == 0) +#else + else if (ioperm(0x501, 1, 1) == 0) +#endif asm volatile ("outb %%al, %%dx" :: "d"(0x501), "a"(0)); /* if it does nothing, fall back to the regular panic */ #endif From bb6dfd968d8719408c992fd8da08978b84f1b23f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:45 +0200 Subject: [PATCH 0685/1250] selftests/nolibc: add a "kernel" target to build the kernel with the initramfs The "kernel" target rebuilds the kernel with the current config for the selected arch, with an initramfs containing the nolibc-test utility. Since image names depend on the architecture, the currently supported ones are referenced and resolved based on the architecture. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index fd0a670823340b..4a2ab0e73ce2e3 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -12,6 +12,16 @@ include $(srctree)/scripts/subarch.include ARCH = $(SUBARCH) endif +# kernel image names by architecture +IMAGE_i386 = arch/x86/boot/bzImage +IMAGE_x86 = arch/x86/boot/bzImage +IMAGE_arm64 = arch/arm64/boot/Image +IMAGE_arm = arch/arm/boot/zImage +IMAGE_mips = vmlinuz +IMAGE_riscv = arch/riscv/boot/Image +IMAGE = $(IMAGE_$(ARCH)) +IMAGE_NAME = $(notdir $(IMAGE)) + # OUTPUT is only set when run from the main makefile, otherwise # it defaults to this nolibc directory. OUTPUT ?= $(CURDIR)/ @@ -36,6 +46,9 @@ initramfs: nolibc-test $(call QUIET_INSTALL, initramfs/init) $(Q)cp nolibc-test initramfs/init +kernel: initramfs + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs + clean: $(call QUIET_CLEAN, nolibc-test) $(Q)rm -f nolibc-test From dfbdcc6c971406b15be64a5be25c9d8caa5ba94e Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:46 +0200 Subject: [PATCH 0686/1250] selftests/nolibc: add a "defconfig" target While most archs will work fine with "make defconfig", not all will do, and it's not always easy to remember the most suitable choice to use for a specific architecture. This adds a "defconfig" target to the Makefile so that one may easily run "make -C ... defconfig" and make sure to clean and rebuild a fresh config. This is *not* used by default because we want to preserve the user's config by default. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 4a2ab0e73ce2e3..c104719eae8b8a 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -22,6 +22,15 @@ IMAGE_riscv = arch/riscv/boot/Image IMAGE = $(IMAGE_$(ARCH)) IMAGE_NAME = $(notdir $(IMAGE)) +# default kernel configurations that appear to be usable +DEFCONFIG_i386 = defconfig +DEFCONFIG_x86 = defconfig +DEFCONFIG_arm64 = defconfig +DEFCONFIG_arm = multi_v7_defconfig +DEFCONFIG_mips = malta_defconfig +DEFCONFIG_riscv = defconfig +DEFCONFIG = $(DEFCONFIG_$(ARCH)) + # OUTPUT is only set when run from the main makefile, otherwise # it defaults to this nolibc directory. OUTPUT ?= $(CURDIR)/ @@ -46,6 +55,9 @@ initramfs: nolibc-test $(call QUIET_INSTALL, initramfs/init) $(Q)cp nolibc-test initramfs/init +defconfig: + $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) mrproper $(DEFCONFIG) prepare + kernel: initramfs $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs From 4c95c27c71a306c92b4853657bbd13a3893f2ac5 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:47 +0200 Subject: [PATCH 0687/1250] selftests/nolibc: add a "run" target to start the kernel in QEMU The "run" target will build the kernel and start it in QEMU. The "rerun" target will not have the kernel dependency and will just try to start QEMU. The QEMU architecture used to start the kernel is derived from the configured ARCH. This might need to be improved for archs which include different variants under the same name (mips vs mipsel, +/-64, riscv32 vs riscv64). This could be tested for i386, x86, arm, arm64, mips and riscv (the later two reporting issues on some tests). It is possible to pass a test specification for nolibc-test in the TEST variable, which will be passed as-is as NOLIBC_TEST. On success, the number of successful tests is printed. On failure, failed lines are individually printed. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 33 +++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index c104719eae8b8a..7c1f5360f454ef 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -31,6 +31,27 @@ DEFCONFIG_mips = malta_defconfig DEFCONFIG_riscv = defconfig DEFCONFIG = $(DEFCONFIG_$(ARCH)) +# optional tests to run (default = all) +TEST = + +# QEMU_ARCH: arch names used by qemu +QEMU_ARCH_i386 = i386 +QEMU_ARCH_x86 = x86_64 +QEMU_ARCH_arm64 = aarch64 +QEMU_ARCH_arm = arm +QEMU_ARCH_mips = mipsel # works with malta_defconfig +QEMU_ARCH_riscv = riscv64 +QEMU_ARCH = $(QEMU_ARCH_$(ARCH)) + +# QEMU_ARGS : some arch-specific args to pass to qemu +QEMU_ARGS_i386 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_x86 = -M pc -append "console=ttyS0,9600 i8042.noaux panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_arm64 = -M virt -cpu cortex-a53 -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_arm = -M virt -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_mips = -M malta -append "panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS_riscv = -M virt -append "console=ttyS0 panic=-1 $(TEST:%=NOLIBC_TEST=%)" +QEMU_ARGS = $(QEMU_ARGS_$(ARCH)) + # OUTPUT is only set when run from the main makefile, otherwise # it defaults to this nolibc directory. OUTPUT ?= $(CURDIR)/ @@ -61,8 +82,20 @@ defconfig: kernel: initramfs $(Q)$(MAKE) -C $(srctree) ARCH=$(ARCH) CC=$(CC) CROSS_COMPILE=$(CROSS_COMPILE) $(IMAGE_NAME) CONFIG_INITRAMFS_SOURCE=$(CURDIR)/initramfs +# run the tests after building the kernel +run: kernel + $(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out" + $(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed." + +# re-run the tests from an existing kernel +rerun: + $(Q)qemu-system-$(QEMU_ARCH) -display none -no-reboot -kernel "$(srctree)/$(IMAGE)" -serial stdio $(QEMU_ARGS) > "$(CURDIR)/run.out" + $(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed." + clean: $(call QUIET_CLEAN, nolibc-test) $(Q)rm -f nolibc-test $(call QUIET_CLEAN, initramfs) $(Q)rm -rf initramfs + $(call QUIET_CLEAN, run.out) + $(Q)rm -rf run.out From 85114fb5629f72a5030536c59ebbd13a655ef44f Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:48 +0200 Subject: [PATCH 0688/1250] selftests/nolibc: "sysroot" target installs a local copy of the sysroot It's not convenient to rely on a sysroot built in another directory, especially when running cross-compilation tests, where one has to switch back and forth between directories. Let's make it possible to install the sysroot directly in the test directory. It's not big and even benefits from being copied by arch so that it's easier to switch between archs if needed. The new "sysroot" target does this, it just calls "headers_standalone" from nolibc to install the sysroot right here. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 7c1f5360f454ef..210f5369fdfc4e 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -67,9 +67,16 @@ LDFLAGS := -s all: nolibc-test -nolibc-test: nolibc-test.c +sysroot: sysroot/$(ARCH)/include + +sysroot/$(ARCH)/include: + $(QUIET_MKDIR)mkdir -p sysroot + $(Q)$(MAKE) -C ../../../include/nolibc ARCH=$(ARCH) OUTPUT=$(CURDIR)/sysroot/ headers_standalone + $(Q)mv sysroot/sysroot sysroot/$(ARCH) + +nolibc-test: nolibc-test.c sysroot/$(ARCH)/include $(QUIET_CC)$(CC) $(CFLAGS) $(LDFLAGS) -o $@ \ - -nostdlib -static -include ../../../include/nolibc/nolibc.h $^ -lgcc + -nostdlib -static -Isysroot/$(ARCH)/include $< -lgcc initramfs: nolibc-test $(QUIET_MKDIR)mkdir -p initramfs @@ -93,6 +100,8 @@ rerun: $(Q)grep -w FAIL "$(CURDIR)/run.out" && echo "See all results in $(CURDIR)/run.out" || echo "$$(grep -c ^[0-9].*OK $(CURDIR)/run.out) test(s) passed." clean: + $(call QUIET_CLEAN, sysroot) + $(Q)rm -rf sysroot $(call QUIET_CLEAN, nolibc-test) $(Q)rm -f nolibc-test $(call QUIET_CLEAN, initramfs) From 0717e927c47fec7474868ac550ad675dbf837a15 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 19 Jul 2022 23:44:49 +0200 Subject: [PATCH 0689/1250] selftests/nolibc: add a "help" target It presents the supported targets, and becomes the default target to save the user from having to read the makefile. The "all" target was placed after it and now points to "run" to do everything since it's no longer the default one. Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/Makefile | 27 ++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/nolibc/Makefile b/tools/testing/selftests/nolibc/Makefile index 210f5369fdfc4e..69ea659caca98d 100644 --- a/tools/testing/selftests/nolibc/Makefile +++ b/tools/testing/selftests/nolibc/Makefile @@ -65,7 +65,32 @@ endif CFLAGS ?= -Os -fno-ident -fno-asynchronous-unwind-tables LDFLAGS := -s -all: nolibc-test +help: + @echo "Supported targets under selftests/nolibc:" + @echo " all call the \"run\" target below" + @echo " help this help" + @echo " sysroot create the nolibc sysroot here (uses \$$ARCH)" + @echo " nolibc-test build the executable (uses \$$CC and \$$CROSS_COMPILE)" + @echo " initramfs prepare the initramfs with nolibc-test" + @echo " defconfig create a fresh new default config (uses \$$ARCH)" + @echo " kernel (re)build the kernel with the initramfs (uses \$$ARCH)" + @echo " run runs the kernel in QEMU after building it (uses \$$ARCH, \$$TEST)" + @echo " rerun runs a previously prebuilt kernel in QEMU (uses \$$ARCH, \$$TEST)" + @echo " clean clean the sysroot, initramfs, build and output files" + @echo "" + @echo "The output file is \"run.out\". Test ranges may be passed using \$$TEST." + @echo "" + @echo "Currently using the following variables:" + @echo " ARCH = $(ARCH)" + @echo " CROSS_COMPILE = $(CROSS_COMPILE)" + @echo " CC = $(CC)" + @echo " OUTPUT = $(OUTPUT)" + @echo " TEST = $(TEST)" + @echo " QEMU_ARCH = $(if $(QEMU_ARCH),$(QEMU_ARCH),UNKNOWN_ARCH) [determined from \$$ARCH]" + @echo " IMAGE_NAME = $(if $(IMAGE_NAME),$(IMAGE_NAME),UNKNOWN_ARCH) [determined from \$$ARCH]" + @echo "" + +all: run sysroot: sysroot/$(ARCH)/include From 4dfb06442fd276aaff9ed4f38a623f052c9c170d Mon Sep 17 00:00:00 2001 From: Fernanda Ma'rouf Date: Wed, 20 Jul 2022 05:37:45 +0200 Subject: [PATCH 0690/1250] selftests/nolibc: Avoid generated files being committed After running the nolibc tests, the "git status" is not clean because the generated files are not ignored. Create a `.gitignore` inside the selftests/nolibc directory to ignore them. Cc: Ammar Faizi Cc: Fernanda Ma'rouf Signed-off-by: Fernanda Ma'rouf Signed-off-by: Willy Tarreau Signed-off-by: Paul E. McKenney --- tools/testing/selftests/nolibc/.gitignore | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 tools/testing/selftests/nolibc/.gitignore diff --git a/tools/testing/selftests/nolibc/.gitignore b/tools/testing/selftests/nolibc/.gitignore new file mode 100644 index 00000000000000..4696df589d68e5 --- /dev/null +++ b/tools/testing/selftests/nolibc/.gitignore @@ -0,0 +1,4 @@ +/initramfs/ +/nolibc-test +/run.out +/sysroot/ From c451eedea5223b40116ec48505bd2354da800c31 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 7 Jul 2022 13:15:32 -0600 Subject: [PATCH 0691/1250] md/raid5: Fix sectors_to_do bitmap overflow in raid5_make_request() For unaligned IO that have nearly maximum sectors, the number of stripes will end up being one greater than the size of the bitmap. When this happens, the last stripe in the IO will not be processed as it should be, resulting in data corruption. However, this is not normally seen when the backing block devices have 4K physical block sizes since the block layer will split the request before that happens. To fix this increase the bitmap size by one bit and ensure the full number of stripes are checked when calling find_first_bit(). Reported-by: David Sloan Fixes: a5b9c6a653fb ("md/raid5: Pivot raid5_make_request()") Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f28efce53ad9ff..0936f4f18d78f4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5873,8 +5873,11 @@ struct stripe_request_ctx { /* last sector in the request */ sector_t last_sector; - /* bitmap to track stripe sectors that have been added to stripes */ - DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES); + /* + * bitmap to track stripe sectors that have been added to stripes + * add one to account for unaligned requests + */ + DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1); /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ bool do_flush; @@ -6047,7 +6050,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) const int rw = bio_data_dir(bi); enum stripe_result res; DEFINE_WAIT(w); - int s; + int s, stripe_cnt; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -6091,9 +6094,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) ctx.last_sector = bio_end_sector(bi); bi->bi_next = NULL; - bitmap_set(ctx.sectors_to_do, 0, - DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, - RAID5_STRIPE_SECTORS(conf))); + stripe_cnt = DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + RAID5_STRIPE_SECTORS(conf)); + bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); pr_debug("raid456: %s, logical %llu to %llu\n", __func__, bi->bi_iter.bi_sector, ctx.last_sector); @@ -6138,8 +6141,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; } - s = find_first_bit(ctx.sectors_to_do, RAID5_MAX_REQ_STRIPES); - if (s == RAID5_MAX_REQ_STRIPES) + s = find_first_bit(ctx.sectors_to_do, stripe_cnt); + if (s == stripe_cnt) break; logical_sector = ctx.first_sector + From f68c68c0a6d522ad2509a87e28b6ef3e46992f12 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Thu, 7 Jul 2022 13:15:33 -0600 Subject: [PATCH 0692/1250] md/raid5: Convert prepare_to_wait() to wait_woken() api raid5_get_active_stripe() can sleep in various situations and it is called by make_stripe_request() while inside the prepare_to_wait()/finish_wait() section. Nested waits like this are not supported. This was noticed while making other changes that add different sleeps to raid5_get_active_stripe() that caused a WARNING with CONFIG_DEBUG_ATOMIC_SLEEP. No ill effects have been noticed with the code as is, but theoretically a nested and here could cause a dead lock so it should be fixed. To fix this, convert the prepare_to_wait() call to use wake_woken() which supports nested sleeps. Link: https://lwn.net/Articles/628628/ Signed-off-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/raid5.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0936f4f18d78f4..09df30237a653c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6044,12 +6044,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { + DEFINE_WAIT_FUNC(wait, woken_wake_function); struct r5conf *conf = mddev->private; sector_t logical_sector; struct stripe_request_ctx ctx = {}; const int rw = bio_data_dir(bi); enum stripe_result res; - DEFINE_WAIT(w); int s, stripe_cnt; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { @@ -6112,7 +6112,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) return true; } md_account_bio(mddev, &bi); - prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); + + add_wait_queue(&conf->wait_for_overlap, &wait); while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi); @@ -6135,9 +6136,8 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) ctx.batch_last = NULL; } - schedule(); - prepare_to_wait(&conf->wait_for_overlap, &w, - TASK_UNINTERRUPTIBLE); + wait_woken(&wait, TASK_UNINTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); continue; } @@ -6148,8 +6148,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) logical_sector = ctx.first_sector + (s << RAID5_STRIPE_SHIFT(conf)); } - - finish_wait(&conf->wait_for_overlap, &w); + remove_wait_queue(&conf->wait_for_overlap, &wait); if (ctx.batch_last) raid5_release_stripe(ctx.batch_last); From 069cce613a4a06bf76cd6c1f1158f24c44c3426c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:15 +0200 Subject: [PATCH 0693/1250] md: fix mddev->kobj lifetime Once a kobject is initialized, the containing object should not be directly freed. So delay initialization until it is added. Also remove the kobject_del call as the last put will remove the kobject as well. The explicitly delete isn't needed here, and dropping it will simplify further fixes. With this md_free now does not need to check that ->gendisk is non-NULL as it is always set by the time that kobject_init is called on mddev->kobj. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Hannes Reinecke Signed-off-by: Song Liu --- drivers/md/md.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7bc967131ac555..f8b6d37c5bdf0e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -678,7 +678,6 @@ static void md_safemode_timeout(struct timer_list *t); void mddev_init(struct mddev *mddev) { - kobject_init(&mddev->kobj, &md_ktype); mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->bitmap_info.mutex); @@ -5590,10 +5589,9 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); - if (mddev->gendisk) { - del_gendisk(mddev->gendisk); - put_disk(mddev->gendisk); - } + del_gendisk(mddev->gendisk); + put_disk(mddev->gendisk); + percpu_ref_exit(&mddev->writes_pending); bioset_exit(&mddev->bio_set); @@ -5617,7 +5615,6 @@ static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); - kobject_del(&mddev->kobj); kobject_put(&mddev->kobj); } @@ -5719,6 +5716,7 @@ int md_alloc(dev_t dev, char *name) if (error) goto out_cleanup_disk; + kobject_init(&mddev->kobj, &md_ktype); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); if (error) goto out_del_gendisk; From 9c39164ac17d7e7119496f013d7c043583436339 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:16 +0200 Subject: [PATCH 0694/1250] md: fix error handling in md_alloc Error handling in md_alloc is a mess. Untangle it to just free the mddev directly before add_disk is called and thus the gendisk is globally visible. After that clear the hold flag and let the mddev_put take care of cleaning up the mddev through the usual mechanisms. Fixes: 5e55e2f5fc95 ("[PATCH] md: convert compile time warnings into runtime warnings") Fixes: 9be68dd7ac0e ("md: add error handling support for add_disk()") Fixes: 7ad1069166c0 ("md: properly unwind when failing to add the kobject in md_alloc") Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Hannes Reinecke Signed-off-by: Song Liu --- drivers/md/md.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f8b6d37c5bdf0e..c1439d5ab9b1a4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -790,6 +790,15 @@ static struct mddev *mddev_alloc(dev_t unit) return ERR_PTR(error); } +static void mddev_free(struct mddev *mddev) +{ + spin_lock(&all_mddevs_lock); + list_del(&mddev->all_mddevs); + spin_unlock(&all_mddevs_lock); + + kfree(mddev); +} + static const struct attribute_group md_redundancy_group; void mddev_unlock(struct mddev *mddev) @@ -5661,8 +5670,8 @@ int md_alloc(dev_t dev, char *name) mutex_lock(&disks_mutex); mddev = mddev_alloc(dev); if (IS_ERR(mddev)) { - mutex_unlock(&disks_mutex); - return PTR_ERR(mddev); + error = PTR_ERR(mddev); + goto out_unlock; } partitioned = (MAJOR(mddev->unit) != MD_MAJOR); @@ -5680,7 +5689,7 @@ int md_alloc(dev_t dev, char *name) strcmp(mddev2->gendisk->disk_name, name) == 0) { spin_unlock(&all_mddevs_lock); error = -EEXIST; - goto out_unlock_disks_mutex; + goto out_free_mddev; } spin_unlock(&all_mddevs_lock); } @@ -5693,7 +5702,7 @@ int md_alloc(dev_t dev, char *name) error = -ENOMEM; disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk) - goto out_unlock_disks_mutex; + goto out_free_mddev; disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; @@ -5714,26 +5723,36 @@ int md_alloc(dev_t dev, char *name) mddev->gendisk = disk; error = add_disk(disk); if (error) - goto out_cleanup_disk; + goto out_put_disk; kobject_init(&mddev->kobj, &md_ktype); error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md"); - if (error) - goto out_del_gendisk; + if (error) { + /* + * The disk is already live at this point. Clear the hold flag + * and let mddev_put take care of the deletion, as it isn't any + * different from a normal close on last release now. + */ + mddev->hold_active = 0; + goto done; + } kobject_uevent(&mddev->kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); - goto out_unlock_disks_mutex; -out_del_gendisk: - del_gendisk(disk); -out_cleanup_disk: - put_disk(disk); -out_unlock_disks_mutex: +done: mutex_unlock(&disks_mutex); mddev_put(mddev); return error; + +out_put_disk: + put_disk(disk); +out_free_mddev: + mddev_free(mddev); +out_unlock: + mutex_unlock(&disks_mutex); + return error; } static void md_probe(dev_t dev) From a46457a5ee790113e80a6677aede76a9125a6a72 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:17 +0200 Subject: [PATCH 0695/1250] md: implement ->free_disk Ensure that all private data is only freed once all accesses are done. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Hannes Reinecke Signed-off-by: Song Liu --- drivers/md/md.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index c1439d5ab9b1a4..e07a6f26116cea 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5600,12 +5600,6 @@ static void md_free(struct kobject *ko) del_gendisk(mddev->gendisk); put_disk(mddev->gendisk); - - percpu_ref_exit(&mddev->writes_pending); - - bioset_exit(&mddev->bio_set); - bioset_exit(&mddev->sync_set); - kfree(mddev); } static const struct sysfs_ops md_sysfs_ops = { @@ -7875,6 +7869,17 @@ static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) return ret; } +static void md_free_disk(struct gendisk *disk) +{ + struct mddev *mddev = disk->private_data; + + percpu_ref_exit(&mddev->writes_pending); + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); + + kfree(mddev); +} + const struct block_device_operations md_fops = { .owner = THIS_MODULE, @@ -7888,6 +7893,7 @@ const struct block_device_operations md_fops = .getgeo = md_getgeo, .check_events = md_check_events, .set_read_only = md_set_read_only, + .free_disk = md_free_disk, }; static int md_thread(void *arg) From e7f238227528cae61806ad6ee2326e2db710ed60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:18 +0200 Subject: [PATCH 0696/1250] md: rename md_free to md_kobj_release The md_free name is rather misleading, so pick a better one. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Hannes Reinecke Signed-off-by: Song Liu --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e07a6f26116cea..a4a2a10326e23c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5589,7 +5589,7 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, return rv; } -static void md_free(struct kobject *ko) +static void md_kobj_release(struct kobject *ko) { struct mddev *mddev = container_of(ko, struct mddev, kobj); @@ -5607,7 +5607,7 @@ static const struct sysfs_ops md_sysfs_ops = { .store = md_attr_store, }; static struct kobj_type md_ktype = { - .release = md_free, + .release = md_kobj_release, .sysfs_ops = &md_sysfs_ops, .default_groups = md_attr_groups, }; From 93c12a1eb34e5faa64c2c60a72efa68dd497c069 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:19 +0200 Subject: [PATCH 0697/1250] md: factor out the rdev overlaps check from rdev_size_store This splits the code into nicely readable chunks and also avoids the refcount inc/dec manipulations. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Hannes Reinecke Signed-off-by: Song Liu --- drivers/md/md.c | 84 +++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 45 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index a4a2a10326e23c..3dc39857123969 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3343,14 +3343,33 @@ rdev_size_show(struct md_rdev *rdev, char *page) return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); } -static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) +static int md_rdevs_overlap(struct md_rdev *a, struct md_rdev *b) { /* check if two start/length pairs overlap */ - if (s1+l1 <= s2) - return 0; - if (s2+l2 <= s1) - return 0; - return 1; + if (a->data_offset + a->sectors <= b->data_offset) + return false; + if (b->data_offset + b->sectors <= a->data_offset) + return false; + return true; +} + +static bool md_rdev_overlaps(struct md_rdev *rdev) +{ + struct mddev *mddev; + struct md_rdev *rdev2; + + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev, &all_mddevs, all_mddevs) { + rdev_for_each(rdev2, mddev) { + if (rdev != rdev2 && rdev->bdev == rdev2->bdev && + md_rdevs_overlap(rdev, rdev2)) { + spin_unlock(&all_mddevs_lock); + return true; + } + } + } + spin_unlock(&all_mddevs_lock); + return false; } static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) @@ -3402,46 +3421,21 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) return -EINVAL; /* component must fit device */ rdev->sectors = sectors; - if (sectors > oldsectors && my_mddev->external) { - /* Need to check that all other rdevs with the same - * ->bdev do not overlap. 'rcu' is sufficient to walk - * the rdev lists safely. - * This check does not provide a hard guarantee, it - * just helps avoid dangerous mistakes. - */ - struct mddev *mddev; - int overlap = 0; - struct list_head *tmp; - rcu_read_lock(); - for_each_mddev(mddev, tmp) { - struct md_rdev *rdev2; - - rdev_for_each(rdev2, mddev) - if (rdev->bdev == rdev2->bdev && - rdev != rdev2 && - overlaps(rdev->data_offset, rdev->sectors, - rdev2->data_offset, - rdev2->sectors)) { - overlap = 1; - break; - } - if (overlap) { - mddev_put(mddev); - break; - } - } - rcu_read_unlock(); - if (overlap) { - /* Someone else could have slipped in a size - * change here, but doing so is just silly. - * We put oldsectors back because we *know* it is - * safe, and trust userspace not to race with - * itself - */ - rdev->sectors = oldsectors; - return -EBUSY; - } + /* + * Check that all other rdevs with the same bdev do not overlap. This + * check does not provide a hard guarantee, it just helps avoid + * dangerous mistakes. + */ + if (sectors > oldsectors && my_mddev->external && + md_rdev_overlaps(rdev)) { + /* + * Someone else could have slipped in a size change here, but + * doing so is just silly. We put oldsectors back because we + * know it is safe, and trust userspace not to race with itself. + */ + rdev->sectors = oldsectors; + return -EBUSY; } return len; } From 1bb1083754971c6c50bf894ff93b81139e045ab8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:20 +0200 Subject: [PATCH 0698/1250] md: stop using for_each_mddev in md_do_sync Just do a plain list_for_each that only grabs a mddev reference in the case where the thread sleeps and restarts the list iteration. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Hannes Reinecke Signed-off-by: Song Liu --- drivers/md/md.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3dc39857123969..f7d5865880f26f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8723,7 +8723,6 @@ void md_do_sync(struct md_thread *thread) unsigned long update_time; sector_t mark_cnt[SYNC_MARKS]; int last_mark,m; - struct list_head *tmp; sector_t last_check; int skipped = 0; struct md_rdev *rdev; @@ -8787,7 +8786,8 @@ void md_do_sync(struct md_thread *thread) try_again: if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) goto skip; - for_each_mddev(mddev2, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { if (mddev2 == mddev) continue; if (!mddev->parallel_resync @@ -8819,7 +8819,8 @@ void md_do_sync(struct md_thread *thread) desc, mdname(mddev), mdname(mddev2)); } - mddev_put(mddev2); + spin_unlock(&all_mddevs_lock); + if (signal_pending(current)) flush_signals(current); schedule(); @@ -8829,6 +8830,7 @@ void md_do_sync(struct md_thread *thread) finish_wait(&resync_wait, &wq); } } + spin_unlock(&all_mddevs_lock); } while (mddev->curr_resync < MD_RESYNC_DELAYED); j = 0; From 32389bb813f970befb44dcbde09704c937509cb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:21 +0200 Subject: [PATCH 0699/1250] md: stop using for_each_mddev in md_notify_reboot Just do a simple list_for_each_entry_safe on all_mddevs, and only grab a reference when we drop the lock. Reviewed-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Hannes Reinecke Signed-off-by: Song Liu --- drivers/md/md.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f7d5865880f26f..f5d46694a50642 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9585,11 +9585,13 @@ EXPORT_SYMBOL_GPL(rdev_clear_badblocks); static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { - struct list_head *tmp; - struct mddev *mddev; + struct mddev *mddev, *n; int need_delay = 0; - for_each_mddev(mddev, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); if (mddev_trylock(mddev)) { if (mddev->pers) __md_stop_writes(mddev); @@ -9598,7 +9600,11 @@ static int md_notify_reboot(struct notifier_block *this, mddev_unlock(mddev); } need_delay = 1; + mddev_put(mddev); + spin_lock(&all_mddevs_lock); } + spin_unlock(&all_mddevs_lock); + /* * certain more exotic SCSI devices are known to be * volatile wrt too early system reboots. While the From 45e0d274008202cc336ba2fdf1f2129b4d888ea3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:22 +0200 Subject: [PATCH 0700/1250] md: stop using for_each_mddev in md_exit Just do a simple list_for_each_entry_safe on all_mddevs, and only grab a reference when we drop the lock and delete the now unused for_each_mddev macro. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Reviewed-by: Hannes Reinecke Signed-off-by: Song Liu --- drivers/md/md.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f5d46694a50642..709df90454762e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -368,28 +368,6 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); -/* - * iterates through all used mddevs in the system. - * We take care to grab the all_mddevs_lock whenever navigating - * the list, and to always hold a refcount when unlocked. - * Any code which breaks out of this loop while own - * a reference to the current mddev and must mddev_put it. - */ -#define for_each_mddev(_mddev,_tmp) \ - \ - for (({ spin_lock(&all_mddevs_lock); \ - _tmp = all_mddevs.next; \ - _mddev = NULL;}); \ - ({ if (_tmp != &all_mddevs) \ - mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ - spin_unlock(&all_mddevs_lock); \ - if (_mddev) mddev_put(_mddev); \ - _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ - _tmp != &all_mddevs;}); \ - ({ spin_lock(&all_mddevs_lock); \ - _tmp = _tmp->next;}) \ - ) - /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -9923,8 +9901,7 @@ void md_autostart_arrays(int part) static __exit void md_exit(void) { - struct mddev *mddev; - struct list_head *tmp; + struct mddev *mddev, *n; int delay = 1; unregister_blkdev(MD_MAJOR,"md"); @@ -9944,17 +9921,23 @@ static __exit void md_exit(void) } remove_proc_entry("mdstat", NULL); - for_each_mddev(mddev, tmp) { + spin_lock(&all_mddevs_lock); + list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); export_array(mddev); mddev->ctime = 0; mddev->hold_active = 0; /* - * for_each_mddev() will call mddev_put() at the end of each - * iteration. As the mddev is now fully clear, this will - * schedule the mddev for destruction by a workqueue, and the + * As the mddev is now fully clear, mddev_put will schedule + * the mddev for destruction by a workqueue, and the * destroy_workqueue() below will wait for that to complete. */ + mddev_put(mddev); + spin_lock(&all_mddevs_lock); } + spin_unlock(&all_mddevs_lock); + destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); From d9406f31a96407754a05e1b6510b02684f622922 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:23 +0200 Subject: [PATCH 0701/1250] md: only delete entries from all_mddevs when the disk is freed This ensures device names don't get prematurely reused. Instead add a deleted flag to skip already deleted devices in mddev_get and other places that only want to see live mddevs. Reported-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/md.c | 56 +++++++++++++++++++++++++++++++++---------------- drivers/md/md.h | 2 ++ 2 files changed, 40 insertions(+), 18 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 709df90454762e..2e9ed4485ed828 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -625,6 +625,10 @@ EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) { + lockdep_assert_held(&all_mddevs_lock); + + if (test_bit(MD_DELETED, &mddev->flags)) + return NULL; atomic_inc(&mddev->active); return mddev; } @@ -639,7 +643,7 @@ static void mddev_put(struct mddev *mddev) mddev->ctime == 0 && !mddev->hold_active) { /* Array is not configured at all, and not held active, * so destroy it */ - list_del_init(&mddev->all_mddevs); + set_bit(MD_DELETED, &mddev->flags); /* * Call queue_work inside the spinlock so that @@ -719,8 +723,8 @@ static struct mddev *mddev_find(dev_t unit) spin_lock(&all_mddevs_lock); mddev = mddev_find_locked(unit); - if (mddev) - mddev_get(mddev); + if (mddev && !mddev_get(mddev)) + mddev = NULL; spin_unlock(&all_mddevs_lock); return mddev; @@ -3338,6 +3342,8 @@ static bool md_rdev_overlaps(struct md_rdev *rdev) spin_lock(&all_mddevs_lock); list_for_each_entry(mddev, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev->flags)) + continue; rdev_for_each(rdev2, mddev) { if (rdev != rdev2 && rdev->bdev == rdev2->bdev && md_rdevs_overlap(rdev, rdev2)) { @@ -5525,11 +5531,10 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); rv = entry->show(mddev, page); @@ -5550,11 +5555,10 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, if (!capable(CAP_SYS_ADMIN)) return -EACCES; spin_lock(&all_mddevs_lock); - if (list_empty(&mddev->all_mddevs)) { + if (!mddev_get(mddev)) { spin_unlock(&all_mddevs_lock); return -EBUSY; } - mddev_get(mddev); spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); mddev_put(mddev); @@ -7849,7 +7853,7 @@ static void md_free_disk(struct gendisk *disk) bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - kfree(mddev); + mddev_free(mddev); } const struct block_device_operations md_fops = @@ -8171,6 +8175,8 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos) if (!l--) { mddev = list_entry(tmp, struct mddev, all_mddevs); mddev_get(mddev); + if (!mddev_get(mddev)) + continue; spin_unlock(&all_mddevs_lock); return mddev; } @@ -8184,25 +8190,35 @@ static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct list_head *tmp; struct mddev *next_mddev, *mddev = v; + struct mddev *to_put = NULL; ++*pos; if (v == (void*)2) return NULL; spin_lock(&all_mddevs_lock); - if (v == (void*)1) + if (v == (void*)1) { tmp = all_mddevs.next; - else + } else { + to_put = mddev; tmp = mddev->all_mddevs.next; - if (tmp != &all_mddevs) - next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); - else { - next_mddev = (void*)2; - *pos = 0x10000; } + + for (;;) { + if (tmp == &all_mddevs) { + next_mddev = (void*)2; + *pos = 0x10000; + break; + } + next_mddev = list_entry(tmp, struct mddev, all_mddevs); + if (mddev_get(next_mddev)) + break; + mddev = next_mddev; + tmp = mddev->all_mddevs.next; + }; spin_unlock(&all_mddevs_lock); - if (v != (void*)1) + if (to_put) mddev_put(mddev); return next_mddev; @@ -8766,6 +8782,8 @@ void md_do_sync(struct md_thread *thread) goto skip; spin_lock(&all_mddevs_lock); list_for_each_entry(mddev2, &all_mddevs, all_mddevs) { + if (test_bit(MD_DELETED, &mddev2->flags)) + continue; if (mddev2 == mddev) continue; if (!mddev->parallel_resync @@ -9568,7 +9586,8 @@ static int md_notify_reboot(struct notifier_block *this, spin_lock(&all_mddevs_lock); list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { - mddev_get(mddev); + if (!mddev_get(mddev)) + continue; spin_unlock(&all_mddevs_lock); if (mddev_trylock(mddev)) { if (mddev->pers) @@ -9923,7 +9942,8 @@ static __exit void md_exit(void) spin_lock(&all_mddevs_lock); list_for_each_entry_safe(mddev, n, &all_mddevs, all_mddevs) { - mddev_get(mddev); + if (!mddev_get(mddev)) + continue; spin_unlock(&all_mddevs_lock); export_array(mddev); mddev->ctime = 0; diff --git a/drivers/md/md.h b/drivers/md/md.h index 861088b3d2363f..f6ab73c90b7d27 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -254,6 +254,7 @@ struct md_cluster_info; * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that * array is ready yet. * @MD_BROKEN: This is used to stop writes and mark array as failed. + * @MD_DELETED: This device is being deleted * * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */ @@ -270,6 +271,7 @@ enum mddev_flags { MD_UPDATING_SB, MD_NOT_READY, MD_BROKEN, + MD_DELETED, }; enum mddev_sb_flags { From 4500d5c1791058398698437d55dd6c24912e6067 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jul 2022 11:18:24 +0200 Subject: [PATCH 0702/1250] md: simplify md_open Now that devices are on the all_mddevs list until the gendisk is freed, there can't be any duplicates. Remove the global list lookup and just grab a reference. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Logan Gunthorpe Signed-off-by: Song Liu --- drivers/md/md.c | 42 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 2e9ed4485ed828..fa500ae9863b58 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7783,45 +7783,33 @@ static int md_set_read_only(struct block_device *bdev, bool ro) static int md_open(struct block_device *bdev, fmode_t mode) { - /* - * Succeed if we can lock the mddev, which confirms that - * it isn't being stopped right now. - */ - struct mddev *mddev = mddev_find(bdev->bd_dev); + struct mddev *mddev; int err; + spin_lock(&all_mddevs_lock); + mddev = mddev_get(bdev->bd_disk->private_data); + spin_unlock(&all_mddevs_lock); if (!mddev) return -ENODEV; - if (mddev->gendisk != bdev->bd_disk) { - /* we are racing with mddev_put which is discarding this - * bd_disk. - */ - mddev_put(mddev); - /* Wait until bdev->bd_disk is definitely gone */ - if (work_pending(&mddev->del_work)) - flush_workqueue(md_misc_wq); - return -EBUSY; - } - BUG_ON(mddev != bdev->bd_disk->private_data); - - if ((err = mutex_lock_interruptible(&mddev->open_mutex))) + err = mutex_lock_interruptible(&mddev->open_mutex); + if (err) goto out; - if (test_bit(MD_CLOSING, &mddev->flags)) { - mutex_unlock(&mddev->open_mutex); - err = -ENODEV; - goto out; - } + err = -ENODEV; + if (test_bit(MD_CLOSING, &mddev->flags)) + goto out_unlock; - err = 0; atomic_inc(&mddev->openers); mutex_unlock(&mddev->open_mutex); bdev_check_media_change(bdev); - out: - if (err) - mddev_put(mddev); + return 0; + +out_unlock: + mutex_unlock(&mddev->open_mutex); +out: + mddev_put(mddev); return err; } From 211a3702d5aecd9f20823ebe7dcea5b915fae08f Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Thu, 7 Jul 2022 17:08:34 +0800 Subject: [PATCH 0703/1250] raid5: fix duplicate checks for rdev->saved_raid_disk 'first' will always be greater than or equal to 0, it is unnecessary to repeat the 0 check, clean it up. Signed-off-by: Jackie Liu Signed-off-by: Song Liu --- drivers/md/raid5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 09df30237a653c..3b7887428cd0e8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8261,8 +8261,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) * find the disk ... but prefer rdev->saved_raid_disk * if possible. */ - if (rdev->saved_raid_disk >= 0 && - rdev->saved_raid_disk >= first && + if (rdev->saved_raid_disk >= first && rdev->saved_raid_disk <= last && conf->disks[rdev->saved_raid_disk].rdev == NULL) first = rdev->saved_raid_disk; From cd53a2f6199a0d1e81ab9ec6cb9419c37cd2d008 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 8 Apr 2022 13:17:13 +0300 Subject: [PATCH 0704/1250] clocksource/drivers/timer-ti-dm: Move inline functions to driver for am6 The __omap_dm_timer_* inline functions in the header are no longer needed outside the driver, and the header ifdefs prevent the driver working for ARCH_K3. Let's move the inline functions to the driver and drop the ifdefs and drop the unused functions __omap_dm_timer_override_errata() and __omap_dm_timer_load_start(). Cc: Keerthy Cc: Nishanth Menon Cc: Vignesh Raghavendra Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20220408101715.43697-2-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 115 ++++++++++++++++++++++++ include/clocksource/timer-ti-dm.h | 144 ------------------------------ 2 files changed, 115 insertions(+), 144 deletions(-) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 33609be0b304b1..530b71f3ab3729 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -44,6 +44,121 @@ enum { REQUEST_BY_NODE, }; +static inline u32 __omap_dm_timer_read(struct omap_dm_timer *timer, u32 reg, + int posted) +{ + if (posted) + while (readl_relaxed(timer->pend) & (reg >> WPSHIFT)) + cpu_relax(); + + return readl_relaxed(timer->func_base + (reg & 0xff)); +} + +static inline void __omap_dm_timer_write(struct omap_dm_timer *timer, + u32 reg, u32 val, int posted) +{ + if (posted) + while (readl_relaxed(timer->pend) & (reg >> WPSHIFT)) + cpu_relax(); + + writel_relaxed(val, timer->func_base + (reg & 0xff)); +} + +static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer) +{ + u32 tidr; + + /* Assume v1 ip if bits [31:16] are zero */ + tidr = readl_relaxed(timer->io_base); + if (!(tidr >> 16)) { + timer->revision = 1; + timer->irq_stat = timer->io_base + OMAP_TIMER_V1_STAT_OFFSET; + timer->irq_ena = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET; + timer->irq_dis = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET; + timer->pend = timer->io_base + _OMAP_TIMER_WRITE_PEND_OFFSET; + timer->func_base = timer->io_base; + } else { + timer->revision = 2; + timer->irq_stat = timer->io_base + OMAP_TIMER_V2_IRQSTATUS; + timer->irq_ena = timer->io_base + OMAP_TIMER_V2_IRQENABLE_SET; + timer->irq_dis = timer->io_base + OMAP_TIMER_V2_IRQENABLE_CLR; + timer->pend = timer->io_base + + _OMAP_TIMER_WRITE_PEND_OFFSET + + OMAP_TIMER_V2_FUNC_OFFSET; + timer->func_base = timer->io_base + OMAP_TIMER_V2_FUNC_OFFSET; + } +} + +/* + * __omap_dm_timer_enable_posted - enables write posted mode + * @timer: pointer to timer instance handle + * + * Enables the write posted mode for the timer. When posted mode is enabled + * writes to certain timer registers are immediately acknowledged by the + * internal bus and hence prevents stalling the CPU waiting for the write to + * complete. Enabling this feature can improve performance for writing to the + * timer registers. + */ +static inline void __omap_dm_timer_enable_posted(struct omap_dm_timer *timer) +{ + if (timer->posted) + return; + + if (timer->errata & OMAP_TIMER_ERRATA_I103_I767) { + timer->posted = OMAP_TIMER_NONPOSTED; + __omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0, 0); + return; + } + + __omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, + OMAP_TIMER_CTRL_POSTED, 0); + timer->context.tsicr = OMAP_TIMER_CTRL_POSTED; + timer->posted = OMAP_TIMER_POSTED; +} + +static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer, + int posted, unsigned long rate) +{ + u32 l; + + l = __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted); + if (l & OMAP_TIMER_CTRL_ST) { + l &= ~0x1; + __omap_dm_timer_write(timer, OMAP_TIMER_CTRL_REG, l, posted); +#ifdef CONFIG_ARCH_OMAP2PLUS + /* Readback to make sure write has completed */ + __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted); + /* + * Wait for functional clock period x 3.5 to make sure that + * timer is stopped + */ + udelay(3500000 / rate + 1); +#endif + } + + /* Ack possibly pending interrupt */ + writel_relaxed(OMAP_TIMER_INT_OVERFLOW, timer->irq_stat); +} + +static inline void __omap_dm_timer_int_enable(struct omap_dm_timer *timer, + unsigned int value) +{ + writel_relaxed(value, timer->irq_ena); + __omap_dm_timer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value, 0); +} + +static inline unsigned int +__omap_dm_timer_read_counter(struct omap_dm_timer *timer, int posted) +{ + return __omap_dm_timer_read(timer, OMAP_TIMER_COUNTER_REG, posted); +} + +static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer, + unsigned int value) +{ + writel_relaxed(value, timer->irq_stat); +} + /** * omap_dm_timer_read_reg - read timer registers in posted and non-posted mode * @timer: timer pointer over which read operation to perform diff --git a/include/clocksource/timer-ti-dm.h b/include/clocksource/timer-ti-dm.h index f6da8a1326398d..b0f80cfd2a263b 100644 --- a/include/clocksource/timer-ti-dm.h +++ b/include/clocksource/timer-ti-dm.h @@ -247,148 +247,4 @@ int omap_dm_timers_active(void); #define OMAP_TIMER_TICK_INT_MASK_COUNT_REG \ (_OMAP_TIMER_TICK_INT_MASK_COUNT_OFFSET | (WP_TOWR << WPSHIFT)) -/* - * The below are inlined to optimize code size for system timers. Other code - * should not need these at all. - */ -#if defined(CONFIG_ARCH_OMAP1) || defined(CONFIG_ARCH_OMAP2PLUS) -static inline u32 __omap_dm_timer_read(struct omap_dm_timer *timer, u32 reg, - int posted) -{ - if (posted) - while (readl_relaxed(timer->pend) & (reg >> WPSHIFT)) - cpu_relax(); - - return readl_relaxed(timer->func_base + (reg & 0xff)); -} - -static inline void __omap_dm_timer_write(struct omap_dm_timer *timer, - u32 reg, u32 val, int posted) -{ - if (posted) - while (readl_relaxed(timer->pend) & (reg >> WPSHIFT)) - cpu_relax(); - - writel_relaxed(val, timer->func_base + (reg & 0xff)); -} - -static inline void __omap_dm_timer_init_regs(struct omap_dm_timer *timer) -{ - u32 tidr; - - /* Assume v1 ip if bits [31:16] are zero */ - tidr = readl_relaxed(timer->io_base); - if (!(tidr >> 16)) { - timer->revision = 1; - timer->irq_stat = timer->io_base + OMAP_TIMER_V1_STAT_OFFSET; - timer->irq_ena = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET; - timer->irq_dis = timer->io_base + OMAP_TIMER_V1_INT_EN_OFFSET; - timer->pend = timer->io_base + _OMAP_TIMER_WRITE_PEND_OFFSET; - timer->func_base = timer->io_base; - } else { - timer->revision = 2; - timer->irq_stat = timer->io_base + OMAP_TIMER_V2_IRQSTATUS; - timer->irq_ena = timer->io_base + OMAP_TIMER_V2_IRQENABLE_SET; - timer->irq_dis = timer->io_base + OMAP_TIMER_V2_IRQENABLE_CLR; - timer->pend = timer->io_base + - _OMAP_TIMER_WRITE_PEND_OFFSET + - OMAP_TIMER_V2_FUNC_OFFSET; - timer->func_base = timer->io_base + OMAP_TIMER_V2_FUNC_OFFSET; - } -} - -/* - * __omap_dm_timer_enable_posted - enables write posted mode - * @timer: pointer to timer instance handle - * - * Enables the write posted mode for the timer. When posted mode is enabled - * writes to certain timer registers are immediately acknowledged by the - * internal bus and hence prevents stalling the CPU waiting for the write to - * complete. Enabling this feature can improve performance for writing to the - * timer registers. - */ -static inline void __omap_dm_timer_enable_posted(struct omap_dm_timer *timer) -{ - if (timer->posted) - return; - - if (timer->errata & OMAP_TIMER_ERRATA_I103_I767) { - timer->posted = OMAP_TIMER_NONPOSTED; - __omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, 0, 0); - return; - } - - __omap_dm_timer_write(timer, OMAP_TIMER_IF_CTRL_REG, - OMAP_TIMER_CTRL_POSTED, 0); - timer->context.tsicr = OMAP_TIMER_CTRL_POSTED; - timer->posted = OMAP_TIMER_POSTED; -} - -/** - * __omap_dm_timer_override_errata - override errata flags for a timer - * @timer: pointer to timer handle - * @errata: errata flags to be ignored - * - * For a given timer, override a timer errata by clearing the flags - * specified by the errata argument. A specific erratum should only be - * overridden for a timer if the timer is used in such a way the erratum - * has no impact. - */ -static inline void __omap_dm_timer_override_errata(struct omap_dm_timer *timer, - u32 errata) -{ - timer->errata &= ~errata; -} - -static inline void __omap_dm_timer_stop(struct omap_dm_timer *timer, - int posted, unsigned long rate) -{ - u32 l; - - l = __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted); - if (l & OMAP_TIMER_CTRL_ST) { - l &= ~0x1; - __omap_dm_timer_write(timer, OMAP_TIMER_CTRL_REG, l, posted); -#ifdef CONFIG_ARCH_OMAP2PLUS - /* Readback to make sure write has completed */ - __omap_dm_timer_read(timer, OMAP_TIMER_CTRL_REG, posted); - /* - * Wait for functional clock period x 3.5 to make sure that - * timer is stopped - */ - udelay(3500000 / rate + 1); -#endif - } - - /* Ack possibly pending interrupt */ - writel_relaxed(OMAP_TIMER_INT_OVERFLOW, timer->irq_stat); -} - -static inline void __omap_dm_timer_load_start(struct omap_dm_timer *timer, - u32 ctrl, unsigned int load, - int posted) -{ - __omap_dm_timer_write(timer, OMAP_TIMER_COUNTER_REG, load, posted); - __omap_dm_timer_write(timer, OMAP_TIMER_CTRL_REG, ctrl, posted); -} - -static inline void __omap_dm_timer_int_enable(struct omap_dm_timer *timer, - unsigned int value) -{ - writel_relaxed(value, timer->irq_ena); - __omap_dm_timer_write(timer, OMAP_TIMER_WAKEUP_EN_REG, value, 0); -} - -static inline unsigned int -__omap_dm_timer_read_counter(struct omap_dm_timer *timer, int posted) -{ - return __omap_dm_timer_read(timer, OMAP_TIMER_COUNTER_REG, posted); -} - -static inline void __omap_dm_timer_write_status(struct omap_dm_timer *timer, - unsigned int value) -{ - writel_relaxed(value, timer->irq_stat); -} -#endif /* CONFIG_ARCH_OMAP1 || CONFIG_ARCH_OMAP2PLUS */ #endif /* __CLOCKSOURCE_DMTIMER_H */ From e6831f1a52d901e337bbd49137882d26fd72f566 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 8 Apr 2022 13:17:14 +0300 Subject: [PATCH 0705/1250] clocksource/drivers/timer-ti-dm: Make timer selectable for ARCH_K3 Let's make timer-ti-dm selectable for ARCH_K3, and add a separate option for OMAP_DM_SYSTIMER as there should be no need for it on ARCH_K3. For older TI SoCs, we are already selecting OMAP_DM_TIMER in arch/arm/mach-omap*/Kconfig. For mach-omap2, we need to now also select OMAP_DM_SYSTIMER. Cc: Keerthy Cc: Nishanth Menon Cc: Vignesh Raghavendra Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20220408101715.43697-3-tony@atomide.com Signed-off-by: Daniel Lezcano --- arch/arm/mach-omap2/Kconfig | 2 ++ drivers/clocksource/Kconfig | 8 +++++++- drivers/clocksource/Makefile | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index 02c253de9b6e78..21e99287f5c0e6 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -105,6 +105,7 @@ config ARCH_OMAP2PLUS select MACH_OMAP_GENERIC select MEMORY select MFD_SYSCON + select OMAP_DM_SYSTIMER select OMAP_DM_TIMER select OMAP_GPMC select PINCTRL @@ -160,6 +161,7 @@ config SOC_OMAP2420 bool "OMAP2420 support" depends on ARCH_OMAP2 default y + select OMAP_DM_SYSTIMER select OMAP_DM_TIMER select SOC_HAS_OMAP2_SDRC diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 2dcdf02239c352..967dcfb8dd0dcb 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -22,7 +22,7 @@ config CLKEVT_I8253 config I8253_LOCK bool -config OMAP_DM_TIMER +config OMAP_DM_SYSTIMER bool select TIMER_OF @@ -56,6 +56,12 @@ config DIGICOLOR_TIMER help Enables the support for the digicolor timer driver. +config OMAP_DM_TIMER + tristate "OMAP dual-mode timer driver" if ARCH_K3 || COMPILE_TEST + select TIMER_OF + help + Enables the support for the TI dual-mode timer driver. + config DW_APB_TIMER bool "DW APB timer driver" if COMPILE_TEST help diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 685430d5c604cf..fadb5a8f5e07a3 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -18,7 +18,7 @@ obj-$(CONFIG_CLKSRC_MMIO) += mmio.o obj-$(CONFIG_DAVINCI_TIMER) += timer-davinci.o obj-$(CONFIG_DIGICOLOR_TIMER) += timer-digicolor.o obj-$(CONFIG_OMAP_DM_TIMER) += timer-ti-dm.o -obj-$(CONFIG_OMAP_DM_TIMER) += timer-ti-dm-systimer.o +obj-$(CONFIG_OMAP_DM_SYSTIMER) += timer-ti-dm-systimer.o obj-$(CONFIG_DW_APB_TIMER) += dw_apb_timer.o obj-$(CONFIG_DW_APB_TIMER_OF) += dw_apb_timer_of.o obj-$(CONFIG_FTTMR010_TIMER) += timer-fttmr010.o From 4f64b4ac719e92143d8e1dfdc8702d7df7b6a1ce Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 8 Apr 2022 13:17:15 +0300 Subject: [PATCH 0706/1250] clocksource/drivers/timer-ti-dm: Add compatible for am6 SoCs Add compatible for ti,am654-timer to support the timers. For example, am654 has four timers in the MCU domain and 12 timers in the MAIN domain. Cc: Keerthy Cc: Nishanth Menon Cc: Vignesh Raghavendra Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20220408101715.43697-4-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-ti-dm.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/clocksource/timer-ti-dm.c b/drivers/clocksource/timer-ti-dm.c index 530b71f3ab3729..80d7e5aaefde3c 100644 --- a/drivers/clocksource/timer-ti-dm.c +++ b/drivers/clocksource/timer-ti-dm.c @@ -1036,6 +1036,10 @@ static const struct dmtimer_platform_data omap3plus_pdata = { .timer_ops = &dmtimer_ops, }; +static const struct dmtimer_platform_data am6_pdata = { + .timer_ops = &dmtimer_ops, +}; + static const struct of_device_id omap_timer_match[] = { { .compatible = "ti,omap2420-timer", @@ -1064,6 +1068,10 @@ static const struct of_device_id omap_timer_match[] = { .compatible = "ti,dm816-timer", .data = &omap3plus_pdata, }, + { + .compatible = "ti,am654-timer", + .data = &am6_pdata, + }, {}, }; MODULE_DEVICE_TABLE(of, omap_timer_match); From dbed639ec6c20b62949315b5c8bca6998d54018c Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 23 May 2022 18:14:48 +0300 Subject: [PATCH 0707/1250] clocksource/drivers/timer-ti-dm: Make driver selection bool for TI K3 The clocksource drivers do not currently have loadable modules as pointed out by Daniel Lezcano . Let's reconsider this later on once timer removal discussion has been done, and set timer-ti-dm to bool for TI K3 SoC. Cc: Keerthy Cc: Nishanth Menon Cc: Vignesh Raghavendra Signed-off-by: Tony Lindgren Link: https://lore.kernel.org/r/20220523151448.23732-1-tony@atomide.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 967dcfb8dd0dcb..5d09cf54c4587b 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -57,7 +57,8 @@ config DIGICOLOR_TIMER Enables the support for the digicolor timer driver. config OMAP_DM_TIMER - tristate "OMAP dual-mode timer driver" if ARCH_K3 || COMPILE_TEST + bool "OMAP dual-mode timer driver" if ARCH_K3 || COMPILE_TEST + default y if ARCH_K3 select TIMER_OF help Enables the support for the TI dual-mode timer driver. From 311d73d6e9453a8842e6bf459e5019db286cfeb6 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 18 Jul 2022 23:36:57 +0200 Subject: [PATCH 0708/1250] clocksource/drivers/tegra186: Put Kconfig option 'tristate' to 'bool' The clocksource are built-in, not as module. We don't know if the core time framework is ready for that. Revert back this option to 'bool'. Signed-off-by: Daniel Lezcano Link: https://lore.kernel.org/r/20220718213657.1303538-1-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano --- drivers/clocksource/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 5d09cf54c4587b..440d4f41d17dd8 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -158,7 +158,7 @@ config TEGRA_TIMER Enables support for the Tegra driver. config TEGRA186_TIMER - tristate "NVIDIA Tegra186 timer driver" + bool "NVIDIA Tegra186 timer driver" depends on ARCH_TEGRA || COMPILE_TEST depends on WATCHDOG && WATCHDOG_CORE help From 1e1e5c47d3306edaf9c6b2a2c26f4c18c47b8c43 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 20 Jul 2022 09:53:34 +0200 Subject: [PATCH 0709/1250] dt-bindings: timer: renesas,cmt: Fix R-Car Gen4 fall-out Restore sort order (by family, followed by type). Update the conditional sections specifying the number of interrupts. Fixes: 525b296185b4b0ab ("dt-bindings: timer: renesas,cmt: Add r8a779f0 and generic Gen4 CMT support") Signed-off-by: Geert Uytterhoeven Acked-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/2e3863ae32e17d49f41111580f195dd34e2b769d.1658303544.git.geert+renesas@glider.be Signed-off-by: Daniel Lezcano --- .../devicetree/bindings/timer/renesas,cmt.yaml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/Documentation/devicetree/bindings/timer/renesas,cmt.yaml b/Documentation/devicetree/bindings/timer/renesas,cmt.yaml index 433ddb49620ca5..bde6c9b66bf426 100644 --- a/Documentation/devicetree/bindings/timer/renesas,cmt.yaml +++ b/Documentation/devicetree/bindings/timer/renesas,cmt.yaml @@ -82,12 +82,6 @@ properties: - renesas,r8a77995-cmt0 # 32-bit CMT0 on R-Car D3 - const: renesas,rcar-gen3-cmt0 # 32-bit CMT0 on R-Car Gen3 and RZ/G2 - - items: - - enum: - - renesas,r8a779a0-cmt0 # 32-bit CMT0 on R-Car V3U - - renesas,r8a779f0-cmt0 # 32-bit CMT0 on R-Car S4-8 - - const: renesas,rcar-gen4-cmt0 # 32-bit CMT0 on R-Car Gen4 - - items: - enum: - renesas,r8a774a1-cmt1 # 48-bit CMT on RZ/G2M @@ -104,6 +98,12 @@ properties: - renesas,r8a77995-cmt1 # 48-bit CMT on R-Car D3 - const: renesas,rcar-gen3-cmt1 # 48-bit CMT on R-Car Gen3 and RZ/G2 + - items: + - enum: + - renesas,r8a779a0-cmt0 # 32-bit CMT0 on R-Car V3U + - renesas,r8a779f0-cmt0 # 32-bit CMT0 on R-Car S4-8 + - const: renesas,rcar-gen4-cmt0 # 32-bit CMT0 on R-Car Gen4 + - items: - enum: - renesas,r8a779a0-cmt1 # 48-bit CMT on R-Car V3U @@ -145,6 +145,7 @@ allOf: enum: - renesas,rcar-gen2-cmt0 - renesas,rcar-gen3-cmt0 + - renesas,rcar-gen4-cmt0 then: properties: interrupts: @@ -158,6 +159,7 @@ allOf: enum: - renesas,rcar-gen2-cmt1 - renesas,rcar-gen3-cmt1 + - renesas,rcar-gen4-cmt1 then: properties: interrupts: From c19e18637d895e851817b99c65beea74ab7af113 Mon Sep 17 00:00:00 2001 From: XU pengfei Date: Wed, 20 Jul 2022 10:07:35 +0800 Subject: [PATCH 0710/1250] clocksource/drivers/sun4i: Remove unnecessary (void*) conversions Remove unnecessary void* type casting. Signed-off-by: XU pengfei Link: https://lore.kernel.org/r/20220720020735.3771-1-xupengfei@nfschina.com Signed-off-by: Daniel Lezcano --- drivers/clocksource/timer-sun4i.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clocksource/timer-sun4i.c b/drivers/clocksource/timer-sun4i.c index bb6ea6c198295e..94dc6e42e983d8 100644 --- a/drivers/clocksource/timer-sun4i.c +++ b/drivers/clocksource/timer-sun4i.c @@ -128,7 +128,7 @@ static void sun4i_timer_clear_interrupt(void __iomem *base) static irqreturn_t sun4i_timer_interrupt(int irq, void *dev_id) { - struct clock_event_device *evt = (struct clock_event_device *)dev_id; + struct clock_event_device *evt = dev_id; struct timer_of *to = to_timer_of(evt); sun4i_timer_clear_interrupt(timer_of_base(to)); From 55930c20f617defb480ff12cb2a2aca4f2567490 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 20 Jul 2022 13:55:38 +0100 Subject: [PATCH 0711/1250] cacheinfo: Use atomic allocation for percpu cache attributes On couple of architectures like RISC-V and ARM64, we need to detect cache attribues quite early during the boot when the secondary CPUs start. So we will call detect_cache_attributes in the atomic context and since use of normal allocation can sleep, we will end up getting "sleeping in the atomic context" bug splat. In order avoid that, move the allocation to use atomic version in preparation to move the actual detection of cache attributes in the CPU hotplug path which is atomic. Link: https://lore.kernel.org/r/20220720-arch_topo_fixes-v3-1-43d696288e84@arm.com Cc: Ionela Voinescu Tested-by: Conor Dooley Signed-off-by: Sudeep Holla --- drivers/base/cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index 65d566ff24c41a..4b5cd08c5a657f 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -356,7 +356,7 @@ int detect_cache_attributes(unsigned int cpu) return -ENOENT; per_cpu_cacheinfo(cpu) = kcalloc(cache_leaves(cpu), - sizeof(struct cacheinfo), GFP_KERNEL); + sizeof(struct cacheinfo), GFP_ATOMIC); if (per_cpu_cacheinfo(cpu) == NULL) { cache_leaves(cpu) = 0; return -ENOMEM; From 7c14e55a7dfeff4a8d2ad825eb4b3b3019c4af02 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 20 Jul 2022 13:55:39 +0100 Subject: [PATCH 0712/1250] ACPI: PPTT: Leave the table mapped for the runtime usage Currently, everytime an information needs to be fetched from the PPTT, the table is mapped via acpi_get_table() and unmapped after the use via acpi_put_table() which is fine. However we do this at runtime especially when the CPU is hotplugged out and plugged in back since we re-populate the cache topology and other information. However, with the support to fetch LLC information from the PPTT in the cpuhotplug path which is executed in the atomic context, it is preferred to avoid mapping and unmapping of the PPTT for every single use as the acpi_get_table() might sleep waiting for a mutex. In order to avoid the same, the table is needs to just mapped once on the boot CPU and is never unmapped allowing it to be used at runtime with out the hassle of mapping and unmapping the table. Link: https://lore.kernel.org/r/20220720-arch_topo_fixes-v3-2-43d696288e84@arm.com Reported-by: Guenter Roeck Cc: Rafael J. Wysocki Signed-off-by: Sudeep Holla --- drivers/acpi/pptt.c | 102 ++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 55 deletions(-) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index dd3222a15c9c5d..c91342dcbcd636 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -533,21 +533,37 @@ static int topology_get_acpi_cpu_tag(struct acpi_table_header *table, return -ENOENT; } + +static struct acpi_table_header *acpi_get_pptt(void) +{ + static struct acpi_table_header *pptt; + acpi_status status; + + /* + * PPTT will be used at runtime on every CPU hotplug in path, so we + * don't need to call acpi_put_table() to release the table mapping. + */ + if (!pptt) { + status = acpi_get_table(ACPI_SIG_PPTT, 0, &pptt); + if (ACPI_FAILURE(status)) + acpi_pptt_warn_missing(); + } + + return pptt; +} + static int find_acpi_cpu_topology_tag(unsigned int cpu, int level, int flag) { struct acpi_table_header *table; - acpi_status status; int retval; - status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); - if (ACPI_FAILURE(status)) { - acpi_pptt_warn_missing(); + table = acpi_get_pptt(); + if (!table) return -ENOENT; - } + retval = topology_get_acpi_cpu_tag(table, cpu, level, flag); pr_debug("Topology Setup ACPI CPU %d, level %d ret = %d\n", cpu, level, retval); - acpi_put_table(table); return retval; } @@ -568,16 +584,13 @@ static int find_acpi_cpu_topology_tag(unsigned int cpu, int level, int flag) static int check_acpi_cpu_flag(unsigned int cpu, int rev, u32 flag) { struct acpi_table_header *table; - acpi_status status; u32 acpi_cpu_id = get_acpi_id_for_cpu(cpu); struct acpi_pptt_processor *cpu_node = NULL; int ret = -ENOENT; - status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); - if (ACPI_FAILURE(status)) { - acpi_pptt_warn_missing(); - return ret; - } + table = acpi_get_pptt(); + if (!table) + return -ENOENT; if (table->revision >= rev) cpu_node = acpi_find_processor_node(table, acpi_cpu_id); @@ -585,8 +598,6 @@ static int check_acpi_cpu_flag(unsigned int cpu, int rev, u32 flag) if (cpu_node) ret = (cpu_node->flags & flag) != 0; - acpi_put_table(table); - return ret; } @@ -605,18 +616,15 @@ int acpi_find_last_cache_level(unsigned int cpu) u32 acpi_cpu_id; struct acpi_table_header *table; int number_of_levels = 0; - acpi_status status; + + table = acpi_get_pptt(); + if (!table) + return -ENOENT; pr_debug("Cache Setup find last level CPU=%d\n", cpu); acpi_cpu_id = get_acpi_id_for_cpu(cpu); - status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); - if (ACPI_FAILURE(status)) { - acpi_pptt_warn_missing(); - } else { - number_of_levels = acpi_find_cache_levels(table, acpi_cpu_id); - acpi_put_table(table); - } + number_of_levels = acpi_find_cache_levels(table, acpi_cpu_id); pr_debug("Cache Setup find last level level=%d\n", number_of_levels); return number_of_levels; @@ -638,20 +646,16 @@ int acpi_find_last_cache_level(unsigned int cpu) int cache_setup_acpi(unsigned int cpu) { struct acpi_table_header *table; - acpi_status status; - - pr_debug("Cache Setup ACPI CPU %d\n", cpu); - status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); - if (ACPI_FAILURE(status)) { - acpi_pptt_warn_missing(); + table = acpi_get_pptt(); + if (!table) return -ENOENT; - } + + pr_debug("Cache Setup ACPI CPU %d\n", cpu); cache_setup_acpi_cpu(table, cpu); - acpi_put_table(table); - return status; + return 0; } /** @@ -730,50 +734,38 @@ int find_acpi_cpu_topology_package(unsigned int cpu) int find_acpi_cpu_topology_cluster(unsigned int cpu) { struct acpi_table_header *table; - acpi_status status; struct acpi_pptt_processor *cpu_node, *cluster_node; u32 acpi_cpu_id; int retval; int is_thread; - status = acpi_get_table(ACPI_SIG_PPTT, 0, &table); - if (ACPI_FAILURE(status)) { - acpi_pptt_warn_missing(); + table = acpi_get_pptt(); + if (!table) return -ENOENT; - } acpi_cpu_id = get_acpi_id_for_cpu(cpu); cpu_node = acpi_find_processor_node(table, acpi_cpu_id); - if (cpu_node == NULL || !cpu_node->parent) { - retval = -ENOENT; - goto put_table; - } + if (!cpu_node || !cpu_node->parent) + return -ENOENT; is_thread = cpu_node->flags & ACPI_PPTT_ACPI_PROCESSOR_IS_THREAD; cluster_node = fetch_pptt_node(table, cpu_node->parent); - if (cluster_node == NULL) { - retval = -ENOENT; - goto put_table; - } + if (!cluster_node) + return -ENOENT; + if (is_thread) { - if (!cluster_node->parent) { - retval = -ENOENT; - goto put_table; - } + if (!cluster_node->parent) + return -ENOENT; + cluster_node = fetch_pptt_node(table, cluster_node->parent); - if (cluster_node == NULL) { - retval = -ENOENT; - goto put_table; - } + if (!cluster_node) + return -ENOENT; } if (cluster_node->flags & ACPI_PPTT_ACPI_PROCESSOR_ID_VALID) retval = cluster_node->acpi_processor_id; else retval = ACPI_PTR_DIFF(cluster_node, table); -put_table: - acpi_put_table(table); - return retval; } From c86b123b03beb8bdead8057f2a44e7b773ed01e7 Mon Sep 17 00:00:00 2001 From: Sudeep Holla Date: Wed, 20 Jul 2022 13:55:40 +0100 Subject: [PATCH 0713/1250] arch_topology: Fix cache attributes detection in the CPU hotplug path init_cpu_topology() is called only once at the boot and all the cache attributes are detected early for all the possible CPUs. However when the CPUs are hotplugged out, the cacheinfo gets removed. While the attributes are added back when the CPUs are hotplugged back in as part of CPU hotplug state machine, it ends up called quite late after the update_siblings_masks() are called in the secondary_start_kernel() resulting in wrong llc_sibling_masks. Move the call to detect_cache_attributes() inside update_siblings_masks() to ensure the cacheinfo is updated before the LLC sibling masks are updated. This will fix the incorrect LLC sibling masks generated when the CPUs are hotplugged out and hotplugged back in again. Link: https://lore.kernel.org/r/20220720-arch_topo_fixes-v3-3-43d696288e84@arm.com Reported-by: Ionela Voinescu Tested-by: Geert Uytterhoeven Tested-by: Ionela Voinescu Reviewed-by: Conor Dooley Reviewed-by: Ionela Voinescu Signed-off-by: Sudeep Holla --- drivers/base/arch_topology.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 441e14ac33a495..0424b59b695ef0 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -732,7 +732,11 @@ const struct cpumask *cpu_clustergroup_mask(int cpu) void update_siblings_masks(unsigned int cpuid) { struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid]; - int cpu; + int cpu, ret; + + ret = detect_cache_attributes(cpuid); + if (ret) + pr_info("Early cacheinfo failed, ret = %d\n", ret); /* update core and thread sibling masks */ for_each_online_cpu(cpu) { @@ -821,7 +825,7 @@ __weak int __init parse_acpi_topology(void) #if defined(CONFIG_ARM64) || defined(CONFIG_RISCV) void __init init_cpu_topology(void) { - int ret, cpu; + int ret; reset_cpu_topology(); ret = parse_acpi_topology(); @@ -836,13 +840,5 @@ void __init init_cpu_topology(void) reset_cpu_topology(); return; } - - for_each_possible_cpu(cpu) { - ret = detect_cache_attributes(cpu); - if (ret) { - pr_info("Early cacheinfo failed, ret = %d\n", ret); - break; - } - } } #endif From 05fe531eb3f1cdc535d46071c7e2067cdc2c3f3c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 20 Jul 2022 22:40:18 +0900 Subject: [PATCH 0714/1250] kernel/hung_task: show backtrace of tasks with locks held Since the cause of a task waiting for some lock in uninterruptible state for too long can be that some other task is running with that lock held, but debug_show_all_locks() does not print locks held by running tasks, also showing backtrace of all tasks with some lock held might be helpful. Let's try reporting all tasks with any lock held (except khungtaskd task itself which is holding RCU read lock which will not block hung task). This is an experimental patch. If it turned out that this approach is helpful, I'll propose this code as a lockdep function. Signed-off-by: Tetsuo Handa --- kernel/hung_task.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index cff3ae8c818fd3..2956bc5a90b2f1 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -204,8 +204,23 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); - if (hung_task_show_lock) - debug_show_all_locks(); +#if defined(CONFIG_PROVE_LOCKING) + if (hung_task_show_lock && debug_locks) { + pr_warn("\nShowing all threads with locks held in the system:\n"); + rcu_read_lock(); + for_each_process_thread(g, t) { + if (t->lockdep_depth && t != current) { + sched_show_task(t); + debug_show_held_locks(t); + touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); + } + } + rcu_read_unlock(); + pr_warn("\n"); + pr_warn("=============================================\n\n"); + } +#endif if (hung_task_show_all_bt) { hung_task_show_all_bt = false; From db52f939ccf899528fef4a876aa47e33fe1012ca Mon Sep 17 00:00:00 2001 From: Ying Hsu Date: Mon, 4 Jul 2022 18:33:24 +0800 Subject: [PATCH 0715/1250] Bluetooth: Add default wakeup callback for HCI UART driver Bluetooth HCI devices indicate if they are able to wakeup in the wakeup callback since 'commit 4539ca67fe8e ("Bluetooth: Rename driver .prevent_wake to .wakeup")'. This patch adds a default wakeup callback for Bluetooth HCI UAR devices. It assumes Bluetooth HCI UART devices are wakeable for backward compatibility. For those who need a customized behavior, one can override it before calling hci_uart_register_device(). Fixes: 4539ca67fe8e ("Bluetooth: Rename driver .prevent_wake to .wakeup") Signed-off-by: Ying Hsu Reviewed-by: Alain Michaud Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_serdev.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c index 4cda890ce6470b..c0e5f42ec6b7df 100644 --- a/drivers/bluetooth/hci_serdev.c +++ b/drivers/bluetooth/hci_serdev.c @@ -231,6 +231,15 @@ static int hci_uart_setup(struct hci_dev *hdev) return 0; } +/* Check if the device is wakeable */ +static bool hci_uart_wakeup(struct hci_dev *hdev) +{ + /* HCI UART devices are assumed to be wakeable by default. + * Implement wakeup callback to override this behavior. + */ + return true; +} + /** hci_uart_write_wakeup - transmit buffer wakeup * @serdev: serial device * @@ -342,6 +351,8 @@ int hci_uart_register_device(struct hci_uart *hu, hdev->flush = hci_uart_flush; hdev->send = hci_uart_send_frame; hdev->setup = hci_uart_setup; + if (!hdev->wakeup) + hdev->wakeup = hci_uart_wakeup; SET_HCIDEV_DEV(hdev, &hu->serdev->dev); if (test_bit(HCI_UART_NO_SUSPEND_NOTIFIER, &hu->flags)) From 5ce548c49ce0c2de9cb1ba8b9b622d1795f43a18 Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:19 +0800 Subject: [PATCH 0716/1250] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x04CA:0x4007 Add the support ID(0x04CA, 0x4007) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=04ca ProdID=4007 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index fb1a6718941240..f3a68d75509957 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -427,6 +427,10 @@ static const struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x04ca, 0x4006), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + /* Realtek 8852CE Bluetooth devices */ + { USB_DEVICE(0x04ca, 0x4007), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, + /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), .driver_info = BTUSB_REALTEK }, From 35de797dc6c7009c1002f142335da2a25a202267 Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:20 +0800 Subject: [PATCH 0717/1250] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x04C5:0x1675 Add the support ID(0x04c5, 0x1675) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=04c5 ProdID=1675 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index f3a68d75509957..fe0434eeac78cb 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -430,6 +430,8 @@ static const struct usb_device_id blacklist_table[] = { /* Realtek 8852CE Bluetooth devices */ { USB_DEVICE(0x04ca, 0x4007), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x04c5, 0x1675), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), From aec73886e6b5a1aab1ac3489e843b50e2f6b7007 Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:21 +0800 Subject: [PATCH 0718/1250] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x0CB8:0xC558 Add the support ID(0x0CB8, 0xC558) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0cb8 ProdID=c558 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index fe0434eeac78cb..50f2b0094cc7fc 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -432,6 +432,8 @@ static const struct usb_device_id blacklist_table[] = { BTUSB_WIDEBAND_SPEECH }, { USB_DEVICE(0x04c5, 0x1675), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x0cb8, 0xc558), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), From f0dc2393fbbf696d8bf85f1290e51efca1308ec8 Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:22 +0800 Subject: [PATCH 0719/1250] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x13D3:0x3587 Add the support ID(0x13D3, 0x3587) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=13d3 ProdID=3587 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 50f2b0094cc7fc..e2da9d2573c9d0 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -434,6 +434,8 @@ static const struct usb_device_id blacklist_table[] = { BTUSB_WIDEBAND_SPEECH }, { USB_DEVICE(0x0cb8, 0xc558), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x13d3, 0x3587), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), From 8dd512ca460932f04114a9829e4014ae1f498503 Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:23 +0800 Subject: [PATCH 0720/1250] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x13D3:0x3586 Add the support ID(0x13D3, 0x3586) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=13d3 ProdID=3586 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index e2da9d2573c9d0..aaba2d7371781e 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -436,6 +436,8 @@ static const struct usb_device_id blacklist_table[] = { BTUSB_WIDEBAND_SPEECH }, { USB_DEVICE(0x13d3, 0x3587), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x13d3, 0x3586), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), From 3c7ec89ab45e7a46b806910de78d7243a114a860 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 20 Jul 2022 14:23:49 +0300 Subject: [PATCH 0721/1250] Bluetooth: clean up error pointer checking The bt_skb_sendmsg() function can't return NULL so there is no need to check for that. Several of these checks were removed previously but this one was missed. Signed-off-by: Dan Carpenter Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 6b48d9e2aab9dc..a8b52175af051a 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -494,7 +494,7 @@ static inline struct sk_buff *bt_skb_sendmmsg(struct sock *sk, struct sk_buff *skb, **frag; skb = bt_skb_sendmsg(sk, msg, len, mtu, headroom, tailroom); - if (IS_ERR_OR_NULL(skb)) + if (IS_ERR(skb)) return skb; len -= skb->len; From c6480829cda7cb7c8ec4c9698f4210a724cfb545 Mon Sep 17 00:00:00 2001 From: Hakan Jansson Date: Thu, 30 Jun 2022 14:45:20 +0200 Subject: [PATCH 0722/1250] dt-bindings: net: broadcom-bluetooth: Add CYW55572 DT binding CYW55572 is a Wi-Fi + Bluetooth combo device from Infineon. Extend the binding with its DT compatible. Signed-off-by: Hakan Jansson Acked-by: Krzysztof Kozlowski Reviewed-by: Linus Walleij Signed-off-by: Luiz Augusto von Dentz --- Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml index df59575840fe37..71fe9b17f8f127 100644 --- a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml +++ b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml @@ -24,6 +24,7 @@ properties: - brcm,bcm43540-bt - brcm,bcm4335a0 - brcm,bcm4349-bt + - infineon,cyw55572-bt shutdown-gpios: maxItems: 1 From f5d25901c5ccbefc2f83294d83d9c50456e1f12b Mon Sep 17 00:00:00 2001 From: Hakan Jansson Date: Thu, 30 Jun 2022 14:45:21 +0200 Subject: [PATCH 0723/1250] dt-bindings: net: broadcom-bluetooth: Add conditional constraints Add conditional constraint to make property "reset-gpios" available only for compatible devices acually having the reset pin. Make property "brcm,requires-autobaud-mode" depend on property "shutdown-gpios" as the shutdown pin is required to enter autobaud mode. I looked at all compatible devices and compiled the matrix below before formulating the conditional constraint. This was a pure paper exercise and no verification testing has been performed. d e v h i o c s s e t h - - u w w v t r a a v d d e k k b d o s e e a i w e u u t o n t p p - - - - - - s s g g g g u u p p p p t p p i i i i x l p p o o o o c p l l s s s s o o y y --------------------------------------- brcm,bcm20702a1 X X X X X X X X brcm,bcm4329-bt X X X X X X X X brcm,bcm4330-bt X X X X X X X X brcm,bcm4334-bt X - X X X X X X brcm,bcm43438-bt X - X X X X X X brcm,bcm4345c5 X - X X X X X X brcm,bcm43540-bt X - X X X X X X brcm,bcm4335a0 X - X X X X X X brcm,bcm4349-bt X - X X X X X X infineon,cyw55572-bt X - X X X X X X Signed-off-by: Hakan Jansson Reviewed-by: Rob Herring Reviewed-by: Linus Walleij Signed-off-by: Luiz Augusto von Dentz --- .../bindings/net/broadcom-bluetooth.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml index 71fe9b17f8f127..445b2a5536259e 100644 --- a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml +++ b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml @@ -117,6 +117,22 @@ properties: required: - compatible +dependencies: + brcm,requires-autobaud-mode: [ 'shutdown-gpios' ] + +if: + not: + properties: + compatible: + contains: + enum: + - brcm,bcm20702a1 + - brcm,bcm4329-bt + - brcm,bcm4330-bt +then: + properties: + reset-gpios: false + additionalProperties: false examples: From 7386459d24b3ba5cd9c55776d81d43a27dd5bb07 Mon Sep 17 00:00:00 2001 From: Hakan Jansson Date: Thu, 30 Jun 2022 14:45:22 +0200 Subject: [PATCH 0724/1250] Bluetooth: hci_bcm: Add DT compatible for CYW55572 CYW55572 is a Wi-Fi + Bluetooth combo device from Infineon. Signed-off-by: Hakan Jansson Reviewed-by: Linus Walleij Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_bcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 6f834ff1b44b16..9a129867a4c068 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -1557,6 +1557,7 @@ static const struct of_device_id bcm_bluetooth_of_match[] = { { .compatible = "brcm,bcm4349-bt", .data = &bcm43438_device_data }, { .compatible = "brcm,bcm43540-bt", .data = &bcm4354_device_data }, { .compatible = "brcm,bcm4335a0" }, + { .compatible = "infineon,cyw55572-bt" }, { }, }; MODULE_DEVICE_TABLE(of, bcm_bluetooth_of_match); From 31e65c6d44a2531c12d00478782f2afcb906666b Mon Sep 17 00:00:00 2001 From: Hakan Jansson Date: Thu, 30 Jun 2022 14:45:23 +0200 Subject: [PATCH 0725/1250] Bluetooth: hci_bcm: Prevent early baudrate setting in autobaud mode Always prevent trying to set device baudrate before calling setup() when using autobaud mode. This was previously happening for devices which had device specific data with member no_early_set_baudrate set to 0. Signed-off-by: Hakan Jansson Reviewed-by: Linus Walleij Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_bcm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 9a129867a4c068..0ae627c293c518 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -484,7 +484,7 @@ static int bcm_open(struct hci_uart *hu) /* If oper_speed is set, ldisc/serdev will set the baudrate * before calling setup() */ - if (!bcm->dev->no_early_set_baudrate) + if (!bcm->dev->no_early_set_baudrate && !bcm->dev->use_autobaud_mode) hu->oper_speed = bcm->dev->oper_speed; err = bcm_gpio_set_power(bcm->dev, true); @@ -1204,9 +1204,6 @@ static int bcm_of_probe(struct bcm_device *bdev) { bdev->use_autobaud_mode = device_property_read_bool(bdev->dev, "brcm,requires-autobaud-mode"); - if (bdev->use_autobaud_mode) - bdev->no_early_set_baudrate = true; - device_property_read_u32(bdev->dev, "max-speed", &bdev->oper_speed); device_property_read_u8_array(bdev->dev, "brcm,bt-pcm-int-params", bdev->pcm_int_params, 5); From 719a11a62d1943f11127f55fc8262a798632474c Mon Sep 17 00:00:00 2001 From: Hakan Jansson Date: Thu, 30 Jun 2022 14:53:45 +0200 Subject: [PATCH 0726/1250] Bluetooth: hci_bcm: Increase host baudrate for CYW55572 in autobaud mode Add device specific data for max baudrate in autobaud mode. This allows the host to use a baudrate higher than "init speed" when loading FW in autobaud mode. The device specific max baudrate in autobaud mode for CYW55572 is set to 921600 bps. Devices without device specific max baudrate in autobaud mode will use init speed as before. If no device specific init speed has been specified, it will default to the bcm_proto default 115200 bps. The increased baud rate improves FW load time. The exact load time will depend on the specific system and FW being used. As a rough indication, the FW load time dropped from ~9s @ 115.2kbps to ~1.7s @ 921.6kbps in one test. Signed-off-by: Hakan Jansson Reviewed-by: Linus Walleij Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/hci_bcm.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 0ae627c293c518..d7e0b75db8a607 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -53,10 +53,12 @@ * struct bcm_device_data - device specific data * @no_early_set_baudrate: Disallow set baudrate before driver setup() * @drive_rts_on_open: drive RTS signal on ->open() when platform requires it + * @max_autobaud_speed: max baudrate supported by device in autobaud mode */ struct bcm_device_data { bool no_early_set_baudrate; bool drive_rts_on_open; + u32 max_autobaud_speed; }; /** @@ -100,6 +102,7 @@ struct bcm_device_data { * @drive_rts_on_open: drive RTS signal on ->open() when platform requires it * @pcm_int_params: keep the initial PCM configuration * @use_autobaud_mode: start Bluetooth device in autobaud mode + * @max_autobaud_speed: max baudrate supported by device in autobaud mode */ struct bcm_device { /* Must be the first member, hci_serdev.c expects this. */ @@ -139,6 +142,7 @@ struct bcm_device { bool drive_rts_on_open; bool use_autobaud_mode; u8 pcm_int_params[5]; + u32 max_autobaud_speed; }; /* generic bcm uart resources */ @@ -479,7 +483,10 @@ static int bcm_open(struct hci_uart *hu) else if (bcm->dev->drive_rts_on_open) hci_uart_set_flow_control(hu, true); - hu->init_speed = bcm->dev->init_speed; + if (bcm->dev->use_autobaud_mode && bcm->dev->max_autobaud_speed) + hu->init_speed = min(bcm->dev->oper_speed, bcm->dev->max_autobaud_speed); + else + hu->init_speed = bcm->dev->init_speed; /* If oper_speed is set, ldisc/serdev will set the baudrate * before calling setup() @@ -585,8 +592,8 @@ static int bcm_setup(struct hci_uart *hu) return 0; /* Init speed if any */ - if (hu->init_speed) - speed = hu->init_speed; + if (bcm->dev && bcm->dev->init_speed) + speed = bcm->dev->init_speed; else if (hu->proto->init_speed) speed = hu->proto->init_speed; else @@ -1519,6 +1526,7 @@ static int bcm_serdev_probe(struct serdev_device *serdev) data = device_get_match_data(bcmdev->dev); if (data) { + bcmdev->max_autobaud_speed = data->max_autobaud_speed; bcmdev->no_early_set_baudrate = data->no_early_set_baudrate; bcmdev->drive_rts_on_open = data->drive_rts_on_open; } @@ -1542,6 +1550,10 @@ static struct bcm_device_data bcm43438_device_data = { .drive_rts_on_open = true, }; +static struct bcm_device_data cyw55572_device_data = { + .max_autobaud_speed = 921600, +}; + static const struct of_device_id bcm_bluetooth_of_match[] = { { .compatible = "brcm,bcm20702a1" }, { .compatible = "brcm,bcm4329-bt" }, @@ -1554,7 +1566,7 @@ static const struct of_device_id bcm_bluetooth_of_match[] = { { .compatible = "brcm,bcm4349-bt", .data = &bcm43438_device_data }, { .compatible = "brcm,bcm43540-bt", .data = &bcm4354_device_data }, { .compatible = "brcm,bcm4335a0" }, - { .compatible = "infineon,cyw55572-bt" }, + { .compatible = "infineon,cyw55572-bt", .data = &cyw55572_device_data }, { }, }; MODULE_DEVICE_TABLE(of, bcm_bluetooth_of_match); From d8252b94a32a6ea09fc04e230bec5c89d930d352 Mon Sep 17 00:00:00 2001 From: Yuri D'Elia Date: Wed, 13 Jul 2022 17:29:13 +0200 Subject: [PATCH 0727/1250] Bluetooth: btusb: Set HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN for MTK This sets HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN for MTK controllers since SCO appear to not work when using HCI_OP_ENHANCED_SETUP_SYNC_CONN. Link: https://bugzilla.kernel.org/show_bug.cgi?id=215576 Signed-off-by: Yuri D'Elia Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index aaba2d7371781e..205b7d3b1cc3a3 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -3810,6 +3810,7 @@ static int btusb_probe(struct usb_interface *intf, hdev->manufacturer = 70; hdev->cmd_timeout = btusb_mtk_cmd_timeout; hdev->set_bdaddr = btmtk_set_bdaddr; + set_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &hdev->quirks); set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks); data->recv_acl = btusb_recv_acl_mtk; } From b4e7b216fd4c560e2c30c00b29be9909178e3dc7 Mon Sep 17 00:00:00 2001 From: He Wang Date: Sat, 25 Jun 2022 17:03:57 +0800 Subject: [PATCH 0728/1250] Bluetooth: btusb: Add a new VID/PID 0489/e0e2 for MT7922 Add VID/PID 0489:e0e2 for MediaTek MT7922 Bluetooth chip. Found and tested with Asus UM5302TA. From /sys/kernel/debug/usb/devices: T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0489 ProdID=e0e2 Rev= 1.00 S: Manufacturer=MediaTek Inc. S: Product=Wireless_Device S: SerialNumber=000000000 C:* #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA A: FirstIf#= 0 IfCount= 3 Cls=e0(wlcon) Sub=01 Prot=01 I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=125us E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms I: If#= 1 Alt= 6 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=83(I) Atr=01(Isoc) MxPS= 63 Ivl=1ms E: Ad=03(O) Atr=01(Isoc) MxPS= 63 Ivl=1ms I:* If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none) E: Ad=8a(I) Atr=03(Int.) MxPS= 64 Ivl=125us E: Ad=0a(O) Atr=03(Int.) MxPS= 64 Ivl=125us I: If#= 2 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none) E: Ad=8a(I) Atr=03(Int.) MxPS= 512 Ivl=125us E: Ad=0a(O) Atr=03(Int.) MxPS= 512 Ivl=125us Signed-off-by: He Wang Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 205b7d3b1cc3a3..21135a419bcc3c 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -492,6 +492,9 @@ static const struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x13d3, 0x3568), .driver_info = BTUSB_MEDIATEK | BTUSB_WIDEBAND_SPEECH | BTUSB_VALID_LE_STATES }, + { USB_DEVICE(0x0489, 0xe0e2), .driver_info = BTUSB_MEDIATEK | + BTUSB_WIDEBAND_SPEECH | + BTUSB_VALID_LE_STATES }, /* Additional Realtek 8723AE Bluetooth devices */ { USB_DEVICE(0x0930, 0x021d), .driver_info = BTUSB_REALTEK }, From f7913b8db3c4fe3ff39f541703d1354a2f0b276a Mon Sep 17 00:00:00 2001 From: Zijun Hu Date: Wed, 13 Jul 2022 14:53:14 +0800 Subject: [PATCH 0729/1250] Bluetooth: hci_sync: Correct hci_set_event_mask_page_2_sync() event mask Event HCI_Truncated_Page_Complete should belong to central and HCI_Peripheral_Page_Response_Timeout should belong to peripheral, but hci_set_event_mask_page_2_sync() take these two events for wrong roles, so correct it by this change. Signed-off-by: Zijun Hu Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 464a5e2c56fb5b..a4f1b209b4f8a3 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3608,7 +3608,7 @@ static int hci_set_event_mask_page_2_sync(struct hci_dev *hdev) if (lmp_cpb_central_capable(hdev)) { events[1] |= 0x40; /* Triggered Clock Capture */ events[1] |= 0x80; /* Synchronization Train Complete */ - events[2] |= 0x10; /* Peripheral Page Response Timeout */ + events[2] |= 0x08; /* Truncated Page Complete */ events[2] |= 0x20; /* CPB Channel Map Change */ changed = true; } @@ -3620,7 +3620,7 @@ static int hci_set_event_mask_page_2_sync(struct hci_dev *hdev) events[2] |= 0x01; /* Synchronization Train Received */ events[2] |= 0x02; /* CPB Receive */ events[2] |= 0x04; /* CPB Timeout */ - events[2] |= 0x08; /* Truncated Page Complete */ + events[2] |= 0x10; /* Peripheral Page Response Timeout */ changed = true; } From 9087782bc5188154a102e621ddfb3aa97926cbec Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 20 Jul 2022 13:58:04 -0400 Subject: [PATCH 0730/1250] dm raid: fix crash if md_handle_request() splits bio Commit ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone") introduced the optimization to _not_ perform bio_associate_blkg()'s relatively costly work when DM core clones its bio. But in doing so it exposed the possibility for DM's cloned bio to alter DM target behavior (e.g. crash) if a target were to issue IO without first calling bio_set_bdev(). The DM raid target happens to trigger a crash due to its need to split the DM bio that is passed to md_handle_request() -- when using raid5 personality, see: raid5.c:chunk_aligned_read(). Fix this by initializing the DM cloned bio's ->bi_blkg, using bio_associate_blkg, before passing the bio to md_handle_request(). Fixes: ca522482e3eaf ("dm: pass NULL bdev to bio_alloc_clone") Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index a438efc70e8772..4ada694296289e 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3334,6 +3334,18 @@ static int raid_map(struct dm_target *ti, struct bio *bio) if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) return DM_MAPIO_REQUEUE; + /* + * FIXME: must call bio_associate_blkg() to init bio->bi_blkg; otherwise + * raid5.c:chunk_aligned_read() will crash in submit_bio_noacct() + * (when blk_throtl_bio() dereferences a NULL blkg_to_tg(bio->bi_blkg) + * because bio->bi_blkg is NULL). + * + * This is because raid5.c:chunk_aligned_read() uses @bio to recurse, + * due to bio splitting before issuing any IO. Long-term fix would be + * to refactor md_handle_request() callers to perform bio splitting. + */ + bio_associate_blkg(bio); + md_handle_request(mddev, bio); return DM_MAPIO_SUBMITTED; From 2b85d0d98f18f4e576d948a9891b59a90f89ceb5 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Sun, 17 Jul 2022 12:35:24 +0200 Subject: [PATCH 0731/1250] random: handle archrandom with multiple longs The archrandom interface was originally designed for x86, which supplies RDRAND/RDSEED for receiving random words into registers, resulting in one function to generate an int and another to generate a long. However, other architectures don't follow this. On arm64, the SMCCC TRNG interface can return between one and three longs. On s390, the CPACF TRNG interface can return arbitrary amounts, with four longs having the same cost as one. On UML, the os_getrandom() interface can return arbitrary amounts. So change the api signature to take a "max_longs" parameter designating the maximum number of longs requested, and then return the number of longs generated. Since callers need to check this return value and loop anyway, each arch implementation does not bother implementing its own loop to try again to fill the maximum number of longs. Additionally, all existing callers pass in a constant max_longs parameter. Taken together, these two things mean that the codegen doesn't really change much for one-word-at-a-time platforms, while performance is greatly improved on platforms such as s390. Cc: Will Deacon Cc: Alexander Gordeev Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Borislav Petkov Cc: Heiko Carstens Cc: Johannes Berg Cc: Harald Freudenberger Acked-by: Catalin Marinas Acked-by: Mark Rutland Acked-by: Michael Ellerman Signed-off-by: Jason A. Donenfeld --- arch/arm64/include/asm/archrandom.h | 102 ++++++++++++-------------- arch/arm64/kernel/kaslr.c | 2 +- arch/powerpc/include/asm/archrandom.h | 30 ++------ arch/powerpc/kvm/book3s_hv.c | 2 +- arch/s390/include/asm/archrandom.h | 29 ++------ arch/um/include/asm/archrandom.h | 21 ++---- arch/x86/include/asm/archrandom.h | 41 +---------- arch/x86/kernel/espfix_64.c | 2 +- drivers/char/random.c | 45 ++++++++---- include/asm-generic/archrandom.h | 18 +---- include/linux/random.h | 12 +-- 11 files changed, 116 insertions(+), 188 deletions(-) diff --git a/arch/arm64/include/asm/archrandom.h b/arch/arm64/include/asm/archrandom.h index c3b9fa56af67ed..109e2a4454be3e 100644 --- a/arch/arm64/include/asm/archrandom.h +++ b/arch/arm64/include/asm/archrandom.h @@ -58,7 +58,7 @@ static inline bool __arm64_rndrrs(unsigned long *v) return ok; } -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { /* * Only support the generic interface after we have detected @@ -66,27 +66,15 @@ static inline bool __must_check arch_get_random_long(unsigned long *v) * cpufeature code and with potential scheduling between CPUs * with and without the feature. */ - if (cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndr(v)) - return true; - return false; + if (max_longs && cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndr(v)) + return 1; + return 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - if (cpus_have_const_cap(ARM64_HAS_RNG)) { - unsigned long val; - - if (__arm64_rndr(&val)) { - *v = val; - return true; - } - } - return false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - struct arm_smccc_res res; + if (!max_longs) + return 0; /* * We prefer the SMCCC call, since its semantics (return actual @@ -95,10 +83,23 @@ static inline bool __must_check arch_get_random_seed_long(unsigned long *v) * (the output of a pseudo RNG freshly seeded by a TRNG). */ if (smccc_trng_available) { - arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, &res); + struct arm_smccc_res res; + + max_longs = min_t(size_t, 3, max_longs); + arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, max_longs * 64, &res); if ((int)res.a0 >= 0) { - *v = res.a3; - return true; + switch (max_longs) { + case 3: + *v++ = res.a1; + fallthrough; + case 2: + *v++ = res.a2; + fallthrough; + case 1: + *v++ = res.a3; + break; + } + return max_longs; } } @@ -108,32 +109,9 @@ static inline bool __must_check arch_get_random_seed_long(unsigned long *v) * enough to implement this API if no other entropy source exists. */ if (cpus_have_const_cap(ARM64_HAS_RNG) && __arm64_rndrrs(v)) - return true; + return 1; - return false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) -{ - struct arm_smccc_res res; - unsigned long val; - - if (smccc_trng_available) { - arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 32, &res); - if ((int)res.a0 >= 0) { - *v = res.a3 & GENMASK(31, 0); - return true; - } - } - - if (cpus_have_const_cap(ARM64_HAS_RNG)) { - if (__arm64_rndrrs(&val)) { - *v = val; - return true; - } - } - - return false; + return 0; } static inline bool __init __early_cpu_has_rndr(void) @@ -143,26 +121,40 @@ static inline bool __init __early_cpu_has_rndr(void) return (ftr >> ID_AA64ISAR0_EL1_RNDR_SHIFT) & 0xf; } -static inline bool __init __must_check -arch_get_random_seed_long_early(unsigned long *v) +static inline size_t __init __must_check +arch_get_random_seed_longs_early(unsigned long *v, size_t max_longs) { WARN_ON(system_state != SYSTEM_BOOTING); + if (!max_longs) + return 0; + if (smccc_trng_available) { struct arm_smccc_res res; - arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, 64, &res); + max_longs = min_t(size_t, 3, max_longs); + arm_smccc_1_1_invoke(ARM_SMCCC_TRNG_RND64, max_longs * 64, &res); if ((int)res.a0 >= 0) { - *v = res.a3; - return true; + switch (max_longs) { + case 3: + *v++ = res.a1; + fallthrough; + case 2: + *v++ = res.a2; + fallthrough; + case 1: + *v++ = res.a3; + break; + } + return max_longs; } } if (__early_cpu_has_rndr() && __arm64_rndr(v)) - return true; + return 1; - return false; + return 0; } -#define arch_get_random_seed_long_early arch_get_random_seed_long_early +#define arch_get_random_seed_longs_early arch_get_random_seed_longs_early #endif /* _ASM_ARCHRANDOM_H */ diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 418b2bba1521b4..c5d541f358d324 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -106,7 +106,7 @@ u64 __init kaslr_early_init(void) * and supported. */ - if (arch_get_random_seed_long_early(&raw)) + if (arch_get_random_seed_longs_early(&raw, 1)) seed ^= raw; if (!seed) { diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index 25ba65df6b1ae6..0e365c5b23969d 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -4,34 +4,16 @@ #include -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return false; + return 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - if (ppc_md.get_random_seed) - return ppc_md.get_random_seed(v); - - return false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) -{ - unsigned long val; - bool rc; - - rc = arch_get_random_seed_long(&val); - if (rc) - *v = val; - - return rc; + if (max_longs && ppc_md.get_random_seed && ppc_md.get_random_seed(v)) + return 1; + return 0; } #ifdef CONFIG_PPC_POWERNV diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e08fb3124dcaa0..631062cde6b425 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1207,7 +1207,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; #endif case H_RANDOM: - if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4])) + if (!arch_get_random_seed_longs(&vcpu->arch.regs.gpr[4], 1)) ret = H_HARDWARE; break; case H_RPT_INVALIDATE: diff --git a/arch/s390/include/asm/archrandom.h b/arch/s390/include/asm/archrandom.h index 0a1c2e66c70938..cf5e000df0a140 100644 --- a/arch/s390/include/asm/archrandom.h +++ b/arch/s390/include/asm/archrandom.h @@ -18,34 +18,19 @@ DECLARE_STATIC_KEY_FALSE(s390_arch_random_available); extern atomic64_t s390_arch_random_counter; -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return false; + return 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) -{ - return false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - if (static_branch_likely(&s390_arch_random_available)) { - cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v)); - atomic64_add(sizeof(*v), &s390_arch_random_counter); - return true; - } - return false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { if (static_branch_likely(&s390_arch_random_available)) { - cpacf_trng(NULL, 0, (u8 *)v, sizeof(*v)); - atomic64_add(sizeof(*v), &s390_arch_random_counter); - return true; + cpacf_trng(NULL, 0, (u8 *)v, max_longs * sizeof(*v)); + atomic64_add(max_longs * sizeof(*v), &s390_arch_random_counter); + return max_longs; } - return false; + return 0; } #endif /* _ASM_S390_ARCHRANDOM_H */ diff --git a/arch/um/include/asm/archrandom.h b/arch/um/include/asm/archrandom.h index 2f24cb96391d7f..24e16c979c51f9 100644 --- a/arch/um/include/asm/archrandom.h +++ b/arch/um/include/asm/archrandom.h @@ -7,24 +7,19 @@ /* This is from , but better not to #include that in a global header here. */ ssize_t os_getrandom(void *buf, size_t len, unsigned int flags); -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return os_getrandom(v, sizeof(*v), 0) == sizeof(*v); -} + ssize_t ret; -static inline bool __must_check arch_get_random_int(unsigned int *v) -{ - return os_getrandom(v, sizeof(*v), 0) == sizeof(*v); -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - return false; + ret = os_getrandom(v, max_longs * sizeof(*v), 0); + if (ret < 0) + return 0; + return ret / sizeof(*v); } -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return false; + return 0; } #endif diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index fb235b6961753c..02bae8e0758b2b 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -31,20 +31,6 @@ static inline bool __must_check rdrand_long(unsigned long *v) return false; } -static inline bool __must_check rdrand_int(unsigned int *v) -{ - bool ok; - unsigned int retry = RDRAND_RETRY_LOOPS; - do { - asm volatile("rdrand %[out]" - CC_SET(c) - : CC_OUT(c) (ok), [out] "=r" (*v)); - if (ok) - return true; - } while (--retry); - return false; -} - static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; @@ -54,38 +40,19 @@ static inline bool __must_check rdseed_long(unsigned long *v) return ok; } -static inline bool __must_check rdseed_int(unsigned int *v) -{ - bool ok; - asm volatile("rdseed %[out]" - CC_SET(c) - : CC_OUT(c) (ok), [out] "=r" (*v)); - return ok; -} - /* * These are the generic interfaces; they must not be declared if the * stubs in are to be invoked. */ -static inline bool __must_check arch_get_random_long(unsigned long *v) -{ - return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_long(v) : false; -} - -static inline bool __must_check arch_get_random_int(unsigned int *v) -{ - return static_cpu_has(X86_FEATURE_RDRAND) ? rdrand_int(v) : false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_long(v) : false; + return max_longs && static_cpu_has(X86_FEATURE_RDRAND) && rdrand_long(v) ? 1 : 0; } -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return static_cpu_has(X86_FEATURE_RDSEED) ? rdseed_int(v) : false; + return max_longs && static_cpu_has(X86_FEATURE_RDSEED) && rdseed_long(v) ? 1 : 0; } #ifndef CONFIG_UML diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 4fe7af58cfe1aa..9417d5aa730575 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -100,7 +100,7 @@ static void init_espfix_random(void) * This is run before the entropy pools are initialized, * but this is hopefully better than nothing. */ - if (!arch_get_random_long(&rand)) { + if (!arch_get_random_longs(&rand, 1)) { /* The constant is an arbitrary large prime */ rand = rdtsc(); rand *= 0xc345c6b72fd16123UL; diff --git a/drivers/char/random.c b/drivers/char/random.c index 0c6568ae5f68b4..7bf11fa66265dc 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -596,12 +596,20 @@ static void extract_entropy(void *buf, size_t len) unsigned long rdseed[32 / sizeof(long)]; size_t counter; } block; - size_t i; + size_t i, longs; - for (i = 0; i < ARRAY_SIZE(block.rdseed); ++i) { - if (!arch_get_random_seed_long(&block.rdseed[i]) && - !arch_get_random_long(&block.rdseed[i])) - block.rdseed[i] = random_get_entropy(); + for (i = 0; i < ARRAY_SIZE(block.rdseed);) { + longs = arch_get_random_seed_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i); + if (longs) { + i += longs; + continue; + } + longs = arch_get_random_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i); + if (longs) { + i += longs; + continue; + } + block.rdseed[i++] = random_get_entropy(); } spin_lock_irqsave(&input_pool.lock, flags); @@ -776,22 +784,31 @@ static struct notifier_block pm_notifier = { .notifier_call = random_pm_notifica int __init random_init(const char *command_line) { ktime_t now = ktime_get_real(); - unsigned int i, arch_bits; - unsigned long entropy; + size_t i, longs, arch_bits; + unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)]; #if defined(LATENT_ENTROPY_PLUGIN) static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy; _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed)); #endif - for (i = 0, arch_bits = BLAKE2S_BLOCK_SIZE * 8; - i < BLAKE2S_BLOCK_SIZE; i += sizeof(entropy)) { - if (!arch_get_random_seed_long_early(&entropy) && - !arch_get_random_long_early(&entropy)) { - entropy = random_get_entropy(); - arch_bits -= sizeof(entropy) * 8; + for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) { + longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i); + if (longs) { + _mix_pool_bytes(entropy, sizeof(*entropy) * longs); + i += longs; + continue; } - _mix_pool_bytes(&entropy, sizeof(entropy)); + longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i); + if (longs) { + _mix_pool_bytes(entropy, sizeof(*entropy) * longs); + i += longs; + continue; + } + entropy[0] = random_get_entropy(); + _mix_pool_bytes(entropy, sizeof(*entropy)); + arch_bits -= sizeof(*entropy) * 8; + ++i; } _mix_pool_bytes(&now, sizeof(now)); _mix_pool_bytes(utsname(), sizeof(*(utsname()))); diff --git a/include/asm-generic/archrandom.h b/include/asm-generic/archrandom.h index 3a5ee202dd86d4..3cd7f980cfdca7 100644 --- a/include/asm-generic/archrandom.h +++ b/include/asm-generic/archrandom.h @@ -2,24 +2,14 @@ #ifndef __ASM_GENERIC_ARCHRANDOM_H__ #define __ASM_GENERIC_ARCHRANDOM_H__ -static inline bool __must_check arch_get_random_long(unsigned long *v) +static inline size_t __must_check arch_get_random_longs(unsigned long *v, size_t max_longs) { - return false; + return 0; } -static inline bool __must_check arch_get_random_int(unsigned int *v) +static inline size_t __must_check arch_get_random_seed_longs(unsigned long *v, size_t max_longs) { - return false; -} - -static inline bool __must_check arch_get_random_seed_long(unsigned long *v) -{ - return false; -} - -static inline bool __must_check arch_get_random_seed_int(unsigned int *v) -{ - return false; + return 0; } #endif diff --git a/include/linux/random.h b/include/linux/random.h index 865770e29f3efb..3fec206487f666 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -112,19 +112,19 @@ declare_get_random_var_wait(long, unsigned long) * Called from the boot CPU during startup; not valid to call once * secondary CPUs are up and preemption is possible. */ -#ifndef arch_get_random_seed_long_early -static inline bool __init arch_get_random_seed_long_early(unsigned long *v) +#ifndef arch_get_random_seed_longs_early +static inline size_t __init arch_get_random_seed_longs_early(unsigned long *v, size_t max_longs) { WARN_ON(system_state != SYSTEM_BOOTING); - return arch_get_random_seed_long(v); + return arch_get_random_seed_longs(v, max_longs); } #endif -#ifndef arch_get_random_long_early -static inline bool __init arch_get_random_long_early(unsigned long *v) +#ifndef arch_get_random_longs_early +static inline bool __init arch_get_random_longs_early(unsigned long *v, size_t max_longs) { WARN_ON(system_state != SYSTEM_BOOTING); - return arch_get_random_long(v); + return arch_get_random_longs(v, max_longs); } #endif From cc983da76a90ae484de551309dae96b1992777b6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 12 Jul 2022 21:05:42 +0800 Subject: [PATCH 0732/1250] mm/hugetlb: avoid corrupting page->mapping in hugetlb_mcopy_atomic_pte In MCOPY_ATOMIC_CONTINUE case with a non-shared VMA, pages in the page cache are installed in the ptes. But hugepage_add_new_anon_rmap is called for them mistakenly because they're not vm_shared. This will corrupt the page->mapping used by page cache code. Link: https://lkml.kernel.org/r/20220712130542.18836-1-linmiaohe@huawei.com Fixes: f619147104c8 ("userfaultfd: add UFFDIO_CONTINUE ioctl") Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Axel Rasmussen Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a18c071c294e35..3a16ae3115f0ae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6015,7 +6015,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (vm_shared) { + if (page_in_pagecache) { page_dup_file_rmap(page, true); } else { ClearHPageRestoreReserve(page); From 8e86a047da7d38a519ae54c5684b394d57b2e688 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 11 Jul 2022 09:59:06 -0700 Subject: [PATCH 0733/1250] userfaultfd: provide properly masked address for huge-pages Commit 824ddc601adc ("userfaultfd: provide unmasked address on page-fault") was introduced to fix an old bug, in which the offset in the address of a page-fault was masked. Concerns were raised - although were never backed by actual code - that some userspace code might break because the bug has been around for quite a while. To address these concerns a new flag was introduced, and only when this flag is set by the user, userfaultfd provides the exact address of the page-fault. The commit however had a bug, and if the flag is unset, the offset was always masked based on a base-page granularity. Yet, for huge-pages, the behavior prior to the commit was that the address is masked to the huge-page granulrity. While there are no reports on real breakage, fix this issue. If the flag is unset, use the address with the masking that was done before. Link: https://lkml.kernel.org/r/20220711165906.2682-1-namit@vmware.com Fixes: 824ddc601adc ("userfaultfd: provide unmasked address on page-fault") Signed-off-by: Nadav Amit Reported-by: James Houghton Reviewed-by: Mike Rapoport Reviewed-by: Peter Xu Reviewed-by: James Houghton Cc: David Hildenbrand Cc: Jan Kara Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e943370107d06a..de86f5b2859f94 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -192,17 +192,19 @@ static inline void msg_init(struct uffd_msg *msg) } static inline struct uffd_msg userfault_msg(unsigned long address, + unsigned long real_address, unsigned int flags, unsigned long reason, unsigned int features) { struct uffd_msg msg; + msg_init(&msg); msg.event = UFFD_EVENT_PAGEFAULT; - if (!(features & UFFD_FEATURE_EXACT_ADDRESS)) - address &= PAGE_MASK; - msg.arg.pagefault.address = address; + msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ? + real_address : address; + /* * These flags indicate why the userfault occurred: * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault. @@ -488,8 +490,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason, - ctx->features); + uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, + reason, ctx->features); uwq.ctx = ctx; uwq.waken = false; From b109319f5c2b44a07bbfe6e8b9144fa1ddc3408e Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 19 Jul 2022 23:42:46 +0800 Subject: [PATCH 0734/1250] mailmap: update Gao Xiang's email addresses I've been in Alibaba Cloud for more than one year, mainly to address cloud-native challenges (such as high-performance container images) for open source communities. Update my email addresses on behalf of my current employer (Alibaba Cloud) to support all my (team) work in this area. Also add an outdated @redhat.com address of me. Link: https://lkml.kernel.org/r/20220719154246.62970-1-xiang@kernel.org Signed-off-by: Gao Xiang Signed-off-by: Andrew Morton --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 56ce99212f3377..d4fe66a8032444 100644 --- a/.mailmap +++ b/.mailmap @@ -132,6 +132,8 @@ Frank Rowand Frank Zago Gao Xiang Gao Xiang +Gao Xiang +Gao Xiang Gerald Schaefer Gerald Schaefer Gerald Schaefer From cef252ca974a944db481ec9bfaed1dff672f30f5 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 20 Jul 2022 23:47:55 +0900 Subject: [PATCH 0735/1250] mm: shrinkers: fix double kfree on shrinker name syzbot is reporting double kfree() at free_prealloced_shrinker() [1], for destroy_unused_super() calls free_prealloced_shrinker() even if prealloc_shrinker() returned an error. Explicitly clear shrinker name when prealloc_shrinker() called kfree(). Link: https://syzkaller.appspot.com/bug?extid=8b481578352d4637f510 [1] Link: https://lkml.kernel.org/r/ffa62ece-6a42-2644-16cf-0d33ef32c676@I-love.SAKURA.ne.jp Fixes: e33c267ab70de424 ("mm: shrinkers: provide shrinkers with names") Reported-by: syzbot Signed-off-by: Tetsuo Handa Acked-by: Roman Gushchin Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index f58761cea0a060..f8d97b905f210f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -704,8 +704,10 @@ int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) return -ENOMEM; err = __register_shrinker(shrinker); - if (err) + if (err) { kfree_const(shrinker->name); + shrinker->name = NULL; + } return err; } #else From 90b5a54c4ccb33799f1cd4d3de37761d55646ea4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 20 Jul 2022 09:29:45 -0700 Subject: [PATCH 0736/1250] mm-shrinkers-fix-double-kfree-on-shrinker-name-fix zero shrinker->name in all cases where shrinker->name is freed Link: https://lkml.kernel.org/r/YtgteTnQTgyuKUSY@castle Cc: Tetsuo Handa Signed-off-by: Andrew Morton --- mm/shrinker_debug.c | 1 + mm/vmscan.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index e5b40c43221d07..b05295bab32226 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -251,6 +251,7 @@ void shrinker_debugfs_remove(struct shrinker *shrinker) lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); + shrinker->name = NULL; if (!shrinker->debugfs_entry) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index f8d97b905f210f..fbb4108250ee4e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -644,8 +644,10 @@ int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) return -ENOMEM; err = __prealloc_shrinker(shrinker); - if (err) + if (err) { kfree_const(shrinker->name); + shrinker->name = NULL; + } return err; } @@ -660,6 +662,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) { #ifdef CONFIG_SHRINKER_DEBUG kfree_const(shrinker->name); + shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); From 1fe3e87f07cd10053ddc95516bc0e87c6244ab1d Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jun 2022 17:22:28 +0800 Subject: [PATCH 0737/1250] mm: hugetlb_vmemmap: delete hugetlb_optimize_vmemmap_enabled() Patch series "Simplify hugetlb vmemmap and improve its readability", v2. This series aims to simplify hugetlb vmemmap and improve its readability. This patch (of 8): The name hugetlb_optimize_vmemmap_enabled() a bit confusing as it tests two conditions (enabled and pages in use). Instead of coming up to an appropriate name, we could just delete it. There is already a discussion about deleting it in thread [1]. There is only one user of hugetlb_optimize_vmemmap_enabled() outside of hugetlb_vmemmap, that is flush_dcache_page() in arch/arm64/mm/flush.c. However, it does not need to call hugetlb_optimize_vmemmap_enabled() in flush_dcache_page() since HugeTLB pages are always fully mapped and only head page will be set PG_dcache_clean meaning only head page's flag may need to be cleared (see commit cf5a501d985b). So it is easy to remove hugetlb_optimize_vmemmap_enabled(). Link: https://lore.kernel.org/all/c77c61c8-8a5a-87e8-db89-d04d8aaab4cc@oracle.com/ [1] Link: https://lkml.kernel.org/r/20220628092235.91270-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Oscar Salvador Reviewed-by: Mike Kravetz Reviewed-by: Catalin Marinas Cc: Will Deacon Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- arch/arm64/mm/flush.c | 13 +++---------- include/linux/page-flags.h | 14 ++------------ 2 files changed, 5 insertions(+), 22 deletions(-) diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index fc4f710e9820f1..5f9379b3c8c877 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -76,17 +76,10 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); void flush_dcache_page(struct page *page) { /* - * Only the head page's flags of HugeTLB can be cleared since the tail - * vmemmap pages associated with each HugeTLB page are mapped with - * read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more - * details can refer to vmemmap_remap_pte()). Although - * __sync_icache_dcache() only set PG_dcache_clean flag on the head - * page struct, there is more than one page struct with PG_dcache_clean - * associated with the HugeTLB page since the head vmemmap page frame - * is reused (more details can refer to the comments above - * page_fixed_fake_head()). + * HugeTLB pages are always fully mapped and only head page will be + * set PG_dcache_clean (see comments in __sync_icache_dcache()). */ - if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page)) + if (PageHuge(page)) page = compound_head(page); if (test_bit(PG_dcache_clean, &page->flags)) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f2ff65f1bf8382..3702f60427d6a5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -208,12 +208,6 @@ enum pageflags { DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, hugetlb_optimize_vmemmap_key); -static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) -{ - return static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, - &hugetlb_optimize_vmemmap_key); -} - /* * If the feature of optimizing vmemmap pages associated with each HugeTLB * page is enabled, the head vmemmap page frame is reused and all of the tail @@ -232,7 +226,8 @@ static __always_inline bool hugetlb_optimize_vmemmap_enabled(void) */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!hugetlb_optimize_vmemmap_enabled()) + if (!static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, + &hugetlb_optimize_vmemmap_key)) return page; /* @@ -260,11 +255,6 @@ static inline const struct page *page_fixed_fake_head(const struct page *page) { return page; } - -static inline bool hugetlb_optimize_vmemmap_enabled(void) -{ - return false; -} #endif static __always_inline int page_is_fake_head(struct page *page) From 078c4621934ff3c1fc8c989a894d0ed217d6f256 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jun 2022 17:22:29 +0800 Subject: [PATCH 0738/1250] mm: hugetlb_vmemmap: optimize vmemmap_optimize_mode handling We hold an another reference to hugetlb_optimize_vmemmap_key when making vmemmap_optimize_mode on, because we use static_key to tell memory_hotplug that memory_hotplug.memmap_on_memory should be overridden. However, this rule has gone when we have introduced PageVmemmapSelfHosted. Therefore, we could simplify vmemmap_optimize_mode handling by not holding an another reference to hugetlb_optimize_vmemmap_key. This also means that we not incur the extra page_fixed_fake_head checks if there are no vmemmap optinmized hugetlb pages after this change. Link: https://lkml.kernel.org/r/20220628092235.91270-3-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Oscar Salvador Reviewed-by: Mike Kravetz Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++-- mm/hugetlb_vmemmap.c | 65 ++++---------------------------------- 2 files changed, 9 insertions(+), 62 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3702f60427d6a5..7477e21bb85e8c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -205,8 +205,7 @@ enum pageflags { #ifndef __GENERATING_BOUNDS_H #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, - hugetlb_optimize_vmemmap_key); +DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* * If the feature of optimizing vmemmap pages associated with each HugeTLB @@ -226,8 +225,7 @@ DECLARE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { - if (!static_branch_maybe(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, - &hugetlb_optimize_vmemmap_key)) + if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key)) return page; /* diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 1362feb3c6c986..e5b83a25c2fa8a 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -23,42 +23,15 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) -enum vmemmap_optimize_mode { - VMEMMAP_OPTIMIZE_OFF, - VMEMMAP_OPTIMIZE_ON, -}; - -DEFINE_STATIC_KEY_MAYBE(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON, - hugetlb_optimize_vmemmap_key); +DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); -static enum vmemmap_optimize_mode vmemmap_optimize_mode = +static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); -static void vmemmap_optimize_mode_switch(enum vmemmap_optimize_mode to) -{ - if (vmemmap_optimize_mode == to) - return; - - if (to == VMEMMAP_OPTIMIZE_OFF) - static_branch_dec(&hugetlb_optimize_vmemmap_key); - else - static_branch_inc(&hugetlb_optimize_vmemmap_key); - WRITE_ONCE(vmemmap_optimize_mode, to); -} - static int __init hugetlb_vmemmap_early_param(char *buf) { - bool enable; - enum vmemmap_optimize_mode mode; - - if (kstrtobool(buf, &enable)) - return -EINVAL; - - mode = enable ? VMEMMAP_OPTIMIZE_ON : VMEMMAP_OPTIMIZE_OFF; - vmemmap_optimize_mode_switch(mode); - - return 0; + return kstrtobool(buf, &vmemmap_optimize_enabled); } early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); @@ -100,7 +73,7 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) static unsigned int vmemmap_optimizable_pages(struct hstate *h, struct page *head) { - if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF) + if (!READ_ONCE(vmemmap_optimize_enabled)) return 0; if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { @@ -191,7 +164,6 @@ void __init hugetlb_vmemmap_init(struct hstate *h) if (!is_power_of_2(sizeof(struct page))) { pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); - static_branch_disable(&hugetlb_optimize_vmemmap_key); return; } @@ -212,36 +184,13 @@ void __init hugetlb_vmemmap_init(struct hstate *h) } #ifdef CONFIG_PROC_SYSCTL -static int hugetlb_optimize_vmemmap_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos) -{ - int ret; - enum vmemmap_optimize_mode mode; - static DEFINE_MUTEX(sysctl_mutex); - - if (write && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&sysctl_mutex); - mode = vmemmap_optimize_mode; - table->data = &mode; - ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (write && !ret) - vmemmap_optimize_mode_switch(mode); - mutex_unlock(&sysctl_mutex); - - return ret; -} - static struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", - .maxlen = sizeof(enum vmemmap_optimize_mode), + .data = &vmemmap_optimize_enabled, + .maxlen = sizeof(int), .mode = 0644, - .proc_handler = hugetlb_optimize_vmemmap_handler, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, + .proc_handler = proc_dobool, }, { } }; From e45911332f73358aed240fbbcf93d26ce5a075c3 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jun 2022 17:22:30 +0800 Subject: [PATCH 0739/1250] mm: hugetlb_vmemmap: introduce the name HVO It it inconvenient to mention the feature of optimizing vmemmap pages associated with HugeTLB pages when communicating with others since there is no specific or abbreviated name for it when it is first introduced. Let us give it a name HVO (HugeTLB Vmemmap Optimization) from now. This commit also updates the document about "hugetlb_free_vmemmap" by the way discussed in thread [1]. Link: https://lore.kernel.org/all/21aae898-d54d-cc4b-a11f-1bb7fddcfffa@redhat.com/ [1] Link: https://lkml.kernel.org/r/20220628092235.91270-4-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Oscar Salvador Reviewed-by: Mike Kravetz Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 7 ++++--- Documentation/admin-guide/mm/hugetlbpage.rst | 4 ++-- Documentation/admin-guide/mm/memory-hotplug.rst | 4 ++-- Documentation/admin-guide/sysctl/vm.rst | 3 +-- Documentation/mm/vmemmap_dedup.rst | 2 ++ fs/Kconfig | 12 +++++------- include/linux/page-flags.h | 3 +-- mm/hugetlb_vmemmap.c | 8 ++++---- mm/hugetlb_vmemmap.h | 4 ++-- 9 files changed, 23 insertions(+), 24 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2cacd4f8deb75f..764577db97150a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1712,12 +1712,13 @@ hugetlb_free_vmemmap= [KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP enabled. + Control if HugeTLB Vmemmap Optimization (HVO) is enabled. Allows heavy hugetlb users to free up some more memory (7 * PAGE_SIZE for each 2MB hugetlb page). - Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) } + Format: { on | off (default) } - [oO][Nn]/Y/y/1: enable the feature - [oO][Ff]/N/n/0: disable the feature + on: enable HVO + off: disable HVO Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y, the default is on. diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst b/Documentation/admin-guide/mm/hugetlbpage.rst index a90330d0a83737..8e2727dc18d4d3 100644 --- a/Documentation/admin-guide/mm/hugetlbpage.rst +++ b/Documentation/admin-guide/mm/hugetlbpage.rst @@ -164,8 +164,8 @@ default_hugepagesz will all result in 256 2M huge pages being allocated. Valid default huge page size is architecture dependent. hugetlb_free_vmemmap - When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing - unused vmemmap pages associated with each HugeTLB page. + When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables HugeTLB + Vmemmap Optimization (HVO). When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages`` indicates the current number of pre-allocated huge pages of the default size. diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 0f56ecd8ac0543..a3c9e8ad8fa0d8 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -653,8 +653,8 @@ block might fail: - Concurrent activity that operates on the same physical memory area, such as allocating gigantic pages, can result in temporary offlining failures. -- Out of memory when dissolving huge pages, especially when freeing unused - vmemmap pages associated with each hugetlb page is enabled. +- Out of memory when dissolving huge pages, especially when HugeTLB Vmemmap + Optimization (HVO) is enabled. Offlining code may be able to migrate huge page contents, but may not be able to dissolve the source huge page because it fails allocating (unmovable) pages diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index f74f722ad7028d..9b833e439f0975 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -569,8 +569,7 @@ This knob is not available when the size of 'struct page' (a structure defined in include/linux/mm_types.h) is not power of two (an unusual system config could result in this). -Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages -associated with each HugeTLB page. +Enable (set to 1) or disable (set to 0) HugeTLB Vmemmap Optimization (HVO). Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index c9c495f62d123b..7d7a161aa36467 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -7,6 +7,8 @@ A vmemmap diet for HugeTLB and Device DAX HugeTLB ======= +This section is to explain how HugeTLB Vmemmap Optimization (HVO) works. + The struct page structures (page structs) are used to describe a physical page frame. By default, there is a one-to-one mapping from a page frame to it's corresponding page struct. diff --git a/fs/Kconfig b/fs/Kconfig index 5976eb33535ff0..a547307c1ae824 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -247,8 +247,7 @@ config HUGETLB_PAGE # # Select this config option from the architecture Kconfig, if it is preferred -# to enable the feature of minimizing overhead of struct page associated with -# each HugeTLB page. +# to enable the feature of HugeTLB Vmemmap Optimization (HVO). # config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP bool @@ -259,14 +258,13 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON - bool "Default optimizing vmemmap pages of HugeTLB to on" + bool "HugeTLB Vmemmap Optimization (HVO) defaults to on" default n depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap - pages associated with each HugeTLB page is default off. Say Y here - to enable optimizing vmemmap pages of HugeTLB by default. It can then - be disabled on the command line via hugetlb_free_vmemmap=off. + The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to + enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off + (boot command line) or hugetlb_optimize_vmemmap (sysctl). config MEMFD_CREATE def_bool TMPFS || HUGETLBFS diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 7477e21bb85e8c..a2ada8c75d5e85 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -208,8 +208,7 @@ enum pageflags { DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* - * If the feature of optimizing vmemmap pages associated with each HugeTLB - * page is enabled, the head vmemmap page frame is reused and all of the tail + * If HVO is enabled, the head vmemmap page frame is reused and all of the tail * vmemmap addresses map to the head vmemmap page frame (furture details can * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other * words, there are more than one page struct with PG_head associated with each diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index e5b83a25c2fa8a..bcafd9d7639cf2 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Optimize vmemmap pages associated with HugeTLB + * HugeTLB Vmemmap Optimization (HVO) * - * Copyright (c) 2020, Bytedance. All rights reserved. + * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song * @@ -156,8 +156,8 @@ void __init hugetlb_vmemmap_init(struct hstate *h) /* * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP, - * so add a BUILD_BUG_ON to catch invalid usage of the tail struct page. + * page structs that can be used when HVO is enabled, add a BUILD_BUG_ON + * to catch invalid usage of the tail page structs. */ BUILD_BUG_ON(__NR_USED_SUBPAGE >= RESERVE_VMEMMAP_SIZE / sizeof(struct page)); diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 109b0a53b6fe9b..ba66fadad9fcac 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Optimize vmemmap pages associated with HugeTLB + * HugeTLB Vmemmap Optimization (HVO) * - * Copyright (c) 2020, Bytedance. All rights reserved. + * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song */ From 6f8e100d088069f6da72d7b222790f4dc3735ee5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jun 2022 17:22:31 +0800 Subject: [PATCH 0740/1250] mm: hugetlb_vmemmap: move vmemmap code related to HugeTLB to hugetlb_vmemmap.c When I first introduced vmemmap manipulation functions related to HugeTLB, I thought those functions may be reused by other modules (e.g. using similar approach to optimize vmemmap pages, unfortunately, the DAX used the same approach but does not use those functions). After two years, we didn't see any other users. So move those functions to hugetlb_vmemmap.c. Code movement without any functional change. Link: https://lkml.kernel.org/r/20220628092235.91270-5-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Oscar Salvador Reviewed-by: Mike Kravetz Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 - mm/hugetlb_vmemmap.c | 399 ++++++++++++++++++++++++++++++++++++++++++- mm/sparse-vmemmap.c | 399 ------------------------------------------- 3 files changed, 398 insertions(+), 407 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 66beb3387824b8..4265bd5728ff15 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3139,13 +3139,6 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) } #endif -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int vmemmap_remap_free(unsigned long start, unsigned long end, - unsigned long reuse); -int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, gfp_t gfp_mask); -#endif - void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index bcafd9d7639cf2..f68e216600b91c 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -10,9 +10,31 @@ */ #define pr_fmt(fmt) "HugeTLB: " fmt -#include +#include +#include +#include +#include #include "hugetlb_vmemmap.h" +/** + * struct vmemmap_remap_walk - walk vmemmap page table + * + * @remap_pte: called for each lowest-level entry (PTE). + * @nr_walked: the number of walked pte. + * @reuse_page: the page which is reused for the tail vmemmap pages. + * @reuse_addr: the virtual address of the @reuse_page page. + * @vmemmap_pages: the list head of the vmemmap pages that can be freed + * or is mapped from. + */ +struct vmemmap_remap_walk { + void (*remap_pte)(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk); + unsigned long nr_walked; + struct page *reuse_page; + unsigned long reuse_addr; + struct list_head *vmemmap_pages; +}; + /* * There are a lot of struct page structures associated with each HugeTLB page. * For tail pages, the value of compound_head is the same. So we can reuse first @@ -23,6 +45,381 @@ #define RESERVE_VMEMMAP_NR 1U #define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + pmd_t __pmd; + int i; + unsigned long addr = start; + struct page *page = pmd_page(*pmd); + pte_t *pgtable = pte_alloc_one_kernel(&init_mm); + + if (!pgtable) + return -ENOMEM; + + pmd_populate_kernel(&init_mm, &__pmd, pgtable); + + for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) { + pte_t entry, *pte; + pgprot_t pgprot = PAGE_KERNEL; + + entry = mk_pte(page + i, pgprot); + pte = pte_offset_kernel(&__pmd, addr); + set_pte_at(&init_mm, addr, pte, entry); + } + + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* + * Higher order allocations from buddy allocator must be able to + * be treated as indepdenent small pages (as they can be freed + * individually). + */ + if (!PageReserved(page)) + split_page(page, get_order(PMD_SIZE)); + + /* Make pte visible before pmd. See comment in pmd_install(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock); + + return 0; +} + +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + +static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + + /* + * The reuse_page is found 'first' in table walk before we start + * remapping (which is calling @walk->remap_pte). + */ + if (!walk->reuse_page) { + walk->reuse_page = pte_page(*pte); + /* + * Because the reuse address is part of the range that we are + * walking, skip the reuse address range. + */ + addr += PAGE_SIZE; + pte++; + walk->nr_walked++; + } + + for (; addr != end; addr += PAGE_SIZE, pte++) { + walk->remap_pte(pte, addr, walk); + walk->nr_walked++; + } +} + +static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pmd_t *pmd; + unsigned long next; + + pmd = pmd_offset(pud, addr); + do { + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret; + + next = pmd_addr_end(addr, end); + vmemmap_pte_range(pmd, addr, next, walk); + } while (pmd++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + pud_t *pud; + unsigned long next; + + pud = pud_offset(p4d, addr); + do { + int ret; + + next = pud_addr_end(addr, end); + ret = vmemmap_pmd_range(pud, addr, next, walk); + if (ret) + return ret; + } while (pud++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, + unsigned long end, + struct vmemmap_remap_walk *walk) +{ + p4d_t *p4d; + unsigned long next; + + p4d = p4d_offset(pgd, addr); + do { + int ret; + + next = p4d_addr_end(addr, end); + ret = vmemmap_pud_range(p4d, addr, next, walk); + if (ret) + return ret; + } while (p4d++, addr = next, addr != end); + + return 0; +} + +static int vmemmap_remap_range(unsigned long start, unsigned long end, + struct vmemmap_remap_walk *walk) +{ + unsigned long addr = start; + unsigned long next; + pgd_t *pgd; + + VM_BUG_ON(!PAGE_ALIGNED(start)); + VM_BUG_ON(!PAGE_ALIGNED(end)); + + pgd = pgd_offset_k(addr); + do { + int ret; + + next = pgd_addr_end(addr, end); + ret = vmemmap_p4d_range(pgd, addr, next, walk); + if (ret) + return ret; + } while (pgd++, addr = next, addr != end); + + /* + * We only change the mapping of the vmemmap virtual address range + * [@start + PAGE_SIZE, end), so we only need to flush the TLB which + * belongs to the range. + */ + flush_tlb_kernel_range(start + PAGE_SIZE, end); + + return 0; +} + +/* + * Free a vmemmap page. A vmemmap page can be allocated from the memblock + * allocator or buddy allocator. If the PG_reserved flag is set, it means + * that it allocated from the memblock allocator, just free it via the + * free_bootmem_page(). Otherwise, use __free_page(). + */ +static inline void free_vmemmap_page(struct page *page) +{ + if (PageReserved(page)) + free_bootmem_page(page); + else + __free_page(page); +} + +/* Free a list of the vmemmap pages */ +static void free_vmemmap_page_list(struct list_head *list) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + list_del(&page->lru); + free_vmemmap_page(page); + } +} + +static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + /* + * Remap the tail pages as read-only to catch illegal write operation + * to the tail pages. + */ + pgprot_t pgprot = PAGE_KERNEL_RO; + pte_t entry = mk_pte(walk->reuse_page, pgprot); + struct page *page = pte_page(*pte); + + list_add_tail(&page->lru, walk->vmemmap_pages); + set_pte_at(&init_mm, addr, pte, entry); +} + +/* + * How many struct page structs need to be reset. When we reuse the head + * struct page, the special metadata (e.g. page->flags or page->mapping) + * cannot copy to the tail struct page structs. The invalid value will be + * checked in the free_tail_pages_check(). In order to avoid the message + * of "corrupted mapping in tail page". We need to reset at least 3 (one + * head struct page struct and two tail struct page structs) struct page + * structs. + */ +#define NR_RESET_STRUCT_PAGE 3 + +static inline void reset_struct_pages(struct page *start) +{ + int i; + struct page *from = start + NR_RESET_STRUCT_PAGE; + + for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) + memcpy(start + i, from, sizeof(*from)); +} + +static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, + struct vmemmap_remap_walk *walk) +{ + pgprot_t pgprot = PAGE_KERNEL; + struct page *page; + void *to; + + BUG_ON(pte_page(*pte) != walk->reuse_page); + + page = list_first_entry(walk->vmemmap_pages, struct page, lru); + list_del(&page->lru); + to = page_to_virt(page); + copy_page(to, (void *)walk->reuse_addr); + reset_struct_pages(to); + + set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); +} + +/** + * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) + * to the page which @reuse is mapped to, then free vmemmap + * which the range are mapped to. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_free(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_remap_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* + * In order to make remapping routine most efficient for the huge pages, + * the routine of vmemmap page table walking has the following rules + * (see more details from the vmemmap_pte_range()): + * + * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) + * should be continuous. + * - The @reuse address is part of the range [@reuse, @end) that we are + * walking which is passed to vmemmap_remap_range(). + * - The @reuse address is the first in the complete range. + * + * So we need to make sure that @start and @reuse meet the above rules. + */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + if (ret && walk.nr_walked) { + end = reuse + walk.nr_walked * PAGE_SIZE; + /* + * vmemmap_pages contains pages from the previous + * vmemmap_remap_range call which failed. These + * are pages which were removed from the vmemmap. + * They will be restored in the following call. + */ + walk = (struct vmemmap_remap_walk) { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + vmemmap_remap_range(reuse, end, &walk); + } + mmap_read_unlock(&init_mm); + + free_vmemmap_page_list(&vmemmap_pages); + + return ret; +} + +static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, + gfp_t gfp_mask, struct list_head *list) +{ + unsigned long nr_pages = (end - start) >> PAGE_SHIFT; + int nid = page_to_nid((struct page *)start); + struct page *page, *next; + + while (nr_pages--) { + page = alloc_pages_node(nid, gfp_mask, 0); + if (!page) + goto out; + list_add_tail(&page->lru, list); + } + + return 0; +out: + list_for_each_entry_safe(page, next, list, lru) + __free_pages(page, 0); + return -ENOMEM; +} + +/** + * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) + * to the page which is from the @vmemmap_pages + * respectively. + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * @gfp_mask: GFP flag for allocating vmemmap pages. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_alloc(unsigned long start, unsigned long end, + unsigned long reuse, gfp_t gfp_mask) +{ + LIST_HEAD(vmemmap_pages); + struct vmemmap_remap_walk walk = { + .remap_pte = vmemmap_restore_pte, + .reuse_addr = reuse, + .vmemmap_pages = &vmemmap_pages, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) + return -ENOMEM; + + mmap_read_lock(&init_mm); + vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return 0; +} + DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index f9ddeaa2fbdfc5..d957d3e368380f 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -27,408 +27,9 @@ #include #include #include -#include -#include #include #include -#include - -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -/** - * struct vmemmap_remap_walk - walk vmemmap page table - * - * @remap_pte: called for each lowest-level entry (PTE). - * @nr_walked: the number of walked pte. - * @reuse_page: the page which is reused for the tail vmemmap pages. - * @reuse_addr: the virtual address of the @reuse_page page. - * @vmemmap_pages: the list head of the vmemmap pages that can be freed - * or is mapped from. - */ -struct vmemmap_remap_walk { - void (*remap_pte)(pte_t *pte, unsigned long addr, - struct vmemmap_remap_walk *walk); - unsigned long nr_walked; - struct page *reuse_page; - unsigned long reuse_addr; - struct list_head *vmemmap_pages; -}; - -static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) -{ - pmd_t __pmd; - int i; - unsigned long addr = start; - struct page *page = pmd_page(*pmd); - pte_t *pgtable = pte_alloc_one_kernel(&init_mm); - - if (!pgtable) - return -ENOMEM; - - pmd_populate_kernel(&init_mm, &__pmd, pgtable); - - for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) { - pte_t entry, *pte; - pgprot_t pgprot = PAGE_KERNEL; - - entry = mk_pte(page + i, pgprot); - pte = pte_offset_kernel(&__pmd, addr); - set_pte_at(&init_mm, addr, pte, entry); - } - - spin_lock(&init_mm.page_table_lock); - if (likely(pmd_leaf(*pmd))) { - /* - * Higher order allocations from buddy allocator must be able to - * be treated as indepdenent small pages (as they can be freed - * individually). - */ - if (!PageReserved(page)) - split_page(page, get_order(PMD_SIZE)); - - /* Make pte visible before pmd. See comment in pmd_install(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); - } else { - pte_free_kernel(&init_mm, pgtable); - } - spin_unlock(&init_mm.page_table_lock); - - return 0; -} - -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) -{ - int leaf; - - spin_lock(&init_mm.page_table_lock); - leaf = pmd_leaf(*pmd); - spin_unlock(&init_mm.page_table_lock); - - if (!leaf) - return 0; - - return __split_vmemmap_huge_pmd(pmd, start); -} - -static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - pte_t *pte = pte_offset_kernel(pmd, addr); - - /* - * The reuse_page is found 'first' in table walk before we start - * remapping (which is calling @walk->remap_pte). - */ - if (!walk->reuse_page) { - walk->reuse_page = pte_page(*pte); - /* - * Because the reuse address is part of the range that we are - * walking, skip the reuse address range. - */ - addr += PAGE_SIZE; - pte++; - walk->nr_walked++; - } - - for (; addr != end; addr += PAGE_SIZE, pte++) { - walk->remap_pte(pte, addr, walk); - walk->nr_walked++; - } -} - -static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - pmd_t *pmd; - unsigned long next; - - pmd = pmd_offset(pud, addr); - do { - int ret; - - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); - if (ret) - return ret; - - next = pmd_addr_end(addr, end); - vmemmap_pte_range(pmd, addr, next, walk); - } while (pmd++, addr = next, addr != end); - - return 0; -} - -static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - pud_t *pud; - unsigned long next; - - pud = pud_offset(p4d, addr); - do { - int ret; - - next = pud_addr_end(addr, end); - ret = vmemmap_pmd_range(pud, addr, next, walk); - if (ret) - return ret; - } while (pud++, addr = next, addr != end); - - return 0; -} - -static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, - unsigned long end, - struct vmemmap_remap_walk *walk) -{ - p4d_t *p4d; - unsigned long next; - - p4d = p4d_offset(pgd, addr); - do { - int ret; - - next = p4d_addr_end(addr, end); - ret = vmemmap_pud_range(p4d, addr, next, walk); - if (ret) - return ret; - } while (p4d++, addr = next, addr != end); - - return 0; -} - -static int vmemmap_remap_range(unsigned long start, unsigned long end, - struct vmemmap_remap_walk *walk) -{ - unsigned long addr = start; - unsigned long next; - pgd_t *pgd; - - VM_BUG_ON(!PAGE_ALIGNED(start)); - VM_BUG_ON(!PAGE_ALIGNED(end)); - - pgd = pgd_offset_k(addr); - do { - int ret; - - next = pgd_addr_end(addr, end); - ret = vmemmap_p4d_range(pgd, addr, next, walk); - if (ret) - return ret; - } while (pgd++, addr = next, addr != end); - - /* - * We only change the mapping of the vmemmap virtual address range - * [@start + PAGE_SIZE, end), so we only need to flush the TLB which - * belongs to the range. - */ - flush_tlb_kernel_range(start + PAGE_SIZE, end); - - return 0; -} - -/* - * Free a vmemmap page. A vmemmap page can be allocated from the memblock - * allocator or buddy allocator. If the PG_reserved flag is set, it means - * that it allocated from the memblock allocator, just free it via the - * free_bootmem_page(). Otherwise, use __free_page(). - */ -static inline void free_vmemmap_page(struct page *page) -{ - if (PageReserved(page)) - free_bootmem_page(page); - else - __free_page(page); -} - -/* Free a list of the vmemmap pages */ -static void free_vmemmap_page_list(struct list_head *list) -{ - struct page *page, *next; - - list_for_each_entry_safe(page, next, list, lru) { - list_del(&page->lru); - free_vmemmap_page(page); - } -} - -static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, - struct vmemmap_remap_walk *walk) -{ - /* - * Remap the tail pages as read-only to catch illegal write operation - * to the tail pages. - */ - pgprot_t pgprot = PAGE_KERNEL_RO; - pte_t entry = mk_pte(walk->reuse_page, pgprot); - struct page *page = pte_page(*pte); - - list_add_tail(&page->lru, walk->vmemmap_pages); - set_pte_at(&init_mm, addr, pte, entry); -} - -/* - * How many struct page structs need to be reset. When we reuse the head - * struct page, the special metadata (e.g. page->flags or page->mapping) - * cannot copy to the tail struct page structs. The invalid value will be - * checked in the free_tail_pages_check(). In order to avoid the message - * of "corrupted mapping in tail page". We need to reset at least 3 (one - * head struct page struct and two tail struct page structs) struct page - * structs. - */ -#define NR_RESET_STRUCT_PAGE 3 - -static inline void reset_struct_pages(struct page *start) -{ - int i; - struct page *from = start + NR_RESET_STRUCT_PAGE; - - for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) - memcpy(start + i, from, sizeof(*from)); -} - -static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, - struct vmemmap_remap_walk *walk) -{ - pgprot_t pgprot = PAGE_KERNEL; - struct page *page; - void *to; - - BUG_ON(pte_page(*pte) != walk->reuse_page); - - page = list_first_entry(walk->vmemmap_pages, struct page, lru); - list_del(&page->lru); - to = page_to_virt(page); - copy_page(to, (void *)walk->reuse_addr); - reset_struct_pages(to); - - set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); -} - -/** - * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) - * to the page which @reuse is mapped to, then free vmemmap - * which the range are mapped to. - * @start: start address of the vmemmap virtual address range that we want - * to remap. - * @end: end address of the vmemmap virtual address range that we want to - * remap. - * @reuse: reuse address. - * - * Return: %0 on success, negative error code otherwise. - */ -int vmemmap_remap_free(unsigned long start, unsigned long end, - unsigned long reuse) -{ - int ret; - LIST_HEAD(vmemmap_pages); - struct vmemmap_remap_walk walk = { - .remap_pte = vmemmap_remap_pte, - .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, - }; - - /* - * In order to make remapping routine most efficient for the huge pages, - * the routine of vmemmap page table walking has the following rules - * (see more details from the vmemmap_pte_range()): - * - * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) - * should be continuous. - * - The @reuse address is part of the range [@reuse, @end) that we are - * walking which is passed to vmemmap_remap_range(). - * - The @reuse address is the first in the complete range. - * - * So we need to make sure that @start and @reuse meet the above rules. - */ - BUG_ON(start - reuse != PAGE_SIZE); - - mmap_read_lock(&init_mm); - ret = vmemmap_remap_range(reuse, end, &walk); - if (ret && walk.nr_walked) { - end = reuse + walk.nr_walked * PAGE_SIZE; - /* - * vmemmap_pages contains pages from the previous - * vmemmap_remap_range call which failed. These - * are pages which were removed from the vmemmap. - * They will be restored in the following call. - */ - walk = (struct vmemmap_remap_walk) { - .remap_pte = vmemmap_restore_pte, - .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, - }; - - vmemmap_remap_range(reuse, end, &walk); - } - mmap_read_unlock(&init_mm); - - free_vmemmap_page_list(&vmemmap_pages); - - return ret; -} - -static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, - gfp_t gfp_mask, struct list_head *list) -{ - unsigned long nr_pages = (end - start) >> PAGE_SHIFT; - int nid = page_to_nid((struct page *)start); - struct page *page, *next; - - while (nr_pages--) { - page = alloc_pages_node(nid, gfp_mask, 0); - if (!page) - goto out; - list_add_tail(&page->lru, list); - } - - return 0; -out: - list_for_each_entry_safe(page, next, list, lru) - __free_pages(page, 0); - return -ENOMEM; -} - -/** - * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) - * to the page which is from the @vmemmap_pages - * respectively. - * @start: start address of the vmemmap virtual address range that we want - * to remap. - * @end: end address of the vmemmap virtual address range that we want to - * remap. - * @reuse: reuse address. - * @gfp_mask: GFP flag for allocating vmemmap pages. - * - * Return: %0 on success, negative error code otherwise. - */ -int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse, gfp_t gfp_mask) -{ - LIST_HEAD(vmemmap_pages); - struct vmemmap_remap_walk walk = { - .remap_pte = vmemmap_restore_pte, - .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, - }; - - /* See the comment in the vmemmap_remap_free(). */ - BUG_ON(start - reuse != PAGE_SIZE); - - if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) - return -ENOMEM; - - mmap_read_lock(&init_mm); - vmemmap_remap_range(reuse, end, &walk); - mmap_read_unlock(&init_mm); - - return 0; -} -#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ /* * Allocate a block of memory to be used to back the virtual memory map From 4923c01965469edc9d3ad48f1b413fbd88e40080 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jun 2022 17:22:32 +0800 Subject: [PATCH 0741/1250] mm: hugetlb_vmemmap: replace early_param() with core_param() After the following commit: 78f39084b41d ("mm: hugetlb_vmemmap: add hugetlb_optimize_vmemmap sysctl") There is no order requirement between the parameter of "hugetlb_free_vmemmap" and "hugepages" since we have removed the check of whether HVO is enabled from hugetlb_vmemmap_init(). Therefore we can safely replace early_param() with core_param() to simplify the code. Link: https://lkml.kernel.org/r/20220628092235.91270-6-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index f68e216600b91c..6c7117c30e5664 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -423,14 +423,8 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); -static bool vmemmap_optimize_enabled = - IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); - -static int __init hugetlb_vmemmap_early_param(char *buf) -{ - return kstrtobool(buf, &vmemmap_optimize_enabled); -} -early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_early_param); +static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); +core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); /* * Previously discarded vmemmap pages will be allocated and remapping From 26a306f7d927b62dc44a7ed1dc11efc46da07e77 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jun 2022 17:22:33 +0800 Subject: [PATCH 0742/1250] mm: hugetlb_vmemmap: improve hugetlb_vmemmap code readability There is a discussion about the name of hugetlb_vmemmap_alloc/free in thread [1]. The suggestion suggested by David is rename "alloc/free" to "optimize/restore" to make functionalities clearer to users, "optimize" means the function will optimize vmemmap pages, while "restore" means restoring its vmemmap pages discared before. This commit does this. Another discussion is the confusion RESERVE_VMEMMAP_NR isn't used explicitly for vmemmap_addr but implicitly for vmemmap_end in hugetlb_vmemmap_alloc/free. David suggested we can compute what hugetlb_vmemmap_init() does now at runtime. We do not need to worry for the overhead of computing at runtime since the calculation is simple enough and those functions are not in a hot path. This commit has the following improvements: 1) The function suffixed name ("optimize/restore") is more expressive. 2) The logic becomes less weird in hugetlb_vmemmap_optimize/restore(). 3) The hugetlb_vmemmap_init() does not need to be exported anymore. 4) A ->optimize_vmemmap_pages field in struct hstate is killed. 5) There is only one place where checks is_power_of_2(sizeof(struct page)) instead of two places. 6) Add more comments for hugetlb_vmemmap_optimize/restore(). 7) For external users, hugetlb_optimize_vmemmap_pages() is used for detecting if the HugeTLB's vmemmap pages is optimizable originally. In this commit, it is killed and we introduce a new helper hugetlb_vmemmap_optimizable() to replace it. The name is more expressive. Link: https://lore.kernel.org/all/20220404074652.68024-2-songmuchun@bytedance.com/ [1] Link: https://lkml.kernel.org/r/20220628092235.91270-7-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 7 +- include/linux/sysctl.h | 4 ++ mm/hugetlb.c | 15 +++-- mm/hugetlb_vmemmap.c | 143 +++++++++++++++++----------------------- mm/hugetlb_vmemmap.h | 41 ++++++++---- 5 files changed, 102 insertions(+), 108 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4cdfce9766446b..6d0620edf0a60c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -638,9 +638,6 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP - unsigned int optimize_vmemmap_pages; -#endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ struct cftype cgroup_files_dfl[8]; @@ -716,7 +713,7 @@ static inline struct hstate *hstate_vma(struct vm_area_struct *vma) return hstate_file(vma->vm_file); } -static inline unsigned long huge_page_size(struct hstate *h) +static inline unsigned long huge_page_size(const struct hstate *h) { return (unsigned long)PAGE_SIZE << h->order; } @@ -745,7 +742,7 @@ static inline bool hstate_is_gigantic(struct hstate *h) return huge_page_order(h) >= MAX_ORDER; } -static inline unsigned int pages_per_huge_page(struct hstate *h) +static inline unsigned int pages_per_huge_page(const struct hstate *h) { return 1 << h->order; } diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 80263f7cdb776c..5a227b9e3ad526 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -266,6 +266,10 @@ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * return NULL; } +static inline void register_sysctl_init(const char *path, struct ctl_table *table) +{ +} + static inline struct ctl_table_header *register_sysctl_mount_point(const char *path) { return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ad4572d0ba3199..d6aa88d744c0db 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1535,7 +1535,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - if (hugetlb_vmemmap_alloc(h, page)) { + if (hugetlb_vmemmap_restore(h, page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1612,7 +1612,7 @@ static DECLARE_WORK(free_hpage_work, free_hpage_workfn); static inline void flush_free_hpage_work(struct hstate *h) { - if (hugetlb_optimize_vmemmap_pages(h)) + if (hugetlb_vmemmap_optimizable(h)) flush_work(&free_hpage_work); } @@ -1734,7 +1734,7 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void __prep_new_huge_page(struct hstate *h, struct page *page) { - hugetlb_vmemmap_free(h, page); + hugetlb_vmemmap_optimize(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); hugetlb_set_page_subpool(page, NULL); @@ -2107,7 +2107,7 @@ int dissolve_free_huge_page(struct page *page) * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = hugetlb_vmemmap_alloc(h, head); + rc = hugetlb_vmemmap_restore(h, head); if (!rc) { /* * Move PageHWPoison flag from head page to the raw @@ -3182,8 +3182,10 @@ static void __init report_hugepages(void) char buf[32]; string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); - pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", + pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->free_huge_pages); + pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", + hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf); } } @@ -3421,7 +3423,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) remove_hugetlb_page_for_demote(h, page, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_alloc(h, page); + rc = hugetlb_vmemmap_restore(h, page); if (rc) { /* Allocation of vmemmmap failed, we can not demote page */ spin_lock_irq(&hugetlb_lock); @@ -4111,7 +4113,6 @@ void __init hugetlb_add_hstate(unsigned int order) h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); - hugetlb_vmemmap_init(h); parsed_hstate = h; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 6c7117c30e5664..8da2b31bb59f50 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -35,16 +35,6 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -/* - * There are a lot of struct page structures associated with each HugeTLB page. - * For tail pages, the value of compound_head is the same. So we can reuse first - * page of head page structures. We map the virtual addresses of all the pages - * of tail page structures to the head page struct, and then free these page - * frames. Therefore, we need to reserve one pages as vmemmap areas. - */ -#define RESERVE_VMEMMAP_NR 1U -#define RESERVE_VMEMMAP_SIZE (RESERVE_VMEMMAP_NR << PAGE_SHIFT) - static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; @@ -426,32 +416,37 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); -/* - * Previously discarded vmemmap pages will be allocated and remapping - * after this function returns zero. +/** + * hugetlb_vmemmap_restore - restore previously optimized (by + * hugetlb_vmemmap_optimize()) vmemmap pages which + * will be reallocated and remapped. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be restored. + * + * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * negative error code otherwise. */ -int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) { int ret; - unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; if (!HPageVmemmapOptimized(head)) return 0; - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_pages = hugetlb_optimize_vmemmap_pages(h); - vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * The pages which the vmemmap virtual address range [@vmemmap_addr, + * The pages which the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end) are mapped to are freed to the buddy allocator, and * the range is mapped to the page which @vmemmap_reuse is mapped to. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_addr, vmemmap_end, vmemmap_reuse, + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, GFP_KERNEL | __GFP_NORETRY | __GFP_THISNODE); if (!ret) { ClearHPageVmemmapOptimized(head); @@ -461,11 +456,14 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) return ret; } -static unsigned int vmemmap_optimizable_pages(struct hstate *h, - struct page *head) +/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ +static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { if (!READ_ONCE(vmemmap_optimize_enabled)) - return 0; + return false; + + if (!hugetlb_vmemmap_optimizable(h)) + return false; if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) { pmd_t *pmdp, pmd; @@ -508,73 +506,47 @@ static unsigned int vmemmap_optimizable_pages(struct hstate *h, * +-------------------------------------------+ */ if (PageVmemmapSelfHosted(vmemmap_page)) - return 0; + return false; } - return hugetlb_optimize_vmemmap_pages(h); + return true; } -void hugetlb_vmemmap_free(struct hstate *h, struct page *head) +/** + * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be optimized. + * + * This function only tries to optimize @head's vmemmap pages and does not + * guarantee that the optimization will succeed after it returns. The caller + * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages + * have been optimized. + */ +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { - unsigned long vmemmap_addr = (unsigned long)head; - unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages; + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; - vmemmap_pages = vmemmap_optimizable_pages(h, head); - if (!vmemmap_pages) + if (!vmemmap_should_optimize(h, head)) return; static_branch_inc(&hugetlb_optimize_vmemmap_key); - vmemmap_addr += RESERVE_VMEMMAP_SIZE; - vmemmap_end = vmemmap_addr + (vmemmap_pages << PAGE_SHIFT); - vmemmap_reuse = vmemmap_addr - PAGE_SIZE; + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* - * Remap the vmemmap virtual address range [@vmemmap_addr, @vmemmap_end) + * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) * to the page which @vmemmap_reuse is mapped to, then free the pages - * which the range [@vmemmap_addr, @vmemmap_end] is mapped to. + * which the range [@vmemmap_start, @vmemmap_end] is mapped to. */ - if (vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse)) + if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse)) static_branch_dec(&hugetlb_optimize_vmemmap_key); else SetHPageVmemmapOptimized(head); } -void __init hugetlb_vmemmap_init(struct hstate *h) -{ - unsigned int nr_pages = pages_per_huge_page(h); - unsigned int vmemmap_pages; - - /* - * There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct - * page structs that can be used when HVO is enabled, add a BUILD_BUG_ON - * to catch invalid usage of the tail page structs. - */ - BUILD_BUG_ON(__NR_USED_SUBPAGE >= - RESERVE_VMEMMAP_SIZE / sizeof(struct page)); - - if (!is_power_of_2(sizeof(struct page))) { - pr_warn_once("cannot optimize vmemmap pages because \"struct page\" crosses page boundaries\n"); - return; - } - - vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT; - /* - * The head page is not to be freed to buddy allocator, the other tail - * pages will map to the head page, so they can be freed. - * - * Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true - * on some architectures (e.g. aarch64). See Documentation/arm64/ - * hugetlbpage.rst for more details. - */ - if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR)) - h->optimize_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR; - - pr_info("can optimize %d vmemmap pages for %s\n", - h->optimize_vmemmap_pages, h->name); -} - -#ifdef CONFIG_PROC_SYSCTL static struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", @@ -586,16 +558,21 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = { { } }; -static __init int hugetlb_vmemmap_sysctls_init(void) +static int __init hugetlb_vmemmap_init(void) { - /* - * If "struct page" crosses page boundaries, the vmemmap pages cannot - * be optimized. - */ - if (is_power_of_2(sizeof(struct page))) - register_sysctl_init("vm", hugetlb_vmemmap_sysctls); - + /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ + BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE); + + if (IS_ENABLED(CONFIG_PROC_SYSCTL)) { + const struct hstate *h; + + for_each_hstate(h) { + if (hugetlb_vmemmap_optimizable(h)) { + register_sysctl_init("vm", hugetlb_vmemmap_sysctls); + break; + } + } + } return 0; } -late_initcall(hugetlb_vmemmap_sysctls_init); -#endif /* CONFIG_PROC_SYSCTL */ +late_initcall(hugetlb_vmemmap_init); diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index ba66fadad9fcac..25bd0e00243140 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -11,35 +11,50 @@ #include #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head); -void hugetlb_vmemmap_free(struct hstate *h, struct page *head); -void hugetlb_vmemmap_init(struct hstate *h); +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); /* - * How many vmemmap pages associated with a HugeTLB page that can be - * optimized and freed to the buddy allocator. + * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See + * Documentation/vm/vmemmap_dedup.rst. */ -static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) +#define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE + +static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { - return h->optimize_vmemmap_pages; + return pages_per_huge_page(h) * sizeof(struct page); +} + +/* + * Return how many vmemmap size associated with a HugeTLB page that can be + * optimized and can be freed to the buddy allocator. + */ +static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) +{ + int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE; + + if (!is_power_of_2(sizeof(struct page))) + return 0; + return size > 0 ? size : 0; } #else -static inline int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) { return 0; } -static inline void hugetlb_vmemmap_free(struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { } -static inline void hugetlb_vmemmap_init(struct hstate *h) +static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { + return 0; } +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ -static inline unsigned int hugetlb_optimize_vmemmap_pages(struct hstate *h) +static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h) { - return 0; + return hugetlb_vmemmap_optimizable_size(h) != 0; } -#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ #endif /* _LINUX_HUGETLB_VMEMMAP_H */ From 83ef48c62abdb51f3a8316ab5693b1b5c9d03c96 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jun 2022 17:22:34 +0800 Subject: [PATCH 0743/1250] mm: hugetlb_vmemmap: move code comments to vmemmap_dedup.rst All the comments which explains how HVO works are moved to vmemmap_dedup.rst since commit 4917f55b4ef9 ("mm/sparse-vmemmap: improve memory savings for compound devmaps") except some comments above page_fixed_fake_head(). This commit moves those comments to vmemmap_dedup.rst and improve vmemmap_dedup.rst as well. Link: https://lkml.kernel.org/r/20220628092235.91270-8-songmuchun@bytedance.com Signed-off-by: Muchun Song Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Mike Kravetz Cc: Oscar Salvador Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- Documentation/mm/vmemmap_dedup.rst | 70 ++++++++++++++++++++---------- include/linux/page-flags.h | 15 +------ 2 files changed, 49 insertions(+), 36 deletions(-) diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index 7d7a161aa36467..a4b12ff906c4df 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -9,23 +9,23 @@ HugeTLB This section is to explain how HugeTLB Vmemmap Optimization (HVO) works. -The struct page structures (page structs) are used to describe a physical -page frame. By default, there is a one-to-one mapping from a page frame to -it's corresponding page struct. +The ``struct page`` structures are used to describe a physical page frame. By +default, there is a one-to-one mapping from a page frame to it's corresponding +``struct page``. HugeTLB pages consist of multiple base page size pages and is supported by many architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages. -For each base page, there is a corresponding page struct. +For each base page, there is a corresponding ``struct page``. -Within the HugeTLB subsystem, only the first 4 page structs are used to -contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides -this upper limit. The only 'useful' information in the remaining page structs +Within the HugeTLB subsystem, only the first 4 ``struct page`` are used to +contain unique information about a HugeTLB page. ``__NR_USED_SUBPAGE`` provides +this upper limit. The only 'useful' information in the remaining ``struct page`` is the compound_head field, and this field is the same for all tail pages. -By removing redundant page structs for HugeTLB pages, memory can be returned +By removing redundant ``struct page`` for HugeTLB pages, memory can be returned to the buddy allocator for other uses. Different architectures support different HugeTLB pages. For example, the @@ -46,7 +46,7 @@ page. | | 64KB | 2MB | 512MB | 16GB | | +--------------+-----------+-----------+-----------+-----------+-----------+ -When the system boot up, every HugeTLB page has more than one struct page +When the system boot up, every HugeTLB page has more than one ``struct page`` structs which size is (unit: pages):: struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE @@ -76,10 +76,10 @@ Where n is how many pte entries which one page can contains. So the value of n is (PAGE_SIZE / sizeof(pte_t)). This optimization only supports 64-bit system, so the value of sizeof(pte_t) -is 8. And this optimization also applicable only when the size of struct page -is a power of two. In most cases, the size of struct page is 64 bytes (e.g. +is 8. And this optimization also applicable only when the size of ``struct page`` +is a power of two. In most cases, the size of ``struct page`` is 64 bytes (e.g. x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the -size of struct page structs of it is 8 page frames which size depends on the +size of ``struct page`` structs of it is 8 page frames which size depends on the size of the base page. For the HugeTLB page of the pud level mapping, then:: @@ -88,7 +88,7 @@ For the HugeTLB page of the pud level mapping, then:: = PAGE_SIZE / 8 * 8 (pages) = PAGE_SIZE (pages) -Where the struct_size(pmd) is the size of the struct page structs of a +Where the struct_size(pmd) is the size of the ``struct page`` structs of a HugeTLB page of the pmd level mapping. E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB @@ -96,7 +96,7 @@ HugeTLB page consists in 4096. Next, we take the pmd level mapping of the HugeTLB page as an example to show the internal implementation of this optimization. There are 8 pages -struct page structs associated with a HugeTLB page which is pmd mapped. +``struct page`` structs associated with a HugeTLB page which is pmd mapped. Here is how things look before optimization:: @@ -124,10 +124,10 @@ Here is how things look before optimization:: +-----------+ The value of page->compound_head is the same for all tail pages. The first -page of page structs (page 0) associated with the HugeTLB page contains the 4 -page structs necessary to describe the HugeTLB. The only use of the remaining -pages of page structs (page 1 to page 7) is to point to page->compound_head. -Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs +page of ``struct page`` (page 0) associated with the HugeTLB page contains the 4 +``struct page`` necessary to describe the HugeTLB. The only use of the remaining +pages of ``struct page`` (page 1 to page 7) is to point to page->compound_head. +Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of ``struct page`` will be used for each HugeTLB page. This will allow us to free the remaining 7 pages to the buddy allocator. @@ -169,13 +169,37 @@ entries that can be cached in a single TLB entry. The contiguous bit is used to increase the mapping size at the pmd and pte (last) level. So this type of HugeTLB page can be optimized only when its -size of the struct page structs is greater than 1 page. +size of the ``struct page`` structs is greater than **1** page. Notice: The head vmemmap page is not freed to the buddy allocator and all tail vmemmap pages are mapped to the head vmemmap page frame. So we can see -more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page) -associated with each HugeTLB page. The compound_head() can handle this -correctly (more details refer to the comment above compound_head()). +more than one ``struct page`` struct with ``PG_head`` (e.g. 8 per 2 MB HugeTLB +page) associated with each HugeTLB page. The ``compound_head()`` can handle +this correctly. There is only **one** head ``struct page``, the tail +``struct page`` with ``PG_head`` are fake head ``struct page``. We need an +approach to distinguish between those two different types of ``struct page`` so +that ``compound_head()`` can return the real head ``struct page`` when the +parameter is the tail ``struct page`` but with ``PG_head``. The following code +snippet describes how to distinguish between real and fake head ``struct page``. + +.. code-block:: c + + if (test_bit(PG_head, &page->flags)) { + unsigned long head = READ_ONCE(page[1].compound_head); + + if (head & 1) { + if (head == (unsigned long)page + 1) + /* head struct page */ + else + /* tail struct page */ + } else { + /* head struct page */ + } + } + +We can safely access the field of the **page[1]** with ``PG_head`` because the +page is a compound page composed with at least two contiguous pages. +The implementation refers to ``page_fixed_fake_head()``. Device DAX ========== @@ -189,7 +213,7 @@ PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). The differences with HugeTLB are relatively minor. -It only use 3 page structs for storing all information as opposed +It only use 3 ``struct page`` for storing all information as opposed to 4 on HugeTLB pages. There's no remapping of vmemmap given that device-dax memory is not part of diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a2ada8c75d5e85..d270b16602cd11 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -208,19 +208,8 @@ enum pageflags { DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); /* - * If HVO is enabled, the head vmemmap page frame is reused and all of the tail - * vmemmap addresses map to the head vmemmap page frame (furture details can - * refer to the figure at the head of the mm/hugetlb_vmemmap.c). In other - * words, there are more than one page struct with PG_head associated with each - * HugeTLB page. We __know__ that there is only one head page struct, the tail - * page structs with PG_head are fake head page structs. We need an approach - * to distinguish between those two different types of page structs so that - * compound_head() can return the real head page struct when the parameter is - * the tail page struct but with PG_head. - * - * The page_fixed_fake_head() returns the real head page struct if the @page is - * fake page head, otherwise, returns @page which can either be a true page - * head or tail. + * Return the real head page struct iff the @page is a fake head page, otherwise + * return the @page itself. See Documentation/mm/vmemmap_dedup.rst. */ static __always_inline const struct page *page_fixed_fake_head(const struct page *page) { From 035c35ace5e2fbdedb2b95d41371c4fc0b8e07be Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Tue, 28 Jun 2022 17:22:35 +0800 Subject: [PATCH 0744/1250] mm: hugetlb_vmemmap: use PTRS_PER_PTE instead of PMD_SIZE / PAGE_SIZE There is already a macro PTRS_PER_PTE to represent the number of page table entries, just use it. Link: https://lkml.kernel.org/r/20220628092235.91270-9-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Anshuman Khandual Cc: Catalin Marinas Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Oscar Salvador Cc: Will Deacon Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 8da2b31bb59f50..20f414c0379f9e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -48,7 +48,7 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) pmd_populate_kernel(&init_mm, &__pmd, pgtable); - for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) { + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; From 4693575fba995c9823fe69def560a46346e0a783 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 20 Jul 2022 14:41:31 -0700 Subject: [PATCH 0745/1250] mm: discard __GFP_ATOMIC __GFP_ATOMIC serves little purpose. Its main effect is to set ALLOC_HARDER which adds a few little boosts to increase the chance of an allocation succeeding, one of which is to lower the water-mark at which it will succeed. It is *always* paired with __GFP_HIGH which sets ALLOC_HIGH which also adjusts this watermark. It is probable that other users of __GFP_HIGH should benefit from the other little bonuses that __GFP_ATOMIC gets. __GFP_ATOMIC also gives a warning if used with __GFP_DIRECT_RECLAIM. There is little point to this. We already get a might_sleep() warning if __GFP_DIRECT_RECLAIM is set. __GFP_ATOMIC allows the "watermark_boost" to be side-stepped. It is probable that testing ALLOC_HARDER is a better fit here. __GFP_ATOMIC is used by tegra-smmu.c to check if the allocation might sleep. This should test __GFP_DIRECT_RECLAIM instead. This patch: - removes __GFP_ATOMIC - causes __GFP_HIGH to set ALLOC_HARDER unless __GFP_NOMEMALLOC is set (as well as ALLOC_HIGH). - makes other adjustments as suggested by the above. The net result is not change to GFP_ATOMIC allocations. Other allocations that use __GFP_HIGH will benefit from a few different extra privileges. This affects: xen, dm, md, ntfs3 the vermillion frame buffer hibernation ksm swap all of which likely produce more benefit than cost if these selected allocation are more likely to succeed quickly. Link: https://lkml.kernel.org/r/163712397076.13692.4727608274002939094@noble.neil.brown.name Signed-off-by: NeilBrown Reviewed-by: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Thierry Reding Cc: Mel Gorman Signed-off-by: Andrew Morton --- Documentation/mm/balance.rst | 2 +- drivers/iommu/tegra-smmu.c | 4 ++-- include/linux/gfp.h | 12 ++++-------- include/trace/events/mmflags.h | 1 - lib/test_printf.c | 8 ++++---- mm/internal.h | 2 +- mm/page_alloc.c | 16 ++++------------ tools/include/linux/gfp.h | 3 +-- tools/perf/builtin-kmem.c | 1 - 9 files changed, 17 insertions(+), 32 deletions(-) diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst index 6a1fadf3e1735e..e38e9d83c1c72b 100644 --- a/Documentation/mm/balance.rst +++ b/Documentation/mm/balance.rst @@ -6,7 +6,7 @@ Memory Balancing Started Jan 2000 by Kanoj Sarcar -Memory balancing is needed for !__GFP_ATOMIC and !__GFP_KSWAPD_RECLAIM as +Memory balancing is needed for !__GFP_HIGH and !__GFP_KSWAPD_RECLAIM as well as for non __GFP_IO allocations. The first reason why a caller may avoid reclaim is that the caller can not diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c index 1fea68e551f135..2f2b1203361862 100644 --- a/drivers/iommu/tegra-smmu.c +++ b/drivers/iommu/tegra-smmu.c @@ -671,12 +671,12 @@ static struct page *as_get_pde_page(struct tegra_smmu_as *as, * allocate page in a sleeping context if GFP flags permit. Hence * spinlock needs to be unlocked and re-locked after allocation. */ - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_unlock_irqrestore(&as->lock, *flags); page = alloc_page(gfp | __GFP_DMA | __GFP_ZERO); - if (!(gfp & __GFP_ATOMIC)) + if (gfp & __GFP_DIRECT_RECLAIM) spin_lock_irqsave(&as->lock, *flags); /* diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 2d2ccae933c207..9a88cce23e173d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,7 +39,7 @@ struct vm_area_struct; #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u -#define ___GFP_ATOMIC 0x200u +/* 0x200u unused */ #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u @@ -124,11 +124,8 @@ struct vm_area_struct; * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. - * For example, creating an IO context to clean pages. - * - * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is - * high priority. Users are typically interrupt handlers. This may be - * used in conjunction with %__GFP_HIGH + * For example creating an IO context to clean pages and requests + * from atomic context. * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed @@ -143,7 +140,6 @@ struct vm_area_struct; * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ -#define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) @@ -337,7 +333,7 @@ struct vm_area_struct; * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ -#define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index e87cb2b80ed3c1..11524cda4a9556 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -31,7 +31,6 @@ gfpflag_string(__GFP_HIGHMEM), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_HIGH), \ - gfpflag_string(__GFP_ATOMIC), \ gfpflag_string(__GFP_IO), \ gfpflag_string(__GFP_FS), \ gfpflag_string(__GFP_NOWARN), \ diff --git a/lib/test_printf.c b/lib/test_printf.c index 07309c45f32796..8010de49b6c5db 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -673,17 +673,17 @@ flags(void) gfp = GFP_ATOMIC|__GFP_DMA; test("GFP_ATOMIC|GFP_DMA", "%pGg", &gfp); - gfp = __GFP_ATOMIC; - test("__GFP_ATOMIC", "%pGg", &gfp); + gfp = __GFP_HIGH; + test("__GFP_HIGH", "%pGg", &gfp); /* Any flags not translated by the table should remain numeric */ gfp = ~__GFP_BITS_MASK; snprintf(cmp_buffer, BUF_SIZE, "%#lx", (unsigned long) gfp); test(cmp_buffer, "%pGg", &gfp); - snprintf(cmp_buffer, BUF_SIZE, "__GFP_ATOMIC|%#lx", + snprintf(cmp_buffer, BUF_SIZE, "__GFP_HIGH|%#lx", (unsigned long) gfp); - gfp |= __GFP_ATOMIC; + gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); kfree(cmp_buffer); diff --git a/mm/internal.h b/mm/internal.h index 899dab512c5a78..9fe642aab0baca 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC|__GFP_NOLOCKDEP) + __GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 50d96fff88559e..eee2500756fbeb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4060,12 +4060,12 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, free_pages)) return true; /* - * Ignore watermark boosting for GFP_ATOMIC order-0 allocations + * Ignore watermark boosting for GFP_HIGH order-0 allocations * when checking the min watermark. The min watermark is the * point where boosting is ignored so that kswapd is woken up * when below the low watermark. */ - if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost + if (unlikely(!order && (alloc_flags & ALLOC_HARDER) && z->watermark_boost && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) { mark = z->_watermark[WMARK_MIN]; return __zone_watermark_ok(z, order, mark, highest_zoneidx, @@ -4800,12 +4800,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will - * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). + * set both ALLOC_HARDER (unless __GFP_NOMEMALLOC) and ALLOC_HIGH. */ alloc_flags |= (__force int) (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); - if (gfp_mask & __GFP_ATOMIC) { + if (gfp_mask & __GFP_HIGH) { /* * Not worth trying to allocate harder for __GFP_NOMEMALLOC even * if it can't schedule. @@ -4998,14 +4998,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; int reserve_flags; - /* - * We also sanity check to catch abuse of atomic reserves being used by - * callers that are not in atomic context. - */ - if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) == - (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) - gfp_mask &= ~__GFP_ATOMIC; - retry_cpuset: compaction_retries = 0; no_progress_loops = 0; diff --git a/tools/include/linux/gfp.h b/tools/include/linux/gfp.h index b238dbc9eb8586..56eec4445bc9e5 100644 --- a/tools/include/linux/gfp.h +++ b/tools/include/linux/gfp.h @@ -12,7 +12,6 @@ #define __GFP_FS 0x80u #define __GFP_NOWARN 0x200u #define __GFP_ZERO 0x8000u -#define __GFP_ATOMIC 0x80000u #define __GFP_ACCOUNT 0x100000u #define __GFP_DIRECT_RECLAIM 0x400000u #define __GFP_KSWAPD_RECLAIM 0x2000000u @@ -20,7 +19,7 @@ #define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM) #define GFP_ZONEMASK 0x0fu -#define GFP_ATOMIC (__GFP_HIGH | __GFP_ATOMIC | __GFP_KSWAPD_RECLAIM) +#define GFP_ATOMIC (__GFP_HIGH | __GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index ebfab2ca17024d..4a06d83f2ac5a3 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -640,7 +640,6 @@ static const struct { { "__GFP_HIGHMEM", "HM" }, { "GFP_DMA32", "D32" }, { "__GFP_HIGH", "H" }, - { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, { "__GFP_NOWARN", "NWR" }, From d37241279398ff30503376be752147e351a85182 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 14:41:31 -0700 Subject: [PATCH 0746/1250] mips: rename mt_init to mips_mt_init Move mt_init out of the way for the maple tree. Use mips_mt prefix to match the rest of the functions in the file. Link: https://lkml.kernel.org/r/20220504002554.654642-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/mips/kernel/mips-mt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/mips-mt.c b/arch/mips/kernel/mips-mt.c index d5f7362e8c2459..dc023a97980356 100644 --- a/arch/mips/kernel/mips-mt.c +++ b/arch/mips/kernel/mips-mt.c @@ -230,7 +230,7 @@ void mips_mt_set_cpuoptions(void) struct class *mt_class; -static int __init mt_init(void) +static int __init mips_mt_init(void) { struct class *mtc; @@ -243,4 +243,4 @@ static int __init mt_init(void) return 0; } -subsys_initcall(mt_init); +subsys_initcall(mips_mt_init); From abc461af0a9d8eff73d399747127598be5973039 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 20 Jun 2022 21:09:09 -0400 Subject: [PATCH 0747/1250] android: binder: stop saving a pointer to the VMA Do not record a pointer to a VMA outside of the mmap_lock for later use. This is unsafe and there are a number of failure paths *after* the recorded VMA pointer may be freed during setup. There is no callback to the driver to clear the saved pointer from generic mm code. Furthermore, the VMA pointer may become stale if any number of VMA operations end up freeing the VMA so saving it was fragile to being with. Instead, change the binder_alloc struct to record the start address of the VMA and use vma_lookup() to get the vma when needed. Add lockdep mmap_lock checks on updates to the vma pointer to ensure the lock is held and depend on that lock for synchronization of readers and writers - which was already the case anyways, so the smp_wmb()/smp_rmb() was not necessary. Link: https://lkml.kernel.org/r/20220621140212.vpkio64idahetbyf@revolver Fixes: da1b9564e85b ("android: binder: fix the race mmap and alloc_new_buf_locked") Reported-by: syzbot+58b51ac2b04e388ab7b0@syzkaller.appspotmail.com Signed-off-by: Liam R. Howlett Cc: Minchan Kim Cc: Christian Brauner (Microsoft) Cc: Greg Kroah-Hartman Cc: Hridya Valsaraju Cc: Joel Fernandes Cc: Martijn Coenen Cc: Suren Baghdasaryan Cc: Todd Kjos Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 30 ++++++++++++++---------------- drivers/android/binder_alloc.h | 2 +- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 51b502217d000c..f555eebceef6bf 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -213,7 +213,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, if (mm) { mmap_read_lock(mm); - vma = alloc->vma; + vma = vma_lookup(mm, alloc->vma_addr); } if (!vma && need_mm) { @@ -313,16 +313,15 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, static inline void binder_alloc_set_vma(struct binder_alloc *alloc, struct vm_area_struct *vma) { - if (vma) + unsigned long vm_start = 0; + + if (vma) { + vm_start = vma->vm_start; alloc->vma_vm_mm = vma->vm_mm; - /* - * If we see alloc->vma is not NULL, buffer data structures set up - * completely. Look at smp_rmb side binder_alloc_get_vma. - * We also want to guarantee new alloc->vma_vm_mm is always visible - * if alloc->vma is set. - */ - smp_wmb(); - alloc->vma = vma; + } + + mmap_assert_write_locked(alloc->vma_vm_mm); + alloc->vma_addr = vm_start; } static inline struct vm_area_struct *binder_alloc_get_vma( @@ -330,11 +329,9 @@ static inline struct vm_area_struct *binder_alloc_get_vma( { struct vm_area_struct *vma = NULL; - if (alloc->vma) { - /* Look at description in binder_alloc_set_vma */ - smp_rmb(); - vma = alloc->vma; - } + if (alloc->vma_addr) + vma = vma_lookup(alloc->vma_vm_mm, alloc->vma_addr); + return vma; } @@ -817,7 +814,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) buffers = 0; mutex_lock(&alloc->mutex); - BUG_ON(alloc->vma); + BUG_ON(alloc->vma_addr && + vma_lookup(alloc->vma_vm_mm, alloc->vma_addr)); while ((n = rb_first(&alloc->allocated_buffers))) { buffer = rb_entry(n, struct binder_buffer, rb_node); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 7dea57a84c79b1..1e4fd37af5e03e 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -100,7 +100,7 @@ struct binder_lru_page { */ struct binder_alloc { struct mutex mutex; - struct vm_area_struct *vma; + unsigned long vma_addr; struct mm_struct *vma_vm_mm; void __user *buffer; struct list_head buffers; From 543d18605d2fb8bd8a712a9e708b97386c8ce61e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Jun 2022 19:16:17 -0700 Subject: [PATCH 0748/1250] android-binder-stop-saving-a-pointer-to-the-vma-fix fix drivers/android/binder_alloc_selftest.c drivers/android/binder_alloc_selftest.c: In function 'binder_selftest_alloc': drivers/android/binder_alloc_selftest.c:290:43: error: 'struct binder_alloc' has no member named 'vma' 290 | if (!binder_selftest_run || !alloc->vma) Cc: Christian Brauner (Microsoft) Cc: Greg Kroah-Hartman Cc: Hridya Valsaraju Cc: Joel Fernandes Cc: "Liam R. Howlett" Cc: Martijn Coenen Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Suren Baghdasaryan Cc: Todd Kjos Signed-off-by: Andrew Morton --- drivers/android/binder_alloc_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index c2b323bc3b3a53..43a881073a4283 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -287,7 +287,7 @@ void binder_selftest_alloc(struct binder_alloc *alloc) if (!binder_selftest_run) return; mutex_lock(&binder_selftest_lock); - if (!binder_selftest_run || !alloc->vma) + if (!binder_selftest_run || !alloc->vma_addr) goto done; pr_info("STARTED\n"); binder_selftest_alloc_offset(alloc, end_offset, 0); From f640f59f69cf7c7989c658033d870ebec734a1b1 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Mon, 27 Jun 2022 15:18:59 +0000 Subject: [PATCH 0749/1250] android: binder: fix lockdep check on clearing vma MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When munmapping a vma, the mmap_lock can be degraded to a write before calling close() on the file handle. The binder close() function calls binder_alloc_set_vma() to clear the vma address, which now has a lock dep check for writing on the mmap_lock. Change the lockdep check to ensure the reading lock is held while clearing and keep the write check while writing. Link: https://lkml.kernel.org/r/20220627151857.2316964-1-Liam.Howlett@oracle.com Fixes: 472a68df605b ("android: binder: stop saving a pointer to the VMA") Signed-off-by: Liam R. Howlett Reported-by: syzbot+da54fa8d793ca89c741f@syzkaller.appspotmail.com Acked-by: Todd Kjos Cc: "Arve Hjønnevåg" Cc: Christian Brauner (Microsoft) Cc: Greg Kroah-Hartman Cc: Hridya Valsaraju Cc: Joel Fernandes Cc: Martijn Coenen Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index f555eebceef6bf..1014beb1280257 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -315,12 +315,19 @@ static inline void binder_alloc_set_vma(struct binder_alloc *alloc, { unsigned long vm_start = 0; + /* + * Allow clearing the vma with holding just the read lock to allow + * munmapping downgrade of the write lock before freeing and closing the + * file using binder_alloc_vma_close(). + */ if (vma) { vm_start = vma->vm_start; alloc->vma_vm_mm = vma->vm_mm; + mmap_assert_write_locked(alloc->vma_vm_mm); + } else { + mmap_assert_locked(alloc->vma_vm_mm); } - mmap_assert_write_locked(alloc->vma_vm_mm); alloc->vma_addr = vm_start; } From 06b152b7980a04b73b6cf17b81808f748a547a50 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:39 +0000 Subject: [PATCH 0750/1250] Maple Tree: add new data structure Patch series "Introducing the Maple Tree", v12. The maple tree is an RCU-safe range based B-tree designed to use modern processor cache efficiently. There are a number of places in the kernel that a non-overlapping range-based tree would be beneficial, especially one with a simple interface. If you use an rbtree with other data structures to improve performance or an interval tree to track non-overlapping ranges, then this is for you. The tree has a branching factor of 10 for non-leaf nodes and 16 for leaf nodes. With the increased branching factor, it is significantly shorter than the rbtree so it has fewer cache misses. The removal of the linked list between subsequent entries also reduces the cache misses and the need to pull in the previous and next VMA during many tree alterations. The first user that is covered in this patch set is the vm_area_struct, where three data structures are replaced by the maple tree: the augmented rbtree, the vma cache, and the linked list of VMAs in the mm_struct. The long term goal is to reduce or remove the mmap_lock contention. The plan is to get to the point where we use the maple tree in RCU mode. Readers will not block for writers. A single write operation will be allowed at a time. A reader re-walks if stale data is encountered. VMAs would be RCU enabled and this mode would be entered once multiple tasks are using the mm_struct. Davidlor said : Yes I like the maple tree, and at this stage I don't think we can ask for : more from this series wrt the MM - albeit there seems to still be some : folks reporting breakage. Fundamentally I see Liam's work to (re)move : complexity out of the MM (not to say that the actual maple tree is not : complex) by consolidating the three complimentary data structures very : much worth it considering performance does not take a hit. This was very : much a turn off with the range locking approach, which worst case scenario : incurred in prohibitive overhead. Also as Liam and Matthew have : mentioned, RCU opens up a lot of nice performance opportunities, and in : addition academia[1] has shown outstanding scalability of address spaces : with the foundation of replacing the locked rbtree with RCU aware trees. A similar work has been discovered in the academic press https://pdos.csail.mit.edu/papers/rcuvm:asplos12.pdf Sheer coincidence. We designed our tree with the intention of solving the hardest problem first. Upon settling on a b-tree variant and a rough outline, we researched ranged based b-trees and RCU b-trees and did find that article. So it was nice to find reassurances that we were on the right path, but our design choice of using ranges made that paper unusable for us. This patch (of 69): The maple tree is an RCU-safe range based B-tree designed to use modern processor cache efficiently. There are a number of places in the kernel that a non-overlapping range-based tree would be beneficial, especially one with a simple interface. If you use an rbtree with other data structures to improve performance or an interval tree to track non-overlapping ranges, then this is for you. The tree has a branching factor of 10 for non-leaf nodes and 16 for leaf nodes. With the increased branching factor, it is significantly shorter than the rbtree so it has fewer cache misses. The removal of the linked list between subsequent entries also reduces the cache misses and the need to pull in the previous and next VMA during many tree alterations. The first user that is covered in this patch set is the vm_area_struct, where three data structures are replaced by the maple tree: the augmented rbtree, the vma cache, and the linked list of VMAs in the mm_struct. The long term goal is to reduce or remove the mmap_lock contention. The plan is to get to the point where we use the maple tree in RCU mode. Readers will not block for writers. A single write operation will be allowed at a time. A reader re-walks if stale data is encountered. VMAs would be RCU enabled and this mode would be entered once multiple tasks are using the mm_struct. Link: https://lkml.kernel.org/r/20220720021727.17018-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220504010716.661115-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220504002554.654642-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220504010716.661115-3-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220511144304.1430851-2-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220517145913.3480729-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220517152209.3486724-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220519150304.1289636-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220607063834.7004-1-lukas.bulwahn@gmail.com Link: https://lkml.kernel.org/r/20220615141921.417598-2-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220615141921.417598-3-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220616011739.802669-3-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220615174213.738849-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220617134609.1771611-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-2-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Tested-by: David Howells Tested-by: Sven Schnelle Signed-off-by: Lukas Bulwahn Cc: Catalin Marinas Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Yang Yingliang Signed-off-by: Andrew Morton --- Documentation/core-api/index.rst | 1 + Documentation/core-api/maple_tree.rst | 217 + MAINTAINERS | 12 + include/linux/maple_tree.h | 684 ++ include/trace/events/maple_tree.h | 123 + init/main.c | 2 + lib/Kconfig.debug | 15 + lib/Makefile | 3 +- lib/maple_tree.c | 7102 +++++++++++++++++ tools/testing/radix-tree/.gitignore | 2 + tools/testing/radix-tree/generated/autoconf.h | 1 + tools/testing/radix-tree/linux/maple_tree.h | 7 + tools/testing/radix-tree/maple.c | 59 + .../radix-tree/trace/events/maple_tree.h | 5 + 14 files changed, 8232 insertions(+), 1 deletion(-) create mode 100644 Documentation/core-api/maple_tree.rst create mode 100644 include/linux/maple_tree.h create mode 100644 include/trace/events/maple_tree.h create mode 100644 lib/maple_tree.c create mode 100644 tools/testing/radix-tree/linux/maple_tree.h create mode 100644 tools/testing/radix-tree/maple.c create mode 100644 tools/testing/radix-tree/trace/events/maple_tree.h diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 5b1188494bcd88..86ae4dbca1f59c 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -36,6 +36,7 @@ Library functionality that is used throughout the kernel. kref assoc_array xarray + maple_tree idr circular-buffers rbtree diff --git a/Documentation/core-api/maple_tree.rst b/Documentation/core-api/maple_tree.rst new file mode 100644 index 00000000000000..45defcf15da71a --- /dev/null +++ b/Documentation/core-api/maple_tree.rst @@ -0,0 +1,217 @@ +.. SPDX-License-Identifier: GPL-2.0+ + + +========== +Maple Tree +========== + +:Author: Liam R. Howlett + +Overview +======== + +The Maple Tree is a B-Tree data type which is optimized for storing +non-overlapping ranges, including ranges of size 1. The tree was designed to +be simple to use and does not require a user written search method. It +supports iterating over a range of entries and going to the previous or next +entry in a cache-efficient manner. The tree can also be put into an RCU-safe +mode of operation which allows reading and writing concurrently. Writers must +synchronize on a lock, which can be the default spinlock, or the user can set +the lock to an external lock of a different type. + +The Maple Tree maintains a small memory footprint and was designed to use +modern processor cache efficiently. The majority of the users will be able to +use the normal API. An :ref:`maple-tree-advanced-api` exists for more complex +scenarios. The most important usage of the Maple Tree is the tracking of the +virtual memory areas. + +The Maple Tree can store values between ``0`` and ``ULONG_MAX``. The Maple +Tree reserves values with the bottom two bits set to '10' which are below 4096 +(ie 2, 6, 10 .. 4094) for internal use. If the entries may use reserved +entries then the users can convert the entries using xa_mk_value() and convert +them back by calling xa_to_value(). If the user needs to use a reserved +value, then the user can convert the value when using the +:ref:`maple-tree-advanced-api`, but are blocked by the normal API. + +The Maple Tree can also be configured to support searching for a gap of a given +size (or larger). + +Pre-allocating of nodes is also supported using the +:ref:`maple-tree-advanced-api`. This is useful for users who must guarantee a +successful store operation within a given +code segment when allocating cannot be done. Allocations of nodes are +relatively small at around 256 bytes. + +.. _maple-tree-normal-api: + +Normal API +========== + +Start by initialising a maple tree, either with DEFINE_MTREE() for statically +allocated maple trees or mt_init() for dynamically allocated ones. A +freshly-initialised maple tree contains a ``NULL`` pointer for the range ``0`` +- ``ULONG_MAX``. There are currently two types of maple trees supported: the +allocation tree and the regular tree. The regular tree has a higher branching +factor for internal nodes. The allocation tree has a lower branching factor +but allows the user to search for a gap of a given size or larger from either +``0`` upwards or ``ULONG_MAX`` down. An allocation tree can be used by +passing in the ``MT_FLAGS_ALLOC_RANGE`` flag when initialising the tree. + +You can then set entries using mtree_store() or mtree_store_range(). +mtree_store() will overwrite any entry with the new entry and return 0 on +success or an error code otherwise. mtree_store_range() works in the same way +but takes a range. mtree_load() is used to retrieve the entry stored at a +given index. You can use mtree_erase() to erase an entire range by only +knowing one value within that range, or mtree_store() call with an entry of +NULL may be used to partially erase a range or many ranges at once. + +If you want to only store a new entry to a range (or index) if that range is +currently ``NULL``, you can use mtree_insert_range() or mtree_insert() which +return -EEXIST if the range is not empty. + +You can search for an entry from an index upwards by using mt_find(). + +You can walk each entry within a range by calling mt_for_each(). You must +provide a temporary variable to store a cursor. If you want to walk each +element of the tree then ``0`` and ``ULONG_MAX`` may be used as the range. If +the caller is going to hold the lock for the duration of the walk then it is +worth looking at the mas_for_each() API in the :ref:`maple-tree-advanced-api` +section. + +Sometimes it is necessary to ensure the next call to store to a maple tree does +not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case. + +Finally, you can remove all entries from a maple tree by calling +mtree_destroy(). If the maple tree entries are pointers, you may wish to free +the entries first. + +Allocating Nodes +---------------- + +The allocations are handled by the internal tree code. See +:ref:`maple-tree-advanced-alloc` for other options. + +Locking +------- + +You do not have to worry about locking. See :ref:`maple-tree-advanced-locks` +for other options. + +The Maple Tree uses RCU and an internal spinlock to synchronise access: + +Takes RCU read lock: + * mtree_load() + * mt_find() + * mt_for_each() + * mt_next() + * mt_prev() + +Takes ma_lock internally: + * mtree_store() + * mtree_store_range() + * mtree_insert() + * mtree_insert_range() + * mtree_erase() + * mtree_destroy() + * mt_set_in_rcu() + * mt_clear_in_rcu() + +If you want to take advantage of the internal lock to protect the data +structures that you are storing in the Maple Tree, you can call mtree_lock() +before calling mtree_load(), then take a reference count on the object you +have found before calling mtree_unlock(). This will prevent stores from +removing the object from the tree between looking up the object and +incrementing the refcount. You can also use RCU to avoid dereferencing +freed memory, but an explanation of that is beyond the scope of this +document. + +.. _maple-tree-advanced-api: + +Advanced API +============ + +The advanced API offers more flexibility and better performance at the +cost of an interface which can be harder to use and has fewer safeguards. +You must take care of your own locking while using the advanced API. +You can use the ma_lock, RCU or an external lock for protection. +You can mix advanced and normal operations on the same array, as long +as the locking is compatible. The :ref:`maple-tree-normal-api` is implemented +in terms of the advanced API. + +The advanced API is based around the ma_state, this is where the 'mas' +prefix originates. The ma_state struct keeps track of tree operations to make +life easier for both internal and external tree users. + +Initialising the maple tree is the same as in the :ref:`maple-tree-normal-api`. +Please see above. + +The maple state keeps track of the range start and end in mas->index and +mas->last, respectively. + +mas_walk() will walk the tree to the location of mas->index and set the +mas->index and mas->last according to the range for the entry. + +You can set entries using mas_store(). mas_store() will overwrite any entry +with the new entry and return the first existing entry that is overwritten. +The range is passed in as members of the maple state: index and last. + +You can use mas_erase() to erase an entire range by setting index and +last of the maple state to the desired range to erase. This will erase +the first range that is found in that range, set the maple state index +and last as the range that was erased and return the entry that existed +at that location. + +You can walk each entry within a range by using mas_for_each(). If you want +to walk each element of the tree then ``0`` and ``ULONG_MAX`` may be used as +the range. If the lock needs to be periodically dropped, see the locking +section mas_pause(). + +Using a maple state allows mas_next() and mas_prev() to function as if the +tree was a linked list. With such a high branching factor the amortized +performance penalty is outweighed by cache optimization. mas_next() will +return the next entry which occurs after the entry at index. mas_prev() +will return the previous entry which occurs before the entry at index. + +mas_find() will find the first entry which exists at or above index on +the first call, and the next entry from every subsequent calls. + +mas_find_rev() will find the fist entry which exists at or below the last on +the first call, and the previous entry from every subsequent calls. + +If the user needs to yield the lock during an operation, then the maple state +must be paused using mas_pause(). + +There are a few extra interfaces provided when using an allocation tree. +If you wish to search for a gap within a range, then mas_empty_area() +or mas_empty_area_rev() can be used. mas_empty_area() searches for a gap +starting at the lowest index given up to the maximum of the range. +mas_empty_area_rev() searches for a gap starting at the highest index given +and continues downward to the lower bound of the range. + +.. _maple-tree-advanced-alloc: + +Advanced Allocating Nodes +------------------------- + +Allocations are usually handled internally to the tree, however if allocations +need to occur before a write occurs then calling mas_expected_entries() will +allocate the worst-case number of needed nodes to insert the provided number of +ranges. This also causes the tree to enter mass insertion mode. Once +insertions are complete calling mas_destroy() on the maple state will free the +unused allocations. + +.. _maple-tree-advanced-locks: + +Advanced Locking +---------------- + +The maple tree uses a spinlock by default, but external locks can be used for +tree updates as well. To use an external lock, the tree must be initialized +with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the +MTREE_INIT_EXT() #define, which takes an external lock as an argument. + +Functions and structures +======================== + +.. kernel-doc:: include/linux/maple_tree.h +.. kernel-doc:: lib/maple_tree.c diff --git a/MAINTAINERS b/MAINTAINERS index 55fb1daa9057e4..770fab39672886 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11826,6 +11826,18 @@ L: linux-man@vger.kernel.org S: Maintained W: http://www.kernel.org/doc/man-pages +MAPLE TREE +M: Liam R. Howlett +L: linux-mm@kvack.org +S: Supported +F: Documentation/core-api/maple_tree.rst +F: include/linux/maple_tree.h +F: include/trace/events/maple_tree.h +F: lib/maple_tree.c +F: lib/test_maple_tree.c +F: tools/testing/radix-tree/linux/maple_tree.h +F: tools/testing/radix-tree/maple.c + MARDUK (CREATOR CI40) DEVICE TREE SUPPORT M: Rahul Bedarkar L: linux-mips@vger.kernel.org diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h new file mode 100644 index 00000000000000..bdb891b0d2b53f --- /dev/null +++ b/include/linux/maple_tree.h @@ -0,0 +1,684 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#ifndef _LINUX_MAPLE_TREE_H +#define _LINUX_MAPLE_TREE_H +/* + * Maple Tree - An RCU-safe adaptive tree for storing ranges + * Copyright (c) 2018-2022 Oracle + * Authors: Liam R. Howlett + * Matthew Wilcox + */ + +#include +#include +#include +/* #define CONFIG_MAPLE_RCU_DISABLED */ +/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ + +/* + * Allocated nodes are mutable until they have been inserted into the tree, + * at which time they cannot change their type until they have been removed + * from the tree and an RCU grace period has passed. + * + * Removed nodes have their ->parent set to point to themselves. RCU readers + * check ->parent before relying on the value that they loaded from the + * slots array. This lets us reuse the slots array for the RCU head. + * + * Nodes in the tree point to their parent unless bit 0 is set. + */ +#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) +/* 64bit sizes */ +#define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ +#define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ +#define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ +#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */ +#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) +#else +/* 32bit sizes */ +#define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ +#define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ +#define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ +#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */ +#define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) +#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ + +#define MAPLE_NODE_MASK 255UL + +/* + * The node->parent of the root node has bit 0 set and the rest of the pointer + * is a pointer to the tree itself. No more bits are available in this pointer + * (on m68k, the data structure may only be 2-byte aligned). + * + * Internal non-root nodes can only have maple_range_* nodes as parents. The + * parent pointer is 256B aligned like all other tree nodes. When storing a 32 + * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an + * extra bit to store the offset. This extra bit comes from a reuse of the last + * bit in the node type. This is possible by using bit 1 to indicate if bit 2 + * is part of the type or the slot. + * + * Once the type is decided, the decision of an allocation range type or a range + * type is done by examining the immutable tree flag for the MAPLE_ALLOC_RANGE + * flag. + * + * Node types: + * 0x??1 = Root + * 0x?00 = 16 bit nodes + * 0x010 = 32 bit nodes + * 0x110 = 64 bit nodes + * + * Slot size and location in the parent pointer: + * type : slot location + * 0x??1 : Root + * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 + * 0x010 : 32 bit values, type in 0-2, slot in 3-6 + * 0x110 : 64 bit values, type in 0-2, slot in 3-6 + */ + +/* + * This metadata is used to optimize the gap updating code and in reverse + * searching for gaps or any other code that needs to find the end of the data. + */ +struct maple_metadata { + unsigned char end; + unsigned char gap; +}; + +/* + * Leaf nodes do not store pointers to nodes, they store user data. Users may + * store almost any bit pattern. As noted above, the optimisation of storing an + * entry at 0 in the root pointer cannot be done for data which have the bottom + * two bits set to '10'. We also reserve values with the bottom two bits set to + * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs + * return errnos as a negative errno shifted right by two bits and the bottom + * two bits set to '10', and while choosing to store these values in the array + * is not an error, it may lead to confusion if you're testing for an error with + * mas_is_err(). + * + * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits + * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. + * + * In regular B-Tree terms, pivots are called keys. The term pivot is used to + * indicate that the tree is specifying ranges, Pivots may appear in the + * subtree with an entry attached to the value whereas keys are unique to a + * specific position of a B-tree. Pivot values are inclusive of the slot with + * the same index. + */ + +struct maple_range_64 { + struct maple_pnode *parent; + unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; + union { + void __rcu *slot[MAPLE_RANGE64_SLOTS]; + struct { + void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; + struct maple_metadata meta; + }; + }; +}; + +/* + * At tree creation time, the user can specify that they're willing to trade off + * storing fewer entries in a tree in return for storing more information in + * each node. + * + * The maple tree supports recording the largest range of NULL entries available + * in this node, also called gaps. This optimises the tree for allocating a + * range. + */ +struct maple_arange_64 { + struct maple_pnode *parent; + unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; + void __rcu *slot[MAPLE_ARANGE64_SLOTS]; + unsigned long gap[MAPLE_ARANGE64_SLOTS]; + struct maple_metadata meta; +}; + +struct maple_alloc { + unsigned long total; + unsigned char node_count; + unsigned int request_count; + struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; +}; + +struct maple_topiary { + struct maple_pnode *parent; + struct maple_enode *next; /* Overlaps the pivot */ +}; + +enum maple_type { + maple_dense, + maple_leaf_64, + maple_range_64, + maple_arange_64, +}; + + +/** + * DOC: Maple tree flags + * + * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree + * * MT_FLAGS_USE_RCU - Operate in RCU mode + * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags + * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value + * * MT_FLAGS_LOCK_MASK - How the mt_lock is used + * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe + * * MT_FLAGS_LOCK_BH - Acquired bh-safe + * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used + * + * MAPLE_HEIGHT_MAX The largest height that can be stored + */ +#define MT_FLAGS_ALLOC_RANGE 0x01 +#define MT_FLAGS_USE_RCU 0x02 +#define MT_FLAGS_HEIGHT_OFFSET 0x02 +#define MT_FLAGS_HEIGHT_MASK 0x7C +#define MT_FLAGS_LOCK_MASK 0x300 +#define MT_FLAGS_LOCK_IRQ 0x100 +#define MT_FLAGS_LOCK_BH 0x200 +#define MT_FLAGS_LOCK_EXTERN 0x300 + +#define MAPLE_HEIGHT_MAX 31 + + +#define MAPLE_NODE_TYPE_MASK 0x0F +#define MAPLE_NODE_TYPE_SHIFT 0x03 + +#define MAPLE_RESERVED_RANGE 4096 + +#ifdef CONFIG_LOCKDEP +typedef struct lockdep_map *lockdep_map_p; +#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock) +#define mt_set_external_lock(mt, lock) \ + (mt)->ma_external_lock = &(lock)->dep_map +#else +typedef struct { /* nothing */ } lockdep_map_p; +#define mt_lock_is_held(mt) 1 +#define mt_set_external_lock(mt, lock) do { } while (0) +#endif + +/* + * If the tree contains a single entry at index 0, it is usually stored in + * tree->ma_root. To optimise for the page cache, an entry which ends in '00', + * '01' or '11' is stored in the root, but an entry which ends in '10' will be + * stored in a node. Bits 3-6 are used to store enum maple_type. + * + * The flags are used both to store some immutable information about this tree + * (set at tree creation time) and dynamic information set under the spinlock. + * + * Another use of flags are to indicate global states of the tree. This is the + * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in + * RCU mode. This mode was added to allow the tree to reuse nodes instead of + * re-allocating and RCU freeing nodes when there is a single user. + */ +struct maple_tree { + union { + spinlock_t ma_lock; + lockdep_map_p ma_external_lock; + }; + void __rcu *ma_root; + unsigned int ma_flags; +}; + +/** + * MTREE_INIT() - Initialize a maple tree + * @name: The maple tree name + * @flags: The maple tree flags + * + */ +#define MTREE_INIT(name, __flags) { \ + .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ + .ma_flags = __flags, \ + .ma_root = NULL, \ +} + +/** + * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. + * @name: The tree name + * @flags: The maple tree flags + * @lock: The external lock + */ +#ifdef CONFIG_LOCKDEP +#define MTREE_INIT_EXT(name, __flags, __lock) { \ + .ma_external_lock = &(__lock).dep_map, \ + .ma_flags = (__flags), \ + .ma_root = NULL, \ +} +#else +#define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) +#endif + +#define DEFINE_MTREE(name) \ + struct maple_tree name = MTREE_INIT(name, 0) + +#define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) +#define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) + +/* + * The Maple Tree squeezes various bits in at various points which aren't + * necessarily obvious. Usually, this is done by observing that pointers are + * N-byte aligned and thus the bottom log_2(N) bits are available for use. We + * don't use the high bits of pointers to store additional information because + * we don't know what bits are unused on any given architecture. + * + * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 + * low bits for our own purposes. Nodes are currently of 4 types: + * 1. Single pointer (Range is 0-0) + * 2. Non-leaf Allocation Range nodes + * 3. Non-leaf Range nodes + * 4. Leaf Range nodes All nodes consist of a number of node slots, + * pivots, and a parent pointer. + */ + +struct maple_node { + union { + struct { + struct maple_pnode *parent; + void __rcu *slot[MAPLE_NODE_SLOTS]; + }; + struct { + void *pad; + struct rcu_head rcu; + struct maple_enode *piv_parent; + unsigned char parent_slot; + enum maple_type type; + unsigned char slot_len; + unsigned int ma_flags; + }; + struct maple_range_64 mr64; + struct maple_arange_64 ma64; + struct maple_alloc alloc; + }; +}; + +/* + * More complicated stores can cause two nodes to become one or three and + * potentially alter the height of the tree. Either half of the tree may need + * to be rebalanced against the other. The ma_topiary struct is used to track + * which nodes have been 'cut' from the tree so that the change can be done + * safely at a later date. This is done to support RCU. + */ +struct ma_topiary { + struct maple_enode *head; + struct maple_enode *tail; + struct maple_tree *mtree; +}; + +void *mtree_load(struct maple_tree *mt, unsigned long index); + +int mtree_insert(struct maple_tree *mt, unsigned long index, + void *entry, gfp_t gfp); +int mtree_insert_range(struct maple_tree *mt, unsigned long first, + unsigned long last, void *entry, gfp_t gfp); +int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long size, unsigned long min, + unsigned long max, gfp_t gfp); +int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long size, unsigned long min, + unsigned long max, gfp_t gfp); + +int mtree_store_range(struct maple_tree *mt, unsigned long first, + unsigned long last, void *entry, gfp_t gfp); +int mtree_store(struct maple_tree *mt, unsigned long index, + void *entry, gfp_t gfp); +void *mtree_erase(struct maple_tree *mt, unsigned long index); + +void mtree_destroy(struct maple_tree *mt); +void __mt_destroy(struct maple_tree *mt); + +/** + * mtree_empty() - Determine if a tree has any present entries. + * @mt: Maple Tree. + * + * Context: Any context. + * Return: %true if the tree contains only NULL pointers. + */ +static inline bool mtree_empty(const struct maple_tree *mt) +{ + return mt->ma_root == NULL; +} + +/* Advanced API */ + +/* + * The maple state is defined in the struct ma_state and is used to keep track + * of information during operations, and even between operations when using the + * advanced API. + * + * If state->node has bit 0 set then it references a tree location which is not + * a node (eg the root). If bit 1 is set, the rest of the bits are a negative + * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the + * node type. + * + * state->alloc either has a request number of nodes or an allocated node. If + * stat->alloc has a requested number of nodes, the first bit will be set (0x1) + * and the remaining bits are the value. If state->alloc is a node, then the + * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for + * storing more allocated nodes, a total number of nodes allocated, and the + * node_count in this node. node_count is the number of allocated nodes in this + * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further + * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc + * by removing a node from the state->alloc node until state->alloc->node_count + * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted + * to state->alloc. Nodes are pushed onto state->alloc by putting the current + * state->alloc into the pushed node's slot[0]. + * + * The state also contains the implied min/max of the state->node, the depth of + * this search, and the offset. The implied min/max are either from the parent + * node or are 0-oo for the root node. The depth is incremented or decremented + * every time a node is walked down or up. The offset is the slot/pivot of + * interest in the node - either for reading or writing. + * + * When returning a value the maple state index and last respectively contain + * the start and end of the range for the entry. Ranges are inclusive in the + * Maple Tree. + */ +struct ma_state { + struct maple_tree *tree; /* The tree we're operating in */ + unsigned long index; /* The index we're operating on - range start */ + unsigned long last; /* The last index we're operating on - range end */ + struct maple_enode *node; /* The node containing this entry */ + unsigned long min; /* The minimum index of this node - implied pivot min */ + unsigned long max; /* The maximum index of this node - implied pivot max */ + struct maple_alloc *alloc; /* Allocated nodes for this operation */ + unsigned char depth; /* depth of tree descent during write */ + unsigned char offset; + unsigned char mas_flags; +}; + +struct ma_wr_state { + struct ma_state *mas; + struct maple_node *node; /* Decoded mas->node */ + unsigned long r_min; /* range min */ + unsigned long r_max; /* range max */ + enum maple_type type; /* mas->node type */ + unsigned char offset_end; /* The offset where the write ends */ + unsigned char node_end; /* mas->node end */ + unsigned long *pivots; /* mas->node->pivots pointer */ + unsigned long end_piv; /* The pivot at the offset end */ + void __rcu **slots; /* mas->node->slots pointer */ + void *entry; /* The entry to write */ + void *content; /* The existing entry that is being overwritten */ +}; + +#define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) +#define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) + + +/* + * Special values for ma_state.node. + * MAS_START means we have not searched the tree. + * MAS_ROOT means we have searched the tree and the entry we found lives in + * the root of the tree (ie it has index 0, length 1 and is the only entry in + * the tree). + * MAS_NONE means we have searched the tree and there is no node in the + * tree for this entry. For example, we searched for index 1 in an empty + * tree. Or we have a tree which points to a full leaf node and we + * searched for an entry which is larger than can be contained in that + * leaf node. + * MA_ERROR represents an errno. After dropping the lock and attempting + * to resolve the error, the walk would have to be restarted from the + * top of the tree as the tree may have been modified. + */ +#define MAS_START ((struct maple_enode *)1UL) +#define MAS_ROOT ((struct maple_enode *)5UL) +#define MAS_NONE ((struct maple_enode *)9UL) +#define MAS_PAUSE ((struct maple_enode *)17UL) +#define MA_ERROR(err) \ + ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) + +#define MA_STATE(name, mt, first, end) \ + struct ma_state name = { \ + .tree = mt, \ + .index = first, \ + .last = end, \ + .node = MAS_START, \ + .min = 0, \ + .max = ULONG_MAX, \ + .alloc = NULL, \ + } + +#define MA_WR_STATE(name, ma_state, wr_entry) \ + struct ma_wr_state name = { \ + .mas = ma_state, \ + .content = NULL, \ + .entry = wr_entry, \ + } + +#define MA_TOPIARY(name, tree) \ + struct ma_topiary name = { \ + .head = NULL, \ + .tail = NULL, \ + .mtree = tree, \ + } + +void *mas_walk(struct ma_state *mas); +void *mas_store(struct ma_state *mas, void *entry); +void *mas_erase(struct ma_state *mas); +int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); +void mas_store_prealloc(struct ma_state *mas, void *entry); +void *mas_find(struct ma_state *mas, unsigned long max); +void *mas_find_rev(struct ma_state *mas, unsigned long min); +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); + +bool mas_nomem(struct ma_state *mas, gfp_t gfp); +void mas_pause(struct ma_state *mas); +void maple_tree_init(void); +void mas_destroy(struct ma_state *mas); +int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); + +void *mas_prev(struct ma_state *mas, unsigned long min); +void *mas_next(struct ma_state *mas, unsigned long max); + +int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, + unsigned long size); + +/* Checks if a mas has not found anything */ +static inline bool mas_is_none(struct ma_state *mas) +{ + return mas->node == MAS_NONE; +} + +/* Checks if a mas has been paused */ +static inline bool mas_is_paused(struct ma_state *mas) +{ + return mas->node == MAS_PAUSE; +} + +void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); +void mas_dup_store(struct ma_state *mas, void *entry); + +/* + * This finds an empty area from the highest address to the lowest. + * AKA "Topdown" version, + */ +int mas_empty_area_rev(struct ma_state *mas, unsigned long min, + unsigned long max, unsigned long size); +/** + * mas_reset() - Reset a Maple Tree operation state. + * @mas: Maple Tree operation state. + * + * Resets the error or walk state of the @mas so future walks of the + * array will start from the root. Use this if you have dropped the + * lock and want to reuse the ma_state. + * + * Context: Any context. + */ +static inline void mas_reset(struct ma_state *mas) +{ + mas->node = MAS_START; +} + +/** + * mas_for_each() - Iterate over a range of the maple tree. + * @mas: Maple Tree operation state (maple_state) + * @entry: Entry retrieved from the tree + * @max: maximum index to retrieve from the tree + * + * When returned, mas->index and mas->last will hold the entire range for the + * entry. + * + * Note: may return the zero entry. + * + */ +#define mas_for_each(__mas, __entry, __max) \ + while (((__entry) = mas_find((__mas), (__max))) != NULL) + + +/** + * mas_set_range() - Set up Maple Tree operation state for a different index. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * Move the operation state to refer to a different range. This will + * have the effect of starting a walk from the top; see mas_next() + * to move to an adjacent index. + */ +static inline +void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) +{ + mas->index = start; + mas->last = last; + mas->node = MAS_START; +} + +/** + * mas_set() - Set up Maple Tree operation state for a different index. + * @mas: Maple Tree operation state. + * @index: New index into the Maple Tree. + * + * Move the operation state to refer to a different index. This will + * have the effect of starting a walk from the top; see mas_next() + * to move to an adjacent index. + */ +static inline void mas_set(struct ma_state *mas, unsigned long index) +{ + + mas_set_range(mas, index, index); +} + +static inline bool mt_external_lock(const struct maple_tree *mt) +{ + return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; +} + +/** + * mt_init_flags() - Initialise an empty maple tree with flags. + * @mt: Maple Tree + * @flags: maple tree flags. + * + * If you need to initialise a Maple Tree with special flags (eg, an + * allocation tree), use this function. + * + * Context: Any context. + */ +static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) +{ + mt->ma_flags = flags; + if (!mt_external_lock(mt)) + spin_lock_init(&mt->ma_lock); + rcu_assign_pointer(mt->ma_root, NULL); +} + +/** + * mt_init() - Initialise an empty maple tree. + * @mt: Maple Tree + * + * An empty Maple Tree. + * + * Context: Any context. + */ +static inline void mt_init(struct maple_tree *mt) +{ + mt_init_flags(mt, 0); +} + +static inline bool mt_in_rcu(struct maple_tree *mt) +{ +#ifdef CONFIG_MAPLE_RCU_DISABLED + return false; +#endif + return mt->ma_flags & MT_FLAGS_USE_RCU; +} + +/** + * mt_clear_in_rcu() - Switch the tree to non-RCU mode. + * @mt: The Maple Tree + */ +static inline void mt_clear_in_rcu(struct maple_tree *mt) +{ + if (!mt_in_rcu(mt)) + return; + + if (mt_external_lock(mt)) { + BUG_ON(!mt_lock_is_held(mt)); + mt->ma_flags &= ~MT_FLAGS_USE_RCU; + } else { + mtree_lock(mt); + mt->ma_flags &= ~MT_FLAGS_USE_RCU; + mtree_unlock(mt); + } +} + +/** + * mt_set_in_rcu() - Switch the tree to RCU safe mode. + * @mt: The Maple Tree + */ +static inline void mt_set_in_rcu(struct maple_tree *mt) +{ + if (mt_in_rcu(mt)) + return; + + if (mt_external_lock(mt)) { + BUG_ON(!mt_lock_is_held(mt)); + mt->ma_flags |= MT_FLAGS_USE_RCU; + } else { + mtree_lock(mt); + mt->ma_flags |= MT_FLAGS_USE_RCU; + mtree_unlock(mt); + } +} + +void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); +void *mt_find_after(struct maple_tree *mt, unsigned long *index, + unsigned long max); +void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); +void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); + +/** + * mt_for_each - Iterate over each entry starting at index until max. + * @tree: The Maple Tree + * @entry: The current entry + * @index: The index to update to track the location in the tree + * @max: The maximum limit for @index + * + * Note: Will not return the zero entry. + */ +#define mt_for_each(__tree, __entry, __index, __max) \ + for (__entry = mt_find(__tree, &(__index), __max); \ + __entry; __entry = mt_find_after(__tree, &(__index), __max)) + + +#ifdef CONFIG_DEBUG_MAPLE_TREE +extern atomic_t maple_tree_tests_run; +extern atomic_t maple_tree_tests_passed; + +void mt_dump(const struct maple_tree *mt); +void mt_validate(struct maple_tree *mt); +#define MT_BUG_ON(__tree, __x) do { \ + atomic_inc(&maple_tree_tests_run); \ + if (__x) { \ + pr_info("BUG at %s:%d (%u)\n", \ + __func__, __LINE__, __x); \ + mt_dump(__tree); \ + pr_info("Pass: %u Run:%u\n", \ + atomic_read(&maple_tree_tests_passed), \ + atomic_read(&maple_tree_tests_run)); \ + dump_stack(); \ + } else { \ + atomic_inc(&maple_tree_tests_passed); \ + } \ +} while (0) +#else +#define MT_BUG_ON(__tree, __x) BUG_ON(__x) +#endif /* CONFIG_DEBUG_MAPLE_TREE */ + +#endif /*_LINUX_MAPLE_TREE_H */ diff --git a/include/trace/events/maple_tree.h b/include/trace/events/maple_tree.h new file mode 100644 index 00000000000000..2be403bdc2bd8d --- /dev/null +++ b/include/trace/events/maple_tree.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM maple_tree + +#if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MM_H + + +#include + +struct ma_state; + +TRACE_EVENT(ma_op, + + TP_PROTO(const char *fn, struct ma_state *mas), + + TP_ARGS(fn, mas), + + TP_STRUCT__entry( + __field(const char *, fn) + __field(unsigned long, min) + __field(unsigned long, max) + __field(unsigned long, index) + __field(unsigned long, last) + __field(void *, node) + ), + + TP_fast_assign( + __entry->fn = fn; + __entry->min = mas->min; + __entry->max = mas->max; + __entry->index = mas->index; + __entry->last = mas->last; + __entry->node = mas->node; + ), + + TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", + __entry->fn, + (void *) __entry->node, + (unsigned long) __entry->min, + (unsigned long) __entry->max, + (unsigned long) __entry->index, + (unsigned long) __entry->last + ) +) +TRACE_EVENT(ma_read, + + TP_PROTO(const char *fn, struct ma_state *mas), + + TP_ARGS(fn, mas), + + TP_STRUCT__entry( + __field(const char *, fn) + __field(unsigned long, min) + __field(unsigned long, max) + __field(unsigned long, index) + __field(unsigned long, last) + __field(void *, node) + ), + + TP_fast_assign( + __entry->fn = fn; + __entry->min = mas->min; + __entry->max = mas->max; + __entry->index = mas->index; + __entry->last = mas->last; + __entry->node = mas->node; + ), + + TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu", + __entry->fn, + (void *) __entry->node, + (unsigned long) __entry->min, + (unsigned long) __entry->max, + (unsigned long) __entry->index, + (unsigned long) __entry->last + ) +) + +TRACE_EVENT(ma_write, + + TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv, + void *val), + + TP_ARGS(fn, mas, piv, val), + + TP_STRUCT__entry( + __field(const char *, fn) + __field(unsigned long, min) + __field(unsigned long, max) + __field(unsigned long, index) + __field(unsigned long, last) + __field(unsigned long, piv) + __field(void *, val) + __field(void *, node) + ), + + TP_fast_assign( + __entry->fn = fn; + __entry->min = mas->min; + __entry->max = mas->max; + __entry->index = mas->index; + __entry->last = mas->last; + __entry->piv = piv; + __entry->val = val; + __entry->node = mas->node; + ), + + TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p", + __entry->fn, + (void *) __entry->node, + (unsigned long) __entry->min, + (unsigned long) __entry->max, + (unsigned long) __entry->index, + (unsigned long) __entry->last, + (unsigned long) __entry->piv, + (void *) __entry->val + ) +) +#endif /* _TRACE_MM_H */ + +/* This part must be outside protection */ +#include diff --git a/init/main.c b/init/main.c index 0ee39cdcfcac97..1edb1bfec640d1 100644 --- a/init/main.c +++ b/init/main.c @@ -116,6 +116,7 @@ static int kernel_init(void *); extern void init_IRQ(void); extern void radix_tree_init(void); +extern void maple_tree_init(void); /* * Debug helper: via this flag we know that we are in 'early bootup code' @@ -1001,6 +1002,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); radix_tree_init(); + maple_tree_init(); /* * Set up housekeeping before setting up workqueues to allow the unbound diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0b483a8da409d5..a8a36e5897552c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -821,6 +821,13 @@ config DEBUG_VM_VMACACHE can cause significant overhead, so only enable it in non-production environments. +config DEBUG_VM_MAPLE_TREE + bool "Debug VM maple trees" + depends on DEBUG_VM + select DEBUG_MAPLE_TREE + help + Enable VM maple tree debugging information and extra validations. + If unsure, say N. config DEBUG_VM_RB @@ -1636,6 +1643,14 @@ config BUG_ON_DATA_CORRUPTION If unsure, say N. +config DEBUG_MAPLE_TREE + bool "Debug maple trees" + depends on DEBUG_KERNEL + help + Enable maple tree debugging information and extra validations. + + If unsure, say N. + endmenu config DEBUG_CREDENTIALS diff --git a/lib/Makefile b/lib/Makefile index f99bf61f8bbc67..2b845a5e4a80d1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -29,7 +29,8 @@ endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o timerqueue.o xarray.o \ - idr.o extable.o sha1.o irq_regs.o argv_split.o \ + maple_tree.o idr.o extable.o \ + sha1.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ diff --git a/lib/maple_tree.c b/lib/maple_tree.c new file mode 100644 index 00000000000000..d5b310e73068e5 --- /dev/null +++ b/lib/maple_tree.c @@ -0,0 +1,7102 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Maple Tree implementation + * Copyright (c) 2018-2022 Oracle Corporation + * Authors: Liam R. Howlett + * Matthew Wilcox + */ + +/* + * DOC: Interesting implementation details of the Maple Tree + * + * Each node type has a number of slots for entries and a number of slots for + * pivots. In the case of dense nodes, the pivots are implied by the position + * and are simply the slot index + the minimum of the node. + * + * In regular B-Tree terms, pivots are called keys. The term pivot is used to + * indicate that the tree is specifying ranges, Pivots may appear in the + * subtree with an entry attached to the value where as keys are unique to a + * specific position of a B-tree. Pivot values are inclusive of the slot with + * the same index. + * + * + * The following illustrates the layout of a range64 nodes slots and pivots. + * + * + * Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 | + * ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ + * │ │ │ │ │ │ │ │ └─ Implied maximum + * │ │ │ │ │ │ │ └─ Pivot 14 + * │ │ │ │ │ │ └─ Pivot 13 + * │ │ │ │ │ └─ Pivot 12 + * │ │ │ │ └─ Pivot 11 + * │ │ │ └─ Pivot 2 + * │ │ └─ Pivot 1 + * │ └─ Pivot 0 + * └─ Implied minimum + * + * Slot contents: + * Internal (non-leaf) nodes contain pointers to other nodes. + * Leaf nodes contain entries. + * + * The location of interest is often referred to as an offset. All offsets have + * a slot, but the last offset has an implied pivot from the node above (or + * UINT_MAX for the root node. + * + * Ranges complicate certain write activities. When modifying any of + * the B-tree variants, it is known that one entry will either be added or + * deleted. When modifying the Maple Tree, one store operation may overwrite + * the entire data set, or one half of the tree, or the middle half of the tree. + * + */ + + +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#define MA_ROOT_PARENT 1 + +/* Maple state flags */ +#define MA_STATE_BULK 1 +#define MA_STATE_REBALANCE 2 + +#define ma_parent_ptr(x) ((struct maple_pnode *)(x)) +#define ma_mnode_ptr(x) ((struct maple_node *)(x)) +#define ma_enode_ptr(x) ((struct maple_enode *)(x)) +static struct kmem_cache *maple_node_cache; + +#ifdef CONFIG_DEBUG_MAPLE_TREE +static const unsigned long mt_max[] = { + [maple_dense] = MAPLE_NODE_SLOTS, + [maple_leaf_64] = ULONG_MAX, + [maple_range_64] = ULONG_MAX, + [maple_arange_64] = ULONG_MAX, +}; +#define mt_node_max(x) mt_max[mte_node_type(x)] +#endif + +static const unsigned char mt_slots[] = { + [maple_dense] = MAPLE_NODE_SLOTS, + [maple_leaf_64] = MAPLE_RANGE64_SLOTS, + [maple_range_64] = MAPLE_RANGE64_SLOTS, + [maple_arange_64] = MAPLE_ARANGE64_SLOTS, +}; +#define mt_slot_count(x) mt_slots[mte_node_type(x)] + +static const unsigned char mt_pivots[] = { + [maple_dense] = 0, + [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, + [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, + [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, +}; +#define mt_pivot_count(x) mt_pivots[mte_node_type(x)] + +static const unsigned char mt_min_slots[] = { + [maple_dense] = MAPLE_NODE_SLOTS / 2, + [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, + [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, + [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, +}; +#define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] + +#define MAPLE_BIG_NODE_SLOTS (MAPLE_RANGE64_SLOTS * 2 + 2) +#define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) + +struct maple_big_node { + struct maple_pnode *parent; + unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; + union { + struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; + struct { + unsigned long padding[MAPLE_BIG_NODE_GAPS]; + unsigned long gap[MAPLE_BIG_NODE_GAPS]; + }; + }; + unsigned char b_end; + enum maple_type type; +}; + +/* + * The maple_subtree_state is used to build a tree to replace a segment of an + * existing tree in a more atomic way. Any walkers of the older tree will hit a + * dead node and restart on updates. + */ +struct maple_subtree_state { + struct ma_state *orig_l; /* Original left side of subtree */ + struct ma_state *orig_r; /* Original right side of subtree */ + struct ma_state *l; /* New left side of subtree */ + struct ma_state *m; /* New middle of subtree (rare) */ + struct ma_state *r; /* New right side of subtree */ + struct ma_topiary *free; /* nodes to be freed */ + struct ma_topiary *destroy; /* Nodes to be destroyed (walked and freed) */ + struct maple_big_node *bn; +}; + +/* Functions */ +static inline struct maple_node *mt_alloc_one(gfp_t gfp) +{ + return kmem_cache_alloc(maple_node_cache, gfp | __GFP_ZERO); +} + +static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) +{ + return kmem_cache_alloc_bulk(maple_node_cache, gfp | __GFP_ZERO, size, + nodes); +} + +static inline void mt_free_bulk(size_t size, void __rcu **nodes) +{ + kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); +} + +static void mt_free_rcu(struct rcu_head *head) +{ + struct maple_node *node = container_of(head, struct maple_node, rcu); + + kmem_cache_free(maple_node_cache, node); +} + +/* + * ma_free_rcu() - Use rcu callback to free a maple node + * @node: The node to free + * + * The maple tree uses the parent pointer to indicate this node is no longer in + * use and will be freed. + */ +static void ma_free_rcu(struct maple_node *node) +{ + node->parent = ma_parent_ptr(node); + call_rcu(&node->rcu, mt_free_rcu); +} + +static unsigned int mt_height(const struct maple_tree *mt) +{ + return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; +} + +static void mas_set_height(struct ma_state *mas) +{ + unsigned int new_flags = mas->tree->ma_flags; + + new_flags &= ~MT_FLAGS_HEIGHT_MASK; + BUG_ON(mas->depth > MAPLE_HEIGHT_MAX); + new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET; + mas->tree->ma_flags = new_flags; +} + +static unsigned int mas_mt_height(struct ma_state *mas) +{ + return mt_height(mas->tree); +} + +static inline enum maple_type mte_node_type(const struct maple_enode *entry) +{ + return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & + MAPLE_NODE_TYPE_MASK; +} + +static inline bool ma_is_dense(const enum maple_type type) +{ + return type < maple_leaf_64; +} + +static inline bool ma_is_leaf(const enum maple_type type) +{ + return type < maple_range_64; +} + +static inline bool mte_is_leaf(const struct maple_enode *entry) +{ + return ma_is_leaf(mte_node_type(entry)); +} + +/* + * We also reserve values with the bottom two bits set to '10' which are + * below 4096 + */ +static inline bool mt_is_reserved(const void *entry) +{ + return ((unsigned long)entry < MAPLE_RESERVED_RANGE) && + xa_is_internal(entry); +} + +static inline void mas_set_err(struct ma_state *mas, long err) +{ + mas->node = MA_ERROR(err); +} + +static inline bool mas_is_ptr(struct ma_state *mas) +{ + return mas->node == MAS_ROOT; +} + +static inline bool mas_is_start(struct ma_state *mas) +{ + return mas->node == MAS_START; +} + +static inline bool mas_is_err(struct ma_state *mas) +{ + return xa_is_err(mas->node); +} + +static inline bool mas_searchable(struct ma_state *mas) +{ + if (mas_is_none(mas)) + return false; + + if (mas_is_ptr(mas)) + return false; + + return true; +} + +static inline struct maple_node *mte_to_node(const struct maple_enode *entry) +{ + return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK); +} + +/* + * mte_to_mat() - Convert a maple encoded node to a maple topiary node. + * @entry: The maple encoded node + * + * Return: a maple topiary pointer + */ +static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry) +{ + return (struct maple_topiary *) + ((unsigned long)entry & ~MAPLE_NODE_MASK); +} + +/* + * mas_mn() - Get the maple state node. + * @mas: The maple state + * + * Return: the maple node (not encoded - bare pointer). + */ +static inline struct maple_node *mas_mn(const struct ma_state *mas) +{ + return mte_to_node(mas->node); +} + +/* + * mte_set_node_dead() - Set a maple encoded node as dead. + * @mn: The maple encoded node. + */ +static inline void mte_set_node_dead(struct maple_enode *mn) +{ + mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn)); + smp_wmb(); /* Needed for RCU */ +} + +/* Bit 1 indicates the root is a node */ +#define MAPLE_ROOT_NODE 0x02 +/* maple_type stored bit 3-6 */ +#define MAPLE_ENODE_TYPE_SHIFT 0x03 +/* Bit 2 means a NULL somewhere below */ +#define MAPLE_ENODE_NULL 0x04 + +static inline struct maple_enode *mt_mk_node(const struct maple_node *node, + enum maple_type type) +{ + return (void *)((unsigned long)node | + (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); +} + +static inline void *mte_mk_root(const struct maple_enode *node) +{ + return (void *)((unsigned long)node | MAPLE_ROOT_NODE); +} + +static inline void *mte_safe_root(const struct maple_enode *node) +{ + return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); +} + +static inline void mte_set_full(const struct maple_enode *node) +{ + node = (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); +} + +static inline void mte_clear_full(const struct maple_enode *node) +{ + node = (void *)((unsigned long)node | MAPLE_ENODE_NULL); +} + +static inline bool ma_is_root(struct maple_node *node) +{ + return ((unsigned long)node->parent & MA_ROOT_PARENT); +} + +static inline bool mte_is_root(const struct maple_enode *node) +{ + return ma_is_root(mte_to_node(node)); +} + +static inline bool mas_is_root_limits(const struct ma_state *mas) +{ + return !mas->min && mas->max == ULONG_MAX; +} + +static inline bool mt_is_alloc(struct maple_tree *mt) +{ + return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE); +} + +/* + * The Parent Pointer + * Excluding root, the parent pointer is 256B aligned like all other tree nodes. + * When storing a 32 or 64 bit values, the offset can fit into 5 bits. The 16 + * bit values need an extra bit to store the offset. This extra bit comes from + * a reuse of the last bit in the node type. This is possible by using bit 1 to + * indicate if bit 2 is part of the type or the slot. + * + * Note types: + * 0x??1 = Root + * 0x?00 = 16 bit nodes + * 0x010 = 32 bit nodes + * 0x110 = 64 bit nodes + * + * Slot size and alignment + * 0b??1 : Root + * 0b?00 : 16 bit values, type in 0-1, slot in 2-7 + * 0b010 : 32 bit values, type in 0-2, slot in 3-7 + * 0b110 : 64 bit values, type in 0-2, slot in 3-7 + */ + +#define MAPLE_PARENT_ROOT 0x01 + +#define MAPLE_PARENT_SLOT_SHIFT 0x03 +#define MAPLE_PARENT_SLOT_MASK 0xF8 + +#define MAPLE_PARENT_16B_SLOT_SHIFT 0x02 +#define MAPLE_PARENT_16B_SLOT_MASK 0xFC + +#define MAPLE_PARENT_RANGE64 0x06 +#define MAPLE_PARENT_RANGE32 0x04 +#define MAPLE_PARENT_NOT_RANGE16 0x02 + +/* + * mte_parent_shift() - Get the parent shift for the slot storage. + * @parent: The parent pointer cast as an unsigned long + * Return: The shift into that pointer to the star to of the slot + */ +static inline unsigned long mte_parent_shift(unsigned long parent) +{ + /* Note bit 1 == 0 means 16B */ + if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) + return MAPLE_PARENT_SLOT_SHIFT; + + return MAPLE_PARENT_16B_SLOT_SHIFT; +} + +/* + * mte_parent_slot_mask() - Get the slot mask for the parent. + * @parent: The parent pointer cast as an unsigned long. + * Return: The slot mask for that parent. + */ +static inline unsigned long mte_parent_slot_mask(unsigned long parent) +{ + /* Note bit 1 == 0 means 16B */ + if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) + return MAPLE_PARENT_SLOT_MASK; + + return MAPLE_PARENT_16B_SLOT_MASK; +} + +/* + * mas_parent_enum() - Return the maple_type of the parent from the stored + * parent type. + * @mas: The maple state + * @node: The maple_enode to extract the parent's enum + * Return: The node->parent maple_type + */ +static inline +enum maple_type mte_parent_enum(struct maple_enode *p_enode, + struct maple_tree *mt) +{ + unsigned long p_type; + + p_type = (unsigned long)p_enode; + if (p_type & MAPLE_PARENT_ROOT) + return 0; /* Validated in the caller. */ + + p_type &= MAPLE_NODE_MASK; + p_type = p_type & ~(MAPLE_PARENT_ROOT | mte_parent_slot_mask(p_type)); + + switch (p_type) { + case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ + if (mt_is_alloc(mt)) + return maple_arange_64; + return maple_range_64; + } + + return 0; +} + +static inline +enum maple_type mas_parent_enum(struct ma_state *mas, struct maple_enode *enode) +{ + return mte_parent_enum(ma_enode_ptr(mte_to_node(enode)->parent), mas->tree); +} + +/* + * mte_set_parent() - Set the parent node and encode the slot + * @enode: The encoded maple node. + * @parent: The encoded maple node that is the parent of @enode. + * @slot: The slot that @enode resides in @parent. + * + * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the + * parent type. + */ +static inline +void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, + unsigned char slot) +{ + unsigned long val = (unsigned long) parent; + unsigned long shift; + unsigned long type; + enum maple_type p_type = mte_node_type(parent); + + BUG_ON(p_type == maple_dense); + BUG_ON(p_type == maple_leaf_64); + + switch (p_type) { + case maple_range_64: + case maple_arange_64: + shift = MAPLE_PARENT_SLOT_SHIFT; + type = MAPLE_PARENT_RANGE64; + break; + default: + case maple_dense: + case maple_leaf_64: + shift = type = 0; + break; + } + + val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */ + val |= (slot << shift) | type; + mte_to_node(enode)->parent = ma_parent_ptr(val); +} + +/* + * mte_parent_slot() - get the parent slot of @enode. + * @enode: The encoded maple node. + * + * Return: The slot in the parent node where @enode resides. + */ +static inline unsigned int mte_parent_slot(const struct maple_enode *enode) +{ + unsigned long val = (unsigned long) mte_to_node(enode)->parent; + + /* Root. */ + if (val & 1) + return 0; + + /* + * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost + * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT + */ + return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val); +} + +/* + * mte_parent() - Get the parent of @node. + * @node: The encoded maple node. + * + * Return: The parent maple node. + */ +static inline struct maple_node *mte_parent(const struct maple_enode *enode) +{ + return (void *)((unsigned long) + (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK); +} + +/* + * ma_dead_node() - check if the @enode is dead. + * @enode: The encoded maple node + * + * Return: true if dead, false otherwise. + */ +static inline bool ma_dead_node(const struct maple_node *node) +{ + struct maple_node *parent = (void *)((unsigned long) + node->parent & ~MAPLE_NODE_MASK); + + return (parent == node); +} +/* + * mte_dead_node() - check if the @enode is dead. + * @enode: The encoded maple node + * + * Return: true if dead, false otherwise. + */ +static inline bool mte_dead_node(const struct maple_enode *enode) +{ + struct maple_node *parent, *node; + + node = mte_to_node(enode); + parent = mte_parent(enode); + return (parent == node); +} + +/* + * mas_allocated() - Get the number of nodes allocated in a maple state. + * @mas: The maple state + * + * The ma_state alloc member is overloaded to hold a pointer to the first + * allocated node or to the number of requested nodes to allocate. If bit 0 is + * set, then the alloc contains the number of requested nodes. If there is an + * allocated node, then the total allocated nodes is in that node. + * + * Return: The total number of nodes allocated + */ +static inline unsigned long mas_allocated(const struct ma_state *mas) +{ + if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) + return 0; + + return mas->alloc->total; +} + +/* + * mas_set_alloc_req() - Set the requested number of allocations. + * @mas: the maple state + * @count: the number of allocations. + * + * The requested number of allocations is either in the first allocated node, + * located in @mas->alloc->request_count, or directly in @mas->alloc if there is + * no allocated node. Set the request either in the node or do the necessary + * encoding to store in @mas->alloc directly. + */ +static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) +{ + if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { + if (!count) + mas->alloc = NULL; + else + mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); + return; + } + + mas->alloc->request_count = count; +} + +/* + * mas_alloc_req() - get the requested number of allocations. + * @mas: The maple state + * + * The alloc count is either stored directly in @mas, or in + * @mas->alloc->request_count if there is at least one node allocated. Decode + * the request count if it's stored directly in @mas->alloc. + * + * Return: The allocation request count. + */ +static inline unsigned int mas_alloc_req(const struct ma_state *mas) +{ + if ((unsigned long)mas->alloc & 0x1) + return (unsigned long)(mas->alloc) >> 1; + else if (mas->alloc) + return mas->alloc->request_count; + return 0; +} + +/* + * ma_pivots() - Get a pointer to the maple node pivots. + * @node - the maple node + * @type - the node type + * + * Return: A pointer to the maple node pivots + */ +static inline unsigned long *ma_pivots(struct maple_node *node, + enum maple_type type) +{ + switch (type) { + case maple_arange_64: + return node->ma64.pivot; + case maple_range_64: + case maple_leaf_64: + return node->mr64.pivot; + case maple_dense: + return NULL; + } + return NULL; +} + +/* + * ma_gaps() - Get a pointer to the maple node gaps. + * @node - the maple node + * @type - the node type + * + * Return: A pointer to the maple node gaps + */ +static inline unsigned long *ma_gaps(struct maple_node *node, + enum maple_type type) +{ + switch (type) { + case maple_arange_64: + return node->ma64.gap; + case maple_range_64: + case maple_leaf_64: + case maple_dense: + return NULL; + } + return NULL; +} + +/* + * mte_pivot() - Get the pivot at @piv of the maple encoded node. + * @mn: The maple encoded node. + * @piv: The pivot. + * + * Return: the pivot at @piv of @mn. + */ +static inline unsigned long mte_pivot(const struct maple_enode *mn, + unsigned char piv) +{ + struct maple_node *node = mte_to_node(mn); + + if (piv >= mt_pivots[piv]) { + WARN_ON(1); + return 0; + } + switch (mte_node_type(mn)) { + case maple_arange_64: + return node->ma64.pivot[piv]; + case maple_range_64: + case maple_leaf_64: + return node->mr64.pivot[piv]; + case maple_dense: + return 0; + } + return 0; +} + +/* + * mas_safe_pivot() - get the pivot at @piv or mas->max. + * @mas: The maple state + * @pivots: The pointer to the maple node pivots + * @piv: The pivot to fetch + * @type: The maple node type + * + * Return: The pivot at @piv within the limit of the @pivots array, @mas->max + * otherwise. + */ +static inline unsigned long +mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots, + unsigned char piv, enum maple_type type) +{ + if (piv >= mt_pivots[type]) + return mas->max; + + return pivots[piv]; +} + +/* + * mas_safe_min() - Return the minimum for a given offset. + * @mas: The maple state + * @pivots: The pointer to the maple node pivots + * @offset: The offset into the pivot array + * + * Return: The minimum range value that is contained in @offset. + */ +static inline unsigned long +mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) +{ + if (likely(offset)) + return pivots[offset - 1] + 1; + + return mas->min; +} + +/* + * mas_logical_pivot() - Get the logical pivot of a given offset. + * @mas: The maple state + * @pivots: The pointer to the maple node pivots + * @offset: The offset into the pivot array + * @type: The maple node type + * + * When there is no value at a pivot (beyond the end of the data), then the + * pivot is actually @mas->max. + * + * Return: the logical pivot of a given @offset. + */ +static inline unsigned long +mas_logical_pivot(struct ma_state *mas, unsigned long *pivots, + unsigned char offset, enum maple_type type) +{ + unsigned long lpiv = mas_safe_pivot(mas, pivots, offset, type); + + if (likely(lpiv)) + return lpiv; + + if (likely(offset)) + return mas->max; + + return lpiv; +} + +/* + * mte_set_pivot() - Set a pivot to a value in an encoded maple node. + * @mn: The encoded maple node + * @piv: The pivot offset + * @val: The value of the pivot + */ +static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, + unsigned long val) +{ + struct maple_node *node = mte_to_node(mn); + enum maple_type type = mte_node_type(mn); + + BUG_ON(piv >= mt_pivots[type]); + switch (type) { + default: + case maple_range_64: + case maple_leaf_64: + node->mr64.pivot[piv] = val; + break; + case maple_arange_64: + node->ma64.pivot[piv] = val; + break; + case maple_dense: + break; + } + +} + +/* + * ma_slots() - Get a pointer to the maple node slots. + * @mn: The maple node + * @mt: The maple node type + * + * Return: A pointer to the maple node slots + */ +static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) +{ + switch (mt) { + default: + case maple_arange_64: + return mn->ma64.slot; + case maple_range_64: + case maple_leaf_64: + return mn->mr64.slot; + case maple_dense: + return mn->slot; + } +} + +static inline bool mt_locked(const struct maple_tree *mt) +{ + return mt_external_lock(mt) ? mt_lock_is_held(mt) : + lockdep_is_held(&mt->ma_lock); +} + +static inline void *mt_slot(const struct maple_tree *mt, + void __rcu **slots, unsigned char offset) +{ + return rcu_dereference_check(slots[offset], mt_locked(mt)); +} + +/* + * mas_slot_locked() - Get the slot value when holding the maple tree lock. + * @mas: The maple state + * @slots: The pointer to the slots + * @offset: The offset into the slots array to fetch + * + * Return: The entry stored in @slots at the @offset. + */ +static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, + unsigned char offset) +{ + return rcu_dereference_protected(slots[offset], mt_locked(mas->tree)); +} + +/* + * mas_slot() - Get the slot value when not holding the maple tree lock. + * @mas: The maple state + * @slots: The pointer to the slots + * @offset: The offset into the slots array to fetch + * + * Return: The entry stored in @slots at the @offset + */ +static inline void *mas_slot(struct ma_state *mas, void __rcu **slots, + unsigned char offset) +{ + return mt_slot(mas->tree, slots, offset); +} + +/* + * mas_root() - Get the maple tree root. + * @mas: The maple state. + * + * Return: The pointer to the root of the tree + */ +static inline void *mas_root(struct ma_state *mas) +{ + return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree)); +} + +static inline void *mt_root_locked(struct maple_tree *mt) +{ + return rcu_dereference_protected(mt->ma_root, mt_locked(mt)); +} + +/* + * mas_root_locked() - Get the maple tree root when holding the maple tree lock. + * @mas: The maple state. + * + * Return: The pointer to the root of the tree + */ +static inline void *mas_root_locked(struct ma_state *mas) +{ + return mt_root_locked(mas->tree); +} + +static inline struct maple_metadata *ma_meta(struct maple_node *mn, + enum maple_type mt) +{ + switch (mt) { + case maple_arange_64: + return &mn->ma64.meta; + default: + return &mn->mr64.meta; + } +} + +/* + * ma_set_meta() - Set the metadata information of a node. + * @mn: The maple node + * @mt: The maple node type + * @offset: The offset of the highest sub-gap in this node. + * @end: The end of the data in this node. + */ +static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, + unsigned char offset, unsigned char end) +{ + struct maple_metadata *meta = ma_meta(mn, mt); + + meta->gap = offset; + meta->end = end; +} + +/* + * ma_meta_end() - Get the data end of a node from the metadata + * @mn: The maple node + * @mt: The maple node type + */ +static inline unsigned char ma_meta_end(struct maple_node *mn, + enum maple_type mt) +{ + struct maple_metadata *meta = ma_meta(mn, mt); + + return meta->end; +} + +/* + * ma_meta_gap() - Get the largest gap location of a node from the metadata + * @mn: The maple node + * @mt: The maple node type + */ +static inline unsigned char ma_meta_gap(struct maple_node *mn, + enum maple_type mt) +{ + BUG_ON(mt != maple_arange_64); + + return mn->ma64.meta.gap; +} + +/* + * ma_set_meta_gap() - Set the largest gap location in a nodes metadata + * @mn: The maple node + * @mn: The maple node type + * @offset: The location of the largest gap. + */ +static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, + unsigned char offset) +{ + + struct maple_metadata *meta = ma_meta(mn, mt); + + meta->gap = offset; +} + +/* + * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. + * @mat - the ma_topiary, a linked list of dead nodes. + * @dead_enode - the node to be marked as dead and added to the tail of the list + * + * Add the @dead_enode to the linked list in @mat. + */ +static inline void mat_add(struct ma_topiary *mat, + struct maple_enode *dead_enode) +{ + mte_set_node_dead(dead_enode); + mte_to_mat(dead_enode)->next = NULL; + if (!mat->tail) { + mat->tail = mat->head = dead_enode; + return; + } + + mte_to_mat(mat->tail)->next = dead_enode; + mat->tail = dead_enode; +} + +static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); +static inline void mas_free(struct ma_state *mas, struct maple_enode *used); + +/* + * mas_mat_free() - Free all nodes in a dead list. + * @mas - the maple state + * @mat - the ma_topiary linked list of dead nodes to free. + * + * Free walk a dead list. + */ +static void mas_mat_free(struct ma_state *mas, struct ma_topiary *mat) +{ + struct maple_enode *next; + + while (mat->head) { + next = mte_to_mat(mat->head)->next; + mas_free(mas, mat->head); + mat->head = next; + } +} + +/* + * mas_mat_destroy() - Free all nodes and subtrees in a dead list. + * @mas - the maple state + * @mat - the ma_topiary linked list of dead nodes to free. + * + * Destroy walk a dead list. + */ +static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) +{ + struct maple_enode *next; + + while (mat->head) { + next = mte_to_mat(mat->head)->next; + mte_destroy_walk(mat->head, mat->mtree); + mat->head = next; + } +} +/* + * mas_descend() - Descend into the slot stored in the ma_state. + * @mas - the maple state. + * + * Note: Not RCU safe, only use in write side or debug code. + */ +static inline void mas_descend(struct ma_state *mas) +{ + enum maple_type type; + unsigned long *pivots; + struct maple_node *node; + void __rcu **slots; + + node = mas_mn(mas); + type = mte_node_type(mas->node); + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + + if (mas->offset) + mas->min = pivots[mas->offset - 1] + 1; + mas->max = mas_safe_pivot(mas, pivots, mas->offset, type); + mas->node = mas_slot(mas, slots, mas->offset); +} + +/* + * mte_set_gap() - Set a maple node gap. + * @mn: The encoded maple node + * @gap: The offset of the gap to set + * @val: The gap value + */ +static inline void mte_set_gap(const struct maple_enode *mn, + unsigned char gap, unsigned long val) +{ + switch (mte_node_type(mn)) { + default: + break; + case maple_arange_64: + mte_to_node(mn)->ma64.gap[gap] = val; + break; + } +} + +/* + * mas_ascend() - Walk up a level of the tree. + * @mas: The maple state + * + * Sets the @mas->max and @mas->min to the correct values when walking up. This + * may cause several levels of walking up to find the correct min and max. + * May find a dead node which will cause a premature return. + * Return: 1 on dead node, 0 otherwise + */ +static int mas_ascend(struct ma_state *mas) +{ + struct maple_enode *p_enode; /* parent enode. */ + struct maple_enode *a_enode; /* ancestor enode. */ + struct maple_node *a_node; /* ancestor node. */ + struct maple_node *p_node; /* parent node. */ + unsigned char a_slot; + enum maple_type a_type; + unsigned long min, max; + unsigned long *pivots; + unsigned char offset; + bool set_max = false, set_min = false; + + a_node = mas_mn(mas); + if (ma_is_root(a_node)) { + mas->offset = 0; + return 0; + } + + p_node = mte_parent(mas->node); + if (unlikely(a_node == p_node)) + return 1; + a_type = mas_parent_enum(mas, mas->node); + offset = mte_parent_slot(mas->node); + a_enode = mt_mk_node(p_node, a_type); + + /* Check to make sure all parent information is still accurate */ + if (p_node != mte_parent(mas->node)) + return 1; + + mas->node = a_enode; + mas->offset = offset; + + if (mte_is_root(a_enode)) { + mas->max = ULONG_MAX; + mas->min = 0; + return 0; + } + + min = 0; + max = ULONG_MAX; + do { + p_enode = a_enode; + a_type = mas_parent_enum(mas, p_enode); + a_node = mte_parent(p_enode); + a_slot = mte_parent_slot(p_enode); + pivots = ma_pivots(a_node, a_type); + a_enode = mt_mk_node(a_node, a_type); + + if (!set_min && a_slot) { + set_min = true; + min = pivots[a_slot - 1] + 1; + } + + if (!set_max && a_slot < mt_pivots[a_type]) { + set_max = true; + max = pivots[a_slot]; + } + + if (unlikely(ma_dead_node(a_node))) + return 1; + + if (unlikely(ma_is_root(a_node))) + break; + + } while (!set_min || !set_max); + + mas->max = max; + mas->min = min; + return 0; +} + +/* + * mas_pop_node() - Get a previously allocated maple node from the maple state. + * @mas: The maple state + * + * Return: A pointer to a maple node. + */ +static inline struct maple_node *mas_pop_node(struct ma_state *mas) +{ + struct maple_alloc *ret, *node = mas->alloc; + unsigned long total = mas_allocated(mas); + + /* nothing or a request pending. */ + if (unlikely(!total)) + return NULL; + + if (total == 1) { + /* single allocation in this ma_state */ + mas->alloc = NULL; + ret = node; + goto single_node; + } + + if (!node->node_count) { + /* Single allocation in this node. */ + mas->alloc = node->slot[0]; + node->slot[0] = NULL; + mas->alloc->total = node->total - 1; + ret = node; + goto new_head; + } + + node->total--; + ret = node->slot[node->node_count]; + node->slot[node->node_count--] = NULL; + +single_node: +new_head: + ret->total = 0; + ret->node_count = 0; + if (ret->request_count) { + mas_set_alloc_req(mas, ret->request_count + 1); + ret->request_count = 0; + } + return (struct maple_node *)ret; +} + +/* + * mas_push_node() - Push a node back on the maple state allocation. + * @mas: The maple state + * @used: The used maple node + * + * Stores the maple node back into @mas->alloc for reuse. Updates allocated and + * requested node count as necessary. + */ +static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) +{ + struct maple_alloc *reuse = (struct maple_alloc *)used; + struct maple_alloc *head = mas->alloc; + unsigned long count; + unsigned int requested = mas_alloc_req(mas); + + memset(reuse, 0, sizeof(*reuse)); + count = mas_allocated(mas); + + if (count && (head->node_count < MAPLE_ALLOC_SLOTS - 1)) { + if (head->slot[0]) + head->node_count++; + head->slot[head->node_count] = reuse; + head->total++; + goto done; + } + + reuse->total = 1; + if ((head) && !((unsigned long)head & 0x1)) { + head->request_count = 0; + reuse->slot[0] = head; + reuse->total += head->total; + } + + mas->alloc = reuse; +done: + if (requested > 1) + mas_set_alloc_req(mas, requested - 1); +} + +/* + * mas_alloc_nodes() - Allocate nodes into a maple state + * @mas: The maple state + * @gfp: The GFP Flags + */ +static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) +{ + struct maple_alloc *node; + struct maple_alloc **nodep = &mas->alloc; + unsigned long allocated = mas_allocated(mas); + unsigned long success = allocated; + unsigned int requested = mas_alloc_req(mas); + unsigned int count; + void **slots = NULL; + unsigned int max_req = 0; + + if (!requested) + return; + + mas_set_alloc_req(mas, 0); + if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS - 1) { + node = (struct maple_alloc *)mt_alloc_one(gfp); + if (!node) + goto nomem_one; + + if (allocated) + node->slot[0] = mas->alloc; + + success++; + mas->alloc = node; + requested--; + } + + node = mas->alloc; + while (requested) { + max_req = MAPLE_NODE_SLOTS - 1; + if (node->slot[0]) { + unsigned int offset = node->node_count + 1; + + slots = (void **)&node->slot[offset]; + max_req -= offset; + } else { + slots = (void **)&node->slot; + } + + max_req = min(requested, max_req); + count = mt_alloc_bulk(gfp, max_req, slots); + if (!count) + goto nomem_bulk; + + node->node_count += count; + /* zero indexed. */ + if (slots == (void **)&node->slot) + node->node_count--; + + success += count; + nodep = &node->slot[0]; + node = *nodep; + requested -= count; + } + mas->alloc->total = success; + return; + +nomem_bulk: + /* Clean up potential freed allocations on bulk failure */ + memset(slots, 0, max_req * sizeof(unsigned long)); +nomem_one: + mas_set_alloc_req(mas, requested); + if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) + mas->alloc->total = success; + mas_set_err(mas, -ENOMEM); + return; + +} + +/* + * mas_free() - Free an encoded maple node + * @mas: The maple state + * @used: The encoded maple node to free. + * + * Uses rcu free if necessary, pushes @used back on the maple state allocations + * otherwise. + */ +static inline void mas_free(struct ma_state *mas, struct maple_enode *used) +{ + struct maple_node *tmp = mte_to_node(used); + + if (mt_in_rcu(mas->tree)) + ma_free_rcu(tmp); + else + mas_push_node(mas, tmp); +} + +/* + * mas_node_count() - Check if enough nodes are allocated and request more if + * there is not enough nodes. + * @mas: The maple state + * @count: The number of nodes needed + * @gfp: the gfp flags + */ +static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) +{ + unsigned long allocated = mas_allocated(mas); + + if (allocated < count) { + mas_set_alloc_req(mas, count - allocated); + mas_alloc_nodes(mas, gfp); + } +} + +/* + * mas_node_count() - Check if enough nodes are allocated and request more if + * there is not enough nodes. + * @mas: The maple state + * @count: The number of nodes needed + * + * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. + */ +static void mas_node_count(struct ma_state *mas, int count) +{ + return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); +} + +/* + * mas_start() - Sets up maple state for operations. + * @mas: The maple state. + * + * If mas->node == MAS_START, then set the min, max, depth, and offset to + * defaults. + * + * Return: + * - If mas->node is an error or not MAS_START, return NULL. + * - If it's an empty tree: NULL & mas->node == MAS_NONE + * - If it's a single entry: The entry & mas->node == MAS_ROOT + * - If it's a tree: NULL & mas->node == safe root node. + */ +static inline struct maple_enode *mas_start(struct ma_state *mas) +{ + if (likely(mas_is_start(mas))) { + struct maple_enode *root; + + mas->node = MAS_NONE; + mas->min = 0; + mas->max = ULONG_MAX; + mas->depth = 0; + mas->offset = 0; + + root = mas_root(mas); + /* Tree with nodes */ + if (likely(xa_is_node(root))) { + mas->node = mte_safe_root(root); + return NULL; + } + + /* empty tree */ + if (unlikely(!root)) { + mas->offset = MAPLE_NODE_SLOTS; + return NULL; + } + + /* Single entry tree */ + mas->node = MAS_ROOT; + mas->offset = MAPLE_NODE_SLOTS; + + /* Single entry tree. */ + if (mas->index > 0) + return NULL; + + return root; + } + + return NULL; +} + +/* + * ma_data_end() - Find the end of the data in a node. + * @node: The maple node + * @type: The maple node type + * @pivots: The array of pivots in the node + * @max: The maximum value in the node + * + * Uses metadata to find the end of the data when possible. + * Return: The zero indexed last slot with data (may be null). + */ +static inline unsigned char ma_data_end(struct maple_node *node, + enum maple_type type, + unsigned long *pivots, + unsigned long max) +{ + unsigned char offset; + + if (type == maple_arange_64) + return ma_meta_end(node, type); + + offset = mt_pivots[type] - 1; + if (likely(!pivots[offset])) + return ma_meta_end(node, type); + + if (likely(pivots[offset] == max)) + return offset; + + return mt_pivots[type]; +} + +/* + * mas_data_end() - Find the end of the data (slot). + * @mas: the maple state + * + * This method is optimized to check the metadata of a node if the node type + * supports data end metadata. + * + * Return: The zero indexed last slot with data (may be null). + */ +static inline unsigned char mas_data_end(struct ma_state *mas) +{ + enum maple_type type; + struct maple_node *node; + unsigned char offset; + unsigned long *pivots; + + type = mte_node_type(mas->node); + node = mas_mn(mas); + if (type == maple_arange_64) + return ma_meta_end(node, type); + + pivots = ma_pivots(node, type); + offset = mt_pivots[type] - 1; + if (likely(!pivots[offset])) + return ma_meta_end(node, type); + + if (likely(pivots[offset] == mas->max)) + return offset; + + return mt_pivots[type]; +} + +/* + * mas_leaf_max_gap() - Returns the largest gap in a leaf node + * @mas - the maple state + * + * Return: The maximum gap in the leaf. + */ +static unsigned long mas_leaf_max_gap(struct ma_state *mas) +{ + enum maple_type mt; + unsigned long pstart, gap, max_gap; + struct maple_node *mn; + unsigned long *pivots; + void __rcu **slots; + unsigned char i; + unsigned char max_piv; + + mt = mte_node_type(mas->node); + mn = mas_mn(mas); + slots = ma_slots(mn, mt); + max_gap = 0; + if (unlikely(ma_is_dense(mt))) { + gap = 0; + for (i = 0; i < mt_slots[mt]; i++) { + if (slots[i]) { + if (gap > max_gap) + max_gap = gap; + gap = 0; + } else { + gap++; + } + } + if (gap > max_gap) + max_gap = gap; + return max_gap; + } + + /* + * Check the first implied pivot optimizes the loop below and slot 1 may + * be skipped if there is a gap in slot 0. + */ + pivots = ma_pivots(mn, mt); + if (likely(!slots[0])) { + max_gap = pivots[0] - mas->min + 1; + i = 2; + } else { + i = 1; + } + + /* reduce max_piv as the special case is checked before the loop */ + max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1; + /* + * Check end implied pivot which can only be a gap on the right most + * node. + */ + if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) { + gap = ULONG_MAX - pivots[max_piv]; + if (gap > max_gap) + max_gap = gap; + } + + for (; i <= max_piv; i++) { + /* data == no gap. */ + if (likely(slots[i])) + continue; + + pstart = pivots[i - 1]; + gap = pivots[i] - pstart; + if (gap > max_gap) + max_gap = gap; + + /* There cannot be two gaps in a row. */ + i++; + } + return max_gap; +} + +/* + * ma_max_gap() - Get the maximum gap in a maple node (non-leaf) + * @node: The maple node + * @gaps: The pointer to the gaps + * @mt: The maple node type + * @*off: Pointer to store the offset location of the gap. + * + * Uses the metadata data end to scan backwards across set gaps. + * + * Return: The maximum gap value + */ +static inline unsigned long +ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, + unsigned char *off) +{ + unsigned char offset, i; + unsigned long max_gap = 0; + + i = offset = ma_meta_end(node, mt); + do { + if (gaps[i] > max_gap) { + max_gap = gaps[i]; + offset = i; + } + } while (i--); + + *off = offset; + return max_gap; +} + +/* + * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. + * @mas: The maple state. + * + * If the metadata gap is set to MAPLE_ARANGE64_META_MAX, there is no gap. + * + * Return: The gap value. + */ +static inline unsigned long mas_max_gap(struct ma_state *mas) +{ + unsigned long *gaps; + unsigned char offset; + enum maple_type mt; + struct maple_node *node; + + mt = mte_node_type(mas->node); + if (ma_is_leaf(mt)) + return mas_leaf_max_gap(mas); + + node = mas_mn(mas); + offset = ma_meta_gap(node, mt); + if (offset == MAPLE_ARANGE64_META_MAX) + return 0; + + gaps = ma_gaps(node, mt); + return gaps[offset]; +} + +/* + * mas_parent_gap() - Set the parent gap and any gaps above, as needed + * @mas: The maple state + * @offset: The gap offset in the parent to set + * @new: The new gap value. + * + * Set the parent gap then continue to set the gap upwards, using the metadata + * of the parent to see if it is necessary to check the node above. + */ +static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, + unsigned long new) +{ + unsigned long meta_gap = 0; + struct maple_node *pnode; + struct maple_enode *penode; + unsigned long *pgaps; + unsigned char meta_offset; + enum maple_type pmt; + + pnode = mte_parent(mas->node); + pmt = mas_parent_enum(mas, mas->node); + penode = mt_mk_node(pnode, pmt); + pgaps = ma_gaps(pnode, pmt); + +ascend: + meta_offset = ma_meta_gap(pnode, pmt); + if (meta_offset == MAPLE_ARANGE64_META_MAX) + meta_gap = 0; + else + meta_gap = pgaps[meta_offset]; + + pgaps[offset] = new; + + if (meta_gap == new) + return; + + if (offset != meta_offset) { + if (meta_gap > new) + return; + + ma_set_meta_gap(pnode, pmt, offset); + } else if (new < meta_gap) { + meta_offset = 15; + new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); + ma_set_meta_gap(pnode, pmt, meta_offset); + } + + if (ma_is_root(pnode)) + return; + + /* Go to the parent node. */ + pnode = mte_parent(penode); + pmt = mas_parent_enum(mas, penode); + pgaps = ma_gaps(pnode, pmt); + offset = mte_parent_slot(penode); + penode = mt_mk_node(pnode, pmt); + goto ascend; +} + +/* + * mas_update_gap() - Update a nodes gaps and propagate up if necessary. + * @mas - the maple state. + */ +static inline void mas_update_gap(struct ma_state *mas) +{ + unsigned char pslot; + unsigned long p_gap; + unsigned long max_gap; + + if (!mt_is_alloc(mas->tree)) + return; + + if (mte_is_root(mas->node)) + return; + + max_gap = mas_max_gap(mas); + + pslot = mte_parent_slot(mas->node); + p_gap = ma_gaps(mte_parent(mas->node), + mas_parent_enum(mas, mas->node))[pslot]; + + if (p_gap != max_gap) + mas_parent_gap(mas, pslot, max_gap); +} + +/* + * mas_adopt_children() - Set the parent pointer of all nodes in @parent to + * @parent with the slot encoded. + * @mas - the maple state (for the tree) + * @parent - the maple encoded node containing the children. + */ +static inline void mas_adopt_children(struct ma_state *mas, + struct maple_enode *parent) +{ + enum maple_type type = mte_node_type(parent); + struct maple_node *node = mas_mn(mas); + void __rcu **slots = ma_slots(node, type); + unsigned long *pivots = ma_pivots(node, type); + struct maple_enode *child; + unsigned char offset; + + offset = ma_data_end(node, type, pivots, mas->max); + do { + child = mas_slot_locked(mas, slots, offset); + mte_set_parent(child, parent, offset); + } while (offset--); +} + +/* + * mas_replace() - Replace a maple node in the tree with mas->node. Uses the + * parent encoding to locate the maple node in the tree. + * @mas - the ma_state to use for operations. + * @advanced - boolean to adopt the child nodes and free the old node (false) or + * leave the node (true) and handle the adoption and free elsewhere. + */ +static inline void mas_replace(struct ma_state *mas, bool advanced) + __must_hold(mas->tree->lock) +{ + struct maple_node *mn = mas_mn(mas); + struct maple_enode *old_enode; + unsigned char offset = 0; + void __rcu **slots = NULL; + + if (ma_is_root(mn)) { + old_enode = mas_root_locked(mas); + } else { + offset = mte_parent_slot(mas->node); + slots = ma_slots(mte_parent(mas->node), + mas_parent_enum(mas, mas->node)); + old_enode = mas_slot_locked(mas, slots, offset); + } + + if (!advanced && !mte_is_leaf(mas->node)) + mas_adopt_children(mas, mas->node); + + if (mte_is_root(mas->node)) { + mn->parent = ma_parent_ptr( + ((unsigned long)mas->tree | MA_ROOT_PARENT)); + rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); + mas_set_height(mas); + } else { + rcu_assign_pointer(slots[offset], mas->node); + } + + if (!advanced) + mas_free(mas, old_enode); +} + +/* + * mas_new_child() - Find the new child of a node. + * @mas: the maple state + * @child: the maple state to store the child. + */ +static inline bool mas_new_child(struct ma_state *mas, struct ma_state *child) + __must_hold(mas->tree->lock) +{ + enum maple_type mt; + unsigned char offset; + unsigned char end; + unsigned long *pivots; + struct maple_enode *entry; + struct maple_node *node; + void __rcu **slots; + + mt = mte_node_type(mas->node); + node = mas_mn(mas); + slots = ma_slots(node, mt); + pivots = ma_pivots(node, mt); + end = ma_data_end(node, mt, pivots, mas->max); + for (offset = mas->offset; offset <= end; offset++) { + entry = mas_slot_locked(mas, slots, offset); + if (mte_parent(entry) == node) { + *child = *mas; + mas->offset = offset + 1; + child->offset = offset; + mas_descend(child); + child->offset = 0; + return true; + } + } + return false; +} + +/* + * mab_shift_right() - Shift the data in mab right. Note, does not clean out the + * old data or set b_node->b_end. + * @b_node: the maple_big_node + * @shift: the shift count + */ +static inline void mab_shift_right(struct maple_big_node *b_node, + unsigned char shift) +{ + unsigned long size = b_node->b_end * sizeof(unsigned long); + + memmove(b_node->pivot + shift, b_node->pivot, size); + memmove(b_node->slot + shift, b_node->slot, size); + if (b_node->type == maple_arange_64) + memmove(b_node->gap + shift, b_node->gap, size); +} + +/* + * mab_middle_node() - Check if a middle node is needed (unlikely) + * @b_node: the maple_big_node that contains the data. + * @size: the amount of data in the b_node + * @split: the potential split location + * @slot_count: the size that can be stored in a single node being considered. + * + * Return: true if a middle node is required. + */ +static inline bool mab_middle_node(struct maple_big_node *b_node, int split, + unsigned char slot_count) +{ + unsigned char size = b_node->b_end; + + if (size >= 2 * slot_count) + return true; + + if (!b_node->slot[split] && (size >= 2 * slot_count - 1)) + return true; + + return false; +} + +/* + * mab_no_null_split() - ensure the split doesn't fall on a NULL + * @b_node: the maple_big_node with the data + * @split: the suggested split location + * @slot_count: the number of slots in the node being considered. + * + * Return: the split location. + */ +static inline int mab_no_null_split(struct maple_big_node *b_node, + unsigned char split, unsigned char slot_count) +{ + if (!b_node->slot[split]) { + /* + * If the split is less than the max slot && the right side will + * still be sufficient, then increment the split on NULL. + */ + if ((split < slot_count - 1) && + (b_node->b_end - split) > (mt_min_slots[b_node->type])) + split++; + else + split--; + } + return split; +} + +/* + * mab_calc_split() - Calculate the split location and if there needs to be two + * splits. + * @bn: The maple_big_node with the data + * @mid_split: The second split, if required. 0 otherwise. + * + * Return: The first split location. The middle split is set in @mid_split. + */ +static inline int mab_calc_split(struct ma_state *mas, + struct maple_big_node *bn, unsigned char *mid_split, unsigned long min) +{ + unsigned char b_end = bn->b_end; + int split = b_end / 2; /* Assume equal split. */ + unsigned char slot_min, slot_count = mt_slots[bn->type]; + + /* + * To support gap tracking, all NULL entries are kept together and a node cannot + * end on a NULL entry, with the exception of the left-most leaf. The + * limitation means that the split of a node must be checked for this condition + * and be able to put more data in one direction or the other. + */ + if (unlikely((mas->mas_flags & MA_STATE_BULK))) { + *mid_split = 0; + if (ma_is_leaf(bn->type)) + slot_min = 2; + else + return b_end - mt_min_slots[bn->type]; + + split = b_end - slot_min; + mas->mas_flags |= MA_STATE_REBALANCE; + if (!bn->slot[split]) + split--; + return split; + } + + /* + * Although extremely rare, it is possible to enter what is known as the 3-way + * split scenario. The 3-way split comes about by means of a store of a range + * that overwrites the end and beginning of two full nodes. The result is a set + * of entries that cannot be stored in 2 nodes. Sometimes, these two nodes can + * also be located in different parent nodes which are also full. This can + * carry upwards all the way to the root in the worst case. + */ + if (unlikely(mab_middle_node(bn, split, slot_count))) { + split = b_end / 3; + *mid_split = split * 2; + } else { + slot_min = mt_min_slots[bn->type]; + + *mid_split = 0; + /* + * Avoid having a range less than the slot count unless it + * causes one node to be deficient. + * NOTE: mt_min_slots is 1 based, b_end and split are zero. + */ + while (((bn->pivot[split] - min) < slot_count - 1) && + (split < slot_count - 1) && (b_end - split > slot_min)) + split++; + } + + /* Avoid ending a node on a NULL entry */ + split = mab_no_null_split(bn, split, slot_count); + if (!(*mid_split)) + return split; + + *mid_split = mab_no_null_split(bn, *mid_split, slot_count); + + return split; +} + +/* + * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node + * and set @b_node->b_end to the next free slot. + * @mas: The maple state + * @mas_start: The starting slot to copy + * @mas_end: The end slot to copy (inclusively) + * @b_node: The maple_big_node to place the data + * @mab_start: The starting location in maple_big_node to store the data. + */ +static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, + unsigned char mas_end, struct maple_big_node *b_node, + unsigned char mab_start) +{ + enum maple_type mt; + struct maple_node *node; + void __rcu **slots; + unsigned long *pivots, *gaps; + int i = mas_start, j = mab_start; + unsigned char piv_end; + + node = mas_mn(mas); + mt = mte_node_type(mas->node); + pivots = ma_pivots(node, mt); + if (!i) { + b_node->pivot[j] = pivots[i++]; + if (unlikely(i > mas_end)) + goto complete; + j++; + } + + piv_end = min(mas_end, mt_pivots[mt]); + for (; i < piv_end; i++, j++) { + b_node->pivot[j] = pivots[i]; + if (unlikely(!b_node->pivot[j])) + break; + + if (unlikely(mas->max == b_node->pivot[j])) + goto complete; + } + + if (likely(i <= mas_end)) + b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); + +complete: + b_node->b_end = ++j; + j -= mab_start; + slots = ma_slots(node, mt); + memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j); + if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) { + gaps = ma_gaps(node, mt); + memcpy(b_node->gap + mab_start, gaps + mas_start, + sizeof(unsigned long) * j); + } +} + +/* + * mas_leaf_set_meta() - Set the metadata of a leaf if possible. + * @mas: The maple state + * @node: The maple node + * @pivots: pointer to the maple node pivots + * @mt: The maple type + * @end: The assumed end + * + * Note, end may be incremented within this function but not modified at the + * source. This is fine since the metadata is the last thing to be stored in a + * node during a write. + */ +static inline void mas_leaf_set_meta(struct ma_state *mas, + struct maple_node *node, unsigned long *pivots, + enum maple_type mt, unsigned char end) +{ + /* There is no room for metadata already */ + if (mt_pivots[mt] <= end) + return; + + if (pivots[end] && pivots[end] < mas->max) + end++; + + if (end < mt_slots[mt] - 1) + ma_set_meta(node, mt, 0, end); +} + +/* + * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node. + * @b_node: the maple_big_node that has the data + * @mab_start: the start location in @b_node. + * @mab_end: The end location in @b_node (inclusively) + * @mas: The maple state with the maple encoded node. + */ +static inline void mab_mas_cp(struct maple_big_node *b_node, + unsigned char mab_start, unsigned char mab_end, + struct ma_state *mas, bool new_max) +{ + int i, j = 0; + enum maple_type mt = mte_node_type(mas->node); + struct maple_node *node = mte_to_node(mas->node); + void __rcu **slots = ma_slots(node, mt); + unsigned long *pivots = ma_pivots(node, mt); + unsigned long *gaps = NULL; + unsigned char end; + + if (mab_end - mab_start > mt_pivots[mt]) + mab_end--; + + if (!pivots[mt_pivots[mt] - 1]) + slots[mt_pivots[mt]] = NULL; + + i = mab_start; + do { + pivots[j++] = b_node->pivot[i++]; + } while (i <= mab_end && likely(b_node->pivot[i])); + + memcpy(slots, b_node->slot + mab_start, + sizeof(void *) * (i - mab_start)); + + if (new_max) + mas->max = b_node->pivot[i - 1]; + + end = j - 1; + if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { + unsigned long max_gap = 0; + unsigned char offset = 15; + + gaps = ma_gaps(node, mt); + do { + gaps[--j] = b_node->gap[--i]; + if (gaps[j] > max_gap) { + offset = j; + max_gap = gaps[j]; + } + } while (j); + + ma_set_meta(node, mt, offset, end); + } else { + mas_leaf_set_meta(mas, node, pivots, mt, end); + } +} + +/* + * mas_descend_adopt() - Descend through a sub-tree and adopt children. + * @mas: the maple state with the maple encoded node of the sub-tree. + * + * Descend through a sub-tree and adopt children who do not have the correct + * parents set. Follow the parents which have the correct parents as they are + * the new entries which need to be followed to find other incorrectly set + * parents. + */ +static inline void mas_descend_adopt(struct ma_state *mas) +{ + struct ma_state list[3], next[3]; + int i, n; + + /* + * At each level there may be up to 3 correct parent pointers which indicates + * the new nodes which need to be walked to find any new nodes at a lower level. + */ + + for (i = 0; i < 3; i++) { + list[i] = *mas; + list[i].offset = 0; + next[i].offset = 0; + } + next[0] = *mas; + + while (!mte_is_leaf(list[0].node)) { + n = 0; + for (i = 0; i < 3; i++) { + if (mas_is_none(&list[i])) + continue; + + if (i && list[i-1].node == list[i].node) + continue; + + while ((n < 3) && (mas_new_child(&list[i], &next[n]))) + n++; + + mas_adopt_children(&list[i], list[i].node); + } + + while (n < 3) + next[n++].node = MAS_NONE; + + /* descend by setting the list to the children */ + for (i = 0; i < 3; i++) + list[i] = next[i]; + } +} + +/* + * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. + * @mas: The maple state + * @end: The maple node end + * @mt: The maple node type + */ +static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, + enum maple_type mt) +{ + if (!(mas->mas_flags & MA_STATE_BULK)) + return; + + if (mte_is_root(mas->node)) + return; + + if (end > mt_min_slots[mt]) { + mas->mas_flags &= ~MA_STATE_REBALANCE; + return; + } +} + +/* + * mas_store_b_node() - Store an @entry into the b_node while also copying the + * data from a maple encoded node. + * @wr_mas: the maple write state + * @b_node: the maple_big_node to fill with data + * @offset_end: the offset to end copying + * + * Return: The actual end of the data stored in @b_node + */ +static inline void mas_store_b_node(struct ma_wr_state *wr_mas, + struct maple_big_node *b_node, unsigned char offset_end) +{ + unsigned char slot; + unsigned char b_end; + /* Possible underflow of piv will wrap back to 0 before use. */ + unsigned long piv; + struct ma_state *mas = wr_mas->mas; + + b_node->type = wr_mas->type; + b_end = 0; + slot = mas->offset; + if (slot) { + /* Copy start data up to insert. */ + mas_mab_cp(mas, 0, slot - 1, b_node, 0); + b_end = b_node->b_end; + piv = b_node->pivot[b_end - 1]; + } else + piv = mas->min - 1; + + if (piv + 1 < mas->index) { + /* Handle range starting after old range */ + b_node->slot[b_end] = wr_mas->content; + if (!wr_mas->content) + b_node->gap[b_end] = mas->index - 1 - piv; + b_node->pivot[b_end++] = mas->index - 1; + } + + /* Store the new entry. */ + mas->offset = b_end; + b_node->slot[b_end] = wr_mas->entry; + b_node->pivot[b_end] = mas->last; + + /* Appended. */ + if (mas->last >= mas->max) + goto b_end; + + /* Handle new range ending before old range ends */ + piv = mas_logical_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); + if (piv > mas->last) { + if (piv == ULONG_MAX) + mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); + + if (offset_end != slot) + wr_mas->content = mas_slot_locked(mas, wr_mas->slots, + offset_end); + + b_node->slot[++b_end] = wr_mas->content; + if (!wr_mas->content) + b_node->gap[b_end] = piv - mas->last + 1; + b_node->pivot[b_end] = piv; + } + + slot = offset_end + 1; + if (slot > wr_mas->node_end) + goto b_end; + + /* Copy end data to the end of the node. */ + mas_mab_cp(mas, slot, wr_mas->node_end + 1, b_node, ++b_end); + b_node->b_end--; + return; + +b_end: + b_node->b_end = b_end; +} + +/* + * mas_prev_sibling() - Find the previous node with the same parent. + * @mas: the maple state + * + * Return: True if there is a previous sibling, false otherwise. + */ +static inline bool mas_prev_sibling(struct ma_state *mas) +{ + unsigned int p_slot = mte_parent_slot(mas->node); + + if (mte_is_root(mas->node)) + return false; + + if (!p_slot) + return false; + + mas_ascend(mas); + mas->offset = p_slot - 1; + mas_descend(mas); + return true; +} + +/* + * mas_next_sibling() - Find the next node with the same parent. + * @mas: the maple state + * + * Return: true if there is a next sibling, false otherwise. + */ +static inline bool mas_next_sibling(struct ma_state *mas) +{ + MA_STATE(parent, mas->tree, mas->index, mas->last); + + if (mte_is_root(mas->node)) + return false; + + parent = *mas; + mas_ascend(&parent); + parent.offset = mte_parent_slot(mas->node) + 1; + if (parent.offset > mas_data_end(&parent)) + return false; + + *mas = parent; + mas_descend(mas); + return true; +} + +/* + * mte_node_or_node() - Return the encoded node or MAS_NONE. + * @enode: The encoded maple node. + * + * Shorthand to avoid setting %NULLs in the tree or maple_subtree_state. + * + * Return: @enode or MAS_NONE + */ +static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode) +{ + if (enode) + return enode; + + return ma_enode_ptr(MAS_NONE); +} + +/* + * mas_wr_node_walk() - Find the correct offset for the index in the @mas. + * @wr_mas: The maple write state + * + * Uses mas_slot_locked() and does not need to worry about dead nodes. + */ +static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + unsigned char count; + unsigned char offset; + unsigned long index, min, max; + + if (unlikely(ma_is_dense(wr_mas->type))) { + wr_mas->r_max = wr_mas->r_min = mas->index; + mas->offset = mas->index = mas->min; + return; + } + + wr_mas->node = mas_mn(wr_mas->mas); + wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); + count = wr_mas->node_end = ma_data_end(wr_mas->node, wr_mas->type, + wr_mas->pivots, mas->max); + offset = mas->offset; + min = mas_safe_min(mas, wr_mas->pivots, offset); + if (unlikely(offset == count)) + goto max; + + max = wr_mas->pivots[offset]; + index = mas->index; + if (unlikely(index <= max)) + goto done; + + if (unlikely(!max && offset)) + goto max; + + min = max + 1; + while (++offset < count) { + max = wr_mas->pivots[offset]; + if (index <= max) + goto done; + else if (unlikely(!max)) + break; + + min = max + 1; + } + +max: + max = mas->max; +done: + wr_mas->r_max = max; + wr_mas->r_min = min; + wr_mas->offset_end = mas->offset = offset; +} + +/* + * mas_topiary_range() - Add a range of slots to the topiary. + * @mas: The maple state + * @destroy: The topiary to add the slots (usually destroy) + * @start: The starting slot inclusively + * @end: The end slot inclusively + */ +static inline void mas_topiary_range(struct ma_state *mas, + struct ma_topiary *destroy, unsigned char start, unsigned char end) +{ + void __rcu **slots; + unsigned char offset; + + MT_BUG_ON(mas->tree, mte_is_leaf(mas->node)); + slots = ma_slots(mas_mn(mas), mte_node_type(mas->node)); + for (offset = start; offset <= end; offset++) { + struct maple_enode *enode = mas_slot_locked(mas, slots, offset); + + if (mte_dead_node(enode)) + continue; + + mat_add(destroy, enode); + } +} + +/* + * mast_topiary() - Add the portions of the tree to the removal list; either to + * be freed or discarded (destroy walk). + * @mast: The maple_subtree_state. + */ +static inline void mast_topiary(struct maple_subtree_state *mast) +{ + MA_WR_STATE(wr_mas, mast->orig_l, NULL); + unsigned char r_start, r_end; + unsigned char l_start, l_end; + void __rcu **l_slots, **r_slots; + + wr_mas.type = mte_node_type(mast->orig_l->node); + mast->orig_l->index = mast->orig_l->last; + mas_wr_node_walk(&wr_mas); + l_start = mast->orig_l->offset + 1; + l_end = mas_data_end(mast->orig_l); + r_start = 0; + r_end = mast->orig_r->offset; + + if (r_end) + r_end--; + + l_slots = ma_slots(mas_mn(mast->orig_l), + mte_node_type(mast->orig_l->node)); + + r_slots = ma_slots(mas_mn(mast->orig_r), + mte_node_type(mast->orig_r->node)); + + if ((l_start < l_end) && + mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_start))) { + l_start++; + } + + if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_end))) { + if (r_end) + r_end--; + } + + if ((l_start > r_end) && (mast->orig_l->node == mast->orig_r->node)) + return; + + /* At the node where left and right sides meet, add the parts between */ + if (mast->orig_l->node == mast->orig_r->node) { + return mas_topiary_range(mast->orig_l, mast->destroy, + l_start, r_end); + } + + /* mast->orig_r is different and consumed. */ + if (mte_is_leaf(mast->orig_r->node)) + return; + + if (mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_end))) + l_end--; + + + if (l_start <= l_end) + mas_topiary_range(mast->orig_l, mast->destroy, l_start, l_end); + + if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_start))) + r_start++; + + if (r_start <= r_end) + mas_topiary_range(mast->orig_r, mast->destroy, 0, r_end); +} + +/* + * mast_rebalance_next() - Rebalance against the next node + * @mast: The maple subtree state + * @old_r: The encoded maple node to the right (next node). + */ +static inline void mast_rebalance_next(struct maple_subtree_state *mast) +{ + unsigned char b_end = mast->bn->b_end; + + mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), + mast->bn, b_end); + mast->orig_r->last = mast->orig_r->max; +} + +/* + * mast_rebalance_prev() - Rebalance against the previous node + * @mast: The maple subtree state + * @old_l: The encoded maple node to the left (previous node) + */ +static inline void mast_rebalance_prev(struct maple_subtree_state *mast) +{ + unsigned char end = mas_data_end(mast->orig_l) + 1; + unsigned char b_end = mast->bn->b_end; + + mab_shift_right(mast->bn, end); + mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); + mast->l->min = mast->orig_l->min; + mast->orig_l->index = mast->orig_l->min; + mast->bn->b_end = end + b_end; + mast->l->offset += end; +} + +/* + * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring + * the node to the right. Checking the nodes to the right then the left at each + * level upwards until root is reached. Free and destroy as needed. + * Data is copied into the @mast->bn. + * @mast: The maple_subtree_state. + */ +static inline +bool mast_spanning_rebalance(struct maple_subtree_state *mast) +{ + struct ma_state r_tmp = *mast->orig_r; + struct ma_state l_tmp = *mast->orig_l; + struct maple_enode *ancestor = NULL; + unsigned char start, end; + unsigned char depth = 0; + + r_tmp = *mast->orig_r; + l_tmp = *mast->orig_l; + do { + mas_ascend(mast->orig_r); + mas_ascend(mast->orig_l); + depth++; + if (!ancestor && + (mast->orig_r->node == mast->orig_l->node)) { + ancestor = mast->orig_r->node; + end = mast->orig_r->offset - 1; + start = mast->orig_l->offset + 1; + } + + if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { + if (!ancestor) { + ancestor = mast->orig_r->node; + start = 0; + } + + mast->orig_r->offset++; + do { + mas_descend(mast->orig_r); + mast->orig_r->offset = 0; + depth--; + } while (depth); + + mast_rebalance_next(mast); + do { + unsigned char l_off = 0; + struct maple_enode *child = r_tmp.node; + + mas_ascend(&r_tmp); + if (ancestor == r_tmp.node) + l_off = start; + + if (r_tmp.offset) + r_tmp.offset--; + + if (l_off < r_tmp.offset) + mas_topiary_range(&r_tmp, mast->destroy, + l_off, r_tmp.offset); + + if (l_tmp.node != child) + mat_add(mast->free, child); + + } while (r_tmp.node != ancestor); + + *mast->orig_l = l_tmp; + return true; + + } else if (mast->orig_l->offset != 0) { + if (!ancestor) { + ancestor = mast->orig_l->node; + end = mas_data_end(mast->orig_l); + } + + mast->orig_l->offset--; + do { + mas_descend(mast->orig_l); + mast->orig_l->offset = + mas_data_end(mast->orig_l); + depth--; + } while (depth); + + mast_rebalance_prev(mast); + do { + unsigned char r_off; + struct maple_enode *child = l_tmp.node; + + mas_ascend(&l_tmp); + if (ancestor == l_tmp.node) + r_off = end; + else + r_off = mas_data_end(&l_tmp); + + if (l_tmp.offset < r_off) + l_tmp.offset++; + + if (l_tmp.offset < r_off) + mas_topiary_range(&l_tmp, mast->destroy, + l_tmp.offset, r_off); + + if (r_tmp.node != child) + mat_add(mast->free, child); + + } while (l_tmp.node != ancestor); + + *mast->orig_r = r_tmp; + return true; + } + } while (!mte_is_root(mast->orig_r->node)); + + *mast->orig_r = r_tmp; + *mast->orig_l = l_tmp; + return false; +} + +/* + * mast_ascend_free() - Add current original maple state nodes to the free list + * and ascend. + * @mast: the maple subtree state. + * + * Ascend the original left and right sides and add the previous nodes to the + * free list. Set the slots to point to the correct location in the new nodes. + */ +static inline void +mast_ascend_free(struct maple_subtree_state *mast) +{ + MA_WR_STATE(wr_mas, mast->orig_r, NULL); + struct maple_enode *left = mast->orig_l->node; + struct maple_enode *right = mast->orig_r->node; + + mas_ascend(mast->orig_l); + mas_ascend(mast->orig_r); + mat_add(mast->free, left); + + if (left != right) + mat_add(mast->free, right); + + mast->orig_r->offset = 0; + mast->orig_r->index = mast->r->max; + /* last should be larger than or equal to index */ + if (mast->orig_r->last < mast->orig_r->index) + mast->orig_r->last = mast->orig_r->index; + /* + * The node may not contain the value so set slot to ensure all + * of the nodes contents are freed or destroyed. + */ + wr_mas.type = mte_node_type(mast->orig_r->node); + mas_wr_node_walk(&wr_mas); + /* Set up the left side of things */ + mast->orig_l->offset = 0; + mast->orig_l->index = mast->l->min; + wr_mas.mas = mast->orig_l; + wr_mas.type = mte_node_type(mast->orig_l->node); + mas_wr_node_walk(&wr_mas); + + mast->bn->type = wr_mas.type; +} + +/* + * mas_new_ma_node() - Create and return a new maple node. Helper function. + * @mas: the maple state with the allocations. + * @b_node: the maple_big_node with the type encoding. + * + * Use the node type from the maple_big_node to allocate a new node from the + * ma_state. This function exists mainly for code readability. + * + * Return: A new maple encoded node + */ +static inline struct maple_enode +*mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node) +{ + return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type); +} + +/* + * mas_mab_to_node() - Set up right and middle nodes + * + * @mas: the maple state that contains the allocations. + * @b_node: the node which contains the data. + * @left: The pointer which will have the left node + * @right: The pointer which may have the right node + * @middle: the pointer which may have the middle node (rare) + * @mid_split: the split location for the middle node + * + * Return: the split of left. + */ +static inline unsigned char mas_mab_to_node(struct ma_state *mas, + struct maple_big_node *b_node, struct maple_enode **left, + struct maple_enode **right, struct maple_enode **middle, + unsigned char *mid_split, unsigned long min) +{ + unsigned char split = 0; + unsigned char slot_count = mt_slots[b_node->type]; + + *left = mas_new_ma_node(mas, b_node); + *right = NULL; + *middle = NULL; + *mid_split = 0; + + if (b_node->b_end < slot_count) { + split = b_node->b_end; + } else { + split = mab_calc_split(mas, b_node, mid_split, min); + *right = mas_new_ma_node(mas, b_node); + } + + if (*mid_split) + *middle = mas_new_ma_node(mas, b_node); + + return split; + +} + +/* + * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end + * pointer. + * @b_node - the big node to add the entry + * @mas - the maple state to get the pivot (mas->max) + * @entry - the entry to add, if NULL nothing happens. + */ +static inline void mab_set_b_end(struct maple_big_node *b_node, + struct ma_state *mas, + void *entry) +{ + if (!entry) + return; + + b_node->slot[b_node->b_end] = entry; + if (mt_is_alloc(mas->tree)) + b_node->gap[b_node->b_end] = mas_max_gap(mas); + b_node->pivot[b_node->b_end++] = mas->max; +} + +/* + * mas_set_split_parent() - combine_then_separate helper function. Sets the parent + * of @mas->node to either @left or @right, depending on @slot and @split + * + * @mas - the maple state with the node that needs a parent + * @left - possible parent 1 + * @right - possible parent 2 + * @slot - the slot the mas->node was placed + * @split - the split location between @left and @right + */ +static inline void mas_set_split_parent(struct ma_state *mas, + struct maple_enode *left, + struct maple_enode *right, + unsigned char *slot, unsigned char split) +{ + if (mas_is_none(mas)) + return; + + if ((*slot) <= split) + mte_set_parent(mas->node, left, *slot); + else if (right) + mte_set_parent(mas->node, right, (*slot) - split - 1); + + (*slot)++; +} + +/* + * mte_mid_split_check() - Check if the next node passes the mid-split + * @**l: Pointer to left encoded maple node. + * @**m: Pointer to middle encoded maple node. + * @**r: Pointer to right encoded maple node. + * @slot: The offset + * @*split: The split location. + * @mid_split: The middle split. + */ +static inline void mte_mid_split_check(struct maple_enode **l, + struct maple_enode **r, + struct maple_enode *right, + unsigned char slot, + unsigned char *split, + unsigned char mid_split) +{ + if (*r == right) + return; + + if (slot < mid_split) + return; + + *l = *r; + *r = right; + *split = mid_split; +} + +/* + * mast_set_split_parents() - Helper function to set three nodes parents. Slot + * is taken from @mast->l. + * @mast - the maple subtree state + * @left - the left node + * @right - the right node + * @split - the split location. + */ +static inline void mast_set_split_parents(struct maple_subtree_state *mast, + struct maple_enode *left, + struct maple_enode *middle, + struct maple_enode *right, + unsigned char split, + unsigned char mid_split) +{ + unsigned char slot; + struct maple_enode *l = left; + struct maple_enode *r = right; + + if (mas_is_none(mast->l)) + return; + + if (middle) + r = middle; + + slot = mast->l->offset; + + mte_mid_split_check(&l, &r, right, slot, &split, mid_split); + mas_set_split_parent(mast->l, l, r, &slot, split); + + mte_mid_split_check(&l, &r, right, slot, &split, mid_split); + mas_set_split_parent(mast->m, l, r, &slot, split); + + mte_mid_split_check(&l, &r, right, slot, &split, mid_split); + mas_set_split_parent(mast->r, l, r, &slot, split); +} + +/* + * mas_wmb_replace() - Write memory barrier and replace + * @mas: The maple state + * @free: the maple topiary list of nodes to free + * @destroy: The maple topiary list of nodes to destroy (walk and free) + * + * Updates gap as necessary. + */ +static inline void mas_wmb_replace(struct ma_state *mas, + struct ma_topiary *free, + struct ma_topiary *destroy) +{ + /* All nodes must see old data as dead prior to replacing that data */ + smp_wmb(); /* Needed for RCU */ + + /* Insert the new data in the tree */ + mas_replace(mas, true); + + if (!mte_is_leaf(mas->node)) + mas_descend_adopt(mas); + + mas_mat_free(mas, free); + + if (destroy) + mas_mat_destroy(mas, destroy); + + if (mte_is_leaf(mas->node)) + return; + + mas_update_gap(mas); +} + +/* + * mast_new_root() - Set a new tree root during subtree creation + * @mast: The maple subtree state + * @mas: The maple state + */ +static inline void mast_new_root(struct maple_subtree_state *mast, + struct ma_state *mas) +{ + mas_mn(mast->l)->parent = + ma_parent_ptr(((unsigned long)mas->tree | MA_ROOT_PARENT)); + if (!mte_dead_node(mast->orig_l->node) && + !mte_is_root(mast->orig_l->node)) { + do { + mast_ascend_free(mast); + mast_topiary(mast); + } while (!mte_is_root(mast->orig_l->node)); + } + if ((mast->orig_l->node != mas->node) && + (mast->l->depth > mas_mt_height(mas))) { + mat_add(mast->free, mas->node); + } +} + +/* + * mast_cp_to_nodes() - Copy data out to nodes. + * @mast: The maple subtree state + * @left: The left encoded maple node + * @middle: The middle encoded maple node + * @right: The right encoded maple node + * @split: The location to split between left and (middle ? middle : right) + * @mid_split: The location to split between middle and right. + */ +static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, + struct maple_enode *left, struct maple_enode *middle, + struct maple_enode *right, unsigned char split, unsigned char mid_split) +{ + bool new_lmax = true; + + mast->l->node = mte_node_or_none(left); + mast->m->node = mte_node_or_none(middle); + mast->r->node = mte_node_or_none(right); + + mast->l->min = mast->orig_l->min; + if (split == mast->bn->b_end) { + mast->l->max = mast->orig_r->max; + new_lmax = false; + } + + mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax); + + if (middle) { + mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true); + mast->m->min = mast->bn->pivot[split] + 1; + split = mid_split; + } + + mast->r->max = mast->orig_r->max; + if (right) { + mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false); + mast->r->min = mast->bn->pivot[split] + 1; + } +} + +/* + * mast_combine_cp_left - Copy in the original left side of the tree into the + * combined data set in the maple subtree state big node. + * @mast: The maple subtree state + */ +static inline void mast_combine_cp_left(struct maple_subtree_state *mast) +{ + unsigned char l_slot = mast->orig_l->offset; + + if (!l_slot) + return; + + mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0); +} + +/* + * mast_combine_cp_right: Copy in the original right side of the tree into the + * combined data set in the maple subtree state big node. + * @mast: The maple subtree state + */ +static inline void mast_combine_cp_right(struct maple_subtree_state *mast) +{ + if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max) + return; + + mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1, + mt_slot_count(mast->orig_r->node), mast->bn, + mast->bn->b_end); + mast->orig_r->last = mast->orig_r->max; +} + +/* + * mast_sufficient: Check if the maple subtree state has enough data in the big + * node to create at least one sufficient node + * @mast: the maple subtree state + */ +static inline bool mast_sufficient(struct maple_subtree_state *mast) +{ + if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node)) + return true; + + return false; +} + +/* + * mast_overflow: Check if there is too much data in the subtree state for a + * single node. + * @mast: The maple subtree state + */ +static inline bool mast_overflow(struct maple_subtree_state *mast) +{ + if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node)) + return true; + + return false; +} + +static inline void *mtree_range_walk(struct ma_state *mas) +{ + unsigned long *pivots; + unsigned char offset; + struct maple_node *node; + struct maple_enode *next, *last; + enum maple_type type; + void __rcu **slots; + unsigned char end; + unsigned long max, min; + unsigned long prev_max, prev_min; + + last = next = mas->node; + prev_min = min = mas->min; + max = mas->max; + do { + offset = 0; + last = next; + node = mte_to_node(next); + type = mte_node_type(next); + pivots = ma_pivots(node, type); + end = ma_data_end(node, type, pivots, max); + if (unlikely(ma_dead_node(node))) + goto dead_node; + + if (pivots[offset] >= mas->index) { + prev_max = max; + prev_min = min; + max = pivots[offset]; + goto next; + } + + do { + offset++; + } while ((offset < end) && (pivots[offset] < mas->index)); + + prev_min = min; + min = pivots[offset - 1] + 1; + prev_max = max; + if (likely(offset < end && pivots[offset])) + max = pivots[offset]; + +next: + slots = ma_slots(node, type); + next = mt_slot(mas->tree, slots, offset); + if (unlikely(ma_dead_node(node))) + goto dead_node; + } while (!ma_is_leaf(type)); + + mas->offset = offset; + mas->index = min; + mas->last = max; + mas->min = prev_min; + mas->max = prev_max; + mas->node = last; + return (void *) next; + +dead_node: + mas_reset(mas); + return NULL; +} + +/* + * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. + * @mas: The starting maple state + * @mast: The maple_subtree_state, keeps track of 4 maple states. + * @count: The estimated count of iterations needed. + * + * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root + * is hit. First @b_node is split into two entries which are inserted into the + * next iteration of the loop. @b_node is returned populated with the final + * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the + * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last + * to account of what has been copied into the new sub-tree. The update of + * orig_l_mas->last is used in mas_consume to find the slots that will need to + * be either freed or destroyed. orig_l_mas->depth keeps track of the height of + * the new sub-tree in case the sub-tree becomes the full tree. + * + * Return: the number of elements in b_node during the last loop. + */ +static int mas_spanning_rebalance(struct ma_state *mas, + struct maple_subtree_state *mast, unsigned char count) +{ + unsigned char split, mid_split; + unsigned char slot = 0; + struct maple_enode *left = NULL, *middle = NULL, *right = NULL; + + MA_STATE(l_mas, mas->tree, mas->index, mas->index); + MA_STATE(r_mas, mas->tree, mas->index, mas->last); + MA_STATE(m_mas, mas->tree, mas->index, mas->index); + MA_TOPIARY(free, mas->tree); + MA_TOPIARY(destroy, mas->tree); + + /* + * The tree needs to be rebalanced and leaves need to be kept at the same level. + * Rebalancing is done by use of the ``struct maple_topiary``. + */ + mast->l = &l_mas; + mast->m = &m_mas; + mast->r = &r_mas; + mast->free = &free; + mast->destroy = &destroy; + l_mas.node = r_mas.node = m_mas.node = MAS_NONE; + if (!(mast->orig_l->min && mast->orig_r->max == ULONG_MAX) && + unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) + mast_spanning_rebalance(mast); + + mast->orig_l->depth = 0; + + /* + * Each level of the tree is examined and balanced, pushing data to the left or + * right, or rebalancing against left or right nodes is employed to avoid + * rippling up the tree to limit the amount of churn. Once a new sub-section of + * the tree is created, there may be a mix of new and old nodes. The old nodes + * will have the incorrect parent pointers and currently be in two trees: the + * original tree and the partially new tree. To remedy the parent pointers in + * the old tree, the new data is swapped into the active tree and a walk down + * the tree is performed and the parent pointers are updated. + * See mas_descend_adopt() for more information.. + */ + while (count--) { + mast->bn->b_end--; + mast->bn->type = mte_node_type(mast->orig_l->node); + split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, + &mid_split, mast->orig_l->min); + mast_set_split_parents(mast, left, middle, right, split, + mid_split); + mast_cp_to_nodes(mast, left, middle, right, split, mid_split); + + /* + * Copy data from next level in the tree to mast->bn from next + * iteration + */ + memset(mast->bn, 0, sizeof(struct maple_big_node)); + mast->bn->type = mte_node_type(left); + mast->orig_l->depth++; + + /* Root already stored in l->node. */ + if (mas_is_root_limits(mast->l)) + goto new_root; + + mast_ascend_free(mast); + mast_combine_cp_left(mast); + l_mas.offset = mast->bn->b_end; + mab_set_b_end(mast->bn, &l_mas, left); + mab_set_b_end(mast->bn, &m_mas, middle); + mab_set_b_end(mast->bn, &r_mas, right); + + /* Copy anything necessary out of the right node. */ + mast_combine_cp_right(mast); + mast_topiary(mast); + mast->orig_l->last = mast->orig_l->max; + + if (mast_sufficient(mast)) + continue; + + if (mast_overflow(mast)) + continue; + + /* May be a new root stored in mast->bn */ + if (mas_is_root_limits(mast->orig_l)) + break; + + mast_spanning_rebalance(mast); + + /* rebalancing from other nodes may require another loop. */ + if (!count) + count++; + } + + l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), + mte_node_type(mast->orig_l->node)); + mast->orig_l->depth++; + mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); + mte_set_parent(left, l_mas.node, slot); + if (middle) + mte_set_parent(middle, l_mas.node, ++slot); + + if (right) + mte_set_parent(right, l_mas.node, ++slot); + + if (mas_is_root_limits(mast->l)) { +new_root: + mast_new_root(mast, mas); + } else { + mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; + } + + if (!mte_dead_node(mast->orig_l->node)) + mat_add(&free, mast->orig_l->node); + + mas->depth = mast->orig_l->depth; + *mast->orig_l = l_mas; + mte_set_node_dead(mas->node); + + /* Set up mas for insertion. */ + mast->orig_l->depth = mas->depth; + mast->orig_l->alloc = mas->alloc; + *mas = *mast->orig_l; + mas_wmb_replace(mas, &free, &destroy); + mtree_range_walk(mas); + return mast->bn->b_end; +} + +/* + * mas_rebalance() - Rebalance a given node. + * @mas: The maple state + * @b_node: The big maple node. + * + * Rebalance two nodes into a single node or two new nodes that are sufficient. + * Continue upwards until tree is sufficient. + * + * Return: the number of elements in b_node during the last loop. + */ +static inline int mas_rebalance(struct ma_state *mas, + struct maple_big_node *b_node) +{ + char empty_count = mas_mt_height(mas); + struct maple_subtree_state mast; + unsigned char shift, b_end = ++b_node->b_end; + + MA_STATE(l_mas, mas->tree, mas->index, mas->last); + MA_STATE(r_mas, mas->tree, mas->index, mas->last); + + trace_ma_op(__func__, mas); + + /* + * Rebalancing occurs if a node is insufficient. Data is rebalanced + * against the node to the right if it exists, otherwise the node to the + * left of this node is rebalanced against this node. If rebalancing + * causes just one node to be produced instead of two, then the parent + * is also examined and rebalanced if it is insufficient. Every level + * tries to combine the data in the same way. If one node contains the + * entire range of the tree, then that node is used as a new root node. + */ + mas_node_count(mas, 1 + empty_count * 3); + if (mas_is_err(mas)) + return 0; + + mast.orig_l = &l_mas; + mast.orig_r = &r_mas; + mast.bn = b_node; + mast.bn->type = mte_node_type(mas->node); + + l_mas = r_mas = *mas; + + if (mas_next_sibling(&r_mas)) { + mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end); + r_mas.last = r_mas.index = r_mas.max; + } else { + mas_prev_sibling(&l_mas); + shift = mas_data_end(&l_mas) + 1; + mab_shift_right(b_node, shift); + mas->offset += shift; + mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0); + b_node->b_end = shift + b_end; + l_mas.index = l_mas.last = l_mas.min; + } + + return mas_spanning_rebalance(mas, &mast, empty_count); +} + +/* + * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple + * state. + * @mas: The maple state + * @end: The end of the left-most node. + * + * During a mass-insert event (such as forking), it may be necessary to + * rebalance the left-most node when it is not sufficient. + */ +static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end) +{ + enum maple_type mt = mte_node_type(mas->node); + struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; + struct maple_enode *eparent; + unsigned char offset, tmp, split = mt_slots[mt] / 2; + void __rcu **l_slots, **slots; + unsigned long *l_pivs, *pivs, gap; + bool in_rcu = mt_in_rcu(mas->tree); + + MA_STATE(l_mas, mas->tree, mas->index, mas->last); + + l_mas = *mas; + mas_prev_sibling(&l_mas); + + /* set up node. */ + if (in_rcu) { + /* Allocate for both left and right as well as parent. */ + mas_node_count(mas, 3); + if (mas_is_err(mas)) + return; + + newnode = mas_pop_node(mas); + } else { + newnode = &reuse; + } + + node = mas_mn(mas); + newnode->parent = node->parent; + slots = ma_slots(newnode, mt); + pivs = ma_pivots(newnode, mt); + left = mas_mn(&l_mas); + l_slots = ma_slots(left, mt); + l_pivs = ma_pivots(left, mt); + if (!l_slots[split]) + split++; + tmp = mas_data_end(&l_mas) - split; + + memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp); + memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp); + pivs[tmp] = l_mas.max; + memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end); + memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end); + + l_mas.max = l_pivs[split]; + mas->min = l_mas.max + 1; + eparent = mt_mk_node(mte_parent(l_mas.node), + mas_parent_enum(&l_mas, l_mas.node)); + tmp += end; + if (!in_rcu) { + unsigned char max_p = mt_pivots[mt]; + unsigned char max_s = mt_slots[mt]; + + if (tmp < max_p) + memset(pivs + tmp, 0, + sizeof(unsigned long *) * (max_p - tmp)); + + if (tmp < mt_slots[mt]) + memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); + + memcpy(node, newnode, sizeof(struct maple_node)); + ma_set_meta(node, mt, 0, tmp - 1); + mte_set_pivot(eparent, mte_parent_slot(l_mas.node), + l_pivs[split]); + + /* Remove data from l_pivs. */ + tmp = split + 1; + memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); + memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); + ma_set_meta(left, mt, 0, split); + + goto done; + } + + /* RCU requires replacing both l_mas, mas, and parent. */ + mas->node = mt_mk_node(newnode, mt); + ma_set_meta(newnode, mt, 0, tmp); + + new_left = mas_pop_node(mas); + new_left->parent = left->parent; + mt = mte_node_type(l_mas.node); + slots = ma_slots(new_left, mt); + pivs = ma_pivots(new_left, mt); + memcpy(slots, l_slots, sizeof(void *) * split); + memcpy(pivs, l_pivs, sizeof(unsigned long) * split); + ma_set_meta(new_left, mt, 0, split); + l_mas.node = mt_mk_node(new_left, mt); + + /* replace parent. */ + offset = mte_parent_slot(mas->node); + mt = mas_parent_enum(&l_mas, l_mas.node); + parent = mas_pop_node(mas); + slots = ma_slots(parent, mt); + pivs = ma_pivots(parent, mt); + memcpy(parent, mte_to_node(eparent), sizeof(struct maple_node)); + rcu_assign_pointer(slots[offset], mas->node); + rcu_assign_pointer(slots[offset - 1], l_mas.node); + pivs[offset - 1] = l_mas.max; + eparent = mt_mk_node(parent, mt); +done: + gap = mas_leaf_max_gap(mas); + mte_set_gap(eparent, mte_parent_slot(mas->node), gap); + gap = mas_leaf_max_gap(&l_mas); + mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); + mas_ascend(mas); + + if (in_rcu) + mas_replace(mas, false); + + mas_update_gap(mas); +} + +/* + * mas_split_final_node() - Split the final node in a subtree operation. + * @mast: the maple subtree state + * @mas: The maple state + * @height: The height of the tree in case it's a new root. + */ +static inline bool mas_split_final_node(struct maple_subtree_state *mast, + struct ma_state *mas, int height) +{ + struct maple_enode *ancestor; + + if (mte_is_root(mas->node)) { + if (mt_is_alloc(mas->tree)) + mast->bn->type = maple_arange_64; + else + mast->bn->type = maple_range_64; + mas->depth = height; + } + /* + * Only a single node is used here, could be root. + * The Big_node data should just fit in a single node. + */ + ancestor = mas_new_ma_node(mas, mast->bn); + mte_set_parent(mast->l->node, ancestor, mast->l->offset); + mte_set_parent(mast->r->node, ancestor, mast->r->offset); + mte_to_node(ancestor)->parent = mas_mn(mas)->parent; + + mast->l->node = ancestor; + mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); + mas->offset = mast->bn->b_end - 1; + return true; +} + +/* + * mast_fill_bnode() - Copy data into the big node in the subtree state + * @mast: The maple subtree state + * @mas: the maple state + * @skip: The number of entries to skip for new nodes insertion. + */ +static inline void mast_fill_bnode(struct maple_subtree_state *mast, + struct ma_state *mas, + unsigned char skip) +{ + bool cp = true; + struct maple_enode *old = mas->node; + unsigned char split; + + memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap)); + memset(mast->bn->slot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->slot)); + memset(mast->bn->pivot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->pivot)); + mast->bn->b_end = 0; + + if (mte_is_root(mas->node)) { + cp = false; + } else { + mas_ascend(mas); + mat_add(mast->free, old); + mas->offset = mte_parent_slot(mas->node); + } + + if (cp && mast->l->offset) + mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0); + + split = mast->bn->b_end; + mab_set_b_end(mast->bn, mast->l, mast->l->node); + mast->r->offset = mast->bn->b_end; + mab_set_b_end(mast->bn, mast->r, mast->r->node); + if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max) + cp = false; + + if (cp) + mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1, + mast->bn, mast->bn->b_end); + + mast->bn->b_end--; + mast->bn->type = mte_node_type(mas->node); +} + +/* + * mast_split_data() - Split the data in the subtree state big node into regular + * nodes. + * @mast: The maple subtree state + * @mas: The maple state + * @split: The location to split the big node + */ +static inline void mast_split_data(struct maple_subtree_state *mast, + struct ma_state *mas, unsigned char split) +{ + unsigned char p_slot; + + mab_mas_cp(mast->bn, 0, split, mast->l, true); + mte_set_pivot(mast->r->node, 0, mast->r->max); + mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false); + mast->l->offset = mte_parent_slot(mas->node); + mast->l->max = mast->bn->pivot[split]; + mast->r->min = mast->l->max + 1; + if (mte_is_leaf(mas->node)) + return; + + p_slot = mast->orig_l->offset; + mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node, + &p_slot, split); + mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node, + &p_slot, split); +} + +/* + * mas_push_data() - Instead of splitting a node, it is beneficial to push the + * data to the right or left node if there is room. + * @mas: The maple state + * @height: The current height of the maple state + * @mast: The maple subtree state + * @left: Push left or not. + * + * Keeping the height of the tree low means faster lookups. + * + * Return: True if pushed, false otherwise. + */ +static inline bool mas_push_data(struct ma_state *mas, int height, + struct maple_subtree_state *mast, bool left) +{ + unsigned char slot_total = mast->bn->b_end; + unsigned char end, space, split; + + MA_STATE(tmp_mas, mas->tree, mas->index, mas->last); + tmp_mas = *mas; + tmp_mas.depth = mast->l->depth; + + if (left && !mas_prev_sibling(&tmp_mas)) + return false; + else if (!left && !mas_next_sibling(&tmp_mas)) + return false; + + end = mas_data_end(&tmp_mas); + slot_total += end; + space = 2 * mt_slot_count(mas->node) - 2; + /* -2 instead of -1 to ensure there isn't a triple split */ + if (ma_is_leaf(mast->bn->type)) + space--; + + if (mas->max == ULONG_MAX) + space--; + + if (slot_total >= space) + return false; + + /* Get the data; Fill mast->bn */ + mast->bn->b_end++; + if (left) { + mab_shift_right(mast->bn, end + 1); + mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0); + mast->bn->b_end = slot_total + 1; + } else { + mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end); + } + + /* Configure mast for splitting of mast->bn */ + split = mt_slots[mast->bn->type] - 2; + if (left) { + /* Switch mas to prev node */ + mat_add(mast->free, mas->node); + *mas = tmp_mas; + /* Start using mast->l for the left side. */ + tmp_mas.node = mast->l->node; + *mast->l = tmp_mas; + } else { + mat_add(mast->free, tmp_mas.node); + tmp_mas.node = mast->r->node; + *mast->r = tmp_mas; + split = slot_total - split; + } + split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]); + /* Update parent slot for split calculation. */ + if (left) + mast->orig_l->offset += end + 1; + + mast_split_data(mast, mas, split); + mast_fill_bnode(mast, mas, 2); + mas_split_final_node(mast, mas, height + 1); + return true; +} + +/* + * mas_split() - Split data that is too big for one node into two. + * @mas: The maple state + * @b_node: The maple big node + * Return: 1 on success, 0 on failure. + */ +static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) +{ + + struct maple_subtree_state mast; + int height = 0; + unsigned char mid_split, split = 0; + + /* + * Splitting is handled differently from any other B-tree; the Maple + * Tree splits upwards. Splitting up means that the split operation + * occurs when the walk of the tree hits the leaves and not on the way + * down. The reason for splitting up is that it is impossible to know + * how much space will be needed until the leaf is (or leaves are) + * reached. Since overwriting data is allowed and a range could + * overwrite more than one range or result in changing one entry into 3 + * entries, it is impossible to know if a split is required until the + * data is examined. + * + * Splitting is a balancing act between keeping allocations to a minimum + * and avoiding a 'jitter' event where a tree is expanded to make room + * for an entry followed by a contraction when the entry is removed. To + * accomplish the balance, there are empty slots remaining in both left + * and right nodes after a split. + */ + MA_STATE(l_mas, mas->tree, mas->index, mas->last); + MA_STATE(r_mas, mas->tree, mas->index, mas->last); + MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); + MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); + MA_TOPIARY(mat, mas->tree); + + trace_ma_op(__func__, mas); + mas->depth = mas_mt_height(mas); + /* Allocation failures will happen early. */ + mas_node_count(mas, 1 + mas->depth * 2); + if (mas_is_err(mas)) + return 0; + + mast.l = &l_mas; + mast.r = &r_mas; + mast.orig_l = &prev_l_mas; + mast.orig_r = &prev_r_mas; + mast.free = &mat; + mast.bn = b_node; + + while (height++ <= mas->depth) { + if (mt_slots[b_node->type] > b_node->b_end) { + mas_split_final_node(&mast, mas, height); + break; + } + + l_mas = r_mas = *mas; + l_mas.node = mas_new_ma_node(mas, b_node); + r_mas.node = mas_new_ma_node(mas, b_node); + /* + * Another way that 'jitter' is avoided is to terminate a split up early if the + * left or right node has space to spare. This is referred to as "pushing left" + * or "pushing right" and is similar to the B* tree, except the nodes left or + * right can rarely be reused due to RCU, but the ripple upwards is halted which + * is a significant savings. + */ + /* Try to push left. */ + if (mas_push_data(mas, height, &mast, true)) + break; + + /* Try to push right. */ + if (mas_push_data(mas, height, &mast, false)) + break; + + split = mab_calc_split(mas, b_node, &mid_split, prev_l_mas.min); + mast_split_data(&mast, mas, split); + /* + * Usually correct, mab_mas_cp in the above call overwrites + * r->max. + */ + mast.r->max = mas->max; + mast_fill_bnode(&mast, mas, 1); + prev_l_mas = *mast.l; + prev_r_mas = *mast.r; + } + + /* Set the original node as dead */ + mat_add(mast.free, mas->node); + mas->node = l_mas.node; + mas_wmb_replace(mas, mast.free, NULL); + mtree_range_walk(mas); + return 1; +} + +/* + * mas_reuse_node() - Reuse the node to store the data. + * @wr_mas: The maple write state + * @bn: The maple big node + * @end: The end of the data. + * + * Will always return false in RCU mode. + * + * Return: True if node was reused, false otherwise. + */ +static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, + struct maple_big_node *bn, unsigned char end) +{ + /* Need to be rcu safe. */ + if (mt_in_rcu(wr_mas->mas->tree)) + return false; + + if (end > bn->b_end) { + int clear = mt_slots[wr_mas->type] - bn->b_end; + + memset(wr_mas->slots + bn->b_end, 0, sizeof(void *) * clear--); + memset(wr_mas->pivots + bn->b_end, 0, sizeof(void *) * clear); + } + mab_mas_cp(bn, 0, bn->b_end, wr_mas->mas, false); + return true; +} + +/* + * mas_commit_b_node() - Commit the big node into the tree. + * @wr_mas: The maple write state + * @b_node: The maple big node + * @end: The end of the data. + */ +static inline int mas_commit_b_node(struct ma_wr_state *wr_mas, + struct maple_big_node *b_node, unsigned char end) +{ + struct maple_node *node; + unsigned char b_end = b_node->b_end; + enum maple_type b_type = b_node->type; + + if ((b_end < mt_min_slots[b_type]) && + (!mte_is_root(wr_mas->mas->node)) && + (mas_mt_height(wr_mas->mas) > 1)) + return mas_rebalance(wr_mas->mas, b_node); + + if (b_end >= mt_slots[b_type]) + return mas_split(wr_mas->mas, b_node); + + if (mas_reuse_node(wr_mas, b_node, end)) + goto reuse_node; + + mas_node_count(wr_mas->mas, 1); + if (mas_is_err(wr_mas->mas)) + return 0; + + node = mas_pop_node(wr_mas->mas); + node->parent = mas_mn(wr_mas->mas)->parent; + wr_mas->mas->node = mt_mk_node(node, b_type); + mab_mas_cp(b_node, 0, b_end, wr_mas->mas, true); + + mas_replace(wr_mas->mas, false); +reuse_node: + mas_update_gap(wr_mas->mas); + return 1; +} + +/* + * mas_root_expand() - Expand a root to a node + * @mas: The maple state + * @entry: The entry to store into the tree + */ +static inline int mas_root_expand(struct ma_state *mas, void *entry) +{ + void *contents = mas_root_locked(mas); + enum maple_type type = maple_leaf_64; + struct maple_node *node; + void __rcu **slots; + unsigned long *pivots; + int slot = 0; + + mas_node_count(mas, 1); + if (unlikely(mas_is_err(mas))) + return 0; + + node = mas_pop_node(mas); + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + node->parent = ma_parent_ptr( + ((unsigned long)mas->tree | MA_ROOT_PARENT)); + mas->node = mt_mk_node(node, type); + + if (mas->index) { + if (contents) { + rcu_assign_pointer(slots[slot], contents); + if (likely(mas->index > 1)) + slot++; + } + pivots[slot++] = mas->index - 1; + } + + rcu_assign_pointer(slots[slot], entry); + mas->offset = slot; + pivots[slot] = mas->last; + if (mas->last != ULONG_MAX) + slot++; + mas->depth = 1; + mas_set_height(mas); + + /* swap the new root into the tree */ + rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); + ma_set_meta(node, maple_leaf_64, 0, slot); + return slot; +} + +static inline void mas_store_root(struct ma_state *mas, void *entry) +{ + if (likely((mas->last != 0) || (mas->index != 0))) + mas_root_expand(mas, entry); + else if (((unsigned long) (entry) & 3) == 2) + mas_root_expand(mas, entry); + else { + rcu_assign_pointer(mas->tree->ma_root, entry); + mas->node = MAS_START; + } +} + +/* + * mas_is_span_wr() - Check if the write needs to be treated as a write that + * spans the node. + * @mas: The maple state + * @piv: The pivot value being written + * @type: The maple node type + * @entry: The data to write + * + * Spanning writes are writes that start in one node and end in another OR if + * the write of a %NULL will cause the node to end with a %NULL. + * + * Return: True if this is a spanning write, false otherwise. + */ +static bool mas_is_span_wr(struct ma_wr_state *wr_mas) +{ + unsigned long max; + unsigned long last = wr_mas->mas->last; + unsigned long piv = wr_mas->r_max; + enum maple_type type = wr_mas->type; + void *entry = wr_mas->entry; + + /* Contained in this pivot */ + if (piv > last) + return false; + + max = wr_mas->mas->max; + if (unlikely(ma_is_leaf(type))) { + /* Fits in the node, but may span slots. */ + if (last < max) + return false; + + /* Writes to the end of the node but not null. */ + if ((last == max) && entry) + return false; + + /* + * Writing ULONG_MAX is not a spanning write regardless of the + * value being written as long as the range fits in the node. + */ + if ((last == ULONG_MAX) && (last == max)) + return false; + } else if (piv == last) { + if (entry) + return false; + + /* Detect spanning store wr walk */ + if (last == ULONG_MAX) + return false; + } + + trace_ma_write(__func__, wr_mas->mas, piv, entry); + + return true; +} + +static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas) +{ + wr_mas->mas->depth++; + wr_mas->type = mte_node_type(wr_mas->mas->node); + mas_wr_node_walk(wr_mas); + wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); +} + +static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas) +{ + wr_mas->mas->max = wr_mas->r_max; + wr_mas->mas->min = wr_mas->r_min; + wr_mas->mas->node = wr_mas->content; + wr_mas->mas->offset = 0; +} +/* + * mas_wr_walk() - Walk the tree for a write. + * @wr_mas: The maple write state + * + * Uses mas_slot_locked() and does not need to worry about dead nodes. + * + * Return: True if it's contained in a node, false on spanning write. + */ +static bool mas_wr_walk(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + + while (true) { + mas_wr_walk_descend(wr_mas); + if (unlikely(mas_is_span_wr(wr_mas))) + return false; + + wr_mas->content = mas_slot_locked(mas, wr_mas->slots, + mas->offset); + if (ma_is_leaf(wr_mas->type)) + return true; + + mas_wr_walk_traverse(wr_mas); + } + + return true; +} + +static bool mas_wr_walk_index(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + + while (true) { + mas_wr_walk_descend(wr_mas); + wr_mas->content = mas_slot_locked(mas, wr_mas->slots, + mas->offset); + if (ma_is_leaf(wr_mas->type)) + return true; + mas_wr_walk_traverse(wr_mas); + + } + return true; +} +/* + * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. + * @l_wr_mas: The left maple write state + * @r_wr_mas: The right maple write state + */ +static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, + struct ma_wr_state *r_wr_mas) +{ + struct ma_state *r_mas = r_wr_mas->mas; + struct ma_state *l_mas = l_wr_mas->mas; + unsigned char l_slot; + + l_slot = l_mas->offset; + if (!l_wr_mas->content) + l_mas->index = l_wr_mas->r_min; + + if ((l_mas->index == l_wr_mas->r_min) && + (l_slot && + !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) { + if (l_slot > 1) + l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1; + else + l_mas->index = l_mas->min; + + l_mas->offset = l_slot - 1; + } + + if (!r_wr_mas->content) { + if (r_mas->last < r_wr_mas->r_max) + r_mas->last = r_wr_mas->r_max; + r_mas->offset++; + } else if ((r_mas->last == r_wr_mas->r_max) && + (r_mas->last < r_mas->max) && + !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) { + r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots, + r_wr_mas->type, r_mas->offset + 1); + r_mas->offset++; + } +} + +static inline void *mas_state_walk(struct ma_state *mas) +{ + void *entry; + + entry = mas_start(mas); + if (mas_is_none(mas)) + return NULL; + + if (mas_is_ptr(mas)) + return entry; + + return mtree_range_walk(mas); +} + +/* + * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up + * to date. + * + * @mas: The maple state. + * + * Note: Leaves mas in undesirable state. + * Return: The entry for @mas->index or %NULL on dead node. + */ +static inline void *mtree_lookup_walk(struct ma_state *mas) +{ + unsigned long *pivots; + unsigned char offset; + struct maple_node *node; + struct maple_enode *next; + enum maple_type type; + void __rcu **slots; + unsigned char end; + unsigned long max; + + next = mas->node; + max = ULONG_MAX; + do { + offset = 0; + node = mte_to_node(next); + type = mte_node_type(next); + pivots = ma_pivots(node, type); + end = ma_data_end(node, type, pivots, max); + if (unlikely(ma_dead_node(node))) + goto dead_node; + + if (pivots[offset] >= mas->index) + goto next; + + do { + offset++; + } while ((offset < end) && (pivots[offset] < mas->index)); + + if (likely(offset > end)) + max = pivots[offset]; + +next: + slots = ma_slots(node, type); + next = mt_slot(mas->tree, slots, offset); + if (unlikely(ma_dead_node(node))) + goto dead_node; + } while (!ma_is_leaf(type)); + + return (void *) next; + +dead_node: + mas_reset(mas); + return NULL; +} + +/* + * mas_new_root() - Create a new root node that only contains the entry passed + * in. + * @mas: The maple state + * @entry: The entry to store. + * + * Only valid when the index == 0 and the last == ULONG_MAX + * + * Return 0 on error, 1 on success. + */ +static inline int mas_new_root(struct ma_state *mas, void *entry) +{ + struct maple_enode *root = mas_root_locked(mas); + enum maple_type type = maple_leaf_64; + struct maple_node *node; + void __rcu **slots; + unsigned long *pivots; + + if (!entry && !mas->index && mas->last == ULONG_MAX) { + mas->depth = 0; + mas_set_height(mas); + rcu_assign_pointer(mas->tree->ma_root, entry); + mas->node = MAS_START; + goto done; + } + + mas_node_count(mas, 1); + if (mas_is_err(mas)) + return 0; + + node = mas_pop_node(mas); + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + node->parent = ma_parent_ptr( + ((unsigned long)mas->tree | MA_ROOT_PARENT)); + mas->node = mt_mk_node(node, type); + rcu_assign_pointer(slots[0], entry); + pivots[0] = mas->last; + mas->depth = 1; + mas_set_height(mas); + rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); + +done: + if (xa_is_node(root)) + mte_destroy_walk(root, mas->tree); + + return 1; +} +/* + * mas_wr_spanning_store() - Create a subtree with the store operation completed + * and new nodes where necessary, then place the sub-tree in the actual tree. + * Note that mas is expected to point to the node which caused the store to + * span. + * @wr_mas: The maple write state + * + * Return: 0 on error, positive on success. + */ +static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) +{ + struct maple_subtree_state mast; + struct maple_big_node b_node; + struct ma_state *mas; + unsigned char height; + + /* Left and Right side of spanning store */ + MA_STATE(l_mas, NULL, 0, 0); + MA_STATE(r_mas, NULL, 0, 0); + + MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); + MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); + + /* + * A store operation that spans multiple nodes is called a spanning + * store and is handled early in the store call stack by the function + * mas_is_span_wr(). When a spanning store is identified, the maple + * state is duplicated. The first maple state walks the left tree path + * to ``index``, the duplicate walks the right tree path to ``last``. + * The data in the two nodes are combined into a single node, two nodes, + * or possibly three nodes (see the 3-way split above). A ``NULL`` + * written to the last entry of a node is considered a spanning store as + * a rebalance is required for the operation to complete and an overflow + * of data may happen. + */ + mas = wr_mas->mas; + trace_ma_op(__func__, mas); + + if (unlikely(!mas->index && mas->last == ULONG_MAX)) + return mas_new_root(mas, wr_mas->entry); + /* + * Node rebalancing may occur due to this store, so there may be three new + * entries per level plus a new root. + */ + height = mas_mt_height(mas); + mas_node_count(mas, 1 + height * 3); + if (mas_is_err(mas)) + return 0; + + /* + * Set up right side. Need to get to the next offset after the spanning + * store to ensure it's not NULL and to combine both the next node and + * the node with the start together. + */ + r_mas = *mas; + /* Avoid overflow, walk to next slot in the tree. */ + if (r_mas.last + 1) + r_mas.last++; + + r_mas.index = r_mas.last; + mas_wr_walk_index(&r_wr_mas); + r_mas.last = r_mas.index = mas->last; + + /* Set up left side. */ + l_mas = *mas; + mas_wr_walk_index(&l_wr_mas); + + if (!wr_mas->entry) { + mas_extend_spanning_null(&l_wr_mas, &r_wr_mas); + mas->offset = l_mas.offset; + mas->index = l_mas.index; + mas->last = l_mas.last = r_mas.last; + } + + /* expanding NULLs may make this cover the entire range */ + if (!l_mas.index && r_mas.last == ULONG_MAX) { + mas_set_range(mas, 0, ULONG_MAX); + return mas_new_root(mas, wr_mas->entry); + } + + memset(&b_node, 0, sizeof(struct maple_big_node)); + /* Copy l_mas and store the value in b_node. */ + mas_store_b_node(&l_wr_mas, &b_node, l_wr_mas.node_end); + /* Copy r_mas into b_node. */ + if (r_mas.offset <= r_wr_mas.node_end) + mas_mab_cp(&r_mas, r_mas.offset, r_wr_mas.node_end, + &b_node, b_node.b_end + 1); + else + b_node.b_end++; + + /* Stop spanning searches by searching for just index. */ + l_mas.index = l_mas.last = mas->index; + + mast.bn = &b_node; + mast.orig_l = &l_mas; + mast.orig_r = &r_mas; + /* Combine l_mas and r_mas and split them up evenly again. */ + return mas_spanning_rebalance(mas, &mast, height + 1); +} + +/* + * mas_wr_node_store() - Attempt to store the value in a node + * @wr_mas: The maple write state + * + * Attempts to reuse the node, but may allocate. + * + * Return: True if stored, false otherwise + */ +static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + void __rcu **dst_slots; + unsigned long *dst_pivots; + unsigned char dst_offset; + unsigned char new_end = wr_mas->node_end; + unsigned char offset; + unsigned char node_slots = mt_slots[wr_mas->type]; + struct maple_node reuse, *newnode; + unsigned char copy_size, max_piv = mt_pivots[wr_mas->type]; + bool in_rcu = mt_in_rcu(mas->tree); + + offset = mas->offset; + if (mas->last == wr_mas->r_max) { + /* runs right to the end of the node */ + if (mas->last == mas->max) + new_end = offset; + /* don't copy this offset */ + wr_mas->offset_end++; + } else if (mas->last < wr_mas->r_max) { + /* new range ends in this range */ + if (unlikely(wr_mas->r_max == ULONG_MAX)) + mas_bulk_rebalance(mas, wr_mas->node_end, wr_mas->type); + + new_end++; + } else { + if (wr_mas->end_piv == mas->last) + wr_mas->offset_end++; + + new_end -= wr_mas->offset_end - offset - 1; + } + + /* new range starts within a range */ + if (wr_mas->r_min < mas->index) + new_end++; + + /* Not enough room */ + if (new_end >= node_slots) + return false; + + /* Not enough data. */ + if (!mte_is_root(mas->node) && (new_end <= mt_min_slots[wr_mas->type]) && + !(mas->mas_flags & MA_STATE_BULK)) + return false; + + /* set up node. */ + if (in_rcu) { + mas_node_count(mas, 1); + if (mas_is_err(mas)) + return false; + + newnode = mas_pop_node(mas); + } else { + memset(&reuse, 0, sizeof(struct maple_node)); + newnode = &reuse; + } + + newnode->parent = mas_mn(mas)->parent; + dst_pivots = ma_pivots(newnode, wr_mas->type); + dst_slots = ma_slots(newnode, wr_mas->type); + /* Copy from start to insert point */ + memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * (offset + 1)); + memcpy(dst_slots, wr_mas->slots, sizeof(void *) * (offset + 1)); + dst_offset = offset; + + /* Handle insert of new range starting after old range */ + if (wr_mas->r_min < mas->index) { + mas->offset++; + rcu_assign_pointer(dst_slots[dst_offset], wr_mas->content); + dst_pivots[dst_offset++] = mas->index - 1; + } + + /* Store the new entry and range end. */ + if (dst_offset < max_piv) + dst_pivots[dst_offset] = mas->last; + mas->offset = dst_offset; + rcu_assign_pointer(dst_slots[dst_offset], wr_mas->entry); + + /* + * this range wrote to the end of the node or it overwrote the rest of + * the data + */ + if (wr_mas->offset_end > wr_mas->node_end || mas->last >= mas->max) { + new_end = dst_offset; + goto done; + } + + dst_offset++; + /* Copy to the end of node if necessary. */ + copy_size = wr_mas->node_end - wr_mas->offset_end + 1; + memcpy(dst_slots + dst_offset, wr_mas->slots + wr_mas->offset_end, + sizeof(void *) * copy_size); + if (dst_offset < max_piv) { + if (copy_size > max_piv - dst_offset) + copy_size = max_piv - dst_offset; + + memcpy(dst_pivots + dst_offset, + wr_mas->pivots + wr_mas->offset_end, + sizeof(unsigned long) * copy_size); + } + + if ((wr_mas->node_end == node_slots - 1) && (new_end < node_slots - 1)) + dst_pivots[new_end] = mas->max; + +done: + mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end); + if (in_rcu) { + mas->node = mt_mk_node(newnode, wr_mas->type); + mas_replace(mas, false); + } else { + memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); + } + trace_ma_write(__func__, mas, 0, wr_mas->entry); + mas_update_gap(mas); + return true; +} + +/* + * mas_wr_slot_store: Attempt to store a value in a slot. + * @wr_mas: the maple write state + * + * Return: True if stored, false otherwise + */ +static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + unsigned long lmax; /* Logical max. */ + unsigned char offset = mas->offset; + + if ((wr_mas->r_max > mas->last) && ((wr_mas->r_min != mas->index) || + (offset != wr_mas->node_end))) + return false; + + if (offset == wr_mas->node_end - 1) + lmax = mas->max; + else + lmax = wr_mas->pivots[offset + 1]; + + /* going to overwrite too many slots. */ + if (lmax < mas->last) + return false; + + if (wr_mas->r_min == mas->index) { + /* overwriting two or more ranges with one. */ + if (lmax == mas->last) + return false; + + /* Overwriting all of offset and a portion of offset + 1. */ + rcu_assign_pointer(wr_mas->slots[offset], wr_mas->entry); + wr_mas->pivots[offset] = mas->last; + goto done; + } + + /* Doesn't end on the next range end. */ + if (lmax != mas->last) + return false; + + /* Overwriting a portion of offset and all of offset + 1 */ + if ((offset + 1 < mt_pivots[wr_mas->type]) && + (wr_mas->entry || wr_mas->pivots[offset + 1])) + wr_mas->pivots[offset + 1] = mas->last; + + rcu_assign_pointer(wr_mas->slots[offset + 1], wr_mas->entry); + wr_mas->pivots[offset] = mas->index - 1; + mas->offset++; /* Keep mas accurate. */ + +done: + trace_ma_write(__func__, mas, 0, wr_mas->entry); + mas_update_gap(mas); + return true; +} + +static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) +{ + while ((wr_mas->mas->last > wr_mas->end_piv) && + (wr_mas->offset_end < wr_mas->node_end)) + wr_mas->end_piv = wr_mas->pivots[++wr_mas->offset_end]; + + if (wr_mas->mas->last > wr_mas->end_piv) + wr_mas->end_piv = wr_mas->mas->max; +} + +static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + + if (mas->last < wr_mas->end_piv && !wr_mas->slots[wr_mas->offset_end]) + mas->last = wr_mas->end_piv; + + /* Check next slot(s) if we are overwriting the end */ + if ((mas->last == wr_mas->end_piv) && + (wr_mas->node_end != wr_mas->offset_end) && + !wr_mas->slots[wr_mas->offset_end + 1]) { + wr_mas->offset_end++; + if (wr_mas->offset_end == wr_mas->node_end) + mas->last = mas->max; + else + mas->last = wr_mas->pivots[wr_mas->offset_end]; + wr_mas->end_piv = mas->last; + } + + if (!wr_mas->content) { + /* If this one is null, the next and prev are not */ + mas->index = wr_mas->r_min; + } else { + /* Check prev slot if we are overwriting the start */ + if (mas->index == wr_mas->r_min && mas->offset && + !wr_mas->slots[mas->offset - 1]) { + mas->offset--; + wr_mas->r_min = mas->index = + mas_safe_min(mas, wr_mas->pivots, mas->offset); + wr_mas->r_max = wr_mas->pivots[mas->offset]; + } + } +} + +static inline bool mas_wr_append(struct ma_wr_state *wr_mas) +{ + unsigned char end = wr_mas->node_end; + unsigned char new_end = end + 1; + struct ma_state *mas = wr_mas->mas; + unsigned char node_pivots = mt_pivots[wr_mas->type]; + + if ((mas->index != wr_mas->r_min) && (mas->last == wr_mas->r_max)) { + if (new_end < node_pivots) + wr_mas->pivots[new_end] = wr_mas->pivots[end]; + + if (new_end < node_pivots) + ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end); + + rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->entry); + mas->offset = new_end; + wr_mas->pivots[end] = mas->index - 1; + + return true; + } + + if ((mas->index == wr_mas->r_min) && (mas->last < wr_mas->r_max)) { + if (new_end < node_pivots) + wr_mas->pivots[new_end] = wr_mas->pivots[end]; + + rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->content); + if (new_end < node_pivots) + ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end); + + wr_mas->pivots[end] = mas->last; + rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry); + return true; + } + + return false; +} + +static inline void mas_wr_modify(struct ma_wr_state *wr_mas) +{ + unsigned char node_slots; + unsigned char node_size; + struct ma_state *mas = wr_mas->mas; + struct maple_big_node b_node; + + /* Direct replacement */ + if (wr_mas->r_min == mas->index && wr_mas->r_max == mas->last) { + rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); + if (!!wr_mas->entry ^ !!wr_mas->content) + mas_update_gap(mas); + return; + } + + /* Attempt to append */ + node_slots = mt_slots[wr_mas->type]; + node_size = wr_mas->node_end - wr_mas->offset_end + mas->offset + 2; + if (mas->max == ULONG_MAX) + node_size++; + + /* slot and node store will not fit, go to the slow path */ + if (unlikely(node_size >= node_slots)) + goto slow_path; + + if (wr_mas->entry && (wr_mas->node_end < node_slots - 1) && + (mas->offset == wr_mas->node_end) && mas_wr_append(wr_mas)) { + if (!wr_mas->content || !wr_mas->entry) + mas_update_gap(mas); + return; + } + + if ((wr_mas->offset_end - mas->offset <= 1) && mas_wr_slot_store(wr_mas)) + return; + else if (mas_wr_node_store(wr_mas)) + return; + + if (mas_is_err(mas)) + return; + +slow_path: + memset(&b_node, 0, sizeof(struct maple_big_node)); + mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); + trace_ma_write(__func__, mas, 0, wr_mas->entry); + mas_commit_b_node(wr_mas, &b_node, wr_mas->node_end); +} + +/* + * mas_wr_store_entry() - Internal call to store a value + * @mas: The maple state + * @entry: The entry to store. + * + * Return: The contents that was stored at the index. + */ +static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + + wr_mas->content = mas_start(mas); + if (mas_is_none(mas) || mas_is_ptr(mas)) { + mas_store_root(mas, wr_mas->entry); + return wr_mas->content; + } + + if (unlikely(!mas_wr_walk(wr_mas))) { + mas_wr_spanning_store(wr_mas); + return wr_mas->content; + } + + /* At this point, we are at the leaf node that needs to be altered. */ + wr_mas->end_piv = wr_mas->r_max; + mas_wr_end_piv(wr_mas); + + if (!wr_mas->entry) + mas_wr_extend_null(wr_mas); + + /* New root for a single pointer */ + if (unlikely(!mas->index && mas->last == ULONG_MAX)) { + mas_new_root(mas, wr_mas->entry); + return wr_mas->content; + } + + mas_wr_modify(wr_mas); + return wr_mas->content; +} + +/** + * mas_insert() - Internal call to insert a value + * @mas: The maple state + * @entry: The entry to store + * + * Return: %NULL or the contents that already exists at the requested index + * otherwise. The maple state needs to be checked for error conditions. + */ +static inline void *mas_insert(struct ma_state *mas, void *entry) +{ + MA_WR_STATE(wr_mas, mas, entry); + + /* + * Inserting a new range inserts either 0, 1, or 2 pivots within the + * tree. If the insert fits exactly into an existing gap with a value + * of NULL, then the slot only needs to be written with the new value. + * If the range being inserted is adjacent to another range, then only a + * single pivot needs to be inserted (as well as writing the entry). If + * the new range is within a gap but does not touch any other ranges, + * then two pivots need to be inserted: the start - 1, and the end. As + * usual, the entry must be written. Most operations require a new node + * to be allocated and replace an existing node to ensure RCU safety, + * when in RCU mode. The exception to requiring a newly allocated node + * is when inserting at the end of a node (appending). When done + * carefully, appending can reuse the node in place. + */ + wr_mas.content = mas_start(mas); + if (wr_mas.content) + goto exists; + + if (mas_is_none(mas) || mas_is_ptr(mas)) { + mas_store_root(mas, entry); + return NULL; + } + + /* spanning writes always overwrite something */ + if (!mas_wr_walk(&wr_mas)) + goto exists; + + /* At this point, we are at the leaf node that needs to be altered. */ + wr_mas.offset_end = mas->offset; + wr_mas.end_piv = wr_mas.r_max; + + if (wr_mas.content || (mas->last > wr_mas.r_max)) + goto exists; + + if (!entry) + return NULL; + + mas_wr_modify(&wr_mas); + return wr_mas.content; + +exists: + mas_set_err(mas, -EEXIST); + return wr_mas.content; + +} + +/* + * mas_prev_node() - Find the prev non-null entry at the same level in the + * tree. The prev value will be mas->node[mas->offset] or MAS_NONE. + * @mas: The maple state + * @min: The lower limit to search + * + * The prev node value will be mas->node[mas->offset] or MAS_NONE. + * Return: 1 if the node is dead, 0 otherwise. + */ +static inline int mas_prev_node(struct ma_state *mas, unsigned long min) +{ + enum maple_type mt; + int offset, level; + void __rcu **slots; + struct maple_node *node; + struct maple_enode *enode; + unsigned long *pivots; + + if (mas_is_none(mas)) + return 0; + + level = 0; + do { + node = mas_mn(mas); + if (ma_is_root(node)) + goto no_entry; + + /* Walk up. */ + if (unlikely(mas_ascend(mas))) + return 1; + offset = mas->offset; + level++; + } while (!offset); + + offset--; + mt = mte_node_type(mas->node); + node = mas_mn(mas); + slots = ma_slots(node, mt); + pivots = ma_pivots(node, mt); + mas->max = pivots[offset]; + if (offset) + mas->min = pivots[offset - 1] + 1; + if (unlikely(ma_dead_node(node))) + return 1; + + if (mas->max < min) + goto no_entry_min; + + while (level > 1) { + level--; + enode = mas_slot(mas, slots, offset); + if (unlikely(ma_dead_node(node))) + return 1; + + mas->node = enode; + mt = mte_node_type(mas->node); + node = mas_mn(mas); + slots = ma_slots(node, mt); + pivots = ma_pivots(node, mt); + offset = ma_data_end(node, mt, pivots, mas->max); + if (offset) + mas->min = pivots[offset - 1] + 1; + + if (offset < mt_pivots[mt]) + mas->max = pivots[offset]; + + if (mas->max < min) + goto no_entry; + } + + mas->node = mas_slot(mas, slots, offset); + if (unlikely(ma_dead_node(node))) + return 1; + + mas->offset = mas_data_end(mas); + if (unlikely(mte_dead_node(mas->node))) + return 1; + + return 0; + +no_entry_min: + mas->offset = offset; + if (offset) + mas->min = pivots[offset - 1] + 1; +no_entry: + if (unlikely(ma_dead_node(node))) + return 1; + + mas->node = MAS_NONE; + return 0; +} + +/* + * mas_next_node() - Get the next node at the same level in the tree. + * @mas: The maple state + * @max: The maximum pivot value to check. + * + * The next value will be mas->node[mas->offset] or MAS_NONE. + * Return: 1 on dead node, 0 otherwise. + */ +static inline int mas_next_node(struct ma_state *mas, struct maple_node *node, + unsigned long max) +{ + unsigned long min, pivot; + unsigned long *pivots; + struct maple_enode *enode; + int level = 0; + unsigned char offset; + enum maple_type mt; + void __rcu **slots; + + if (mas->max >= max) + goto no_entry; + + level = 0; + do { + if (ma_is_root(node)) + goto no_entry; + + min = mas->max + 1; + if (min > max) + goto no_entry; + + if (unlikely(mas_ascend(mas))) + return 1; + + offset = mas->offset; + level++; + node = mas_mn(mas); + mt = mte_node_type(mas->node); + pivots = ma_pivots(node, mt); + } while (unlikely(offset == ma_data_end(node, mt, pivots, mas->max))); + + slots = ma_slots(node, mt); + pivot = mas_safe_pivot(mas, pivots, ++offset, mt); + while (unlikely(level > 1)) { + /* Descend, if necessary */ + enode = mas_slot(mas, slots, offset); + if (unlikely(ma_dead_node(node))) + return 1; + + mas->node = enode; + level--; + node = mas_mn(mas); + mt = mte_node_type(mas->node); + slots = ma_slots(node, mt); + pivots = ma_pivots(node, mt); + offset = 0; + pivot = pivots[0]; + } + + enode = mas_slot(mas, slots, offset); + if (unlikely(ma_dead_node(node))) + return 1; + + mas->node = enode; + mas->min = min; + mas->max = pivot; + return 0; + +no_entry: + if (unlikely(ma_dead_node(node))) + return 1; + + mas->node = MAS_NONE; + return 0; +} + +/* + * mas_next_nentry() - Get the next node entry + * @mas: The maple state + * @max: The maximum value to check + * @*range_start: Pointer to store the start of the range. + * + * Sets @mas->offset to the offset of the next node entry, @mas->last to the + * pivot of the entry. + * + * Return: The next entry, %NULL otherwise + */ +static inline void *mas_next_nentry(struct ma_state *mas, + struct maple_node *node, unsigned long max, enum maple_type type) +{ + unsigned char count; + unsigned long pivot; + unsigned long *pivots; + void __rcu **slots; + void *entry; + + if (mas->last == mas->max) { + mas->index = mas->max; + return NULL; + } + + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + mas->index = mas_safe_min(mas, pivots, mas->offset); + if (ma_dead_node(node)) + return NULL; + + if (mas->index > max) + return NULL; + + count = ma_data_end(node, type, pivots, mas->max); + if (mas->offset > count) + return NULL; + + while (mas->offset < count) { + pivot = pivots[mas->offset]; + entry = mas_slot(mas, slots, mas->offset); + if (ma_dead_node(node)) + return NULL; + + if (entry) + goto found; + + if (pivot >= max) + return NULL; + + mas->index = pivot + 1; + mas->offset++; + } + + if (mas->index > mas->max) { + mas->index = mas->last; + return NULL; + } + + pivot = mas_safe_pivot(mas, pivots, mas->offset, type); + entry = mas_slot(mas, slots, mas->offset); + if (ma_dead_node(node)) + return NULL; + + if (!pivot) + return NULL; + + if (!entry) + return NULL; + +found: + mas->last = pivot; + return entry; +} + +static inline void mas_rewalk(struct ma_state *mas, unsigned long index) +{ + +retry: + mas_set(mas, index); + mas_state_walk(mas); + if (mas_is_start(mas)) + goto retry; + + return; + +} + +/* + * mas_next_entry() - Internal function to get the next entry. + * @mas: The maple state + * @limit: The maximum range start. + * + * Set the @mas->node to the next entry and the range_start to + * the beginning value for the entry. Does not check beyond @limit. + * Sets @mas->index and @mas->last to the limit if it is hit. + * Restarts on dead nodes. + * + * Return: the next entry or %NULL. + */ +static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) +{ + void *entry = NULL; + struct maple_enode *prev_node; + struct maple_node *node; + unsigned char offset; + unsigned long last; + enum maple_type mt; + + last = mas->last; +retry: + offset = mas->offset; + prev_node = mas->node; + node = mas_mn(mas); + mt = mte_node_type(mas->node); + mas->offset++; + if (unlikely(mas->offset >= mt_slots[mt])) { + mas->offset = mt_slots[mt] - 1; + goto next_node; + } + + while (!mas_is_none(mas)) { + entry = mas_next_nentry(mas, node, limit, mt); + if (unlikely(ma_dead_node(node))) { + mas_rewalk(mas, last); + goto retry; + } + + if (likely(entry)) + return entry; + + if (unlikely((mas->index > limit))) + break; + +next_node: + prev_node = mas->node; + offset = mas->offset; + if (unlikely(mas_next_node(mas, node, limit))) { + mas_rewalk(mas, last); + goto retry; + } + mas->offset = 0; + node = mas_mn(mas); + mt = mte_node_type(mas->node); + } + + mas->index = mas->last = limit; + mas->offset = offset; + mas->node = prev_node; + return NULL; +} + +/* + * mas_prev_nentry() - Get the previous node entry. + * @mas: The maple state. + * @limit: The lower limit to check for a value. + * + * Return: the entry, %NULL otherwise. + */ +static inline void *mas_prev_nentry(struct ma_state *mas, unsigned long limit, + unsigned long index) +{ + unsigned long pivot, min; + unsigned char offset; + struct maple_node *mn; + enum maple_type mt; + unsigned long *pivots; + void __rcu **slots; + void *entry; + +retry: + if (!mas->offset) + return NULL; + + mn = mas_mn(mas); + mt = mte_node_type(mas->node); + offset = mas->offset - 1; + if (offset >= mt_slots[mt]) + offset = mt_slots[mt] - 1; + + slots = ma_slots(mn, mt); + pivots = ma_pivots(mn, mt); + if (offset == mt_pivots[mt]) + pivot = mas->max; + else + pivot = pivots[offset]; + + if (unlikely(ma_dead_node(mn))) { + mas_rewalk(mas, index); + goto retry; + } + + while (offset && ((!mas_slot(mas, slots, offset) && pivot >= limit) || + !pivot)) + pivot = pivots[--offset]; + + min = mas_safe_min(mas, pivots, offset); + entry = mas_slot(mas, slots, offset); + if (unlikely(ma_dead_node(mn))) { + mas_rewalk(mas, index); + goto retry; + } + + if (likely(entry)) { + mas->offset = offset; + mas->last = pivot; + mas->index = min; + } + return entry; +} + +static inline void *mas_prev_entry(struct ma_state *mas, unsigned long min) +{ + void *entry; + +retry: + while (likely(!mas_is_none(mas))) { + entry = mas_prev_nentry(mas, min, mas->index); + if (unlikely(mas->last < min)) + goto not_found; + + if (likely(entry)) + return entry; + + if (unlikely(mas_prev_node(mas, min))) { + mas_rewalk(mas, mas->index); + goto retry; + } + + mas->offset++; + } + + mas->offset--; +not_found: + mas->index = mas->last = min; + return NULL; +} + +/* + * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the + * highest gap address of a given size in a given node and descend. + * @mas: The maple state + * @size: The needed size. + * + * Return: True if found in a leaf, false otherwise. + * + */ +static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) +{ + enum maple_type type = mte_node_type(mas->node); + struct maple_node *node = mas_mn(mas); + unsigned long *pivots, *gaps; + void __rcu **slots; + unsigned long gap = 0; + unsigned long max, min, index; + unsigned char offset; + + if (unlikely(mas_is_err(mas))) + return true; + + if (ma_is_dense(type)) { + /* dense nodes. */ + mas->offset = (unsigned char)(mas->index - mas->min); + return true; + } + + pivots = ma_pivots(node, type); + slots = ma_slots(node, type); + gaps = ma_gaps(node, type); + offset = mas->offset; + min = mas_safe_min(mas, pivots, offset); + /* Skip out of bounds. */ + while (mas->last < min) + min = mas_safe_min(mas, pivots, --offset); + + max = mas_safe_pivot(mas, pivots, offset, type); + index = mas->index; + while (index <= max) { + gap = 0; + if (gaps) + gap = gaps[offset]; + else if (!mas_slot(mas, slots, offset)) + gap = max - min + 1; + + if (gap) { + if ((size <= gap) && (size <= mas->last - min + 1)) + break; + + if (!gaps) { + /* Skip the next slot, it cannot be a gap. */ + if (offset < 2) + goto ascend; + + offset -= 2; + max = pivots[offset]; + min = mas_safe_min(mas, pivots, offset); + continue; + } + } + + if (!offset) + goto ascend; + + offset--; + max = min - 1; + min = mas_safe_min(mas, pivots, offset); + } + + if (unlikely(index > max)) { + mas_set_err(mas, -EBUSY); + return false; + } + + if (unlikely(ma_is_leaf(type))) { + mas->offset = offset; + mas->min = min; + mas->max = min + gap - 1; + return true; + } + + /* descend, only happens under lock. */ + mas->node = mas_slot(mas, slots, offset); + mas->min = min; + mas->max = max; + mas->offset = mas_data_end(mas); + return false; + +ascend: + if (mte_is_root(mas->node)) + mas_set_err(mas, -EBUSY); + + return false; +} + +static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) +{ + enum maple_type type = mte_node_type(mas->node); + unsigned long pivot, min, gap = 0; + unsigned char count, offset; + unsigned long *gaps = NULL, *pivots = ma_pivots(mas_mn(mas), type); + void __rcu **slots = ma_slots(mas_mn(mas), type); + bool found = false; + + if (ma_is_dense(type)) { + mas->offset = (unsigned char)(mas->index - mas->min); + return true; + } + + gaps = ma_gaps(mte_to_node(mas->node), type); + offset = mas->offset; + count = mt_slots[type]; + min = mas_safe_min(mas, pivots, offset); + for (; offset < count; offset++) { + pivot = mas_safe_pivot(mas, pivots, offset, type); + if (offset && !pivot) + break; + + /* Not within lower bounds */ + if (mas->index > pivot) + goto next_slot; + + if (gaps) + gap = gaps[offset]; + else if (!mas_slot(mas, slots, offset)) + gap = min(pivot, mas->last) - max(mas->index, min) + 1; + else + goto next_slot; + + if (gap >= size) { + if (ma_is_leaf(type)) { + found = true; + goto done; + } + if (mas->index <= pivot) { + mas->node = mas_slot(mas, slots, offset); + mas->min = min; + mas->max = pivot; + offset = 0; + type = mte_node_type(mas->node); + count = mt_slots[type]; + break; + } + } +next_slot: + min = pivot + 1; + if (mas->last <= pivot) { + mas_set_err(mas, -EBUSY); + return true; + } + } + + if (mte_is_root(mas->node)) + found = true; +done: + mas->offset = offset; + return found; +} + +/** + * mas_walk() - Search for @mas->index in the tree. + * @mas: The maple state. + * + * mas->index and mas->last will be set to the range if there is a value. If + * mas->node is MAS_NONE, reset to MAS_START. + * + * Return: the entry at the location or %NULL. + */ +void *mas_walk(struct ma_state *mas) +{ + void *entry; + +retry: + entry = mas_state_walk(mas); + if (mas_is_start(mas)) + goto retry; + + if (mas_is_ptr(mas)) { + if (!mas->index) { + mas->last = 0; + } else { + mas->index = 1; + mas->last = ULONG_MAX; + } + return entry; + } + + if (mas_is_none(mas)) { + mas->index = 0; + mas->last = ULONG_MAX; + } + + return entry; +} + +static inline bool mas_rewind_node(struct ma_state *mas) +{ + unsigned char slot; + + do { + if (mte_is_root(mas->node)) { + slot = mas->offset; + if (!slot) + return false; + } else { + mas_ascend(mas); + slot = mas->offset; + } + } while (!slot); + + mas->offset = --slot; + return true; +} + +/* + * mas_skip_node() - Internal function. Skip over a node. + * @mas: The maple state. + * + * Return: true if there is another node, false otherwise. + */ +static inline bool mas_skip_node(struct ma_state *mas) +{ + unsigned char slot, slot_count; + unsigned long *pivots; + enum maple_type mt; + + mt = mte_node_type(mas->node); + slot_count = mt_slots[mt] - 1; + do { + if (mte_is_root(mas->node)) { + slot = mas->offset; + if (slot > slot_count) { + mas_set_err(mas, -EBUSY); + return false; + } + } else { + mas_ascend(mas); + slot = mas->offset; + mt = mte_node_type(mas->node); + slot_count = mt_slots[mt] - 1; + } + } while (slot > slot_count); + + mas->offset = ++slot; + pivots = ma_pivots(mas_mn(mas), mt); + if (slot > 0) + mas->min = pivots[slot - 1] + 1; + + if (slot <= slot_count) + mas->max = pivots[slot]; + + return true; +} + +/* + * mas_awalk() - Allocation walk. Search from low address to high, for a gap of + * @size + * @mas: The maple state + * @size: The size of the gap required + * + * Search between @mas->index and @mas->last for a gap of @size. + */ +static inline void mas_awalk(struct ma_state *mas, unsigned long size) +{ + struct maple_enode *last = NULL; + + /* + * There are 4 options: + * go to child (descend) + * go back to parent (ascend) + * no gap found. (return, slot == MAPLE_NODE_SLOTS) + * found the gap. (return, slot != MAPLE_NODE_SLOTS) + */ + while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { + if (last == mas->node) + mas_skip_node(mas); + else + last = mas->node; + } +} + +/* + * mas_fill_gap() - Fill a located gap with @entry. + * @mas: The maple state + * @entry: The value to store + * @slot: The offset into the node to store the @entry + * @size: The size of the entry + * @index: The start location + */ +static inline void mas_fill_gap(struct ma_state *mas, void *entry, + unsigned char slot, unsigned long size, unsigned long *index) +{ + MA_WR_STATE(wr_mas, mas, entry); + unsigned char pslot = mte_parent_slot(mas->node); + struct maple_enode *mn = mas->node; + unsigned long *pivots; + enum maple_type ptype; + /* + * mas->index is the start address for the search + * which may no longer be needed. + * mas->last is the end address for the search + */ + + *index = mas->index; + mas->last = mas->index + size - 1; + + /* + * It is possible that using mas->max and mas->min to correctly + * calculate the index and last will cause an issue in the gap + * calculation, so fix the ma_state here + */ + mas_ascend(mas); + ptype = mte_node_type(mas->node); + pivots = ma_pivots(mas_mn(mas), ptype); + mas->max = mas_safe_pivot(mas, pivots, pslot, ptype); + mas->min = mas_safe_min(mas, pivots, pslot); + mas->node = mn; + mas->offset = slot; + mas_wr_store_entry(&wr_mas); +} + +/* + * mas_sparse_area() - Internal function. Return upper or lower limit when + * searching for a gap in an empty tree. + * @mas: The maple state + * @min: the minimum range + * @max: The maximum range + * @size: The size of the gap + * @fwd: Searching forward or back + */ +static inline void mas_sparse_area(struct ma_state *mas, unsigned long min, + unsigned long max, unsigned long size, bool fwd) +{ + unsigned long start = 0; + + if (!unlikely(mas_is_none(mas))) + start++; + /* mas_is_ptr */ + + if (start < min) + start = min; + + if (fwd) { + mas->index = start; + mas->last = start + size - 1; + return; + } + + mas->index = max; +} + +/* + * mas_empty_area() - Get the lowest address within the range that is + * sufficient for the size requested. + * @mas: The maple state + * @min: The lowest value of the range + * @max: The highest value of the range + * @size: The size needed + */ +int mas_empty_area(struct ma_state *mas, unsigned long min, + unsigned long max, unsigned long size) +{ + unsigned char offset; + unsigned long *pivots; + enum maple_type mt; + + if (mas_is_start(mas)) + mas_start(mas); + else if (mas->offset >= 2) + mas->offset -= 2; + else if (!mas_skip_node(mas)) + return -EBUSY; + + /* Empty set */ + if (mas_is_none(mas) || mas_is_ptr(mas)) { + mas_sparse_area(mas, min, max, size, true); + return 0; + } + + /* The start of the window can only be within these values */ + mas->index = min; + mas->last = max; + mas_awalk(mas, size); + + if (unlikely(mas_is_err(mas))) + return xa_err(mas->node); + + offset = mas->offset; + if (unlikely(offset == MAPLE_NODE_SLOTS)) + return -EBUSY; + + mt = mte_node_type(mas->node); + pivots = ma_pivots(mas_mn(mas), mt); + if (offset) + mas->min = pivots[offset - 1] + 1; + + if (offset < mt_pivots[mt]) + mas->max = pivots[offset]; + + if (mas->index < mas->min) + mas->index = mas->min; + + mas->last = mas->index + size - 1; + return 0; +} + +/* + * mas_empty_area_rev() - Get the highest address within the range that is + * sufficient for the size requested. + * @mas: The maple state + * @min: The lowest value of the range + * @max: The highest value of the range + * @size: The size needed + */ +int mas_empty_area_rev(struct ma_state *mas, unsigned long min, + unsigned long max, unsigned long size) +{ + struct maple_enode *last = mas->node; + + if (mas_is_start(mas)) { + mas_start(mas); + mas->offset = mas_data_end(mas); + } else if (mas->offset >= 2) { + mas->offset -= 2; + } else if (!mas_rewind_node(mas)) { + return -EBUSY; + } + + /* Empty set. */ + if (mas_is_none(mas) || mas_is_ptr(mas)) { + mas_sparse_area(mas, min, max, size, false); + return 0; + } + + /* The start of the window can only be within these values. */ + mas->index = min; + mas->last = max; + + while (!mas_rev_awalk(mas, size)) { + if (last == mas->node) { + if (!mas_rewind_node(mas)) + return -EBUSY; + } else { + last = mas->node; + } + } + + if (mas_is_err(mas)) + return xa_err(mas->node); + + if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) + return -EBUSY; + + /* + * mas_rev_awalk() has set mas->min and mas->max to the gap values. If + * the maximum is outside the window we are searching, then use the last + * location in the search. + * mas->max and mas->min is the range of the gap. + * mas->index and mas->last are currently set to the search range. + */ + + /* Trim the upper limit to the max. */ + if (mas->max <= mas->last) + mas->last = mas->max; + + mas->index = mas->last - size + 1; + return 0; +} + +static inline int mas_alloc(struct ma_state *mas, void *entry, + unsigned long size, unsigned long *index) +{ + unsigned long min; + + mas_start(mas); + if (mas_is_none(mas) || mas_is_ptr(mas)) { + mas_root_expand(mas, entry); + if (mas_is_err(mas)) + return xa_err(mas->node); + + if (!mas->index) + return mte_pivot(mas->node, 0); + return mte_pivot(mas->node, 1); + } + + /* Must be walking a tree. */ + mas_awalk(mas, size); + if (mas_is_err(mas)) + return xa_err(mas->node); + + if (mas->offset == MAPLE_NODE_SLOTS) + goto no_gap; + + /* + * At this point, mas->node points to the right node and we have an + * offset that has a sufficient gap. + */ + min = mas->min; + if (mas->offset) + min = mte_pivot(mas->node, mas->offset - 1) + 1; + + if (mas->index < min) + mas->index = min; + + mas_fill_gap(mas, entry, mas->offset, size, index); + return 0; + +no_gap: + return -EBUSY; +} + +static inline int mas_rev_alloc(struct ma_state *mas, unsigned long min, + unsigned long max, void *entry, + unsigned long size, unsigned long *index) +{ + int ret = 0; + + ret = mas_empty_area_rev(mas, min, max, size); + if (ret) + return ret; + + if (mas_is_err(mas)) + return xa_err(mas->node); + + if (mas->offset == MAPLE_NODE_SLOTS) + goto no_gap; + + mas_fill_gap(mas, entry, mas->offset, size, index); + return 0; + +no_gap: + return -EBUSY; +} + +/* + * mas_dead_leaves() - Mark all leaves of a node as dead. + * @mas: The maple state + * @slots: Pointer to the slot array + * + * Must hold the write lock. + * + * Return: The number of leaves marked as dead. + */ +static inline +unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots) +{ + struct maple_node *node; + enum maple_type type; + void *entry; + int offset; + + for (offset = 0; offset < mt_slot_count(mas->node); offset++) { + entry = mas_slot_locked(mas, slots, offset); + type = mte_node_type(entry); + node = mte_to_node(entry); + /* Use both node and type to catch LE & BE metadata */ + if (!node || !type) + break; + + mte_set_node_dead(entry); + smp_wmb(); /* Needed for RCU */ + node->type = type; + rcu_assign_pointer(slots[offset], node); + } + + return offset; +} + +static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset) +{ + struct maple_node *node, *next; + void __rcu **slots = NULL; + + next = mas_mn(mas); + do { + mas->node = ma_enode_ptr(next); + node = mas_mn(mas); + slots = ma_slots(node, node->type); + next = mas_slot_locked(mas, slots, offset); + offset = 0; + } while (!ma_is_leaf(next->type)); + + return slots; +} + +static void mt_free_walk(struct rcu_head *head) +{ + void __rcu **slots; + struct maple_node *node, *start; + struct maple_tree mt; + unsigned char offset; + enum maple_type type; + MA_STATE(mas, &mt, 0, 0); + + node = container_of(head, struct maple_node, rcu); + + if (ma_is_leaf(node->type)) + goto free_leaf; + + mt_init_flags(&mt, node->ma_flags); + mas_lock(&mas); + start = node; + mas.node = mt_mk_node(node, node->type); + slots = mas_dead_walk(&mas, 0); + node = mas_mn(&mas); + do { + mt_free_bulk(node->slot_len, slots); + offset = node->parent_slot + 1; + mas.node = node->piv_parent; + if (mas_mn(&mas) == node) + goto start_slots_free; + + type = mte_node_type(mas.node); + slots = ma_slots(mte_to_node(mas.node), type); + if ((offset < mt_slots[type]) && (slots[offset])) + slots = mas_dead_walk(&mas, offset); + + node = mas_mn(&mas); + } while ((node != start) || (node->slot_len < offset)); + + slots = ma_slots(node, node->type); + mt_free_bulk(node->slot_len, slots); + +start_slots_free: + mas_unlock(&mas); +free_leaf: + mt_free_rcu(&node->rcu); +} + +static inline void __rcu **mas_destroy_descend(struct ma_state *mas, + struct maple_enode *prev, unsigned char offset) +{ + struct maple_node *node; + struct maple_enode *next = mas->node; + void __rcu **slots = NULL; + + do { + mas->node = next; + node = mas_mn(mas); + slots = ma_slots(node, mte_node_type(mas->node)); + next = mas_slot_locked(mas, slots, 0); + if ((mte_dead_node(next))) + next = mas_slot_locked(mas, slots, 1); + + mte_set_node_dead(mas->node); + node->type = mte_node_type(mas->node); + node->piv_parent = prev; + node->parent_slot = offset; + offset = 0; + prev = mas->node; + } while (!mte_is_leaf(next)); + + return slots; +} + +static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, + bool free) +{ + void __rcu **slots; + struct maple_node *node = mte_to_node(enode); + struct maple_enode *start; + struct maple_tree mt; + + MA_STATE(mas, &mt, 0, 0); + + if (mte_is_leaf(enode)) + goto free_leaf; + + mt_init_flags(&mt, ma_flags); + mas_lock(&mas); + + mas.node = start = enode; + slots = mas_destroy_descend(&mas, start, 0); + node = mas_mn(&mas); + do { + enum maple_type type; + unsigned char offset; + struct maple_enode *parent, *tmp; + + node->slot_len = mas_dead_leaves(&mas, slots); + if (free) + mt_free_bulk(node->slot_len, slots); + offset = node->parent_slot + 1; + mas.node = node->piv_parent; + if (mas_mn(&mas) == node) + goto start_slots_free; + + type = mte_node_type(mas.node); + slots = ma_slots(mte_to_node(mas.node), type); + if (offset >= mt_slots[type]) + goto next; + + tmp = mas_slot_locked(&mas, slots, offset); + if (mte_node_type(tmp) && mte_to_node(tmp)) { + parent = mas.node; + mas.node = tmp; + slots = mas_destroy_descend(&mas, parent, offset); + } +next: + node = mas_mn(&mas); + } while (start != mas.node); + + node = mas_mn(&mas); + node->slot_len = mas_dead_leaves(&mas, slots); + if (free) + mt_free_bulk(node->slot_len, slots); + +start_slots_free: + mas_unlock(&mas); + +free_leaf: + if (free) + mt_free_rcu(&node->rcu); +} + +/* + * mte_destroy_walk() - Free a tree or sub-tree. + * @enode - the encoded maple node (maple_enode) to start + * @mn - the tree to free - needed for node types. + * + * Must hold the write lock. + */ +static inline void mte_destroy_walk(struct maple_enode *enode, + struct maple_tree *mt) +{ + struct maple_node *node = mte_to_node(enode); + + if (mt_in_rcu(mt)) { + mt_destroy_walk(enode, mt->ma_flags, false); + call_rcu(&node->rcu, mt_free_walk); + } else { + mt_destroy_walk(enode, mt->ma_flags, true); + } +} + +static void mas_wr_store_setup(struct ma_wr_state *wr_mas) +{ + if (!mas_is_start(wr_mas->mas)) { + if (mas_is_none(wr_mas->mas)) { + mas_reset(wr_mas->mas); + } else { + wr_mas->r_max = wr_mas->mas->max; + wr_mas->type = mte_node_type(wr_mas->mas->node); + if (mas_is_span_wr(wr_mas)) + mas_reset(wr_mas->mas); + } + } + +} + +/* Interface */ + +/** + * mas_store() - Store an @entry. + * @mas: The maple state. + * @entry: The entry to store. + * + * The @mas->index and @mas->last is used to set the range for the @entry. + * Note: The @mas should have pre-allocated entries to ensure there is memory to + * store the entry. Please see mas_expected_entries()/mas_destroy() for more details. + * + * Return: the first entry between mas->index and mas->last or %NULL. + */ +void *mas_store(struct ma_state *mas, void *entry) +{ + MA_WR_STATE(wr_mas, mas, entry); + + trace_ma_write(__func__, mas, 0, entry); +#ifdef CONFIG_DEBUG_MAPLE_TREE + if (mas->index > mas->last) + pr_err("Error %lu > %lu %p\n", mas->index, mas->last, entry); + MT_BUG_ON(mas->tree, mas->index > mas->last); + if (mas->index > mas->last) { + mas_set_err(mas, -EINVAL); + return NULL; + } + +#endif + + /* + * Storing is the same operation as insert with the added caveat that it + * can overwrite entries. Although this seems simple enough, one may + * want to examine what happens if a single store operation was to + * overwrite multiple entries within a self-balancing B-Tree. + */ + mas_wr_store_setup(&wr_mas); + mas_wr_store_entry(&wr_mas); + return wr_mas.content; +} + +/** + * mas_store_gfp() - Store a value into the tree. + * @mas: The maple state + * @entry: The entry to store + * @gfp: The GFP_FLAGS to use for allocations if necessary. + * + * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not + * be allocated. + */ +int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) +{ + MA_WR_STATE(wr_mas, mas, entry); + + mas_wr_store_setup(&wr_mas); + trace_ma_write(__func__, mas, 0, entry); +retry: + mas_wr_store_entry(&wr_mas); + if (unlikely(mas_nomem(mas, gfp))) + goto retry; + + if (unlikely(mas_is_err(mas))) + return xa_err(mas->node); + + return 0; +} + +/** + * mas_store_prealloc() - Store a value into the tree using memory + * preallocated in the maple state. + * @mas: The maple state + * @entry: The entry to store. + */ +void mas_store_prealloc(struct ma_state *mas, void *entry) +{ + MA_WR_STATE(wr_mas, mas, entry); + + mas_wr_store_setup(&wr_mas); + trace_ma_write(__func__, mas, 0, entry); + mas_wr_store_entry(&wr_mas); + BUG_ON(mas_is_err(mas)); + mas_destroy(mas); +} + +/** + * mas_preallocate() - Preallocate enough nodes for a store operation + * @mas: The maple state + * @entry: The entry that will be stored + * @gfp: The GFP_FLAGS to use for allocations. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated. + */ +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) +{ + int ret; + + mas_node_count_gfp(mas, 1 + mas_mt_height(mas) * 3, gfp); + if (likely(!mas_is_err(mas))) + return 0; + + mas_set_alloc_req(mas, 0); + ret = xa_err(mas->node); + mas_reset(mas); + mas_destroy(mas); + mas_reset(mas); + return ret; +} + +/* + * mas_expected_entries() - Set the expected number of entries that will be inserted. + * @mas: The maple state + * @nr_entries: The number of expected entries. + * + * This will attempt to pre-allocate enough nodes to store the expected number + * of entries. The allocations will occur using the bulk allocator interface + * for speed. Please call mas_destroy() on the @mas after inserting the entries + * to ensure any unused nodes are freed. + * + * Return: 0 on success, -ENOMEM if memory could not be allocated. + */ +int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) +{ + int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2; + struct maple_enode *enode = mas->node; + int nr_nodes; + int ret; + + /* + * Sometimes it is necessary to duplicate a tree to a new tree, such as + * forking a process and duplicating the VMAs from one tree to a new + * tree. When such a situation arises, it is known that the new tree is + * not going to be used until the entire tree is populated. For + * performance reasons, it is best to use a bulk load with RCU disabled. + * This allows for optimistic splitting that favours the left and reuse + * of nodes during the operation. + */ + + /* Optimize splitting for bulk insert in-order */ + mas->mas_flags |= MA_STATE_BULK; + + /* + * Avoid overflow, assume a gap between each entry and a trailing null. + * If this is wrong, it just means allocation can happen during + * insertion of entries. + */ + nr_nodes = max(nr_entries, nr_entries * 2 + 1); + + if (!mt_is_alloc(mas->tree)) + nonleaf_cap = MAPLE_RANGE64_SLOTS - 2; + + /* Leaves */ + nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 1); + /* Internal nodes */ + nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); + mas_node_count(mas, nr_nodes); + + if (!mas_is_err(mas)) + return 0; + + ret = xa_err(mas->node); + mas->node = enode; + return ret; + +} + +/* + * mas_destroy() - destroy a maple state. + * @mas: The maple state + * + * Upon completion, check the left-most node and rebalance against the node to + * the right if necessary. Frees any allocated nodes associated with this maple + * state. + */ +void mas_destroy(struct ma_state *mas) +{ + struct maple_alloc *node; + + /* + * When using mas_for_each() to insert an expected number of elements, + * it is possible that the number inserted is less than the expected + * number. To fix an invalid final node, a check is performed here to + * rebalance the previous node with the final node. + */ + if (mas->mas_flags & MA_STATE_REBALANCE) { + unsigned char end; + + if (mas_is_start(mas)) + mas_start(mas); + + mtree_range_walk(mas); + end = mas_data_end(mas) + 1; + if (end < mt_min_slot_count(mas->node) - 1) + mas_destroy_rebalance(mas, end); + + mas->mas_flags &= ~MA_STATE_REBALANCE; + } + mas->mas_flags &= ~MA_STATE_BULK; + + while (mas->alloc && !((unsigned long)mas->alloc & 0x1)) { + node = mas->alloc; + mas->alloc = node->slot[0]; + if (node->node_count > 0) + mt_free_bulk(node->node_count, + (void __rcu **)&node->slot[1]); + kmem_cache_free(maple_node_cache, node); + } + mas->alloc = NULL; +} + +/** + * mas_next() - Get the next entry. + * @mas: The maple state + * @max: The maximum index to check. + * + * Returns the next entry after @mas->index. + * Must hold rcu_read_lock or the write lock. + * Can return the zero entry. + * + * Return: The next entry or %NULL + */ +void *mas_next(struct ma_state *mas, unsigned long max) +{ + if (mas_is_none(mas) || mas_is_paused(mas)) + mas->node = MAS_START; + + if (mas_is_start(mas)) + mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ + + if (mas_is_ptr(mas)) { + if (!mas->index) { + mas->index = 1; + mas->last = ULONG_MAX; + } + return NULL; + } + + if (mas->last == ULONG_MAX) + return NULL; + + /* Retries on dead nodes handled by mas_next_entry */ + return mas_next_entry(mas, max); +} +EXPORT_SYMBOL_GPL(mas_next); + +/** + * mt_next() - get the next value in the maple tree + * @mt: The maple tree + * @index: The start index + * @max: The maximum index to check + * + * Return: The entry at @index or higher, or %NULL if nothing is found. + */ +void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) +{ + void *entry = NULL; + MA_STATE(mas, mt, index, index); + + rcu_read_lock(); + entry = mas_next(&mas, max); + rcu_read_unlock(); + return entry; +} +EXPORT_SYMBOL_GPL(mt_next); + +/** + * mas_prev() - Get the previous entry + * @mas: The maple state + * @min: The minimum value to check. + * + * Must hold rcu_read_lock or the write lock. + * Will reset mas to MAS_START if the node is MAS_NONE. Will stop on not + * searchable nodes. + * + * Return: the previous value or %NULL. + */ +void *mas_prev(struct ma_state *mas, unsigned long min) +{ + if (!mas->index) { + /* Nothing comes before 0 */ + mas->last = 0; + return NULL; + } + + if (unlikely(mas_is_ptr(mas))) + return NULL; + + if (mas_is_none(mas) || mas_is_paused(mas)) + mas->node = MAS_START; + + if (mas_is_start(mas)) { + mas_walk(mas); + if (!mas->index) + return NULL; + } + + if (mas_is_ptr(mas)) { + if (!mas->index) { + mas->last = 0; + return NULL; + } + + mas->index = mas->last = 0; + return mas_root_locked(mas); + } + return mas_prev_entry(mas, min); +} +EXPORT_SYMBOL_GPL(mas_prev); + +/** + * mt_prev() - get the previous value in the maple tree + * @mt: The maple tree + * @index: The start index + * @min: The minimum index to check + * + * Return: The entry at @index or lower, or %NULL if nothing is found. + */ +void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) +{ + void *entry = NULL; + MA_STATE(mas, mt, index, index); + + rcu_read_lock(); + entry = mas_prev(&mas, min); + rcu_read_unlock(); + return entry; +} +EXPORT_SYMBOL_GPL(mt_prev); + +/** + * mas_pause() - Pause a mas_find/mas_for_each to drop the lock. + * @mas: The maple state to pause + * + * Some users need to pause a walk and drop the lock they're holding in + * order to yield to a higher priority thread or carry out an operation + * on an entry. Those users should call this function before they drop + * the lock. It resets the @mas to be suitable for the next iteration + * of the loop after the user has reacquired the lock. If most entries + * found during a walk require you to call mas_pause(), the mt_for_each() + * iterator may be more appropriate. + * + */ +void mas_pause(struct ma_state *mas) +{ + mas->node = MAS_PAUSE; +} +EXPORT_SYMBOL_GPL(mas_pause); + +/** + * mas_find() - On the first call, find the entry at or after mas->index up to + * %max. Otherwise, find the entry after mas->index. + * @mas: The maple state + * @max: The maximum value to check. + * + * Must hold rcu_read_lock or the write lock. + * If an entry exists, last and index are updated accordingly. + * May set @mas->node to MAS_NONE. + * + * Return: The entry or %NULL. + */ +void *mas_find(struct ma_state *mas, unsigned long max) +{ + if (unlikely(mas_is_paused(mas))) { + if (unlikely(mas->last == ULONG_MAX)) { + mas->node = MAS_NONE; + return NULL; + } + mas->node = MAS_START; + mas->index = ++mas->last; + } + + if (unlikely(mas_is_start(mas))) { + /* First run or continue */ + void *entry; + + if (mas->index > max) + return NULL; + + entry = mas_walk(mas); + if (entry) + return entry; + } + + if (unlikely(!mas_searchable(mas))) + return NULL; + + /* Retries on dead nodes handled by mas_next_entry */ + return mas_next_entry(mas, max); +} + +/** + * mas_find_rev: On the first call, find the first non-null entry at or below + * mas->index down to %min. Otherwise find the first non-null entry below + * mas->index down to %min. + * @mas: The maple state + * @min: The minimum value to check. + * + * Must hold rcu_read_lock or the write lock. + * If an entry exists, last and index are updated accordingly. + * May set @mas->node to MAS_NONE. + * + * Return: The entry or %NULL. + */ +void *mas_find_rev(struct ma_state *mas, unsigned long min) +{ + if (unlikely(mas_is_paused(mas))) { + if (unlikely(mas->last == ULONG_MAX)) { + mas->node = MAS_NONE; + return NULL; + } + mas->node = MAS_START; + mas->last = --mas->index; + } + + if (unlikely(mas_is_start(mas))) { + /* First run or continue */ + void *entry; + + if (mas->index < min) + return NULL; + + entry = mas_walk(mas); + if (entry) + return entry; + } + + if (unlikely(!mas_searchable(mas))) + return NULL; + + if (mas->index < min) + return NULL; + + /* Retries on dead nodes handled by mas_next_entry */ + return mas_prev_entry(mas, min); +} +EXPORT_SYMBOL_GPL(mas_find); + +/** + * mas_erase() - Find the range in which index resides and erase the entire + * range. + * @mas: The maple state + * + * Must hold the write lock. + * Searches for @mas->index, sets @mas->index and @mas->last to the range and + * erases that range. + * + * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated. + */ +void *mas_erase(struct ma_state *mas) +{ + void *entry; + MA_WR_STATE(wr_mas, mas, NULL); + + if (mas_is_none(mas) || mas_is_paused(mas)) + mas->node = MAS_START; + + /* Retry unnecessary when holding the write lock. */ + entry = mas_state_walk(mas); + if (!entry) + return NULL; + +write_retry: + /* Must reset to ensure spanning writes of last slot are detected */ + mas_reset(mas); + mas_wr_store_setup(&wr_mas); + mas_wr_store_entry(&wr_mas); + if (mas_nomem(mas, GFP_KERNEL)) + goto write_retry; + + return entry; +} +EXPORT_SYMBOL_GPL(mas_erase); + +/** + * mas_nomem() - Check if there was an error allocating and do the allocation + * if necessary If there are allocations, then free them. + * @mas: The maple state + * @gfp: The GFP_FLAGS to use for allocations + * Return: true on allocation, false otherwise. + */ +bool mas_nomem(struct ma_state *mas, gfp_t gfp) + __must_hold(mas->tree->lock) +{ + if (likely(mas->node != MA_ERROR(-ENOMEM))) { + mas_destroy(mas); + return false; + } + + if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { + mtree_unlock(mas->tree); + mas_alloc_nodes(mas, gfp); + mtree_lock(mas->tree); + } else { + mas_alloc_nodes(mas, gfp); + } + + if (!mas_allocated(mas)) + return false; + + mas->node = MAS_START; + return true; +} + +void __init maple_tree_init(void) +{ + maple_node_cache = kmem_cache_create("maple_node", + sizeof(struct maple_node), sizeof(struct maple_node), + SLAB_PANIC, NULL); +} + +/** + * mtree_load() - Load a value stored in a maple tree + * @mt: The maple tree + * @index: The index to load + * + * Return: the entry or %NULL + */ +void *mtree_load(struct maple_tree *mt, unsigned long index) +{ + MA_STATE(mas, mt, index, index); + void *entry; + + trace_ma_read(__func__, &mas); + rcu_read_lock(); +retry: + entry = mas_start(&mas); + if (unlikely(mas_is_none(&mas))) + goto unlock; + + if (unlikely(mas_is_ptr(&mas))) { + if (index) + entry = NULL; + + goto unlock; + } + + entry = mtree_lookup_walk(&mas); + if (!entry && unlikely(mas_is_start(&mas))) + goto retry; +unlock: + rcu_read_unlock(); + if (xa_is_zero(entry)) + return NULL; + + return entry; +} +EXPORT_SYMBOL(mtree_load); + +/** + * mtree_store_range() - Store an entry at a given range. + * @mt: The maple tree + * @index: The start of the range + * @last: The end of the range + * @entry: The entry to store + * @gfp: The GFP_FLAGS to use for allocations + * + * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not + * be allocated. + */ +int mtree_store_range(struct maple_tree *mt, unsigned long index, + unsigned long last, void *entry, gfp_t gfp) +{ + MA_STATE(mas, mt, index, last); + MA_WR_STATE(wr_mas, &mas, entry); + + trace_ma_write(__func__, &mas, 0, entry); + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return -EINVAL; + + if (index > last) + return -EINVAL; + + mtree_lock(mt); +retry: + mas_wr_store_entry(&wr_mas); + if (mas_nomem(&mas, gfp)) + goto retry; + + mtree_unlock(mt); + if (mas_is_err(&mas)) + return xa_err(mas.node); + + return 0; +} +EXPORT_SYMBOL(mtree_store_range); + +/** + * mtree_store() - Store an entry at a given index. + * @mt: The maple tree + * @index: The index to store the value + * @entry: The entry to store + * @gfp: The GFP_FLAGS to use for allocations + * + * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not + * be allocated. + */ +int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, + gfp_t gfp) +{ + return mtree_store_range(mt, index, index, entry, gfp); +} +EXPORT_SYMBOL(mtree_store); + +/** + * mtree_insert_range() - Insert an entry at a give range if there is no value. + * @mt: The maple tree + * @first: The start of the range + * @last: The end of the range + * @entry: The entry to store + * @gfp: The GFP_FLAGS to use for allocations. + * + * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid + * request, -ENOMEM if memory could not be allocated. + */ +int mtree_insert_range(struct maple_tree *mt, unsigned long first, + unsigned long last, void *entry, gfp_t gfp) +{ + MA_STATE(ms, mt, first, last); + + if (WARN_ON_ONCE(xa_is_advanced(entry))) + return -EINVAL; + + if (first > last) + return -EINVAL; + + mtree_lock(mt); +retry: + mas_insert(&ms, entry); + if (mas_nomem(&ms, gfp)) + goto retry; + + mtree_unlock(mt); + if (mas_is_err(&ms)) + return xa_err(ms.node); + + return 0; +} +EXPORT_SYMBOL(mtree_insert_range); + +/** + * mtree_insert() - Insert an entry at a give index if there is no value. + * @mt: The maple tree + * @index : The index to store the value + * @entry: The entry to store + * @gfp: The FGP_FLAGS to use for allocations. + * + * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid + * request, -ENOMEM if memory could not be allocated. + */ +int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, + gfp_t gfp) +{ + return mtree_insert_range(mt, index, index, entry, gfp); +} +EXPORT_SYMBOL(mtree_insert); + +int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long size, unsigned long min, + unsigned long max, gfp_t gfp) +{ + int ret = 0; + + MA_STATE(mas, mt, min, max - size); + if (!mt_is_alloc(mt)) + return -EINVAL; + + if (WARN_ON_ONCE(mt_is_reserved(entry))) + return -EINVAL; + + if (min > max) + return -EINVAL; + + if (max < size) + return -EINVAL; + + if (!size) + return -EINVAL; + + mtree_lock(mt); +retry: + mas.offset = 0; + mas.index = min; + mas.last = max - size; + ret = mas_alloc(&mas, entry, size, startp); + if (mas_nomem(&mas, gfp)) + goto retry; + + mtree_unlock(mt); + return ret; +} +EXPORT_SYMBOL(mtree_alloc_range); + +int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long size, unsigned long min, + unsigned long max, gfp_t gfp) +{ + int ret = 0; + + MA_STATE(mas, mt, min, max - size); + if (!mt_is_alloc(mt)) + return -EINVAL; + + if (WARN_ON_ONCE(mt_is_reserved(entry))) + return -EINVAL; + + if (min >= max) + return -EINVAL; + + if (max < size - 1) + return -EINVAL; + + if (!size) + return -EINVAL; + + mtree_lock(mt); +retry: + ret = mas_rev_alloc(&mas, min, max, entry, size, startp); + if (mas_nomem(&mas, gfp)) + goto retry; + + mtree_unlock(mt); + return ret; +} +EXPORT_SYMBOL(mtree_alloc_rrange); + +/** + * mtree_erase() - Find an index and erase the entire range. + * @mt: The maple tree + * @index: The index to erase + * + * Erasing is the same as a walk to an entry then a store of a NULL to that + * ENTIRE range. In fact, it is implemented as such using the advanced API. + * + * Return: The entry stored at the @index or %NULL + */ +void *mtree_erase(struct maple_tree *mt, unsigned long index) +{ + void *entry = NULL; + + MA_STATE(mas, mt, index, index); + trace_ma_op(__func__, &mas); + + mtree_lock(mt); + entry = mas_erase(&mas); + mtree_unlock(mt); + + return entry; +} +EXPORT_SYMBOL(mtree_erase); + +/** + * __mt_destroy() - Walk and free all nodes of a locked maple tree. + * @mt: The maple tree + * + * Note: Does not handle locking. + */ +void __mt_destroy(struct maple_tree *mt) +{ + void *root = mt_root_locked(mt); + + rcu_assign_pointer(mt->ma_root, NULL); + if (xa_is_node(root)) + mte_destroy_walk(root, mt); + + mt->ma_flags = 0; +} +EXPORT_SYMBOL_GPL(__mt_destroy); + +/** + * mtree_destroy() - Destroy a maple tree + * @mt: The maple tree + * + * Frees all resources used by the tree. Handles locking. + */ +void mtree_destroy(struct maple_tree *mt) +{ + mtree_lock(mt); + __mt_destroy(mt); + mtree_unlock(mt); +} +EXPORT_SYMBOL(mtree_destroy); + +/** + * mt_find() - Search from the start up until an entry is found. + * @mt: The maple tree + * @index: Pointer which contains the start location of the search + * @max: The maximum value to check + * + * Handles locking. @index will be incremented to one beyond the range. + * + * Return: The entry at or after the @index or %NULL + */ +void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) +{ + MA_STATE(mas, mt, *index, *index); + void *entry; +#ifdef CONFIG_DEBUG_MAPLE_TREE + unsigned long copy = *index; +#endif + + trace_ma_read(__func__, &mas); + + if ((*index) > max) + return NULL; + + rcu_read_lock(); +retry: + entry = mas_state_walk(&mas); + if (mas_is_start(&mas)) + goto retry; + + if (unlikely(xa_is_zero(entry))) + entry = NULL; + + if (entry) + goto unlock; + + while (mas_searchable(&mas) && (mas.index < max)) { + entry = mas_next_entry(&mas, max); + if (likely(entry && !xa_is_zero(entry))) + break; + } + + if (unlikely(xa_is_zero(entry))) + entry = NULL; +unlock: + rcu_read_unlock(); + if (likely(entry)) { + *index = mas.last + 1; +#ifdef CONFIG_DEBUG_MAPLE_TREE + if ((*index) && (*index) <= copy) + pr_err("index not increased! %lx <= %lx\n", + *index, copy); + MT_BUG_ON(mt, (*index) && ((*index) <= copy)); +#endif + } + + return entry; +} +EXPORT_SYMBOL(mt_find); + +/** + * mt_find_after() - Search from the start up until an entry is found. + * @mt: The maple tree + * @index: Pointer which contains the start location of the search + * @max: The maximum value to check + * + * Handles locking, detects wrapping on index == 0 + * + * Return: The entry at or after the @index or %NULL + */ +void *mt_find_after(struct maple_tree *mt, unsigned long *index, + unsigned long max) +{ + if (!(*index)) + return NULL; + + return mt_find(mt, index, max); +} +EXPORT_SYMBOL(mt_find_after); + +#ifdef CONFIG_DEBUG_MAPLE_TREE +atomic_t maple_tree_tests_run; +EXPORT_SYMBOL_GPL(maple_tree_tests_run); +atomic_t maple_tree_tests_passed; +EXPORT_SYMBOL_GPL(maple_tree_tests_passed); + +#ifndef __KERNEL__ +extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int); +void mt_set_non_kernel(unsigned int val) +{ + kmem_cache_set_non_kernel(maple_node_cache, val); +} + +extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); +unsigned long mt_get_alloc_size(void) +{ + return kmem_cache_get_alloc(maple_node_cache); +} + +extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *); +void mt_zero_nr_tallocated(void) +{ + kmem_cache_zero_nr_tallocated(maple_node_cache); +} + +extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *); +unsigned int mt_nr_tallocated(void) +{ + return kmem_cache_nr_tallocated(maple_node_cache); +} + +extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *); +unsigned int mt_nr_allocated(void) +{ + return kmem_cache_nr_allocated(maple_node_cache); +} + +/* + * mas_dead_node() - Check if the maple state is pointing to a dead node. + * @mas: The maple state + * @index: The index to restore in @mas. + * + * Used in test code. + * Return: 1 if @mas has been reset to MAS_START, 0 otherwise. + */ +static inline int mas_dead_node(struct ma_state *mas, unsigned long index) +{ + if (unlikely(!mas_searchable(mas) || mas_is_start(mas))) + return 0; + + if (likely(!mte_dead_node(mas->node))) + return 0; + + mas_rewalk(mas, index); + return 1; +} +#endif /* not defined __KERNEL__ */ + +/* + * mas_get_slot() - Get the entry in the maple state node stored at @offset. + * @mas: The maple state + * @offset: The offset into the slot array to fetch. + * + * Return: The entry stored at @offset. + */ +static inline struct maple_enode *mas_get_slot(struct ma_state *mas, + unsigned char offset) +{ + return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)), + offset); +} + + +/* + * mas_first_entry() - Go the first leaf and find the first entry. + * @mas: the maple state. + * @limit: the maximum index to check. + * @*r_start: Pointer to set to the range start. + * + * Sets mas->offset to the offset of the entry, r_start to the range minimum. + * + * Return: The first entry or MAS_NONE. + */ +static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, + unsigned long limit, enum maple_type mt) + +{ + unsigned long max; + unsigned long *pivots; + void __rcu **slots; + void *entry = NULL; + + mas->index = mas->min; + if (mas->index > limit) + goto none; + + max = mas->max; + mas->offset = 0; + while (likely(!ma_is_leaf(mt))) { + MT_BUG_ON(mas->tree, mte_dead_node(mas->node)); + slots = ma_slots(mn, mt); + pivots = ma_pivots(mn, mt); + max = pivots[0]; + entry = mas_slot(mas, slots, 0); + if (unlikely(ma_dead_node(mn))) + return NULL; + mas->node = entry; + mn = mas_mn(mas); + mt = mte_node_type(mas->node); + } + MT_BUG_ON(mas->tree, mte_dead_node(mas->node)); + + mas->max = max; + slots = ma_slots(mn, mt); + entry = mas_slot(mas, slots, 0); + if (unlikely(ma_dead_node(mn))) + return NULL; + + /* Slot 0 or 1 must be set */ + if (mas->index > limit) + goto none; + + if (likely(entry)) + return entry; + + pivots = ma_pivots(mn, mt); + mas->index = pivots[0] + 1; + mas->offset = 1; + entry = mas_slot(mas, slots, 1); + if (unlikely(ma_dead_node(mn))) + return NULL; + + if (mas->index > limit) + goto none; + + if (likely(entry)) + return entry; + +none: + if (likely(!ma_dead_node(mn))) + mas->node = MAS_NONE; + return NULL; +} + +/* Depth first search, post-order */ +static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) +{ + + struct maple_enode *p = MAS_NONE, *mn = mas->node; + unsigned long p_min, p_max; + + mas_next_node(mas, mas_mn(mas), max); + if (!mas_is_none(mas)) + return; + + if (mte_is_root(mn)) + return; + + mas->node = mn; + mas_ascend(mas); + while (mas->node != MAS_NONE) { + p = mas->node; + p_min = mas->min; + p_max = mas->max; + mas_prev_node(mas, 0); + } + + if (p == MAS_NONE) + return; + + mas->node = p; + mas->max = p_max; + mas->min = p_min; +} + +/* Tree validations */ +static void mt_dump_node(const struct maple_tree *mt, void *entry, + unsigned long min, unsigned long max, unsigned int depth); +static void mt_dump_range(unsigned long min, unsigned long max, + unsigned int depth) +{ + static const char spaces[] = " "; + + if (min == max) + pr_info("%.*s%lu: ", depth * 2, spaces, min); + else + pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); +} + +static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, + unsigned int depth) +{ + mt_dump_range(min, max, depth); + + if (xa_is_value(entry)) + pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry), + xa_to_value(entry), entry); + else if (xa_is_zero(entry)) + pr_cont("zero (%ld)\n", xa_to_internal(entry)); + else if (mt_is_reserved(entry)) + pr_cont("UNKNOWN ENTRY (%p)\n", entry); + else + pr_cont("%p\n", entry); +} + +static void mt_dump_range64(const struct maple_tree *mt, void *entry, + unsigned long min, unsigned long max, unsigned int depth) +{ + struct maple_range_64 *node = &mte_to_node(entry)->mr64; + bool leaf = mte_is_leaf(entry); + unsigned long first = min; + int i; + + pr_cont(" contents: "); + for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) + pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + pr_cont("%p\n", node->slot[i]); + for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { + unsigned long last = max; + + if (i < (MAPLE_RANGE64_SLOTS - 1)) + last = node->pivot[i]; + else if (!node->slot[i] && max != mt_max[mte_node_type(entry)]) + break; + if (last == 0 && i > 0) + break; + if (leaf) + mt_dump_entry(mt_slot(mt, node->slot, i), + first, last, depth + 1); + else if (node->slot[i]) + mt_dump_node(mt, mt_slot(mt, node->slot, i), + first, last, depth + 1); + + if (last == max) + break; + if (last > max) { + pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + node, last, max, i); + break; + } + first = last + 1; + } +} + +static void mt_dump_arange64(const struct maple_tree *mt, void *entry, + unsigned long min, unsigned long max, unsigned int depth) +{ + struct maple_arange_64 *node = &mte_to_node(entry)->ma64; + bool leaf = mte_is_leaf(entry); + unsigned long first = min; + int i; + + pr_cont(" contents: "); + for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) + pr_cont("%lu ", node->gap[i]); + pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); + for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) + pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + pr_cont("%p\n", node->slot[i]); + for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { + unsigned long last = max; + + if (i < (MAPLE_ARANGE64_SLOTS - 1)) + last = node->pivot[i]; + else if (!node->slot[i]) + break; + if (last == 0 && i > 0) + break; + if (leaf) + mt_dump_entry(mt_slot(mt, node->slot, i), + first, last, depth + 1); + else if (node->slot[i]) + mt_dump_node(mt, mt_slot(mt, node->slot, i), + first, last, depth + 1); + + if (last == max) + break; + if (last > max) { + pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + node, last, max, i); + break; + } + first = last + 1; + } +} + +static void mt_dump_node(const struct maple_tree *mt, void *entry, + unsigned long min, unsigned long max, unsigned int depth) +{ + struct maple_node *node = mte_to_node(entry); + unsigned int type = mte_node_type(entry); + unsigned int i; + + mt_dump_range(min, max, depth); + + pr_cont("node %p depth %d type %d parent %p", node, depth, type, + node ? node->parent : NULL); + switch (type) { + case maple_dense: + pr_cont("\n"); + for (i = 0; i < MAPLE_NODE_SLOTS; i++) { + if (min + i > max) + pr_cont("OUT OF RANGE: "); + mt_dump_entry(mt_slot(mt, node->slot, i), + min + i, min + i, depth); + } + break; + case maple_leaf_64: + case maple_range_64: + mt_dump_range64(mt, entry, min, max, depth); + break; + case maple_arange_64: + mt_dump_arange64(mt, entry, min, max, depth); + break; + + default: + pr_cont(" UNKNOWN TYPE\n"); + } +} + +void mt_dump(const struct maple_tree *mt) +{ + void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); + + pr_info("maple_tree(%p) flags %X, height %u root %p\n", + mt, mt->ma_flags, mt_height(mt), entry); + if (!xa_is_node(entry)) + mt_dump_entry(entry, 0, 0, 0); + else if (entry) + mt_dump_node(mt, entry, 0, mt_max[mte_node_type(entry)], 0); +} + +/* + * Calculate the maximum gap in a node and check if that's what is reported in + * the parent (unless root). + */ +static void mas_validate_gaps(struct ma_state *mas) +{ + struct maple_enode *mte = mas->node; + struct maple_node *p_mn; + unsigned long gap = 0, max_gap = 0; + unsigned long p_end, p_start = mas->min; + unsigned char p_slot; + unsigned long *gaps = NULL; + unsigned long *pivots = ma_pivots(mte_to_node(mte), mte_node_type(mte)); + int i; + + if (ma_is_dense(mte_node_type(mte))) { + for (i = 0; i < mt_slot_count(mte); i++) { + if (mas_get_slot(mas, i)) { + if (gap > max_gap) + max_gap = gap; + gap = 0; + continue; + } + gap++; + } + goto counted; + } + + gaps = ma_gaps(mte_to_node(mte), mte_node_type(mte)); + for (i = 0; i < mt_slot_count(mte); i++) { + p_end = mas_logical_pivot(mas, pivots, i, mte_node_type(mte)); + + if (!gaps) { + if (mas_get_slot(mas, i)) { + gap = 0; + goto not_empty; + } + + gap += p_end - p_start + 1; + } else { + void *entry = mas_get_slot(mas, i); + + gap = gaps[i]; + if (!entry) { + if (gap != p_end - p_start + 1) { + pr_err("%p[%u] -> %p %lu != %lu - %lu + 1\n", + mas_mn(mas), i, + mas_get_slot(mas, i), gap, + p_end, p_start); + mt_dump(mas->tree); + + MT_BUG_ON(mas->tree, + gap != p_end - p_start + 1); + } + } else { + if (gap > p_end - p_start + 1) { + pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", + mas_mn(mas), i, gap, p_end, p_start, + p_end - p_start + 1); + MT_BUG_ON(mas->tree, + gap > p_end - p_start + 1); + } + } + } + + if (gap > max_gap) + max_gap = gap; +not_empty: + p_start = p_end + 1; + if (p_end >= mas->max) + break; + } + +counted: + if (mte_is_root(mte)) + return; + + p_slot = mte_parent_slot(mas->node); + p_mn = mte_parent(mte); + MT_BUG_ON(mas->tree, max_gap > mas->max); + if (ma_gaps(p_mn, mas_parent_enum(mas, mte))[p_slot] != max_gap) { + pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap); + mt_dump(mas->tree); + } + + MT_BUG_ON(mas->tree, + ma_gaps(p_mn, mas_parent_enum(mas, mte))[p_slot] != max_gap); +} + +static void mas_validate_parent_slot(struct ma_state *mas) +{ + struct maple_node *parent; + struct maple_enode *node; + enum maple_type p_type = mas_parent_enum(mas, mas->node); + unsigned char p_slot = mte_parent_slot(mas->node); + void __rcu **slots; + int i; + + if (mte_is_root(mas->node)) + return; + + parent = mte_parent(mas->node); + slots = ma_slots(parent, p_type); + MT_BUG_ON(mas->tree, mas_mn(mas) == parent); + + /* Check prev/next parent slot for duplicate node entry */ + + for (i = 0; i < mt_slots[p_type]; i++) { + node = mas_slot(mas, slots, i); + if (i == p_slot) { + if (node != mas->node) + pr_err("parent %p[%u] does not have %p\n", + parent, i, mas_mn(mas)); + MT_BUG_ON(mas->tree, node != mas->node); + } else if (node == mas->node) { + pr_err("Invalid child %p at parent %p[%u] p_slot %u\n", + mas_mn(mas), parent, i, p_slot); + MT_BUG_ON(mas->tree, node == mas->node); + } + } +} + +static void mas_validate_child_slot(struct ma_state *mas) +{ + enum maple_type type = mte_node_type(mas->node); + void __rcu **slots = ma_slots(mte_to_node(mas->node), type); + unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type); + struct maple_enode *child; + unsigned char i; + + if (mte_is_leaf(mas->node)) + return; + + for (i = 0; i < mt_slots[type]; i++) { + child = mas_slot(mas, slots, i); + if (!pivots[i] || pivots[i] == mas->max) + break; + + if (!child) + break; + + if (mte_parent_slot(child) != i) { + pr_err("Slot error at %p[%u]: child %p has pslot %u\n", + mas_mn(mas), i, mte_to_node(child), + mte_parent_slot(child)); + MT_BUG_ON(mas->tree, 1); + } + + if (mte_parent(child) != mte_to_node(mas->node)) { + pr_err("child %p has parent %p not %p\n", + mte_to_node(child), mte_parent(child), + mte_to_node(mas->node)); + MT_BUG_ON(mas->tree, 1); + } + } +} + +/* + * Validate all pivots are within mas->min and mas->max. + */ +static void mas_validate_limits(struct ma_state *mas) +{ + int i; + unsigned long prev_piv = 0; + enum maple_type type = mte_node_type(mas->node); + void __rcu **slots = ma_slots(mte_to_node(mas->node), type); + unsigned long *pivots = ma_pivots(mas_mn(mas), type); + + /* all limits are fine here. */ + if (mte_is_root(mas->node)) + return; + + for (i = 0; i < mt_slots[type]; i++) { + unsigned long piv; + + piv = mas_safe_pivot(mas, pivots, i, type); + + if (!piv && (i != 0)) + break; + + if (!mte_is_leaf(mas->node)) { + void *entry = mas_slot(mas, slots, i); + + if (!entry) + pr_err("%p[%u] cannot be null\n", + mas_mn(mas), i); + + MT_BUG_ON(mas->tree, !entry); + } + + if (prev_piv > piv) { + pr_err("%p[%u] piv %lu < prev_piv %lu\n", + mas_mn(mas), i, piv, prev_piv); + MT_BUG_ON(mas->tree, piv < prev_piv); + } + + if (piv < mas->min) { + pr_err("%p[%u] %lu < %lu\n", mas_mn(mas), i, + piv, mas->min); + MT_BUG_ON(mas->tree, piv < mas->min); + } + if (piv > mas->max) { + pr_err("%p[%u] %lu > %lu\n", mas_mn(mas), i, + piv, mas->max); + MT_BUG_ON(mas->tree, piv > mas->max); + } + prev_piv = piv; + if (piv == mas->max) + break; + } + for (i += 1; i < mt_slots[type]; i++) { + void *entry = mas_slot(mas, slots, i); + + if (entry && (i != mt_slots[type] - 1)) { + pr_err("%p[%u] should not have entry %p\n", mas_mn(mas), + i, entry); + MT_BUG_ON(mas->tree, entry != NULL); + } + + if (i < mt_pivots[type]) { + unsigned long piv = pivots[i]; + + if (!piv) + continue; + + pr_err("%p[%u] should not have piv %lu\n", + mas_mn(mas), i, piv); + MT_BUG_ON(mas->tree, i < mt_pivots[type] - 1); + } + } +} + +static void mt_validate_nulls(struct maple_tree *mt) +{ + void *entry, *last = (void *)1; + unsigned char offset = 0; + void __rcu **slots; + MA_STATE(mas, mt, 0, 0); + + mas_start(&mas); + if (mas_is_none(&mas) || (mas.node == MAS_ROOT)) + return; + + while (!mte_is_leaf(mas.node)) + mas_descend(&mas); + + slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); + do { + entry = mas_slot(&mas, slots, offset); + if (!last && !entry) { + pr_err("Sequential nulls end at %p[%u]\n", + mas_mn(&mas), offset); + } + MT_BUG_ON(mt, !last && !entry); + last = entry; + if (offset == mas_data_end(&mas)) { + mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); + if (mas_is_none(&mas)) + return; + offset = 0; + slots = ma_slots(mte_to_node(mas.node), + mte_node_type(mas.node)); + } else { + offset++; + } + + } while (!mas_is_none(&mas)); +} + +/* + * validate a maple tree by checking: + * 1. The limits (pivots are within mas->min to mas->max) + * 2. The gap is correctly set in the parents + */ +void mt_validate(struct maple_tree *mt) +{ + unsigned char end; + + MA_STATE(mas, mt, 0, 0); + rcu_read_lock(); + mas_start(&mas); + if (!mas_searchable(&mas)) + goto done; + + mas_first_entry(&mas, mas_mn(&mas), ULONG_MAX, mte_node_type(mas.node)); + while (!mas_is_none(&mas)) { + MT_BUG_ON(mas.tree, mte_dead_node(mas.node)); + if (!mte_is_root(mas.node)) { + end = mas_data_end(&mas); + if ((end < mt_min_slot_count(mas.node)) && + (mas.max != ULONG_MAX)) { + pr_err("Invalid size %u of %p\n", end, + mas_mn(&mas)); + MT_BUG_ON(mas.tree, 1); + } + + } + mas_validate_parent_slot(&mas); + mas_validate_child_slot(&mas); + mas_validate_limits(&mas); + if (mt_is_alloc(mt)) + mas_validate_gaps(&mas); + mas_dfs_postorder(&mas, ULONG_MAX); + } + mt_validate_nulls(mt); +done: + rcu_read_unlock(); + +} + +#endif /* CONFIG_DEBUG_MAPLE_TREE */ diff --git a/tools/testing/radix-tree/.gitignore b/tools/testing/radix-tree/.gitignore index d971516401e68f..c901d96dd013ef 100644 --- a/tools/testing/radix-tree/.gitignore +++ b/tools/testing/radix-tree/.gitignore @@ -6,3 +6,5 @@ main multiorder radix-tree.c xarray +maple +ma_xa_benchmark diff --git a/tools/testing/radix-tree/generated/autoconf.h b/tools/testing/radix-tree/generated/autoconf.h index 2218b3cc184e41..e7da803502362b 100644 --- a/tools/testing/radix-tree/generated/autoconf.h +++ b/tools/testing/radix-tree/generated/autoconf.h @@ -1 +1,2 @@ #define CONFIG_XARRAY_MULTI 1 +#define CONFIG_64BIT 1 diff --git a/tools/testing/radix-tree/linux/maple_tree.h b/tools/testing/radix-tree/linux/maple_tree.h new file mode 100644 index 00000000000000..7d8d1f445b8998 --- /dev/null +++ b/tools/testing/radix-tree/linux/maple_tree.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +#define atomic_t int32_t +#include "../../../../include/linux/maple_tree.h" +#define atomic_inc(x) uatomic_inc(x) +#define atomic_read(x) uatomic_read(x) +#define atomic_set(x, y) do {} while (0) +#define U8_MAX UCHAR_MAX diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c new file mode 100644 index 00000000000000..35082671928ad5 --- /dev/null +++ b/tools/testing/radix-tree/maple.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * maple_tree.c: Userspace shim for maple tree test-suite + * Copyright (c) 2018 Liam R. Howlett + */ + +#define CONFIG_DEBUG_MAPLE_TREE +#define CONFIG_MAPLE_SEARCH +#include "test.h" + +#define module_init(x) +#define module_exit(x) +#define MODULE_AUTHOR(x) +#define MODULE_LICENSE(x) +#define dump_stack() assert(0) + +#include "../../../lib/maple_tree.c" +#undef CONFIG_DEBUG_MAPLE_TREE +#include "../../../lib/test_maple_tree.c" + +void farmer_tests(void) +{ + struct maple_node *node; + DEFINE_MTREE(tree); + + mt_dump(&tree); + + tree.ma_root = xa_mk_value(0); + mt_dump(&tree); + + node = mt_alloc_one(GFP_KERNEL); + node->parent = (void *)((unsigned long)(&tree) | 1); + node->slot[0] = xa_mk_value(0); + node->slot[1] = xa_mk_value(1); + node->mr64.pivot[0] = 0; + node->mr64.pivot[1] = 1; + node->mr64.pivot[2] = 0; + tree.ma_root = mt_mk_node(node, maple_leaf_64); + mt_dump(&tree); + + ma_free_rcu(node); +} + +void maple_tree_tests(void) +{ + farmer_tests(); + maple_tree_seed(); + maple_tree_harvest(); +} + +int __weak main(void) +{ + maple_tree_init(); + maple_tree_tests(); + rcu_barrier(); + if (nr_allocated) + printf("nr_allocated = %d\n", nr_allocated); + return 0; +} diff --git a/tools/testing/radix-tree/trace/events/maple_tree.h b/tools/testing/radix-tree/trace/events/maple_tree.h new file mode 100644 index 00000000000000..97d0e1ddcf08ea --- /dev/null +++ b/tools/testing/radix-tree/trace/events/maple_tree.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ + +#define trace_ma_op(a, b) do {} while (0) +#define trace_ma_read(a, b) do {} while (0) +#define trace_ma_write(a, b, c, d) do {} while (0) From 1bd1d9c088071b2b21d3047541a5b6373dae4070 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:39 +0000 Subject: [PATCH 0751/1250] radix tree test suite: add pr_err define define pr_err to printk Link: https://lkml.kernel.org/r/20220404143501.2016403-2-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220504010716.661115-4-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-3-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- tools/testing/radix-tree/linux/kernel.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index 39867fd80c8faa..c5c9d05f29da95 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -14,6 +14,7 @@ #include "../../../include/linux/kconfig.h" #define printk printf +#define pr_err printk #define pr_info printk #define pr_debug printk #define pr_cont printk From 3c95b0ea4b52c13bce2d9a24071cf4fb4ec82283 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:39 +0000 Subject: [PATCH 0752/1250] radix tree test suite: add kmem_cache_set_non_kernel() kmem_cache_set_non_kernel() is a mechanism to allow a certain number of kmem_cache_alloc requests to succeed even when GFP_KERNEL is not set in the flags. This functionality allows for testing different paths though the code. Link: https://lkml.kernel.org/r/20220504010716.661115-5-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-4-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- tools/testing/radix-tree/linux.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index d5c1bcba86fe00..277aa8b70abce3 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -23,15 +23,26 @@ struct kmem_cache { int nr_objs; void *objs; void (*ctor)(void *); + unsigned int non_kernel; }; +void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val) +{ + cachep->non_kernel = val; +} + void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, int gfp) + { void *p; - if (!(gfp & __GFP_DIRECT_RECLAIM)) - return NULL; + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (!cachep->non_kernel) + return NULL; + + cachep->non_kernel--; + } pthread_mutex_lock(&cachep->lock); if (cachep->nr_objs) { @@ -90,5 +101,6 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, ret->nr_objs = 0; ret->objs = NULL; ret->ctor = ctor; + ret->non_kernel = 0; return ret; } From e55102778f2fe06d204aef7275135b133c77f10e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:40 +0000 Subject: [PATCH 0753/1250] radix tree test suite: add allocation counts and size to kmem_cache Add functions to get the number of allocations, and total allocations from a kmem_cache. Also add a function to get the allocated size and a way to zero the total allocations. Link: https://lkml.kernel.org/r/20220504010716.661115-6-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-5-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- tools/testing/radix-tree/linux.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index 277aa8b70abce3..f20529ae4dbefe 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -24,6 +24,8 @@ struct kmem_cache { void *objs; void (*ctor)(void *); unsigned int non_kernel; + unsigned long nr_allocated; + unsigned long nr_tallocated; }; void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val) @@ -31,9 +33,28 @@ void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val) cachep->non_kernel = val; } +unsigned long kmem_cache_get_alloc(struct kmem_cache *cachep) +{ + return cachep->size * cachep->nr_allocated; +} + +unsigned long kmem_cache_nr_allocated(struct kmem_cache *cachep) +{ + return cachep->nr_allocated; +} + +unsigned long kmem_cache_nr_tallocated(struct kmem_cache *cachep) +{ + return cachep->nr_tallocated; +} + +void kmem_cache_zero_nr_tallocated(struct kmem_cache *cachep) +{ + cachep->nr_tallocated = 0; +} + void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, int gfp) - { void *p; @@ -64,7 +85,9 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, memset(p, 0, cachep->size); } + uatomic_inc(&cachep->nr_allocated); uatomic_inc(&nr_allocated); + uatomic_inc(&cachep->nr_tallocated); if (kmalloc_verbose) printf("Allocating %p from slab\n", p); return p; @@ -74,6 +97,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) { assert(objp); uatomic_dec(&nr_allocated); + uatomic_dec(&cachep->nr_allocated); if (kmalloc_verbose) printf("Freeing %p to slab\n", objp); pthread_mutex_lock(&cachep->lock); @@ -99,6 +123,8 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, ret->size = size; ret->align = align; ret->nr_objs = 0; + ret->nr_allocated = 0; + ret->nr_tallocated = 0; ret->objs = NULL; ret->ctor = ctor; ret->non_kernel = 0; From 954b2a53d95aa61ab394108af2819410f91b85d7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:40 +0000 Subject: [PATCH 0754/1250] radix tree test suite: add support for slab bulk APIs Add support for kmem_cache_free_bulk() and kmem_cache_alloc_bulk() to the radix tree test suite. Link: https://lkml.kernel.org/r/20220504010716.661115-7-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-6-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- tools/include/linux/slab.h | 4 ++ tools/testing/radix-tree/linux.c | 118 ++++++++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 2 deletions(-) diff --git a/tools/include/linux/slab.h b/tools/include/linux/slab.h index 0616409513eb79..311759ea25e921 100644 --- a/tools/include/linux/slab.h +++ b/tools/include/linux/slab.h @@ -41,4 +41,8 @@ struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align, unsigned int flags, void (*ctor)(void *)); +void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list); +int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, + void **list); + #endif /* _TOOLS_SLAB_H */ diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index f20529ae4dbefe..2048d12c31df36 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -93,14 +93,13 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, return p; } -void kmem_cache_free(struct kmem_cache *cachep, void *objp) +void kmem_cache_free_locked(struct kmem_cache *cachep, void *objp) { assert(objp); uatomic_dec(&nr_allocated); uatomic_dec(&cachep->nr_allocated); if (kmalloc_verbose) printf("Freeing %p to slab\n", objp); - pthread_mutex_lock(&cachep->lock); if (cachep->nr_objs > 10 || cachep->align) { memset(objp, POISON_FREE, cachep->size); free(objp); @@ -110,9 +109,80 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) node->parent = cachep->objs; cachep->objs = node; } +} + +void kmem_cache_free(struct kmem_cache *cachep, void *objp) +{ + pthread_mutex_lock(&cachep->lock); + kmem_cache_free_locked(cachep, objp); pthread_mutex_unlock(&cachep->lock); } +void kmem_cache_free_bulk(struct kmem_cache *cachep, size_t size, void **list) +{ + if (kmalloc_verbose) + pr_debug("Bulk free %p[0-%lu]\n", list, size - 1); + + pthread_mutex_lock(&cachep->lock); + for (int i = 0; i < size; i++) + kmem_cache_free_locked(cachep, list[i]); + pthread_mutex_unlock(&cachep->lock); +} + +int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, + void **p) +{ + size_t i; + + if (kmalloc_verbose) + pr_debug("Bulk alloc %lu\n", size); + + if (!(gfp & __GFP_DIRECT_RECLAIM)) { + if (cachep->non_kernel < size) + return 0; + + cachep->non_kernel -= size; + } + + pthread_mutex_lock(&cachep->lock); + if (cachep->nr_objs >= size) { + struct radix_tree_node *node; + + for (i = 0; i < size; i++) { + node = cachep->objs; + cachep->nr_objs--; + cachep->objs = node->parent; + p[i] = node; + node->parent = NULL; + } + pthread_mutex_unlock(&cachep->lock); + } else { + pthread_mutex_unlock(&cachep->lock); + for (i = 0; i < size; i++) { + if (cachep->align) { + posix_memalign(&p[i], cachep->align, + cachep->size * size); + } else { + p[i] = malloc(cachep->size * size); + } + if (cachep->ctor) + cachep->ctor(p[i]); + else if (gfp & __GFP_ZERO) + memset(p[i], 0, cachep->size); + } + } + + for (i = 0; i < size; i++) { + uatomic_inc(&nr_allocated); + uatomic_inc(&cachep->nr_allocated); + uatomic_inc(&cachep->nr_tallocated); + if (kmalloc_verbose) + printf("Allocating %p from slab\n", p[i]); + } + + return size; +} + struct kmem_cache * kmem_cache_create(const char *name, unsigned int size, unsigned int align, unsigned int flags, void (*ctor)(void *)) @@ -130,3 +200,47 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, ret->non_kernel = 0; return ret; } + +/* + * Test the test infrastructure for kem_cache_alloc/free and bulk counterparts. + */ +void test_kmem_cache_bulk(void) +{ + int i; + void *list[12]; + static struct kmem_cache *test_cache, *test_cache2; + + /* + * Testing the bulk allocators without aligned kmem_cache to force the + * bulk alloc/free to reuse + */ + test_cache = kmem_cache_create("test_cache", 256, 0, SLAB_PANIC, NULL); + + for (i = 0; i < 5; i++) + list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM); + + for (i = 0; i < 5; i++) + kmem_cache_free(test_cache, list[i]); + assert(test_cache->nr_objs == 5); + + kmem_cache_alloc_bulk(test_cache, __GFP_DIRECT_RECLAIM, 5, list); + kmem_cache_free_bulk(test_cache, 5, list); + + for (i = 0; i < 12 ; i++) + list[i] = kmem_cache_alloc(test_cache, __GFP_DIRECT_RECLAIM); + + for (i = 0; i < 12; i++) + kmem_cache_free(test_cache, list[i]); + + /* The last free will not be kept around */ + assert(test_cache->nr_objs == 11); + + /* Aligned caches will immediately free */ + test_cache2 = kmem_cache_create("test_cache2", 128, 128, SLAB_PANIC, NULL); + + kmem_cache_alloc_bulk(test_cache2, __GFP_DIRECT_RECLAIM, 10, list); + kmem_cache_free_bulk(test_cache2, 10, list); + assert(!test_cache2->nr_objs); + + +} From e07d1b9d26e582cfab75092ebcd8e888393144be Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:41 +0000 Subject: [PATCH 0755/1250] radix tree test suite: add lockdep_is_held to header maple tree uses lockdep_is_held, so define it as external in the header. Link: https://lkml.kernel.org/r/20220504010716.661115-8-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-7-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- tools/testing/radix-tree/linux/lockdep.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/radix-tree/linux/lockdep.h b/tools/testing/radix-tree/linux/lockdep.h index 016cff473cfc48..62473ab57f99c2 100644 --- a/tools/testing/radix-tree/linux/lockdep.h +++ b/tools/testing/radix-tree/linux/lockdep.h @@ -11,4 +11,6 @@ static inline void lockdep_set_class(spinlock_t *lock, struct lock_class_key *key) { } + +extern int lockdep_is_held(const void *); #endif /* _LINUX_LOCKDEP_H */ From dd9239d21d921aa523fc1eee4101832e54b89ae0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:45 +0000 Subject: [PATCH 0756/1250] lib/test_maple_tree: add testing for maple tree This is a test suite that uses the radix test infrastructure. It has been split into its own commit to allow for easier review of the maple tree code. Link: https://lkml.kernel.org/r/20220504010716.661115-9-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220511144304.1430851-3-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220615141921.417598-4-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-8-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 38206 ++++++++++++++++++++++++++++ tools/testing/radix-tree/Makefile | 9 +- 2 files changed, 38213 insertions(+), 2 deletions(-) create mode 100644 lib/test_maple_tree.c diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c new file mode 100644 index 00000000000000..0ff43ff52365cd --- /dev/null +++ b/lib/test_maple_tree.c @@ -0,0 +1,38206 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * test_maple_tree.c: Test the maple tree API + * Copyright (c) 2018 Liam R. Howlett + * Author: Liam R. Howlett + */ + +#include +#include +#include +#include + +#define MTREE_ALLOC_MAX 0x2000000000000Ul +#define CONFIG_DEBUG_MAPLE_TREE +#define CONFIG_MAPLE_SEARCH +/* #define BENCH_SLOT_STORE */ +/* #define BENCH_NODE_STORE */ +/* #define BENCH_AWALK */ +/* #define BENCH_WALK */ +/* #define BENCH_MT_FOR_EACH */ +/* #define BENCH_FORK */ +static +int mtree_insert_index(struct maple_tree *mt, unsigned long index, gfp_t gfp) +{ + return mtree_insert(mt, index, xa_mk_value(index & LONG_MAX), gfp); +} + +static void mtree_erase_index(struct maple_tree *mt, unsigned long index) +{ + MT_BUG_ON(mt, mtree_erase(mt, index) != xa_mk_value(index & LONG_MAX)); + MT_BUG_ON(mt, mtree_load(mt, index) != NULL); +} + +static int mtree_test_insert(struct maple_tree *mt, unsigned long index, + void *ptr) +{ + return mtree_insert(mt, index, ptr, GFP_KERNEL); +} + +static int mtree_test_store_range(struct maple_tree *mt, unsigned long start, + unsigned long end, void *ptr) +{ + return mtree_store_range(mt, start, end, ptr, GFP_KERNEL); +} + +static int mtree_test_store(struct maple_tree *mt, unsigned long start, + void *ptr) +{ + return mtree_test_store_range(mt, start, start, ptr); +} + +static int mtree_test_insert_range(struct maple_tree *mt, unsigned long start, + unsigned long end, void *ptr) +{ + return mtree_insert_range(mt, start, end, ptr, GFP_KERNEL); +} + +static void *mtree_test_load(struct maple_tree *mt, unsigned long index) +{ + return mtree_load(mt, index); +} + +static void *mtree_test_erase(struct maple_tree *mt, unsigned long index) +{ + return mtree_erase(mt, index); +} + +static noinline void check_mtree_alloc_range(struct maple_tree *mt, + unsigned long start, unsigned long end, unsigned long size, + unsigned long expected, int eret, void *ptr) +{ + + unsigned long result = expected + 1; + int ret; + + ret = mtree_alloc_range(mt, &result, ptr, size, start, end, + GFP_KERNEL); + MT_BUG_ON(mt, ret != eret); + if (ret) + return; + + MT_BUG_ON(mt, result != expected); +} + +static noinline void check_mtree_alloc_rrange(struct maple_tree *mt, + unsigned long start, unsigned long end, unsigned long size, + unsigned long expected, int eret, void *ptr) +{ + + unsigned long result = expected + 1; + int ret; + + ret = mtree_alloc_rrange(mt, &result, ptr, size, start, end - 1, + GFP_KERNEL); + MT_BUG_ON(mt, ret != eret); + if (ret) + return; + + MT_BUG_ON(mt, result != expected); +} + +static noinline void check_load(struct maple_tree *mt, unsigned long index, + void *ptr) +{ + void *ret = mtree_test_load(mt, index); + + if (ret != ptr) + pr_err("Load %lu returned %p expect %p\n", index, ret, ptr); + MT_BUG_ON(mt, ret != ptr); +} + +static noinline void check_store_range(struct maple_tree *mt, + unsigned long start, unsigned long end, void *ptr, int expected) +{ + int ret = -EINVAL; + unsigned long i; + + ret = mtree_test_store_range(mt, start, end, ptr); + MT_BUG_ON(mt, ret != expected); + + if (ret) + return; + + for (i = start; i <= end; i++) + check_load(mt, i, ptr); +} + +static noinline void check_insert_range(struct maple_tree *mt, + unsigned long start, unsigned long end, void *ptr, int expected) +{ + int ret = -EINVAL; + unsigned long i; + + ret = mtree_test_insert_range(mt, start, end, ptr); + MT_BUG_ON(mt, ret != expected); + + if (ret) + return; + + for (i = start; i <= end; i++) + check_load(mt, i, ptr); +} + +static noinline void check_insert(struct maple_tree *mt, unsigned long index, + void *ptr) +{ + int ret = -EINVAL; + + ret = mtree_test_insert(mt, index, ptr); + MT_BUG_ON(mt, ret != 0); +} + +static noinline void check_erase(struct maple_tree *mt, unsigned long index, + void *ptr) +{ + MT_BUG_ON(mt, mtree_test_erase(mt, index) != ptr); +} + +static noinline void check_dup_insert(struct maple_tree *mt, + unsigned long index, void *ptr) +{ + int ret = -EINVAL; + + ret = mtree_test_insert(mt, index, ptr); + MT_BUG_ON(mt, ret != -EEXIST); +} + + +static noinline +void check_index_load(struct maple_tree *mt, unsigned long index) +{ + return check_load(mt, index, xa_mk_value(index & LONG_MAX)); +} + +static noinline void check_nomem(struct maple_tree *mt) +{ + MA_STATE(ms, mt, 1, 1); + + MT_BUG_ON(mt, !mtree_empty(mt)); + /* Ensure no bypassing of allocation failures */ + mt_set_non_kernel(0); + + /* Storing something at 1 requires memory allocation */ + MT_BUG_ON(mt, mtree_insert(mt, 1, &ms, GFP_ATOMIC) != -ENOMEM); + /* Storing something at 0 does not */ + MT_BUG_ON(mt, mtree_insert(mt, 0, &ms, GFP_ATOMIC) != 0); + + /* + * Simulate two threads racing; the first one fails to allocate + * memory to insert an entry at 1, then the second one succeeds + * in allocating memory to insert an entry at 2. The first one + * then needs to free the node it allocated. LeakSanitizer will + * notice this, as will the 'nr_allocated' debugging aid in the + * userspace test suite. + */ + mtree_lock(mt); + mas_store(&ms, &ms); /* insert 1 -> &ms, fails. */ + MT_BUG_ON(mt, ms.node != MA_ERROR(-ENOMEM)); + mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */ + MT_BUG_ON(mt, ms.node != MAS_START); + mtree_unlock(mt); + MT_BUG_ON(mt, mtree_insert(mt, 2, mt, GFP_KERNEL) != 0); + mtree_lock(mt); + mas_store(&ms, &ms); /* insert 1 -> &ms */ + mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */ + mtree_unlock(mt); + mtree_destroy(mt); +} + +static inline int not_empty(struct maple_node *node) +{ + int i; + + if (node->parent) + return 1; + + for (i = 0; i < ARRAY_SIZE(node->slot); i++) + if (node->slot[i]) + return 1; + + return 0; +} + +static noinline void check_new_node(struct maple_tree *mt) +{ + + struct maple_node *mn, *mn2, *mn3; + struct maple_alloc *smn; + struct maple_node *nodes[100]; + int i, j, total; + + MA_STATE(mas, mt, 0, 0); + + /* Try allocating 3 nodes */ + mtree_lock(mt); + /* request 3 nodes to be allocated. */ + mas_node_count(&mas, 3); + /* Allocation request of 3. */ + MT_BUG_ON(mt, mas_alloc_req(&mas) != 3); + /* Allocate failed. */ + MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + + MT_BUG_ON(mt, mas_allocated(&mas) != 3); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + MT_BUG_ON(mt, mn == NULL); + MT_BUG_ON(mt, mas.alloc == NULL); + MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); + mas_push_node(&mas, mn); + mas_nomem(&mas, GFP_KERNEL); /* free */ + mtree_unlock(mt); + + + /* Try allocating 1 node, then 2 more */ + mtree_lock(mt); + /* Set allocation request to 1. */ + mas_set_alloc_req(&mas, 1); + /* Check Allocation request of 1. */ + MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); + mas_set_err(&mas, -ENOMEM); + /* Validate allocation request. */ + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + /* Eat the requested node. */ + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + MT_BUG_ON(mt, mn == NULL); + MT_BUG_ON(mt, mn->slot[0] != NULL); + MT_BUG_ON(mt, mn->slot[1] != NULL); + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + + ma_free_rcu(mn); + mas.node = MAS_START; + mas_nomem(&mas, GFP_KERNEL); + /* Allocate 3 nodes, will fail. */ + mas_node_count(&mas, 3); + /* Drop the lock and allocate 3 nodes. */ + mas_nomem(&mas, GFP_KERNEL); + /* Ensure 3 are allocated. */ + MT_BUG_ON(mt, mas_allocated(&mas) != 3); + /* Allocation request of 0. */ + MT_BUG_ON(mt, mas_alloc_req(&mas) != 0); + + MT_BUG_ON(mt, mas.alloc == NULL); + MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); + MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); + /* Ensure we counted 3. */ + MT_BUG_ON(mt, mas_allocated(&mas) != 3); + /* Free. */ + mas_nomem(&mas, GFP_KERNEL); + + /* Set allocation request to 1. */ + mas_set_alloc_req(&mas, 1); + MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); + mas_set_err(&mas, -ENOMEM); + /* Validate allocation request. */ + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + MT_BUG_ON(mt, mas_allocated(&mas) != 1); + /* Check the node is only one node. */ + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + MT_BUG_ON(mt, mn == NULL); + MT_BUG_ON(mt, mn->slot[0] != NULL); + MT_BUG_ON(mt, mn->slot[1] != NULL); + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + mas_push_node(&mas, mn); + MT_BUG_ON(mt, mas_allocated(&mas) != 1); + MT_BUG_ON(mt, mas.alloc->node_count); + + mas_set_alloc_req(&mas, 2); /* request 2 more. */ + MT_BUG_ON(mt, mas_alloc_req(&mas) != 2); + mas_set_err(&mas, -ENOMEM); + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + MT_BUG_ON(mt, mas_allocated(&mas) != 3); + MT_BUG_ON(mt, mas.alloc == NULL); + MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); + MT_BUG_ON(mt, mas.alloc->slot[1] == NULL); + for (i = 2; i >= 0; i--) { + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, mas_allocated(&mas) != i); + MT_BUG_ON(mt, !mn); + MT_BUG_ON(mt, not_empty(mn)); + ma_free_rcu(mn); + } + + total = 64; + mas_set_alloc_req(&mas, total); /* request 2 more. */ + MT_BUG_ON(mt, mas_alloc_req(&mas) != total); + mas_set_err(&mas, -ENOMEM); + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + for (i = total; i > 0; i--) { + unsigned int e = 0; /* expected node_count */ + + if (i >= 35) + e = i - 35; + else if (i >= 5) + e = i - 5; + else if (i >= 2) + e = i - 2; + MT_BUG_ON(mt, mas.alloc->node_count != e); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + MT_BUG_ON(mt, mas_allocated(&mas) != i - 1); + MT_BUG_ON(mt, !mn); + ma_free_rcu(mn); + } + + total = 100; + for (i = 1; i < total; i++) { + mas_set_alloc_req(&mas, i); + mas_set_err(&mas, -ENOMEM); + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + for (j = i; j > 0; j--) { + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); + MT_BUG_ON(mt, !mn); + MT_BUG_ON(mt, not_empty(mn)); + mas_push_node(&mas, mn); + MT_BUG_ON(mt, mas_allocated(&mas) != j); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + MT_BUG_ON(mt, mas_allocated(&mas) != j - 1); + ma_free_rcu(mn); + } + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + + mas_set_alloc_req(&mas, i); + mas_set_err(&mas, -ENOMEM); + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + for (j = 0; j <= i/2; j++) { + MT_BUG_ON(mt, mas_allocated(&mas) != i - j); + nodes[j] = mas_pop_node(&mas); + MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); + } + + while (j) { + j--; + mas_push_node(&mas, nodes[j]); + MT_BUG_ON(mt, mas_allocated(&mas) != i - j); + } + MT_BUG_ON(mt, mas_allocated(&mas) != i); + for (j = 0; j <= i/2; j++) { + MT_BUG_ON(mt, mas_allocated(&mas) != i - j); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + ma_free_rcu(mn); + MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1); + } + MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); + + } + + /* Set allocation request. */ + total = 500; + mas_node_count(&mas, total); + /* Drop the lock and allocate the nodes. */ + mas_nomem(&mas, GFP_KERNEL); + MT_BUG_ON(mt, !mas.alloc); + i = 1; + smn = mas.alloc; + while (i < total) { + for (j = 0; j < MAPLE_ALLOC_SLOTS; j++) { + i++; + MT_BUG_ON(mt, !smn->slot[j]); + if (i == total) + break; + } + smn = smn->slot[0]; /* next. */ + } + MT_BUG_ON(mt, mas_allocated(&mas) != total); + mas_nomem(&mas, GFP_KERNEL); /* Free. */ + + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + for (i = 1; i < 128; i++) { + mas_node_count(&mas, i); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ + for (j = i; j > 0; j--) { /*Free the requests */ + mn = mas_pop_node(&mas); /* get the next node. */ + MT_BUG_ON(mt, mn == NULL); + MT_BUG_ON(mt, not_empty(mn)); + ma_free_rcu(mn); + } + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + } + + for (i = 1; i < MAPLE_NODE_MASK + 1; i++) { + MA_STATE(mas2, mt, 0, 0); + mas_node_count(&mas, i); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + MT_BUG_ON(mt, mas_allocated(&mas) != i); /* check request filled */ + for (j = 1; j <= i; j++) { /* Move the allocations to mas2 */ + mn = mas_pop_node(&mas); /* get the next node. */ + MT_BUG_ON(mt, mn == NULL); + MT_BUG_ON(mt, not_empty(mn)); + mas_push_node(&mas2, mn); + MT_BUG_ON(mt, mas_allocated(&mas2) != j); + } + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + MT_BUG_ON(mt, mas_allocated(&mas2) != i); + + for (j = i; j > 0; j--) { /*Free the requests */ + MT_BUG_ON(mt, mas_allocated(&mas2) != j); + mn = mas_pop_node(&mas2); /* get the next node. */ + MT_BUG_ON(mt, mn == NULL); + MT_BUG_ON(mt, not_empty(mn)); + ma_free_rcu(mn); + } + MT_BUG_ON(mt, mas_allocated(&mas2) != 0); + } + + + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ + MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + + mn = mas_pop_node(&mas); /* get the next node. */ + MT_BUG_ON(mt, mn == NULL); + MT_BUG_ON(mt, not_empty(mn)); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 2); + + mas_push_node(&mas, mn); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + + /* Check the limit of pop/push/pop */ + mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 2); /* Request */ + MT_BUG_ON(mt, mas_alloc_req(&mas) != 1); + MT_BUG_ON(mt, mas.node != MA_ERROR(-ENOMEM)); + MT_BUG_ON(mt, !mas_nomem(&mas, GFP_KERNEL)); + MT_BUG_ON(mt, mas_alloc_req(&mas)); + MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); + MT_BUG_ON(mt, mas.alloc->node_count != MAPLE_ALLOC_SLOTS - 1); + mas_push_node(&mas, mn); + MT_BUG_ON(mt, mas.alloc->node_count); + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + ma_free_rcu(mn); + for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) { + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, not_empty(mn)); + ma_free_rcu(mn); + } + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + + + for (i = 3; i < MAPLE_NODE_MASK * 3; i++) { + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, i); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mn = mas_pop_node(&mas); /* get the next node. */ + mas_push_node(&mas, mn); /* put it back */ + mas_destroy(&mas); + + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, i); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mn = mas_pop_node(&mas); /* get the next node. */ + mn2 = mas_pop_node(&mas); /* get the next node. */ + mas_push_node(&mas, mn); /* put them back */ + mas_push_node(&mas, mn2); + mas_destroy(&mas); + + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, i); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mn = mas_pop_node(&mas); /* get the next node. */ + mn2 = mas_pop_node(&mas); /* get the next node. */ + mn3 = mas_pop_node(&mas); /* get the next node. */ + mas_push_node(&mas, mn); /* put them back */ + mas_push_node(&mas, mn2); + mas_push_node(&mas, mn3); + mas_destroy(&mas); + + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, i); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mn = mas_pop_node(&mas); /* get the next node. */ + ma_free_rcu(mn); + mas_destroy(&mas); + + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, i); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mn = mas_pop_node(&mas); /* get the next node. */ + ma_free_rcu(mn); + mn = mas_pop_node(&mas); /* get the next node. */ + ma_free_rcu(mn); + mn = mas_pop_node(&mas); /* get the next node. */ + ma_free_rcu(mn); + mas_destroy(&mas); + } + + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, 5); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + MT_BUG_ON(mt, mas_allocated(&mas) != 5); + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, 10); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mas.node = MAS_START; + MT_BUG_ON(mt, mas_allocated(&mas) != 10); + mas_destroy(&mas); + + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS - 1); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS - 1); + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, 10 + MAPLE_ALLOC_SLOTS - 1); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mas.node = MAS_START; + MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); + mas_destroy(&mas); + + mtree_unlock(mt); +} + +static noinline void check_rev_seq(struct maple_tree *mt, unsigned long max, + bool verbose) +{ + unsigned long i = max, j; + + MT_BUG_ON(mt, !mtree_empty(mt)); + + mt_zero_nr_tallocated(); + while (i) { + MT_BUG_ON(mt, mtree_insert_index(mt, i, GFP_KERNEL)); + for (j = i; j <= max; j++) + check_index_load(mt, j); + + check_load(mt, i - 1, NULL); + mt_set_in_rcu(mt); + MT_BUG_ON(mt, !mt_height(mt)); + mt_clear_in_rcu(mt); + MT_BUG_ON(mt, !mt_height(mt)); + i--; + } + check_load(mt, max + 1, NULL); + + if (verbose) { + rcu_barrier(); + mt_dump(mt); + pr_info(" %s test of 0-%lu %luK in %d active (%d total)\n", + __func__, max, mt_get_alloc_size()/1024, mt_nr_allocated(), + mt_nr_tallocated()); + } +} + +static noinline void check_seq(struct maple_tree *mt, unsigned long max, + bool verbose) +{ + unsigned long i, j; + + MT_BUG_ON(mt, !mtree_empty(mt)); + + mt_zero_nr_tallocated(); + for (i = 0; i <= max; i++) { + MT_BUG_ON(mt, mtree_insert_index(mt, i, GFP_KERNEL)); + for (j = 0; j <= i; j++) + check_index_load(mt, j); + + if (i) + MT_BUG_ON(mt, !mt_height(mt)); + check_load(mt, i + 1, NULL); + } + if (verbose) { + rcu_barrier(); + mt_dump(mt); + pr_info(" seq test of 0-%lu %luK in %d active (%d total)\n", + max, mt_get_alloc_size()/1024, mt_nr_allocated(), + mt_nr_tallocated()); + } +} + +static noinline void check_lb_not_empty(struct maple_tree *mt) +{ + unsigned long i, j; + unsigned long huge = 4000UL * 1000 * 1000; + + + i = huge; + while (i > 4096) { + check_insert(mt, i, (void *) i); + for (j = huge; j >= i; j /= 2) { + check_load(mt, j-1, NULL); + check_load(mt, j, (void *) j); + check_load(mt, j+1, NULL); + } + i /= 2; + } + mtree_destroy(mt); +} + +static noinline void check_lower_bound_split(struct maple_tree *mt) +{ + MT_BUG_ON(mt, !mtree_empty(mt)); + check_lb_not_empty(mt); +} + +static noinline void check_upper_bound_split(struct maple_tree *mt) +{ + unsigned long i, j; + unsigned long huge = 4000UL * 1000 * 1000; + + MT_BUG_ON(mt, !mtree_empty(mt)); + + i = 4096; + while (i < huge) { + check_insert(mt, i, (void *) i); + for (j = i; j >= huge; j *= 2) { + check_load(mt, j-1, NULL); + check_load(mt, j, (void *) j); + check_load(mt, j+1, NULL); + } + i *= 2; + } + mtree_destroy(mt); +} + +static noinline void check_mid_split(struct maple_tree *mt) +{ + unsigned long huge = 8000UL * 1000 * 1000; + + check_insert(mt, huge, (void *) huge); + check_insert(mt, 0, xa_mk_value(0)); + check_lb_not_empty(mt); +} + +static noinline void check_rev_find(struct maple_tree *mt) +{ + int i, nr_entries = 200; + void *val; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 5, + xa_mk_value(i), GFP_KERNEL); + + mas_set(&mas, 1000); + val = mas_find_rev(&mas, 1000); + MT_BUG_ON(mt, val != xa_mk_value(100)); + val = mas_find_rev(&mas, 1000); + MT_BUG_ON(mt, val != NULL); + + mas_set(&mas, 999); + val = mas_find_rev(&mas, 997); + MT_BUG_ON(mt, val != NULL); + + mas_set(&mas, 1000); + val = mas_find_rev(&mas, 900); + MT_BUG_ON(mt, val != xa_mk_value(100)); + val = mas_find_rev(&mas, 900); + MT_BUG_ON(mt, val != xa_mk_value(99)); + + mas_set(&mas, 20); + val = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, val != xa_mk_value(2)); + val = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, val != xa_mk_value(1)); + val = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, val != xa_mk_value(0)); + val = mas_find_rev(&mas, 0); + MT_BUG_ON(mt, val != NULL); +} + +static noinline void check_find(struct maple_tree *mt) +{ + unsigned long val = 0; + unsigned long count = 20; + unsigned long max; + unsigned long last = 0, index = 0; + void *entry, *entry2; + + MA_STATE(mas, mt, 0, 0); + + /* Insert 0. */ + MT_BUG_ON(mt, mtree_insert_index(mt, val++, GFP_KERNEL)); + + for (int i = 0; i <= count; i++) { + if (val != 64) + MT_BUG_ON(mt, mtree_insert_index(mt, val, GFP_KERNEL)); + else + MT_BUG_ON(mt, mtree_insert(mt, val, + XA_ZERO_ENTRY, GFP_KERNEL)); + + val <<= 2; + } + + val = 0; + mas_set(&mas, val); + mas_lock(&mas); + while ((entry = mas_find(&mas, 268435456)) != NULL) { + if (val != 64) + MT_BUG_ON(mt, xa_mk_value(val) != entry); + else + MT_BUG_ON(mt, entry != XA_ZERO_ENTRY); + + val <<= 2; + /* For zero check. */ + if (!val) + val = 1; + } + mas_unlock(&mas); + + val = 0; + mas_set(&mas, val); + mas_lock(&mas); + mas_for_each(&mas, entry, ULONG_MAX) { + if (val != 64) + MT_BUG_ON(mt, xa_mk_value(val) != entry); + else + MT_BUG_ON(mt, entry != XA_ZERO_ENTRY); + val <<= 2; + /* For zero check. */ + if (!val) + val = 1; + } + mas_unlock(&mas); + + /* Test mas_pause */ + val = 0; + mas_set(&mas, val); + mas_lock(&mas); + mas_for_each(&mas, entry, ULONG_MAX) { + if (val != 64) + MT_BUG_ON(mt, xa_mk_value(val) != entry); + else + MT_BUG_ON(mt, entry != XA_ZERO_ENTRY); + val <<= 2; + /* For zero check. */ + if (!val) + val = 1; + + mas_pause(&mas); + mas_unlock(&mas); + mas_lock(&mas); + } + mas_unlock(&mas); + + val = 0; + max = 300; /* A value big enough to include XA_ZERO_ENTRY at 64. */ + mt_for_each(mt, entry, index, max) { + MT_BUG_ON(mt, xa_mk_value(val) != entry); + val <<= 2; + if (val == 64) /* Skip zero entry. */ + val <<= 2; + /* For zero check. */ + if (!val) + val = 1; + } + + val = 0; + max = 0; + index = 0; + MT_BUG_ON(mt, mtree_insert_index(mt, ULONG_MAX, GFP_KERNEL)); + mt_for_each(mt, entry, index, ULONG_MAX) { + if (val == 4398046511104) + MT_BUG_ON(mt, entry != + xa_mk_value(ULONG_MAX & LONG_MAX)); + else + MT_BUG_ON(mt, xa_mk_value(val) != entry); + val <<= 2; + if (val == 64) /* Skip zero entry. */ + val <<= 2; + /* For zero check. */ + if (!val) + val = 1; + max++; + MT_BUG_ON(mt, max > 25); + } + mtree_erase_index(mt, ULONG_MAX); + + mas_reset(&mas); + index = 17; + entry = mt_find(mt, &index, 512); + MT_BUG_ON(mt, xa_mk_value(256) != entry); + + mas_reset(&mas); + index = 17; + entry = mt_find(mt, &index, 20); + MT_BUG_ON(mt, entry != NULL); + + + /* Range check.. */ + /* Insert ULONG_MAX */ + MT_BUG_ON(mt, mtree_insert_index(mt, ULONG_MAX, GFP_KERNEL)); + + val = 0; + mas_set(&mas, 0); + mas_lock(&mas); + mas_for_each(&mas, entry, ULONG_MAX) { + if (val == 64) + MT_BUG_ON(mt, entry != XA_ZERO_ENTRY); + else if (val == 4398046511104) + MT_BUG_ON(mt, entry != xa_mk_value(ULONG_MAX & LONG_MAX)); + else + MT_BUG_ON(mt, xa_mk_value(val) != entry); + val <<= 2; + + /* For zero check. */ + if (!val) + val = 1; + mas_pause(&mas); + mas_unlock(&mas); + mas_lock(&mas); + } + mas_unlock(&mas); + + mas_set(&mas, 1048576); + mas_lock(&mas); + entry = mas_find(&mas, 1048576); + mas_unlock(&mas); + MT_BUG_ON(mas.tree, entry == NULL); + + /* + * Find last value. + * 1. get the expected value, leveraging the existence of an end entry + * 2. delete end entry + * 3. find the last value but searching for ULONG_MAX and then using + * prev + */ + /* First, get the expected result. */ + mas_lock(&mas); + mas_reset(&mas); + mas.index = ULONG_MAX; /* start at max.. */ + entry = mas_find(&mas, ULONG_MAX); + entry = mas_prev(&mas, 0); + index = mas.index; + last = mas.last; + + /* Erase the last entry. */ + mas_reset(&mas); + mas.index = ULONG_MAX; + mas.last = ULONG_MAX; + mas_erase(&mas); + + /* Get the previous value from MAS_START */ + mas_reset(&mas); + entry2 = mas_prev(&mas, 0); + + /* Check results. */ + MT_BUG_ON(mt, entry != entry2); + MT_BUG_ON(mt, index != mas.index); + MT_BUG_ON(mt, last != mas.last); + + + mas.node = MAS_NONE; + mas.index = ULONG_MAX; + mas.last = ULONG_MAX; + entry2 = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != entry2); + + mas_set(&mas, 0); + MT_BUG_ON(mt, mas_prev(&mas, 0) != NULL); + + mas_unlock(&mas); + mtree_destroy(mt); +} + +static noinline void check_find_2(struct maple_tree *mt) +{ + unsigned long i, j; + void *entry; + + MA_STATE(mas, mt, 0, 0); + rcu_read_lock(); + mas_for_each(&mas, entry, ULONG_MAX) + MT_BUG_ON(mt, true); + rcu_read_unlock(); + + for (i = 0; i < 256; i++) { + mtree_insert_index(mt, i, GFP_KERNEL); + j = 0; + mas_set(&mas, 0); + rcu_read_lock(); + mas_for_each(&mas, entry, ULONG_MAX) { + MT_BUG_ON(mt, entry != xa_mk_value(j)); + j++; + } + rcu_read_unlock(); + MT_BUG_ON(mt, j != i + 1); + } + + for (i = 0; i < 256; i++) { + mtree_erase_index(mt, i); + j = i + 1; + mas_set(&mas, 0); + rcu_read_lock(); + mas_for_each(&mas, entry, ULONG_MAX) { + if (xa_is_zero(entry)) + continue; + + MT_BUG_ON(mt, entry != xa_mk_value(j)); + j++; + } + rcu_read_unlock(); + MT_BUG_ON(mt, j != 256); + } + + /*MT_BUG_ON(mt, !mtree_empty(mt)); */ +} + +#define erase_ptr(i) entry[i%2] +#define erase_check_load(mt, i) check_load(mt, set[i], entry[i%2]) +#define erase_check_insert(mt, i) check_insert(mt, set[i], entry[i%2]) +#define erase_check_erase(mt, i) check_erase(mt, set[i], entry[i%2]) + +static noinline void check_erase_testset(struct maple_tree *mt) +{ + unsigned long set[] = { 5015, 5014, 5017, 25, 1000, + 1001, 1002, 1003, 1005, 0, + 6003, 6002, 6008, 6012, 6015, + 7003, 7002, 7008, 7012, 7015, + 8003, 8002, 8008, 8012, 8015, + 9003, 9002, 9008, 9012, 9015, + 10003, 10002, 10008, 10012, 10015, + 11003, 11002, 11008, 11012, 11015, + 12003, 12002, 12008, 12012, 12015, + 13003, 13002, 13008, 13012, 13015, + 14003, 14002, 14008, 14012, 14015, + 15003, 15002, 15008, 15012, 15015, + }; + + + void *ptr = &set; + void *entry[2] = { ptr, mt }; + void *root_node; + + + rcu_register_thread(); + mt_set_in_rcu(mt); + for (int i = 0; i < 4; i++) + erase_check_insert(mt, i); + for (int i = 0; i < 4; i++) + erase_check_load(mt, i); + + mt_set_non_kernel(2); + erase_check_erase(mt, 1); + erase_check_load(mt, 0); + check_load(mt, set[1], NULL); + for (int i = 2; i < 4; i++) + erase_check_load(mt, i); + + + erase_check_erase(mt, 2); + erase_check_load(mt, 0); + check_load(mt, set[1], NULL); + check_load(mt, set[2], NULL); + + erase_check_insert(mt, 1); + erase_check_insert(mt, 2); + + for (int i = 0; i < 4; i++) + erase_check_load(mt, i); + + /* Check erase and load without an allocation. */ + erase_check_load(mt, 3); + erase_check_erase(mt, 1); + erase_check_load(mt, 0); + check_load(mt, set[1], NULL); + for (int i = 2; i < 4; i++) + erase_check_load(mt, i); + + /* + * Set the newly erased node. This will produce a different allocated + * node to avoid busy slots. + */ + root_node = mt->ma_root; + erase_check_insert(mt, 1); + + erase_check_load(mt, 0); + check_load(mt, 5016, NULL); + erase_check_load(mt, 1); + check_load(mt, 5013, NULL); + erase_check_load(mt, 2); + check_load(mt, 5018, NULL); + erase_check_load(mt, 3); + + erase_check_erase(mt, 2); /* erase 5017 to check append */ + erase_check_load(mt, 0); + check_load(mt, 5016, NULL); + erase_check_load(mt, 1); + check_load(mt, 5013, NULL); + check_load(mt, set[2], NULL); + check_load(mt, 5018, NULL); + + erase_check_load(mt, 3); + + root_node = mt->ma_root; + erase_check_insert(mt, 2); + + erase_check_load(mt, 0); + check_load(mt, 5016, NULL); + erase_check_load(mt, 1); + check_load(mt, 5013, NULL); + erase_check_load(mt, 2); + check_load(mt, 5018, NULL); + erase_check_load(mt, 3); + + mt_set_non_kernel(1); + erase_check_erase(mt, 2); /* erase 5017 to check append */ + erase_check_load(mt, 0); + check_load(mt, 5016, NULL); + check_load(mt, set[2], NULL); + erase_check_erase(mt, 0); /* erase 5015 to check append */ + check_load(mt, set[0], NULL); + check_load(mt, 5016, NULL); + erase_check_insert(mt, 4); /* 1000 < Should not split. */ + check_load(mt, set[0], NULL); + check_load(mt, 5016, NULL); + erase_check_load(mt, 1); + check_load(mt, 5013, NULL); + check_load(mt, set[2], NULL); + check_load(mt, 5018, NULL); + erase_check_load(mt, 4); + check_load(mt, 999, NULL); + check_load(mt, 1001, NULL); + erase_check_load(mt, 4); + if (mt_in_rcu(mt)) + MT_BUG_ON(mt, root_node == mt->ma_root); + else + MT_BUG_ON(mt, root_node != mt->ma_root); + + /* Should not have split. */ + MT_BUG_ON(mt, !mte_is_leaf(mt->ma_root)); + + + /* Coalesce testing */ + erase_check_insert(mt, 0); + erase_check_insert(mt, 2); + + for (int i = 5; i < 25; i++) { + erase_check_insert(mt, i); + for (int j = i; j >= 0; j--) + erase_check_load(mt, j); + } + + erase_check_erase(mt, 14); /*6015 */ + for (int i = 0; i < 25; i++) { + if (i == 14) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + erase_check_erase(mt, 16); /*7002 */ + for (int i = 0; i < 25; i++) { + if (i == 16 || i == 14) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + + + mt_set_non_kernel(1); + erase_check_erase(mt, 13); /*6012 */ + for (int i = 0; i < 25; i++) { + if (i == 16 || i == 14 || i == 13) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + + erase_check_erase(mt, 15); /*7003 */ + for (int i = 0; i < 25; i++) { + if (i <= 16 && i >= 13) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + + mt_set_non_kernel(2); + erase_check_erase(mt, 17); /*7008 *should* cause coalesce. */ + for (int i = 0; i < 25; i++) { + if (i <= 17 && i >= 13) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + + erase_check_erase(mt, 18); /*7012 */ + for (int i = 0; i < 25; i++) { + if (i <= 18 && i >= 13) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + + mt_set_non_kernel(2); + erase_check_erase(mt, 19); /*7015 */ + for (int i = 0; i < 25; i++) { + if (i <= 19 && i >= 13) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + + erase_check_erase(mt, 20); /*8003 */ + for (int i = 0; i < 25; i++) { + if (i <= 20 && i >= 13) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + + erase_check_erase(mt, 21); /*8002 */ + for (int i = 0; i < 25; i++) { + if (i <= 21 && i >= 13) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + + mt_set_non_kernel(2); + erase_check_erase(mt, 22); /*8008 */ + for (int i = 0; i < 25; i++) { + if (i <= 22 && i >= 13) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + for (int i = 23; i < 25; i++) + erase_check_erase(mt, i); + + for (int i = 0; i < 25; i++) { + if (i <= 25 && i >= 13) + check_load(mt, set[i], NULL); + else + erase_check_load(mt, i); + } + + /* Shrinking tree test. */ + + for (int i = 13; i < ARRAY_SIZE(set); i++) + erase_check_insert(mt, i); + + mt_set_non_kernel(99); + for (int i = 18; i < ARRAY_SIZE(set); i++) { + erase_check_erase(mt, i); + for (int j = 0; j < ARRAY_SIZE(set); j++) { + if (j < 18 || j > i) + erase_check_load(mt, j); + else + check_load(mt, set[j], NULL); + } + } + mt_set_non_kernel(35); + for (int i = 0; i < 18; i++) { + erase_check_erase(mt, i); + for (int j = 0; j < ARRAY_SIZE(set); j++) { + if (j < 18 && j > i) + erase_check_load(mt, j); + else + check_load(mt, set[j], NULL); + } + } + erase_check_insert(mt, 8); + erase_check_insert(mt, 9); + erase_check_erase(mt, 8); + rcu_unregister_thread(); +} + +#define erase_check_store_range(mt, a, i, ptr) mtree_test_store_range(mt, \ + a[(i)], a[(i + 1)], ptr) +#define STORE 1 +#define SNULL 2 +#define ERASE 3 +#define ec_type_str(x) \ + (((x) == STORE) ? \ + "STORE" : \ + (((x) == SNULL) ? \ + "SNULL" : "ERASE") \ + ) +#define check_erase2_debug 0 +void *mas_next(struct ma_state *mas, unsigned long max); + +/* Calculate the overwritten entries. */ +int mas_ce2_over_count(struct ma_state *mas_start, struct ma_state *mas_end, + void *s_entry, unsigned long s_min, + void *e_entry, unsigned long e_max, + unsigned long *set, int i, bool null_entry) +{ + int count = 0, span = 0; + unsigned long retry = 0; + void *entry; + struct ma_state tmp; + + + /* count slots */ + memcpy(&tmp, mas_start, sizeof(tmp)); + entry = mas_next(&tmp, mas_end->last); + while (entry) { + BUG_ON(retry > 50); /* stop infinite retry on testing. */ + if (xa_is_zero(s_entry)) { + retry++; + continue; + } + count++; + span++; + entry = mas_next(&tmp, mas_end->last); + } + + if (null_entry) { + /* Check splitting end. */ + if (e_entry && (e_max > mas_end->last)) + count--; + + /* check overwrite of entire start */ + if (s_entry && (s_min == mas_start->index)) + count++; + } else { /* !null_entry (store) */ + bool esplit = e_max > mas_end->last; + bool ssplit = s_min != mas_start->index; + + if (s_entry && e_entry) { + if (esplit && ssplit) + count--; + else if (ssplit) + count--; + else if (esplit) { + if (span) + count--; + } + } else if (s_entry && !e_entry) { + if (ssplit) + count--; + } else if (!s_entry && e_entry) { + if (esplit) + count--; + count--; + } else { + count--; + } + } + return count; +} + +/* + * mas_node_walk() - Walk a maple node to offset of the index. + * @mas: The maple state + * @type: The maple node type + * @*range_min: Pointer to store the minimum range of the offset + * @*range_max: Pointer to store the maximum range of the offset + * + * The offset will be stored in the maple state. + * + */ +static inline void mas_node_walk(struct ma_state *mas, struct maple_node *node, + enum maple_type type, unsigned long *range_min, + unsigned long *range_max) + +{ + unsigned long *pivots; + unsigned char count; + unsigned long prev, max; + unsigned char offset; + unsigned long index; + + if (unlikely(ma_is_dense(type))) { + (*range_max) = (*range_min) = mas->index; + if (unlikely(ma_dead_node(node))) + return; + + mas->offset = mas->index = mas->min; + return; + } + + pivots = ma_pivots(node, type); + max = pivots[0]; + if (unlikely(ma_dead_node(node))) + return; + + offset = 0; + prev = mas->min; + index = mas->index; + if (unlikely(index <= max)) + goto offset_zero; + + count = mt_pivots[type]; + while (++offset < count) { + prev = max; + max = pivots[offset]; + if (unlikely(ma_dead_node(node))) + return; + + if (index <= max) + goto offset_found; + else if (unlikely(!max)) + goto mas_max; + } + + prev = max; +mas_max: + max = mas->max; +offset_found: + prev++; +offset_zero: + mas->offset = offset; + if (ma_is_leaf(type)) { + *range_max = max; + *range_min = prev; + } else { + mas->max = max; + mas->min = prev; + } +} + +/* + * mas_descend_walk(): Locates a value and sets the mas->node and slot + * accordingly. range_min and range_max are set to the range which the entry is + * valid. + * @mas: The maple state + * @*range_min: A pointer to store the minimum of the range + * @*range_max: A pointer to store the maximum of the range + * + * Check mas->node is still valid on return of any value. + * + * Return: true if pointing to a valid node and offset. False otherwise. + */ +static inline bool mas_descend_walk(struct ma_state *mas, + unsigned long *range_min, unsigned long *range_max) +{ + struct maple_enode *next; + struct maple_node *node; + enum maple_type type; + + next = mas->node; + while (true) { + node = mte_to_node(next); + type = mte_node_type(next); + mas_node_walk(mas, node, type, range_min, range_max); + next = mas_slot(mas, ma_slots(node, type), mas->offset); + if (unlikely(ma_dead_node(node))) + return false; + + if (unlikely(ma_is_leaf(type))) + return true; + + /* Descend. */ + mas->node = next; + } + return false; +} + +/* + * mas_tree_walk() - Walk to @mas->index and set the range values. + * @mas: The maple state. + * @*range_min: The minimum range to be set. + * @*range_max: The maximum range to be set. + * + * Ranges are only valid if there is a valid entry at @mas->index. + * + * Return: True if a value exists, false otherwise. + */ +static inline bool mas_tree_walk(struct ma_state *mas, unsigned long *range_min, + unsigned long *range_max) +{ + bool ret; + +retry: + ret = false; + mas_start(mas); + if (mas_is_none(mas)) + goto not_found; + + if (mas_is_ptr(mas)) { + *range_min = *range_max = 0; + if (!mas->index) + return true; + + goto not_found; + } + + ret = mas_descend_walk(mas, range_min, range_max); + if (unlikely(mte_dead_node(mas->node))) { + mas->node = MAS_START; + goto retry; + } + + return ret; + +not_found: + mas->offset = MAPLE_NODE_SLOTS; + return false; +} + +static inline void *mas_range_load(struct ma_state *mas, + unsigned long *range_min, unsigned long *range_max) + +{ + void *entry = NULL; + unsigned long index = mas->index; + + if (mas_is_none(mas) || mas_is_paused(mas)) + mas->node = MAS_START; +retry: + if (mas_tree_walk(mas, range_min, range_max)) + if (unlikely(mas->node == MAS_ROOT)) + return mas_root(mas); + + if (likely(mas->offset != MAPLE_NODE_SLOTS)) + entry = mas_get_slot(mas, mas->offset); + + if (mas_dead_node(mas, index)) + goto retry; + + return entry; +} +static noinline void check_erase2_testset(struct maple_tree *mt, + unsigned long *set, unsigned long size) +{ + int entry_count = 0; + int check = 0; + void *foo; + unsigned long addr = 0; + void *s_entry = NULL, *e_entry = NULL; + + MA_STATE(mas, mt, 0, 0); + + for (int i = 0; i < size; i += 3) { + unsigned long s_min, s_max; + unsigned long e_min, e_max; + void *value = NULL; + + MA_STATE(mas_start, mt, set[i+1], set[i+1]); + MA_STATE(mas_end, mt, set[i+2], set[i+2]); + mt_set_non_kernel(127); +#if check_erase2_debug + pr_err("%s: %d %s %lu - %lu\n", __func__, i, + ec_type_str(set[i]), + set[i+1], set[i+2]); +#endif + s_entry = mas_range_load(&mas_start, &s_min, &s_max); + e_entry = mas_range_load(&mas_end, &e_min, &e_max); + + switch (set[i]) { + case SNULL: + if ((s_min == set[i+1]) && (s_max == set[i+2])) { + if (s_entry) + entry_count--; + } else if ((s_min != set[i+1]) && (s_max != set[i+2])) { + entry_count++; + } else if ((mas_start.node != mas_end.node) || + (mas_start.offset != mas_end.offset)) { + entry_count -= + mas_ce2_over_count(&mas_start, &mas_end, + s_entry, s_min, + e_entry, e_max, set, i, + true); + } + + + erase_check_store_range(mt, set, i + 1, value); + break; + case STORE: + value = xa_mk_value(set[i + 1]); + if (mas_start.offset > mt_slot_count(mas_start.node)) { + entry_count++; /* appending an entry. */ + } else if ((s_min == e_min) && (s_max == e_max)) { + if (!entry_count) + entry_count++; + + else if (s_entry) { + if (e_max > mas_end.last) + entry_count++; + + if (s_min < mas_start.index) + entry_count++; + + } else { + entry_count++; + } + } else { + entry_count -= + mas_ce2_over_count(&mas_start, &mas_end, + s_entry, s_min, + e_entry, e_max, set, i, + false); + } + + erase_check_store_range(mt, set, i + 1, value); + break; + case ERASE: + if (!s_entry) + break; + check_erase(mt, set[i+1], xa_mk_value(set[i+1])); + entry_count--; + break; + } + mt_validate(mt); + if (entry_count) + MT_BUG_ON(mt, !mt_height(mt)); +#if check_erase2_debug > 1 + mt_dump(mt); +#endif +#if check_erase2_debug + pr_err("Done\n"); +#endif + + check = 0; + addr = 0; + mt_for_each(mt, foo, addr, ULONG_MAX) { + check++; +#if check_erase2_debug > 2 + pr_err("mt: %lu -> %p (%d)\n", addr+1, foo, check); +#endif + if (check > entry_count) + break; + } + +#if check_erase2_debug > 2 + pr_err("mt_for_each %d and count %d\n", check, entry_count); +#endif + + MT_BUG_ON(mt, check != entry_count); + + check = 0; + addr = 0; + mas_reset(&mas); + mas.index = 0; + rcu_read_lock(); + mas_for_each(&mas, foo, ULONG_MAX) { + if (xa_is_zero(foo)) { + if (addr == mas.index) { + mt_dump(mas.tree); + pr_err("retry failed %lu - %lu\n", + mas.index, mas.last); + MT_BUG_ON(mt, 1); + } + addr = mas.index; + continue; + } +#if check_erase2_debug > 2 + pr_err("mas: %lu -> %p\n", mas.index, foo); +#endif + check++; + if (check > entry_count) + break; + } + rcu_read_unlock(); +#if check_erase2_debug > 2 + pr_err("mas_for_each %d and count %d\n", check, entry_count); + mt_validate(mt); +#endif + + MT_BUG_ON(mt, check != entry_count); + + MT_BUG_ON(mt, mtree_load(mas.tree, 0) != NULL); + } +} + + +/* These tests were pulled from kvm tests. */ +static noinline void check_erase2_sets(struct maple_tree *mt) +{ + void *entry; + unsigned long start = 0; + unsigned long set[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140721266458624, 140737488351231, +ERASE, 140721266458624, 140737488351231, +STORE, 140721266458624, 140721266462719, +STORE, 94735788949504, 94735789121535, +ERASE, 94735788949504, 94735789121535, +STORE, 94735788949504, 94735788965887, +STORE, 94735788965888, 94735789121535, +ERASE, 94735788965888, 94735789121535, +STORE, 94735788965888, 94735789068287, +STORE, 94735789068288, 94735789109247, +STORE, 94735789109248, 94735789121535, +STORE, 140253902692352, 140253902864383, +ERASE, 140253902692352, 140253902864383, +STORE, 140253902692352, 140253902696447, +STORE, 140253902696448, 140253902864383, + }; + unsigned long set2[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140735933583360, 140737488351231, +ERASE, 140735933583360, 140737488351231, +STORE, 140735933583360, 140735933587455, +STORE, 94811003260928, 94811003432959, +ERASE, 94811003260928, 94811003432959, +STORE, 94811003260928, 94811003277311, +STORE, 94811003277312, 94811003432959, +ERASE, 94811003277312, 94811003432959, +STORE, 94811003277312, 94811003379711, +STORE, 94811003379712, 94811003420671, +STORE, 94811003420672, 94811003432959, +STORE, 140277094653952, 140277094825983, +ERASE, 140277094653952, 140277094825983, +STORE, 140277094653952, 140277094658047, +STORE, 140277094658048, 140277094825983, +ERASE, 140277094658048, 140277094825983, +STORE, 140277094658048, 140277094780927, +STORE, 140277094780928, 140277094813695, +STORE, 140277094813696, 140277094821887, +STORE, 140277094821888, 140277094825983, +STORE, 140735933906944, 140735933911039, + }; + unsigned long set3[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140735790264320, 140737488351231, +ERASE, 140735790264320, 140737488351231, +STORE, 140735790264320, 140735790268415, +STORE, 94016597282816, 94016597454847, +ERASE, 94016597282816, 94016597454847, +STORE, 94016597282816, 94016597299199, +STORE, 94016597299200, 94016597454847, +ERASE, 94016597299200, 94016597454847, +STORE, 94016597299200, 94016597401599, +STORE, 94016597401600, 94016597442559, +STORE, 94016597442560, 94016597454847, +STORE, 140496959283200, 140496959455231, +ERASE, 140496959283200, 140496959455231, +STORE, 140496959283200, 140496959287295, +STORE, 140496959287296, 140496959455231, +ERASE, 140496959287296, 140496959455231, +STORE, 140496959287296, 140496959410175, +STORE, 140496959410176, 140496959442943, +STORE, 140496959442944, 140496959451135, +STORE, 140496959451136, 140496959455231, +STORE, 140735791718400, 140735791722495, +STORE, 140735791706112, 140735791718399, +STORE, 47135835713536, 47135835721727, +STORE, 47135835721728, 47135835729919, +STORE, 47135835729920, 47135835893759, +ERASE, 47135835729920, 47135835893759, +STORE, 47135835729920, 47135835742207, +STORE, 47135835742208, 47135835893759, +STORE, 47135835840512, 47135835893759, +STORE, 47135835742208, 47135835840511, +ERASE, 47135835742208, 47135835840511, +STORE, 47135835742208, 47135835840511, +STORE, 47135835885568, 47135835893759, +STORE, 47135835840512, 47135835885567, +ERASE, 47135835840512, 47135835885567, +STORE, 47135835840512, 47135835893759, +ERASE, 47135835840512, 47135835893759, +STORE, 47135835840512, 47135835885567, +STORE, 47135835885568, 47135835893759, + }; + + unsigned long set4[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140728251703296, 140737488351231, +ERASE, 140728251703296, 140737488351231, +STORE, 140728251703296, 140728251707391, +STORE, 94668429205504, 94668429377535, +ERASE, 94668429205504, 94668429377535, +STORE, 94668429205504, 94668429221887, +STORE, 94668429221888, 94668429377535, +ERASE, 94668429221888, 94668429377535, +STORE, 94668429221888, 94668429324287, +STORE, 94668429324288, 94668429365247, +STORE, 94668429365248, 94668429377535, +STORE, 47646523273216, 47646523445247, +ERASE, 47646523273216, 47646523445247, +STORE, 47646523273216, 47646523277311, +STORE, 47646523277312, 47646523445247, +ERASE, 47646523277312, 47646523445247, +STORE, 47646523277312, 47646523400191, + }; + + unsigned long set5[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140726874062848, 140737488351231, +ERASE, 140726874062848, 140737488351231, +STORE, 140726874062848, 140726874066943, +STORE, 94248892870656, 94248893042687, +ERASE, 94248892870656, 94248893042687, +STORE, 94248892870656, 94248892887039, +STORE, 94248892887040, 94248893042687, +ERASE, 94248892887040, 94248893042687, +STORE, 94248892887040, 94248892989439, +STORE, 94248892989440, 94248893030399, +STORE, 94248893030400, 94248893042687, +STORE, 47884786266112, 47884786438143, +ERASE, 47884786266112, 47884786438143, +STORE, 47884786266112, 47884786270207, +STORE, 47884786270208, 47884786438143, +ERASE, 47884786270208, 47884786438143, +STORE, 47884786270208, 47884786393087, +STORE, 47884786393088, 47884786425855, +STORE, 47884786425856, 47884786434047, +STORE, 47884786434048, 47884786438143, +STORE, 140726874513408, 140726874517503, +STORE, 140726874501120, 140726874513407, +STORE, 47884786438144, 47884786446335, +STORE, 47884786446336, 47884786454527, +STORE, 47884786454528, 47884786618367, +ERASE, 47884786454528, 47884786618367, +STORE, 47884786454528, 47884786466815, +STORE, 47884786466816, 47884786618367, +STORE, 47884786565120, 47884786618367, +STORE, 47884786466816, 47884786565119, +ERASE, 47884786466816, 47884786565119, +STORE, 47884786466816, 47884786565119, +STORE, 47884786610176, 47884786618367, +STORE, 47884786565120, 47884786610175, +ERASE, 47884786565120, 47884786610175, +STORE, 47884786565120, 47884786618367, +ERASE, 47884786565120, 47884786618367, +STORE, 47884786565120, 47884786610175, +STORE, 47884786610176, 47884786618367, +ERASE, 47884786610176, 47884786618367, +STORE, 47884786610176, 47884786618367, +STORE, 47884786618368, 47884789669887, +STORE, 47884787163136, 47884789669887, +STORE, 47884786618368, 47884787163135, +ERASE, 47884787163136, 47884789669887, +STORE, 47884787163136, 47884789448703, +STORE, 47884789448704, 47884789669887, +STORE, 47884788858880, 47884789448703, +STORE, 47884787163136, 47884788858879, +ERASE, 47884787163136, 47884788858879, +STORE, 47884787163136, 47884788858879, +STORE, 47884789444608, 47884789448703, +STORE, 47884788858880, 47884789444607, +ERASE, 47884788858880, 47884789444607, +STORE, 47884788858880, 47884789444607, +STORE, 47884789653504, 47884789669887, +STORE, 47884789448704, 47884789653503, +ERASE, 47884789448704, 47884789653503, +STORE, 47884789448704, 47884789653503, +ERASE, 47884789653504, 47884789669887, +STORE, 47884789653504, 47884789669887, +STORE, 47884789669888, 47884791508991, +STORE, 47884789809152, 47884791508991, +STORE, 47884789669888, 47884789809151, +ERASE, 47884789809152, 47884791508991, +STORE, 47884789809152, 47884791468031, +STORE, 47884791468032, 47884791508991, +STORE, 47884791152640, 47884791468031, +STORE, 47884789809152, 47884791152639, +ERASE, 47884789809152, 47884791152639, +STORE, 47884789809152, 47884791152639, +STORE, 47884791463936, 47884791468031, +STORE, 47884791152640, 47884791463935, +ERASE, 47884791152640, 47884791463935, +STORE, 47884791152640, 47884791463935, +STORE, 47884791492608, 47884791508991, +STORE, 47884791468032, 47884791492607, +ERASE, 47884791468032, 47884791492607, +STORE, 47884791468032, 47884791492607, +ERASE, 47884791492608, 47884791508991, +STORE, 47884791492608, 47884791508991, +STORE, 47884791508992, 47884791644159, +ERASE, 47884791508992, 47884791644159, +STORE, 47884791508992, 47884791533567, +STORE, 47884791533568, 47884791644159, +STORE, 47884791595008, 47884791644159, +STORE, 47884791533568, 47884791595007, +ERASE, 47884791533568, 47884791595007, +STORE, 47884791533568, 47884791595007, +STORE, 47884791619584, 47884791644159, +STORE, 47884791595008, 47884791619583, +ERASE, 47884791595008, 47884791619583, +STORE, 47884791595008, 47884791644159, +ERASE, 47884791595008, 47884791644159, +STORE, 47884791595008, 47884791619583, +STORE, 47884791619584, 47884791644159, +STORE, 47884791627776, 47884791644159, +STORE, 47884791619584, 47884791627775, +ERASE, 47884791619584, 47884791627775, +STORE, 47884791619584, 47884791627775, +ERASE, 47884791627776, 47884791644159, +STORE, 47884791627776, 47884791644159, +STORE, 47884791644160, 47884791664639, +ERASE, 47884791644160, 47884791664639, +STORE, 47884791644160, 47884791648255, +STORE, 47884791648256, 47884791664639, +STORE, 47884791652352, 47884791664639, +STORE, 47884791648256, 47884791652351, +ERASE, 47884791648256, 47884791652351, +STORE, 47884791648256, 47884791652351, +STORE, 47884791656448, 47884791664639, +STORE, 47884791652352, 47884791656447, +ERASE, 47884791652352, 47884791656447, +STORE, 47884791652352, 47884791664639, +ERASE, 47884791652352, 47884791664639, +STORE, 47884791652352, 47884791656447, +STORE, 47884791656448, 47884791664639, +ERASE, 47884791656448, 47884791664639, +STORE, 47884791656448, 47884791664639, +STORE, 47884791664640, 47884791672831, +ERASE, 47884791468032, 47884791492607, +STORE, 47884791468032, 47884791484415, +STORE, 47884791484416, 47884791492607, +ERASE, 47884791656448, 47884791664639, +STORE, 47884791656448, 47884791660543, +STORE, 47884791660544, 47884791664639, +ERASE, 47884791619584, 47884791627775, +STORE, 47884791619584, 47884791623679, +STORE, 47884791623680, 47884791627775, + }; + + unsigned long set6[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140722999021568, 140737488351231, +ERASE, 140722999021568, 140737488351231, +STORE, 140722999021568, 140722999025663, +STORE, 94901500268544, 94901500440575, +ERASE, 94901500268544, 94901500440575, +STORE, 94901500268544, 94901500284927, +STORE, 94901500284928, 94901500440575, +ERASE, 94901500284928, 94901500440575, +STORE, 94901500284928, 94901500387327, +STORE, 94901500387328, 94901500428287, +STORE, 94901500428288, 94901500440575, +STORE, 47430426660864, 47430426832895, +ERASE, 47430426660864, 47430426832895, +STORE, 47430426660864, 47430426664959, +STORE, 47430426664960, 47430426832895, +ERASE, 47430426664960, 47430426832895, +STORE, 47430426664960, 47430426787839, +STORE, 47430426787840, 47430426820607, +STORE, 47430426820608, 47430426828799, +STORE, 47430426828800, 47430426832895, +STORE, 140722999115776, 140722999119871, +STORE, 140722999103488, 140722999115775, +STORE, 47430426832896, 47430426841087, +STORE, 47430426841088, 47430426849279, +STORE, 47430426849280, 47430427013119, +ERASE, 47430426849280, 47430427013119, +STORE, 47430426849280, 47430426861567, +STORE, 47430426861568, 47430427013119, +STORE, 47430426959872, 47430427013119, +STORE, 47430426861568, 47430426959871, +ERASE, 47430426861568, 47430426959871, +STORE, 47430426861568, 47430426959871, +STORE, 47430427004928, 47430427013119, +STORE, 47430426959872, 47430427004927, +ERASE, 47430426959872, 47430427004927, +STORE, 47430426959872, 47430427013119, +ERASE, 47430426959872, 47430427013119, +STORE, 47430426959872, 47430427004927, +STORE, 47430427004928, 47430427013119, +ERASE, 47430427004928, 47430427013119, +STORE, 47430427004928, 47430427013119, +STORE, 47430427013120, 47430430064639, +STORE, 47430427557888, 47430430064639, +STORE, 47430427013120, 47430427557887, +ERASE, 47430427557888, 47430430064639, +STORE, 47430427557888, 47430429843455, +STORE, 47430429843456, 47430430064639, +STORE, 47430429253632, 47430429843455, +STORE, 47430427557888, 47430429253631, +ERASE, 47430427557888, 47430429253631, +STORE, 47430427557888, 47430429253631, +STORE, 47430429839360, 47430429843455, +STORE, 47430429253632, 47430429839359, +ERASE, 47430429253632, 47430429839359, +STORE, 47430429253632, 47430429839359, +STORE, 47430430048256, 47430430064639, +STORE, 47430429843456, 47430430048255, +ERASE, 47430429843456, 47430430048255, +STORE, 47430429843456, 47430430048255, +ERASE, 47430430048256, 47430430064639, +STORE, 47430430048256, 47430430064639, +STORE, 47430430064640, 47430431903743, +STORE, 47430430203904, 47430431903743, +STORE, 47430430064640, 47430430203903, +ERASE, 47430430203904, 47430431903743, +STORE, 47430430203904, 47430431862783, +STORE, 47430431862784, 47430431903743, +STORE, 47430431547392, 47430431862783, +STORE, 47430430203904, 47430431547391, +ERASE, 47430430203904, 47430431547391, +STORE, 47430430203904, 47430431547391, +STORE, 47430431858688, 47430431862783, +STORE, 47430431547392, 47430431858687, +ERASE, 47430431547392, 47430431858687, +STORE, 47430431547392, 47430431858687, +STORE, 47430431887360, 47430431903743, +STORE, 47430431862784, 47430431887359, +ERASE, 47430431862784, 47430431887359, +STORE, 47430431862784, 47430431887359, +ERASE, 47430431887360, 47430431903743, +STORE, 47430431887360, 47430431903743, +STORE, 47430431903744, 47430432038911, +ERASE, 47430431903744, 47430432038911, +STORE, 47430431903744, 47430431928319, +STORE, 47430431928320, 47430432038911, +STORE, 47430431989760, 47430432038911, +STORE, 47430431928320, 47430431989759, +ERASE, 47430431928320, 47430431989759, +STORE, 47430431928320, 47430431989759, +STORE, 47430432014336, 47430432038911, +STORE, 47430431989760, 47430432014335, +ERASE, 47430431989760, 47430432014335, +STORE, 47430431989760, 47430432038911, +ERASE, 47430431989760, 47430432038911, +STORE, 47430431989760, 47430432014335, +STORE, 47430432014336, 47430432038911, +STORE, 47430432022528, 47430432038911, +STORE, 47430432014336, 47430432022527, +ERASE, 47430432014336, 47430432022527, +STORE, 47430432014336, 47430432022527, +ERASE, 47430432022528, 47430432038911, +STORE, 47430432022528, 47430432038911, +STORE, 47430432038912, 47430432059391, +ERASE, 47430432038912, 47430432059391, +STORE, 47430432038912, 47430432043007, +STORE, 47430432043008, 47430432059391, +STORE, 47430432047104, 47430432059391, +STORE, 47430432043008, 47430432047103, +ERASE, 47430432043008, 47430432047103, +STORE, 47430432043008, 47430432047103, +STORE, 47430432051200, 47430432059391, +STORE, 47430432047104, 47430432051199, +ERASE, 47430432047104, 47430432051199, +STORE, 47430432047104, 47430432059391, +ERASE, 47430432047104, 47430432059391, +STORE, 47430432047104, 47430432051199, +STORE, 47430432051200, 47430432059391, +ERASE, 47430432051200, 47430432059391, +STORE, 47430432051200, 47430432059391, +STORE, 47430432059392, 47430432067583, +ERASE, 47430431862784, 47430431887359, +STORE, 47430431862784, 47430431879167, +STORE, 47430431879168, 47430431887359, +ERASE, 47430432051200, 47430432059391, +STORE, 47430432051200, 47430432055295, +STORE, 47430432055296, 47430432059391, +ERASE, 47430432014336, 47430432022527, +STORE, 47430432014336, 47430432018431, +STORE, 47430432018432, 47430432022527, + }; + unsigned long set7[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140729808330752, 140737488351231, +ERASE, 140729808330752, 140737488351231, +STORE, 140729808330752, 140729808334847, +STORE, 94629632020480, 94629632192511, +ERASE, 94629632020480, 94629632192511, +STORE, 94629632020480, 94629632036863, +STORE, 94629632036864, 94629632192511, +ERASE, 94629632036864, 94629632192511, +STORE, 94629632036864, 94629632139263, +STORE, 94629632139264, 94629632180223, +STORE, 94629632180224, 94629632192511, +STORE, 47439981776896, 47439981948927, +ERASE, 47439981776896, 47439981948927, +STORE, 47439981776896, 47439981780991, +STORE, 47439981780992, 47439981948927, +ERASE, 47439981780992, 47439981948927, +STORE, 47439981780992, 47439981903871, +STORE, 47439981903872, 47439981936639, +STORE, 47439981936640, 47439981944831, +STORE, 47439981944832, 47439981948927, +STORE, 140729808474112, 140729808478207, +STORE, 140729808461824, 140729808474111, +STORE, 47439981948928, 47439981957119, +STORE, 47439981957120, 47439981965311, +STORE, 47439981965312, 47439982129151, +ERASE, 47439981965312, 47439982129151, +STORE, 47439981965312, 47439981977599, +STORE, 47439981977600, 47439982129151, +STORE, 47439982075904, 47439982129151, +STORE, 47439981977600, 47439982075903, +ERASE, 47439981977600, 47439982075903, +STORE, 47439981977600, 47439982075903, +STORE, 47439982120960, 47439982129151, +STORE, 47439982075904, 47439982120959, +ERASE, 47439982075904, 47439982120959, +STORE, 47439982075904, 47439982129151, +ERASE, 47439982075904, 47439982129151, +STORE, 47439982075904, 47439982120959, +STORE, 47439982120960, 47439982129151, +ERASE, 47439982120960, 47439982129151, +STORE, 47439982120960, 47439982129151, +STORE, 47439982129152, 47439985180671, +STORE, 47439982673920, 47439985180671, +STORE, 47439982129152, 47439982673919, +ERASE, 47439982673920, 47439985180671, +STORE, 47439982673920, 47439984959487, +STORE, 47439984959488, 47439985180671, +STORE, 47439984369664, 47439984959487, +STORE, 47439982673920, 47439984369663, +ERASE, 47439982673920, 47439984369663, +STORE, 47439982673920, 47439984369663, +STORE, 47439984955392, 47439984959487, +STORE, 47439984369664, 47439984955391, +ERASE, 47439984369664, 47439984955391, +STORE, 47439984369664, 47439984955391, +STORE, 47439985164288, 47439985180671, +STORE, 47439984959488, 47439985164287, +ERASE, 47439984959488, 47439985164287, +STORE, 47439984959488, 47439985164287, +ERASE, 47439985164288, 47439985180671, +STORE, 47439985164288, 47439985180671, +STORE, 47439985180672, 47439987019775, +STORE, 47439985319936, 47439987019775, +STORE, 47439985180672, 47439985319935, +ERASE, 47439985319936, 47439987019775, +STORE, 47439985319936, 47439986978815, +STORE, 47439986978816, 47439987019775, +STORE, 47439986663424, 47439986978815, +STORE, 47439985319936, 47439986663423, +ERASE, 47439985319936, 47439986663423, +STORE, 47439985319936, 47439986663423, +STORE, 47439986974720, 47439986978815, +STORE, 47439986663424, 47439986974719, +ERASE, 47439986663424, 47439986974719, +STORE, 47439986663424, 47439986974719, +STORE, 47439987003392, 47439987019775, +STORE, 47439986978816, 47439987003391, +ERASE, 47439986978816, 47439987003391, +STORE, 47439986978816, 47439987003391, +ERASE, 47439987003392, 47439987019775, +STORE, 47439987003392, 47439987019775, +STORE, 47439987019776, 47439987154943, +ERASE, 47439987019776, 47439987154943, +STORE, 47439987019776, 47439987044351, +STORE, 47439987044352, 47439987154943, +STORE, 47439987105792, 47439987154943, +STORE, 47439987044352, 47439987105791, +ERASE, 47439987044352, 47439987105791, +STORE, 47439987044352, 47439987105791, +STORE, 47439987130368, 47439987154943, +STORE, 47439987105792, 47439987130367, +ERASE, 47439987105792, 47439987130367, +STORE, 47439987105792, 47439987154943, +ERASE, 47439987105792, 47439987154943, +STORE, 47439987105792, 47439987130367, +STORE, 47439987130368, 47439987154943, +STORE, 47439987138560, 47439987154943, +STORE, 47439987130368, 47439987138559, +ERASE, 47439987130368, 47439987138559, +STORE, 47439987130368, 47439987138559, +ERASE, 47439987138560, 47439987154943, +STORE, 47439987138560, 47439987154943, +STORE, 47439987154944, 47439987175423, +ERASE, 47439987154944, 47439987175423, +STORE, 47439987154944, 47439987159039, +STORE, 47439987159040, 47439987175423, +STORE, 47439987163136, 47439987175423, +STORE, 47439987159040, 47439987163135, +ERASE, 47439987159040, 47439987163135, +STORE, 47439987159040, 47439987163135, +STORE, 47439987167232, 47439987175423, +STORE, 47439987163136, 47439987167231, +ERASE, 47439987163136, 47439987167231, +STORE, 47439987163136, 47439987175423, +ERASE, 47439987163136, 47439987175423, +STORE, 47439987163136, 47439987167231, +STORE, 47439987167232, 47439987175423, +ERASE, 47439987167232, 47439987175423, +STORE, 47439987167232, 47439987175423, +STORE, 47439987175424, 47439987183615, +ERASE, 47439986978816, 47439987003391, +STORE, 47439986978816, 47439986995199, +STORE, 47439986995200, 47439987003391, +ERASE, 47439987167232, 47439987175423, +STORE, 47439987167232, 47439987171327, +STORE, 47439987171328, 47439987175423, +ERASE, 47439987130368, 47439987138559, +STORE, 47439987130368, 47439987134463, +STORE, 47439987134464, 47439987138559, + }; + unsigned long set8[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140722482974720, 140737488351231, +ERASE, 140722482974720, 140737488351231, +STORE, 140722482974720, 140722482978815, +STORE, 94121505034240, 94121505206271, +ERASE, 94121505034240, 94121505206271, +STORE, 94121505034240, 94121505050623, +STORE, 94121505050624, 94121505206271, +ERASE, 94121505050624, 94121505206271, +STORE, 94121505050624, 94121505153023, +STORE, 94121505153024, 94121505193983, +STORE, 94121505193984, 94121505206271, +STORE, 47708483284992, 47708483457023, +ERASE, 47708483284992, 47708483457023, +STORE, 47708483284992, 47708483289087, +STORE, 47708483289088, 47708483457023, +ERASE, 47708483289088, 47708483457023, +STORE, 47708483289088, 47708483411967, +STORE, 47708483411968, 47708483444735, +STORE, 47708483444736, 47708483452927, +STORE, 47708483452928, 47708483457023, +STORE, 140722483142656, 140722483146751, +STORE, 140722483130368, 140722483142655, +STORE, 47708483457024, 47708483465215, +STORE, 47708483465216, 47708483473407, +STORE, 47708483473408, 47708483637247, +ERASE, 47708483473408, 47708483637247, +STORE, 47708483473408, 47708483485695, +STORE, 47708483485696, 47708483637247, +STORE, 47708483584000, 47708483637247, +STORE, 47708483485696, 47708483583999, +ERASE, 47708483485696, 47708483583999, +STORE, 47708483485696, 47708483583999, +STORE, 47708483629056, 47708483637247, +STORE, 47708483584000, 47708483629055, +ERASE, 47708483584000, 47708483629055, +STORE, 47708483584000, 47708483637247, +ERASE, 47708483584000, 47708483637247, +STORE, 47708483584000, 47708483629055, +STORE, 47708483629056, 47708483637247, +ERASE, 47708483629056, 47708483637247, +STORE, 47708483629056, 47708483637247, +STORE, 47708483637248, 47708486688767, +STORE, 47708484182016, 47708486688767, +STORE, 47708483637248, 47708484182015, +ERASE, 47708484182016, 47708486688767, +STORE, 47708484182016, 47708486467583, +STORE, 47708486467584, 47708486688767, +STORE, 47708485877760, 47708486467583, +STORE, 47708484182016, 47708485877759, +ERASE, 47708484182016, 47708485877759, +STORE, 47708484182016, 47708485877759, +STORE, 47708486463488, 47708486467583, +STORE, 47708485877760, 47708486463487, +ERASE, 47708485877760, 47708486463487, +STORE, 47708485877760, 47708486463487, +STORE, 47708486672384, 47708486688767, +STORE, 47708486467584, 47708486672383, +ERASE, 47708486467584, 47708486672383, +STORE, 47708486467584, 47708486672383, +ERASE, 47708486672384, 47708486688767, +STORE, 47708486672384, 47708486688767, +STORE, 47708486688768, 47708488527871, +STORE, 47708486828032, 47708488527871, +STORE, 47708486688768, 47708486828031, +ERASE, 47708486828032, 47708488527871, +STORE, 47708486828032, 47708488486911, +STORE, 47708488486912, 47708488527871, +STORE, 47708488171520, 47708488486911, +STORE, 47708486828032, 47708488171519, +ERASE, 47708486828032, 47708488171519, +STORE, 47708486828032, 47708488171519, +STORE, 47708488482816, 47708488486911, +STORE, 47708488171520, 47708488482815, +ERASE, 47708488171520, 47708488482815, +STORE, 47708488171520, 47708488482815, +STORE, 47708488511488, 47708488527871, +STORE, 47708488486912, 47708488511487, +ERASE, 47708488486912, 47708488511487, +STORE, 47708488486912, 47708488511487, +ERASE, 47708488511488, 47708488527871, +STORE, 47708488511488, 47708488527871, +STORE, 47708488527872, 47708488663039, +ERASE, 47708488527872, 47708488663039, +STORE, 47708488527872, 47708488552447, +STORE, 47708488552448, 47708488663039, +STORE, 47708488613888, 47708488663039, +STORE, 47708488552448, 47708488613887, +ERASE, 47708488552448, 47708488613887, +STORE, 47708488552448, 47708488613887, +STORE, 47708488638464, 47708488663039, +STORE, 47708488613888, 47708488638463, +ERASE, 47708488613888, 47708488638463, +STORE, 47708488613888, 47708488663039, +ERASE, 47708488613888, 47708488663039, +STORE, 47708488613888, 47708488638463, +STORE, 47708488638464, 47708488663039, +STORE, 47708488646656, 47708488663039, +STORE, 47708488638464, 47708488646655, +ERASE, 47708488638464, 47708488646655, +STORE, 47708488638464, 47708488646655, +ERASE, 47708488646656, 47708488663039, +STORE, 47708488646656, 47708488663039, +STORE, 47708488663040, 47708488683519, +ERASE, 47708488663040, 47708488683519, +STORE, 47708488663040, 47708488667135, +STORE, 47708488667136, 47708488683519, +STORE, 47708488671232, 47708488683519, +STORE, 47708488667136, 47708488671231, +ERASE, 47708488667136, 47708488671231, +STORE, 47708488667136, 47708488671231, +STORE, 47708488675328, 47708488683519, +STORE, 47708488671232, 47708488675327, +ERASE, 47708488671232, 47708488675327, +STORE, 47708488671232, 47708488683519, +ERASE, 47708488671232, 47708488683519, +STORE, 47708488671232, 47708488675327, +STORE, 47708488675328, 47708488683519, +ERASE, 47708488675328, 47708488683519, +STORE, 47708488675328, 47708488683519, +STORE, 47708488683520, 47708488691711, +ERASE, 47708488486912, 47708488511487, +STORE, 47708488486912, 47708488503295, +STORE, 47708488503296, 47708488511487, +ERASE, 47708488675328, 47708488683519, +STORE, 47708488675328, 47708488679423, +STORE, 47708488679424, 47708488683519, +ERASE, 47708488638464, 47708488646655, +STORE, 47708488638464, 47708488642559, +STORE, 47708488642560, 47708488646655, + }; + + unsigned long set9[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140736427839488, 140737488351231, +ERASE, 140736427839488, 140736427839488, +STORE, 140736427839488, 140736427843583, +STORE, 94071213395968, 94071213567999, +ERASE, 94071213395968, 94071213395968, +STORE, 94071213395968, 94071213412351, +STORE, 94071213412352, 94071213567999, +ERASE, 94071213412352, 94071213412352, +STORE, 94071213412352, 94071213514751, +STORE, 94071213514752, 94071213555711, +STORE, 94071213555712, 94071213567999, +STORE, 139968410644480, 139968410816511, +ERASE, 139968410644480, 139968410644480, +STORE, 139968410644480, 139968410648575, +STORE, 139968410648576, 139968410816511, +ERASE, 139968410648576, 139968410648576, +STORE, 139968410648576, 139968410771455, +STORE, 139968410771456, 139968410804223, +STORE, 139968410804224, 139968410812415, +STORE, 139968410812416, 139968410816511, +STORE, 140736429277184, 140736429281279, +STORE, 140736429264896, 140736429277183, +STORE, 47664384352256, 47664384360447, +STORE, 47664384360448, 47664384368639, +STORE, 47664384368640, 47664384532479, +ERASE, 47664384368640, 47664384368640, +STORE, 47664384368640, 47664384380927, +STORE, 47664384380928, 47664384532479, +STORE, 47664384479232, 47664384532479, +STORE, 47664384380928, 47664384479231, +ERASE, 47664384380928, 47664384380928, +STORE, 47664384380928, 47664384479231, +STORE, 47664384524288, 47664384532479, +STORE, 47664384479232, 47664384524287, +ERASE, 47664384479232, 47664384479232, +STORE, 47664384479232, 47664384532479, +ERASE, 47664384479232, 47664384479232, +STORE, 47664384479232, 47664384524287, +STORE, 47664384524288, 47664384532479, +ERASE, 47664384524288, 47664384524288, +STORE, 47664384524288, 47664384532479, +STORE, 47664384532480, 47664387583999, +STORE, 47664385077248, 47664387583999, +STORE, 47664384532480, 47664385077247, +ERASE, 47664385077248, 47664385077248, +STORE, 47664385077248, 47664387362815, +STORE, 47664387362816, 47664387583999, +STORE, 47664386772992, 47664387362815, +STORE, 47664385077248, 47664386772991, +ERASE, 47664385077248, 47664385077248, +STORE, 47664385077248, 47664386772991, +STORE, 47664387358720, 47664387362815, +STORE, 47664386772992, 47664387358719, +ERASE, 47664386772992, 47664386772992, +STORE, 47664386772992, 47664387358719, +STORE, 47664387567616, 47664387583999, +STORE, 47664387362816, 47664387567615, +ERASE, 47664387362816, 47664387362816, +STORE, 47664387362816, 47664387567615, +ERASE, 47664387567616, 47664387567616, +STORE, 47664387567616, 47664387583999, +STORE, 47664387584000, 47664389423103, +STORE, 47664387723264, 47664389423103, +STORE, 47664387584000, 47664387723263, +ERASE, 47664387723264, 47664387723264, +STORE, 47664387723264, 47664389382143, +STORE, 47664389382144, 47664389423103, +STORE, 47664389066752, 47664389382143, +STORE, 47664387723264, 47664389066751, +ERASE, 47664387723264, 47664387723264, +STORE, 47664387723264, 47664389066751, +STORE, 47664389378048, 47664389382143, +STORE, 47664389066752, 47664389378047, +ERASE, 47664389066752, 47664389066752, +STORE, 47664389066752, 47664389378047, +STORE, 47664389406720, 47664389423103, +STORE, 47664389382144, 47664389406719, +ERASE, 47664389382144, 47664389382144, +STORE, 47664389382144, 47664389406719, +ERASE, 47664389406720, 47664389406720, +STORE, 47664389406720, 47664389423103, +STORE, 47664389423104, 47664389558271, +ERASE, 47664389423104, 47664389423104, +STORE, 47664389423104, 47664389447679, +STORE, 47664389447680, 47664389558271, +STORE, 47664389509120, 47664389558271, +STORE, 47664389447680, 47664389509119, +ERASE, 47664389447680, 47664389447680, +STORE, 47664389447680, 47664389509119, +STORE, 47664389533696, 47664389558271, +STORE, 47664389509120, 47664389533695, +ERASE, 47664389509120, 47664389509120, +STORE, 47664389509120, 47664389558271, +ERASE, 47664389509120, 47664389509120, +STORE, 47664389509120, 47664389533695, +STORE, 47664389533696, 47664389558271, +STORE, 47664389541888, 47664389558271, +STORE, 47664389533696, 47664389541887, +ERASE, 47664389533696, 47664389533696, +STORE, 47664389533696, 47664389541887, +ERASE, 47664389541888, 47664389541888, +STORE, 47664389541888, 47664389558271, +STORE, 47664389558272, 47664389578751, +ERASE, 47664389558272, 47664389558272, +STORE, 47664389558272, 47664389562367, +STORE, 47664389562368, 47664389578751, +STORE, 47664389566464, 47664389578751, +STORE, 47664389562368, 47664389566463, +ERASE, 47664389562368, 47664389562368, +STORE, 47664389562368, 47664389566463, +STORE, 47664389570560, 47664389578751, +STORE, 47664389566464, 47664389570559, +ERASE, 47664389566464, 47664389566464, +STORE, 47664389566464, 47664389578751, +ERASE, 47664389566464, 47664389566464, +STORE, 47664389566464, 47664389570559, +STORE, 47664389570560, 47664389578751, +ERASE, 47664389570560, 47664389570560, +STORE, 47664389570560, 47664389578751, +STORE, 47664389578752, 47664389586943, +ERASE, 47664389382144, 47664389382144, +STORE, 47664389382144, 47664389398527, +STORE, 47664389398528, 47664389406719, +ERASE, 47664389570560, 47664389570560, +STORE, 47664389570560, 47664389574655, +STORE, 47664389574656, 47664389578751, +ERASE, 47664389533696, 47664389533696, +STORE, 47664389533696, 47664389537791, +STORE, 47664389537792, 47664389541887, +ERASE, 47664387362816, 47664387362816, +STORE, 47664387362816, 47664387559423, +STORE, 47664387559424, 47664387567615, +ERASE, 47664384524288, 47664384524288, +STORE, 47664384524288, 47664384528383, +STORE, 47664384528384, 47664384532479, +ERASE, 94071213555712, 94071213555712, +STORE, 94071213555712, 94071213563903, +STORE, 94071213563904, 94071213567999, +ERASE, 139968410804224, 139968410804224, +STORE, 139968410804224, 139968410808319, +STORE, 139968410808320, 139968410812415, +ERASE, 47664384352256, 47664384352256, +STORE, 94071244402688, 94071244537855, +STORE, 140737488347136, 140737488351231, +STORE, 140728271503360, 140737488351231, +ERASE, 140728271503360, 140728271503360, +STORE, 140728271503360, 140728271507455, +STORE, 94410361982976, 94410362155007, +ERASE, 94410361982976, 94410361982976, +STORE, 94410361982976, 94410361999359, +STORE, 94410361999360, 94410362155007, +ERASE, 94410361999360, 94410361999360, +STORE, 94410361999360, 94410362101759, +STORE, 94410362101760, 94410362142719, +STORE, 94410362142720, 94410362155007, +STORE, 140351953997824, 140351954169855, +ERASE, 140351953997824, 140351953997824, +STORE, 140351953997824, 140351954001919, +STORE, 140351954001920, 140351954169855, +ERASE, 140351954001920, 140351954001920, +STORE, 140351954001920, 140351954124799, +STORE, 140351954124800, 140351954157567, +STORE, 140351954157568, 140351954165759, +STORE, 140351954165760, 140351954169855, +STORE, 140728272429056, 140728272433151, +STORE, 140728272416768, 140728272429055, +STORE, 47280840998912, 47280841007103, +STORE, 47280841007104, 47280841015295, +STORE, 47280841015296, 47280841179135, +ERASE, 47280841015296, 47280841015296, +STORE, 47280841015296, 47280841027583, +STORE, 47280841027584, 47280841179135, +STORE, 47280841125888, 47280841179135, +STORE, 47280841027584, 47280841125887, +ERASE, 47280841027584, 47280841027584, +STORE, 47280841027584, 47280841125887, +STORE, 47280841170944, 47280841179135, +STORE, 47280841125888, 47280841170943, +ERASE, 47280841125888, 47280841125888, +STORE, 47280841125888, 47280841179135, +ERASE, 47280841125888, 47280841125888, +STORE, 47280841125888, 47280841170943, +STORE, 47280841170944, 47280841179135, +ERASE, 47280841170944, 47280841170944, +STORE, 47280841170944, 47280841179135, +STORE, 47280841179136, 47280844230655, +STORE, 47280841723904, 47280844230655, +STORE, 47280841179136, 47280841723903, +ERASE, 47280841723904, 47280841723904, +STORE, 47280841723904, 47280844009471, +STORE, 47280844009472, 47280844230655, +STORE, 47280843419648, 47280844009471, +STORE, 47280841723904, 47280843419647, +ERASE, 47280841723904, 47280841723904, +STORE, 47280841723904, 47280843419647, +STORE, 47280844005376, 47280844009471, +STORE, 47280843419648, 47280844005375, +ERASE, 47280843419648, 47280843419648, +STORE, 47280843419648, 47280844005375, +STORE, 47280844214272, 47280844230655, +STORE, 47280844009472, 47280844214271, +ERASE, 47280844009472, 47280844009472, +STORE, 47280844009472, 47280844214271, +ERASE, 47280844214272, 47280844214272, +STORE, 47280844214272, 47280844230655, +STORE, 47280844230656, 47280846069759, +STORE, 47280844369920, 47280846069759, +STORE, 47280844230656, 47280844369919, +ERASE, 47280844369920, 47280844369920, +STORE, 47280844369920, 47280846028799, +STORE, 47280846028800, 47280846069759, +STORE, 47280845713408, 47280846028799, +STORE, 47280844369920, 47280845713407, +ERASE, 47280844369920, 47280844369920, +STORE, 47280844369920, 47280845713407, +STORE, 47280846024704, 47280846028799, +STORE, 47280845713408, 47280846024703, +ERASE, 47280845713408, 47280845713408, +STORE, 47280845713408, 47280846024703, +STORE, 47280846053376, 47280846069759, +STORE, 47280846028800, 47280846053375, +ERASE, 47280846028800, 47280846028800, +STORE, 47280846028800, 47280846053375, +ERASE, 47280846053376, 47280846053376, +STORE, 47280846053376, 47280846069759, +STORE, 47280846069760, 47280846204927, +ERASE, 47280846069760, 47280846069760, +STORE, 47280846069760, 47280846094335, +STORE, 47280846094336, 47280846204927, +STORE, 47280846155776, 47280846204927, +STORE, 47280846094336, 47280846155775, +ERASE, 47280846094336, 47280846094336, +STORE, 47280846094336, 47280846155775, +STORE, 47280846180352, 47280846204927, +STORE, 47280846155776, 47280846180351, +ERASE, 47280846155776, 47280846155776, +STORE, 47280846155776, 47280846204927, +ERASE, 47280846155776, 47280846155776, +STORE, 47280846155776, 47280846180351, +STORE, 47280846180352, 47280846204927, +STORE, 47280846188544, 47280846204927, +STORE, 47280846180352, 47280846188543, +ERASE, 47280846180352, 47280846180352, +STORE, 47280846180352, 47280846188543, +ERASE, 47280846188544, 47280846188544, +STORE, 47280846188544, 47280846204927, +STORE, 47280846204928, 47280846225407, +ERASE, 47280846204928, 47280846204928, +STORE, 47280846204928, 47280846209023, +STORE, 47280846209024, 47280846225407, +STORE, 47280846213120, 47280846225407, +STORE, 47280846209024, 47280846213119, +ERASE, 47280846209024, 47280846209024, +STORE, 47280846209024, 47280846213119, +STORE, 47280846217216, 47280846225407, +STORE, 47280846213120, 47280846217215, +ERASE, 47280846213120, 47280846213120, +STORE, 47280846213120, 47280846225407, +ERASE, 47280846213120, 47280846213120, +STORE, 47280846213120, 47280846217215, +STORE, 47280846217216, 47280846225407, +ERASE, 47280846217216, 47280846217216, +STORE, 47280846217216, 47280846225407, +STORE, 47280846225408, 47280846233599, +ERASE, 47280846028800, 47280846028800, +STORE, 47280846028800, 47280846045183, +STORE, 47280846045184, 47280846053375, +ERASE, 47280846217216, 47280846217216, +STORE, 47280846217216, 47280846221311, +STORE, 47280846221312, 47280846225407, +ERASE, 47280846180352, 47280846180352, +STORE, 47280846180352, 47280846184447, +STORE, 47280846184448, 47280846188543, +ERASE, 47280844009472, 47280844009472, +STORE, 47280844009472, 47280844206079, +STORE, 47280844206080, 47280844214271, +ERASE, 47280841170944, 47280841170944, +STORE, 47280841170944, 47280841175039, +STORE, 47280841175040, 47280841179135, +ERASE, 94410362142720, 94410362142720, +STORE, 94410362142720, 94410362150911, +STORE, 94410362150912, 94410362155007, +ERASE, 140351954157568, 140351954157568, +STORE, 140351954157568, 140351954161663, +STORE, 140351954161664, 140351954165759, +ERASE, 47280840998912, 47280840998912, +STORE, 94410379456512, 94410379591679, +STORE, 140737488347136, 140737488351231, +STORE, 140732946362368, 140737488351231, +ERASE, 140732946362368, 140732946362368, +STORE, 140732946362368, 140732946366463, +STORE, 94352937934848, 94352938106879, +ERASE, 94352937934848, 94352937934848, +STORE, 94352937934848, 94352937951231, +STORE, 94352937951232, 94352938106879, +ERASE, 94352937951232, 94352937951232, +STORE, 94352937951232, 94352938053631, +STORE, 94352938053632, 94352938094591, +STORE, 94352938094592, 94352938106879, +STORE, 140595518742528, 140595518914559, +ERASE, 140595518742528, 140595518742528, +STORE, 140595518742528, 140595518746623, +STORE, 140595518746624, 140595518914559, +ERASE, 140595518746624, 140595518746624, +STORE, 140595518746624, 140595518869503, +STORE, 140595518869504, 140595518902271, +STORE, 140595518902272, 140595518910463, +STORE, 140595518910464, 140595518914559, +STORE, 140732947468288, 140732947472383, +STORE, 140732947456000, 140732947468287, +STORE, 47037276254208, 47037276262399, +STORE, 47037276262400, 47037276270591, +STORE, 47037276270592, 47037276434431, +ERASE, 47037276270592, 47037276270592, +STORE, 47037276270592, 47037276282879, +STORE, 47037276282880, 47037276434431, +STORE, 47037276381184, 47037276434431, +STORE, 47037276282880, 47037276381183, +ERASE, 47037276282880, 47037276282880, +STORE, 47037276282880, 47037276381183, +STORE, 47037276426240, 47037276434431, +STORE, 47037276381184, 47037276426239, +ERASE, 47037276381184, 47037276381184, +STORE, 47037276381184, 47037276434431, +ERASE, 47037276381184, 47037276381184, +STORE, 47037276381184, 47037276426239, +STORE, 47037276426240, 47037276434431, +ERASE, 47037276426240, 47037276426240, +STORE, 47037276426240, 47037276434431, +STORE, 47037276434432, 47037279485951, +STORE, 47037276979200, 47037279485951, +STORE, 47037276434432, 47037276979199, +ERASE, 47037276979200, 47037276979200, +STORE, 47037276979200, 47037279264767, +STORE, 47037279264768, 47037279485951, +STORE, 47037278674944, 47037279264767, +STORE, 47037276979200, 47037278674943, +ERASE, 47037276979200, 47037276979200, +STORE, 47037276979200, 47037278674943, +STORE, 47037279260672, 47037279264767, +STORE, 47037278674944, 47037279260671, +ERASE, 47037278674944, 47037278674944, +STORE, 47037278674944, 47037279260671, +STORE, 47037279469568, 47037279485951, +STORE, 47037279264768, 47037279469567, +ERASE, 47037279264768, 47037279264768, +STORE, 47037279264768, 47037279469567, +ERASE, 47037279469568, 47037279469568, +STORE, 47037279469568, 47037279485951, +STORE, 47037279485952, 47037281325055, +STORE, 47037279625216, 47037281325055, +STORE, 47037279485952, 47037279625215, +ERASE, 47037279625216, 47037279625216, +STORE, 47037279625216, 47037281284095, +STORE, 47037281284096, 47037281325055, +STORE, 47037280968704, 47037281284095, +STORE, 47037279625216, 47037280968703, +ERASE, 47037279625216, 47037279625216, +STORE, 47037279625216, 47037280968703, +STORE, 47037281280000, 47037281284095, +STORE, 47037280968704, 47037281279999, +ERASE, 47037280968704, 47037280968704, +STORE, 47037280968704, 47037281279999, +STORE, 47037281308672, 47037281325055, +STORE, 47037281284096, 47037281308671, +ERASE, 47037281284096, 47037281284096, +STORE, 47037281284096, 47037281308671, +ERASE, 47037281308672, 47037281308672, +STORE, 47037281308672, 47037281325055, +STORE, 47037281325056, 47037281460223, +ERASE, 47037281325056, 47037281325056, +STORE, 47037281325056, 47037281349631, +STORE, 47037281349632, 47037281460223, +STORE, 47037281411072, 47037281460223, +STORE, 47037281349632, 47037281411071, +ERASE, 47037281349632, 47037281349632, +STORE, 47037281349632, 47037281411071, +STORE, 47037281435648, 47037281460223, +STORE, 47037281411072, 47037281435647, +ERASE, 47037281411072, 47037281411072, +STORE, 47037281411072, 47037281460223, +ERASE, 47037281411072, 47037281411072, +STORE, 47037281411072, 47037281435647, +STORE, 47037281435648, 47037281460223, +STORE, 47037281443840, 47037281460223, +STORE, 47037281435648, 47037281443839, +ERASE, 47037281435648, 47037281435648, +STORE, 47037281435648, 47037281443839, +ERASE, 47037281443840, 47037281443840, +STORE, 47037281443840, 47037281460223, +STORE, 47037281460224, 47037281480703, +ERASE, 47037281460224, 47037281460224, +STORE, 47037281460224, 47037281464319, +STORE, 47037281464320, 47037281480703, +STORE, 47037281468416, 47037281480703, +STORE, 47037281464320, 47037281468415, +ERASE, 47037281464320, 47037281464320, +STORE, 47037281464320, 47037281468415, +STORE, 47037281472512, 47037281480703, +STORE, 47037281468416, 47037281472511, +ERASE, 47037281468416, 47037281468416, +STORE, 47037281468416, 47037281480703, +ERASE, 47037281468416, 47037281468416, +STORE, 47037281468416, 47037281472511, +STORE, 47037281472512, 47037281480703, +ERASE, 47037281472512, 47037281472512, +STORE, 47037281472512, 47037281480703, +STORE, 47037281480704, 47037281488895, +ERASE, 47037281284096, 47037281284096, +STORE, 47037281284096, 47037281300479, +STORE, 47037281300480, 47037281308671, +ERASE, 47037281472512, 47037281472512, +STORE, 47037281472512, 47037281476607, +STORE, 47037281476608, 47037281480703, +ERASE, 47037281435648, 47037281435648, +STORE, 47037281435648, 47037281439743, +STORE, 47037281439744, 47037281443839, +ERASE, 47037279264768, 47037279264768, +STORE, 47037279264768, 47037279461375, +STORE, 47037279461376, 47037279469567, +ERASE, 47037276426240, 47037276426240, +STORE, 47037276426240, 47037276430335, +STORE, 47037276430336, 47037276434431, +ERASE, 94352938094592, 94352938094592, +STORE, 94352938094592, 94352938102783, +STORE, 94352938102784, 94352938106879, +ERASE, 140595518902272, 140595518902272, +STORE, 140595518902272, 140595518906367, +STORE, 140595518906368, 140595518910463, +ERASE, 47037276254208, 47037276254208, +STORE, 94352938438656, 94352938573823, +STORE, 140737488347136, 140737488351231, +STORE, 140733506027520, 140737488351231, +ERASE, 140733506027520, 140733506027520, +STORE, 140733506027520, 140733506031615, +STORE, 94150123073536, 94150123245567, +ERASE, 94150123073536, 94150123073536, +STORE, 94150123073536, 94150123089919, +STORE, 94150123089920, 94150123245567, +ERASE, 94150123089920, 94150123089920, +STORE, 94150123089920, 94150123192319, +STORE, 94150123192320, 94150123233279, +STORE, 94150123233280, 94150123245567, +STORE, 140081290375168, 140081290547199, +ERASE, 140081290375168, 140081290375168, +STORE, 140081290375168, 140081290379263, +STORE, 140081290379264, 140081290547199, +ERASE, 140081290379264, 140081290379264, +STORE, 140081290379264, 140081290502143, +STORE, 140081290502144, 140081290534911, +STORE, 140081290534912, 140081290543103, +STORE, 140081290543104, 140081290547199, +STORE, 140733506707456, 140733506711551, +STORE, 140733506695168, 140733506707455, +STORE, 47551504621568, 47551504629759, +STORE, 47551504629760, 47551504637951, +STORE, 47551504637952, 47551504801791, +ERASE, 47551504637952, 47551504637952, +STORE, 47551504637952, 47551504650239, +STORE, 47551504650240, 47551504801791, +STORE, 47551504748544, 47551504801791, +STORE, 47551504650240, 47551504748543, +ERASE, 47551504650240, 47551504650240, +STORE, 47551504650240, 47551504748543, +STORE, 47551504793600, 47551504801791, +STORE, 47551504748544, 47551504793599, +ERASE, 47551504748544, 47551504748544, +STORE, 47551504748544, 47551504801791, +ERASE, 47551504748544, 47551504748544, +STORE, 47551504748544, 47551504793599, +STORE, 47551504793600, 47551504801791, +ERASE, 47551504793600, 47551504793600, +STORE, 47551504793600, 47551504801791, +STORE, 47551504801792, 47551507853311, +STORE, 47551505346560, 47551507853311, +STORE, 47551504801792, 47551505346559, +ERASE, 47551505346560, 47551505346560, +STORE, 47551505346560, 47551507632127, +STORE, 47551507632128, 47551507853311, +STORE, 47551507042304, 47551507632127, +STORE, 47551505346560, 47551507042303, +ERASE, 47551505346560, 47551505346560, +STORE, 47551505346560, 47551507042303, +STORE, 47551507628032, 47551507632127, +STORE, 47551507042304, 47551507628031, +ERASE, 47551507042304, 47551507042304, +STORE, 47551507042304, 47551507628031, +STORE, 47551507836928, 47551507853311, +STORE, 47551507632128, 47551507836927, +ERASE, 47551507632128, 47551507632128, +STORE, 47551507632128, 47551507836927, +ERASE, 47551507836928, 47551507836928, +STORE, 47551507836928, 47551507853311, +STORE, 47551507853312, 47551509692415, +STORE, 47551507992576, 47551509692415, +STORE, 47551507853312, 47551507992575, +ERASE, 47551507992576, 47551507992576, +STORE, 47551507992576, 47551509651455, +STORE, 47551509651456, 47551509692415, +STORE, 47551509336064, 47551509651455, +STORE, 47551507992576, 47551509336063, +ERASE, 47551507992576, 47551507992576, +STORE, 47551507992576, 47551509336063, +STORE, 47551509647360, 47551509651455, +STORE, 47551509336064, 47551509647359, +ERASE, 47551509336064, 47551509336064, +STORE, 47551509336064, 47551509647359, +STORE, 47551509676032, 47551509692415, +STORE, 47551509651456, 47551509676031, +ERASE, 47551509651456, 47551509651456, +STORE, 47551509651456, 47551509676031, +ERASE, 47551509676032, 47551509676032, +STORE, 47551509676032, 47551509692415, +STORE, 47551509692416, 47551509827583, +ERASE, 47551509692416, 47551509692416, +STORE, 47551509692416, 47551509716991, +STORE, 47551509716992, 47551509827583, +STORE, 47551509778432, 47551509827583, +STORE, 47551509716992, 47551509778431, +ERASE, 47551509716992, 47551509716992, +STORE, 47551509716992, 47551509778431, +STORE, 47551509803008, 47551509827583, +STORE, 47551509778432, 47551509803007, +ERASE, 47551509778432, 47551509778432, +STORE, 47551509778432, 47551509827583, +ERASE, 47551509778432, 47551509778432, +STORE, 47551509778432, 47551509803007, +STORE, 47551509803008, 47551509827583, +STORE, 47551509811200, 47551509827583, +STORE, 47551509803008, 47551509811199, +ERASE, 47551509803008, 47551509803008, +STORE, 47551509803008, 47551509811199, +ERASE, 47551509811200, 47551509811200, +STORE, 47551509811200, 47551509827583, +STORE, 47551509827584, 47551509848063, +ERASE, 47551509827584, 47551509827584, +STORE, 47551509827584, 47551509831679, +STORE, 47551509831680, 47551509848063, +STORE, 47551509835776, 47551509848063, +STORE, 47551509831680, 47551509835775, +ERASE, 47551509831680, 47551509831680, +STORE, 47551509831680, 47551509835775, +STORE, 47551509839872, 47551509848063, +STORE, 47551509835776, 47551509839871, +ERASE, 47551509835776, 47551509835776, +STORE, 47551509835776, 47551509848063, +ERASE, 47551509835776, 47551509835776, +STORE, 47551509835776, 47551509839871, +STORE, 47551509839872, 47551509848063, +ERASE, 47551509839872, 47551509839872, +STORE, 47551509839872, 47551509848063, +STORE, 47551509848064, 47551509856255, +ERASE, 47551509651456, 47551509651456, +STORE, 47551509651456, 47551509667839, +STORE, 47551509667840, 47551509676031, +ERASE, 47551509839872, 47551509839872, +STORE, 47551509839872, 47551509843967, +STORE, 47551509843968, 47551509848063, +ERASE, 47551509803008, 47551509803008, +STORE, 47551509803008, 47551509807103, +STORE, 47551509807104, 47551509811199, +ERASE, 47551507632128, 47551507632128, +STORE, 47551507632128, 47551507828735, +STORE, 47551507828736, 47551507836927, +ERASE, 47551504793600, 47551504793600, +STORE, 47551504793600, 47551504797695, +STORE, 47551504797696, 47551504801791, +ERASE, 94150123233280, 94150123233280, +STORE, 94150123233280, 94150123241471, +STORE, 94150123241472, 94150123245567, +ERASE, 140081290534912, 140081290534912, +STORE, 140081290534912, 140081290539007, +STORE, 140081290539008, 140081290543103, +ERASE, 47551504621568, 47551504621568, +STORE, 94150148112384, 94150148247551, +STORE, 140737488347136, 140737488351231, +STORE, 140734389334016, 140737488351231, +ERASE, 140734389334016, 140734389334016, +STORE, 140734389334016, 140734389338111, +STORE, 94844636606464, 94844636778495, +ERASE, 94844636606464, 94844636606464, +STORE, 94844636606464, 94844636622847, +STORE, 94844636622848, 94844636778495, +ERASE, 94844636622848, 94844636622848, +STORE, 94844636622848, 94844636725247, +STORE, 94844636725248, 94844636766207, +STORE, 94844636766208, 94844636778495, +STORE, 139922765217792, 139922765389823, +ERASE, 139922765217792, 139922765217792, +STORE, 139922765217792, 139922765221887, +STORE, 139922765221888, 139922765389823, +ERASE, 139922765221888, 139922765221888, +STORE, 139922765221888, 139922765344767, +STORE, 139922765344768, 139922765377535, +STORE, 139922765377536, 139922765385727, +STORE, 139922765385728, 139922765389823, +STORE, 140734389678080, 140734389682175, +STORE, 140734389665792, 140734389678079, +STORE, 47710029778944, 47710029787135, +STORE, 47710029787136, 47710029795327, +STORE, 47710029795328, 47710029959167, +ERASE, 47710029795328, 47710029795328, +STORE, 47710029795328, 47710029807615, +STORE, 47710029807616, 47710029959167, +STORE, 47710029905920, 47710029959167, +STORE, 47710029807616, 47710029905919, +ERASE, 47710029807616, 47710029807616, +STORE, 47710029807616, 47710029905919, +STORE, 47710029950976, 47710029959167, +STORE, 47710029905920, 47710029950975, +ERASE, 47710029905920, 47710029905920, +STORE, 47710029905920, 47710029959167, +ERASE, 47710029905920, 47710029905920, +STORE, 47710029905920, 47710029950975, +STORE, 47710029950976, 47710029959167, +ERASE, 47710029950976, 47710029950976, +STORE, 47710029950976, 47710029959167, +STORE, 47710029959168, 47710033010687, +STORE, 47710030503936, 47710033010687, +STORE, 47710029959168, 47710030503935, +ERASE, 47710030503936, 47710030503936, +STORE, 47710030503936, 47710032789503, +STORE, 47710032789504, 47710033010687, +STORE, 47710032199680, 47710032789503, +STORE, 47710030503936, 47710032199679, +ERASE, 47710030503936, 47710030503936, +STORE, 47710030503936, 47710032199679, +STORE, 47710032785408, 47710032789503, +STORE, 47710032199680, 47710032785407, +ERASE, 47710032199680, 47710032199680, +STORE, 47710032199680, 47710032785407, +STORE, 47710032994304, 47710033010687, +STORE, 47710032789504, 47710032994303, +ERASE, 47710032789504, 47710032789504, +STORE, 47710032789504, 47710032994303, +ERASE, 47710032994304, 47710032994304, +STORE, 47710032994304, 47710033010687, +STORE, 47710033010688, 47710034849791, +STORE, 47710033149952, 47710034849791, +STORE, 47710033010688, 47710033149951, +ERASE, 47710033149952, 47710033149952, +STORE, 47710033149952, 47710034808831, +STORE, 47710034808832, 47710034849791, +STORE, 47710034493440, 47710034808831, +STORE, 47710033149952, 47710034493439, +ERASE, 47710033149952, 47710033149952, +STORE, 47710033149952, 47710034493439, +STORE, 47710034804736, 47710034808831, +STORE, 47710034493440, 47710034804735, +ERASE, 47710034493440, 47710034493440, +STORE, 47710034493440, 47710034804735, +STORE, 47710034833408, 47710034849791, +STORE, 47710034808832, 47710034833407, +ERASE, 47710034808832, 47710034808832, +STORE, 47710034808832, 47710034833407, +ERASE, 47710034833408, 47710034833408, +STORE, 47710034833408, 47710034849791, +STORE, 47710034849792, 47710034984959, +ERASE, 47710034849792, 47710034849792, +STORE, 47710034849792, 47710034874367, +STORE, 47710034874368, 47710034984959, +STORE, 47710034935808, 47710034984959, +STORE, 47710034874368, 47710034935807, +ERASE, 47710034874368, 47710034874368, +STORE, 47710034874368, 47710034935807, +STORE, 47710034960384, 47710034984959, +STORE, 47710034935808, 47710034960383, +ERASE, 47710034935808, 47710034935808, +STORE, 47710034935808, 47710034984959, +ERASE, 47710034935808, 47710034935808, +STORE, 47710034935808, 47710034960383, +STORE, 47710034960384, 47710034984959, +STORE, 47710034968576, 47710034984959, +STORE, 47710034960384, 47710034968575, +ERASE, 47710034960384, 47710034960384, +STORE, 47710034960384, 47710034968575, +ERASE, 47710034968576, 47710034968576, +STORE, 47710034968576, 47710034984959, +STORE, 47710034984960, 47710035005439, +ERASE, 47710034984960, 47710034984960, +STORE, 47710034984960, 47710034989055, +STORE, 47710034989056, 47710035005439, +STORE, 47710034993152, 47710035005439, +STORE, 47710034989056, 47710034993151, +ERASE, 47710034989056, 47710034989056, +STORE, 47710034989056, 47710034993151, +STORE, 47710034997248, 47710035005439, +STORE, 47710034993152, 47710034997247, +ERASE, 47710034993152, 47710034993152, +STORE, 47710034993152, 47710035005439, +ERASE, 47710034993152, 47710034993152, +STORE, 47710034993152, 47710034997247, +STORE, 47710034997248, 47710035005439, +ERASE, 47710034997248, 47710034997248, +STORE, 47710034997248, 47710035005439, +STORE, 47710035005440, 47710035013631, +ERASE, 47710034808832, 47710034808832, +STORE, 47710034808832, 47710034825215, +STORE, 47710034825216, 47710034833407, +ERASE, 47710034997248, 47710034997248, +STORE, 47710034997248, 47710035001343, +STORE, 47710035001344, 47710035005439, +ERASE, 47710034960384, 47710034960384, +STORE, 47710034960384, 47710034964479, +STORE, 47710034964480, 47710034968575, +ERASE, 47710032789504, 47710032789504, +STORE, 47710032789504, 47710032986111, +STORE, 47710032986112, 47710032994303, +ERASE, 47710029950976, 47710029950976, +STORE, 47710029950976, 47710029955071, +STORE, 47710029955072, 47710029959167, +ERASE, 94844636766208, 94844636766208, +STORE, 94844636766208, 94844636774399, +STORE, 94844636774400, 94844636778495, +ERASE, 139922765377536, 139922765377536, +STORE, 139922765377536, 139922765381631, +STORE, 139922765381632, 139922765385727, +ERASE, 47710029778944, 47710029778944, +STORE, 94844641775616, 94844641910783, +STORE, 140737488347136, 140737488351231, +STORE, 140732213886976, 140737488351231, +ERASE, 140732213886976, 140732213886976, +STORE, 140732213886976, 140732213891071, +STORE, 94240508887040, 94240509059071, +ERASE, 94240508887040, 94240508887040, +STORE, 94240508887040, 94240508903423, +STORE, 94240508903424, 94240509059071, +ERASE, 94240508903424, 94240508903424, +STORE, 94240508903424, 94240509005823, +STORE, 94240509005824, 94240509046783, +STORE, 94240509046784, 94240509059071, +STORE, 140275106516992, 140275106689023, +ERASE, 140275106516992, 140275106516992, +STORE, 140275106516992, 140275106521087, +STORE, 140275106521088, 140275106689023, +ERASE, 140275106521088, 140275106521088, +STORE, 140275106521088, 140275106643967, +STORE, 140275106643968, 140275106676735, +STORE, 140275106676736, 140275106684927, +STORE, 140275106684928, 140275106689023, +STORE, 140732213977088, 140732213981183, +STORE, 140732213964800, 140732213977087, +STORE, 47357688479744, 47357688487935, +STORE, 47357688487936, 47357688496127, +STORE, 47357688496128, 47357688659967, +ERASE, 47357688496128, 47357688496128, +STORE, 47357688496128, 47357688508415, +STORE, 47357688508416, 47357688659967, +STORE, 47357688606720, 47357688659967, +STORE, 47357688508416, 47357688606719, +ERASE, 47357688508416, 47357688508416, +STORE, 47357688508416, 47357688606719, +STORE, 47357688651776, 47357688659967, +STORE, 47357688606720, 47357688651775, +ERASE, 47357688606720, 47357688606720, +STORE, 47357688606720, 47357688659967, +ERASE, 47357688606720, 47357688606720, +STORE, 47357688606720, 47357688651775, +STORE, 47357688651776, 47357688659967, +ERASE, 47357688651776, 47357688651776, +STORE, 47357688651776, 47357688659967, +STORE, 47357688659968, 47357691711487, +STORE, 47357689204736, 47357691711487, +STORE, 47357688659968, 47357689204735, +ERASE, 47357689204736, 47357689204736, +STORE, 47357689204736, 47357691490303, +STORE, 47357691490304, 47357691711487, +STORE, 47357690900480, 47357691490303, +STORE, 47357689204736, 47357690900479, +ERASE, 47357689204736, 47357689204736, +STORE, 47357689204736, 47357690900479, +STORE, 47357691486208, 47357691490303, +STORE, 47357690900480, 47357691486207, +ERASE, 47357690900480, 47357690900480, +STORE, 47357690900480, 47357691486207, +STORE, 47357691695104, 47357691711487, +STORE, 47357691490304, 47357691695103, +ERASE, 47357691490304, 47357691490304, +STORE, 47357691490304, 47357691695103, +ERASE, 47357691695104, 47357691695104, +STORE, 47357691695104, 47357691711487, +STORE, 47357691711488, 47357693550591, +STORE, 47357691850752, 47357693550591, +STORE, 47357691711488, 47357691850751, +ERASE, 47357691850752, 47357691850752, +STORE, 47357691850752, 47357693509631, +STORE, 47357693509632, 47357693550591, +STORE, 47357693194240, 47357693509631, +STORE, 47357691850752, 47357693194239, +ERASE, 47357691850752, 47357691850752, +STORE, 47357691850752, 47357693194239, +STORE, 47357693505536, 47357693509631, +STORE, 47357693194240, 47357693505535, +ERASE, 47357693194240, 47357693194240, +STORE, 47357693194240, 47357693505535, +STORE, 47357693534208, 47357693550591, +STORE, 47357693509632, 47357693534207, +ERASE, 47357693509632, 47357693509632, +STORE, 47357693509632, 47357693534207, +ERASE, 47357693534208, 47357693534208, +STORE, 47357693534208, 47357693550591, +STORE, 47357693550592, 47357693685759, +ERASE, 47357693550592, 47357693550592, +STORE, 47357693550592, 47357693575167, +STORE, 47357693575168, 47357693685759, +STORE, 47357693636608, 47357693685759, +STORE, 47357693575168, 47357693636607, +ERASE, 47357693575168, 47357693575168, +STORE, 47357693575168, 47357693636607, +STORE, 47357693661184, 47357693685759, +STORE, 47357693636608, 47357693661183, +ERASE, 47357693636608, 47357693636608, +STORE, 47357693636608, 47357693685759, +ERASE, 47357693636608, 47357693636608, +STORE, 47357693636608, 47357693661183, +STORE, 47357693661184, 47357693685759, +STORE, 47357693669376, 47357693685759, +STORE, 47357693661184, 47357693669375, +ERASE, 47357693661184, 47357693661184, +STORE, 47357693661184, 47357693669375, +ERASE, 47357693669376, 47357693669376, +STORE, 47357693669376, 47357693685759, +STORE, 47357693685760, 47357693706239, +ERASE, 47357693685760, 47357693685760, +STORE, 47357693685760, 47357693689855, +STORE, 47357693689856, 47357693706239, +STORE, 47357693693952, 47357693706239, +STORE, 47357693689856, 47357693693951, +ERASE, 47357693689856, 47357693689856, +STORE, 47357693689856, 47357693693951, +STORE, 47357693698048, 47357693706239, +STORE, 47357693693952, 47357693698047, +ERASE, 47357693693952, 47357693693952, +STORE, 47357693693952, 47357693706239, +ERASE, 47357693693952, 47357693693952, +STORE, 47357693693952, 47357693698047, +STORE, 47357693698048, 47357693706239, +ERASE, 47357693698048, 47357693698048, +STORE, 47357693698048, 47357693706239, +STORE, 47357693706240, 47357693714431, +ERASE, 47357693509632, 47357693509632, +STORE, 47357693509632, 47357693526015, +STORE, 47357693526016, 47357693534207, +ERASE, 47357693698048, 47357693698048, +STORE, 47357693698048, 47357693702143, +STORE, 47357693702144, 47357693706239, +ERASE, 47357693661184, 47357693661184, +STORE, 47357693661184, 47357693665279, +STORE, 47357693665280, 47357693669375, +ERASE, 47357691490304, 47357691490304, +STORE, 47357691490304, 47357691686911, +STORE, 47357691686912, 47357691695103, +ERASE, 47357688651776, 47357688651776, +STORE, 47357688651776, 47357688655871, +STORE, 47357688655872, 47357688659967, +ERASE, 94240509046784, 94240509046784, +STORE, 94240509046784, 94240509054975, +STORE, 94240509054976, 94240509059071, +ERASE, 140275106676736, 140275106676736, +STORE, 140275106676736, 140275106680831, +STORE, 140275106680832, 140275106684927, +ERASE, 47357688479744, 47357688479744, +STORE, 94240518361088, 94240518496255, +STORE, 140737488347136, 140737488351231, +STORE, 140732688277504, 140737488351231, +ERASE, 140732688277504, 140732688277504, +STORE, 140732688277504, 140732688281599, +STORE, 94629171351552, 94629172064255, +ERASE, 94629171351552, 94629171351552, +STORE, 94629171351552, 94629171400703, +STORE, 94629171400704, 94629172064255, +ERASE, 94629171400704, 94629171400704, +STORE, 94629171400704, 94629171945471, +STORE, 94629171945472, 94629172043775, +STORE, 94629172043776, 94629172064255, +STORE, 139770707644416, 139770707816447, +ERASE, 139770707644416, 139770707644416, +STORE, 139770707644416, 139770707648511, +STORE, 139770707648512, 139770707816447, +ERASE, 139770707648512, 139770707648512, +STORE, 139770707648512, 139770707771391, +STORE, 139770707771392, 139770707804159, +STORE, 139770707804160, 139770707812351, +STORE, 139770707812352, 139770707816447, +STORE, 140732689121280, 140732689125375, +STORE, 140732689108992, 140732689121279, +STORE, 47862087352320, 47862087360511, +STORE, 47862087360512, 47862087368703, +STORE, 47862087368704, 47862087475199, +STORE, 47862087385088, 47862087475199, +STORE, 47862087368704, 47862087385087, +ERASE, 47862087385088, 47862087385088, +STORE, 47862087385088, 47862087458815, +STORE, 47862087458816, 47862087475199, +STORE, 47862087438336, 47862087458815, +STORE, 47862087385088, 47862087438335, +ERASE, 47862087385088, 47862087385088, +STORE, 47862087385088, 47862087438335, +STORE, 47862087454720, 47862087458815, +STORE, 47862087438336, 47862087454719, +ERASE, 47862087438336, 47862087438336, +STORE, 47862087438336, 47862087454719, +STORE, 47862087467008, 47862087475199, +STORE, 47862087458816, 47862087467007, +ERASE, 47862087458816, 47862087458816, +STORE, 47862087458816, 47862087467007, +ERASE, 47862087467008, 47862087467008, +STORE, 47862087467008, 47862087475199, +STORE, 47862087475200, 47862089314303, +STORE, 47862087614464, 47862089314303, +STORE, 47862087475200, 47862087614463, +ERASE, 47862087614464, 47862087614464, +STORE, 47862087614464, 47862089273343, +STORE, 47862089273344, 47862089314303, +STORE, 47862088957952, 47862089273343, +STORE, 47862087614464, 47862088957951, +ERASE, 47862087614464, 47862087614464, +STORE, 47862087614464, 47862088957951, +STORE, 47862089269248, 47862089273343, +STORE, 47862088957952, 47862089269247, +ERASE, 47862088957952, 47862088957952, +STORE, 47862088957952, 47862089269247, +STORE, 47862089297920, 47862089314303, +STORE, 47862089273344, 47862089297919, +ERASE, 47862089273344, 47862089273344, +STORE, 47862089273344, 47862089297919, +ERASE, 47862089297920, 47862089297920, +STORE, 47862089297920, 47862089314303, +STORE, 47862089297920, 47862089326591, +ERASE, 47862089273344, 47862089273344, +STORE, 47862089273344, 47862089289727, +STORE, 47862089289728, 47862089297919, +ERASE, 47862087458816, 47862087458816, +STORE, 47862087458816, 47862087462911, +STORE, 47862087462912, 47862087467007, +ERASE, 94629172043776, 94629172043776, +STORE, 94629172043776, 94629172060159, +STORE, 94629172060160, 94629172064255, +ERASE, 139770707804160, 139770707804160, +STORE, 139770707804160, 139770707808255, +STORE, 139770707808256, 139770707812351, +ERASE, 47862087352320, 47862087352320, +STORE, 94629197533184, 94629197668351, +STORE, 140737488347136, 140737488351231, +STORE, 140727540711424, 140737488351231, +ERASE, 140727540711424, 140727540711424, +STORE, 140727540711424, 140727540715519, +STORE, 94299865313280, 94299866025983, +ERASE, 94299865313280, 94299865313280, +STORE, 94299865313280, 94299865362431, +STORE, 94299865362432, 94299866025983, +ERASE, 94299865362432, 94299865362432, +STORE, 94299865362432, 94299865907199, +STORE, 94299865907200, 94299866005503, +STORE, 94299866005504, 94299866025983, +STORE, 140680268763136, 140680268935167, +ERASE, 140680268763136, 140680268763136, +STORE, 140680268763136, 140680268767231, +STORE, 140680268767232, 140680268935167, +ERASE, 140680268767232, 140680268767232, +STORE, 140680268767232, 140680268890111, +STORE, 140680268890112, 140680268922879, +STORE, 140680268922880, 140680268931071, +STORE, 140680268931072, 140680268935167, +STORE, 140727541424128, 140727541428223, +STORE, 140727541411840, 140727541424127, +STORE, 46952526233600, 46952526241791, +STORE, 46952526241792, 46952526249983, +STORE, 46952526249984, 46952526356479, +STORE, 46952526266368, 46952526356479, +STORE, 46952526249984, 46952526266367, +ERASE, 46952526266368, 46952526266368, +STORE, 46952526266368, 46952526340095, +STORE, 46952526340096, 46952526356479, +STORE, 46952526319616, 46952526340095, +STORE, 46952526266368, 46952526319615, +ERASE, 46952526266368, 46952526266368, +STORE, 46952526266368, 46952526319615, +STORE, 46952526336000, 46952526340095, +STORE, 46952526319616, 46952526335999, +ERASE, 46952526319616, 46952526319616, +STORE, 46952526319616, 46952526335999, +STORE, 46952526348288, 46952526356479, +STORE, 46952526340096, 46952526348287, +ERASE, 46952526340096, 46952526340096, +STORE, 46952526340096, 46952526348287, +ERASE, 46952526348288, 46952526348288, +STORE, 46952526348288, 46952526356479, +STORE, 46952526356480, 46952528195583, +STORE, 46952526495744, 46952528195583, +STORE, 46952526356480, 46952526495743, +ERASE, 46952526495744, 46952526495744, +STORE, 46952526495744, 46952528154623, +STORE, 46952528154624, 46952528195583, +STORE, 46952527839232, 46952528154623, +STORE, 46952526495744, 46952527839231, +ERASE, 46952526495744, 46952526495744, +STORE, 46952526495744, 46952527839231, +STORE, 46952528150528, 46952528154623, +STORE, 46952527839232, 46952528150527, +ERASE, 46952527839232, 46952527839232, +STORE, 46952527839232, 46952528150527, +STORE, 46952528179200, 46952528195583, +STORE, 46952528154624, 46952528179199, +ERASE, 46952528154624, 46952528154624, +STORE, 46952528154624, 46952528179199, +ERASE, 46952528179200, 46952528179200, +STORE, 46952528179200, 46952528195583, +STORE, 46952528179200, 46952528207871, +ERASE, 46952528154624, 46952528154624, +STORE, 46952528154624, 46952528171007, +STORE, 46952528171008, 46952528179199, +ERASE, 46952526340096, 46952526340096, +STORE, 46952526340096, 46952526344191, +STORE, 46952526344192, 46952526348287, +ERASE, 94299866005504, 94299866005504, +STORE, 94299866005504, 94299866021887, +STORE, 94299866021888, 94299866025983, +ERASE, 140680268922880, 140680268922880, +STORE, 140680268922880, 140680268926975, +STORE, 140680268926976, 140680268931071, +ERASE, 46952526233600, 46952526233600, +STORE, 140737488347136, 140737488351231, +STORE, 140722874793984, 140737488351231, +ERASE, 140722874793984, 140722874793984, +STORE, 140722874793984, 140722874798079, +STORE, 94448916213760, 94448916926463, +ERASE, 94448916213760, 94448916213760, +STORE, 94448916213760, 94448916262911, +STORE, 94448916262912, 94448916926463, +ERASE, 94448916262912, 94448916262912, +STORE, 94448916262912, 94448916807679, +STORE, 94448916807680, 94448916905983, +STORE, 94448916905984, 94448916926463, +STORE, 140389117046784, 140389117218815, +ERASE, 140389117046784, 140389117046784, +STORE, 140389117046784, 140389117050879, +STORE, 140389117050880, 140389117218815, +ERASE, 140389117050880, 140389117050880, +STORE, 140389117050880, 140389117173759, +STORE, 140389117173760, 140389117206527, +STORE, 140389117206528, 140389117214719, +STORE, 140389117214720, 140389117218815, +STORE, 140722875297792, 140722875301887, +STORE, 140722875285504, 140722875297791, +STORE, 47243677949952, 47243677958143, +STORE, 47243677958144, 47243677966335, +STORE, 47243677966336, 47243678072831, +STORE, 47243677982720, 47243678072831, +STORE, 47243677966336, 47243677982719, +ERASE, 47243677982720, 47243677982720, +STORE, 47243677982720, 47243678056447, +STORE, 47243678056448, 47243678072831, +STORE, 47243678035968, 47243678056447, +STORE, 47243677982720, 47243678035967, +ERASE, 47243677982720, 47243677982720, +STORE, 47243677982720, 47243678035967, +STORE, 47243678052352, 47243678056447, +STORE, 47243678035968, 47243678052351, +ERASE, 47243678035968, 47243678035968, +STORE, 47243678035968, 47243678052351, +STORE, 47243678064640, 47243678072831, +STORE, 47243678056448, 47243678064639, +ERASE, 47243678056448, 47243678056448, +STORE, 47243678056448, 47243678064639, +ERASE, 47243678064640, 47243678064640, +STORE, 47243678064640, 47243678072831, +STORE, 47243678072832, 47243679911935, +STORE, 47243678212096, 47243679911935, +STORE, 47243678072832, 47243678212095, +ERASE, 47243678212096, 47243678212096, +STORE, 47243678212096, 47243679870975, +STORE, 47243679870976, 47243679911935, +STORE, 47243679555584, 47243679870975, +STORE, 47243678212096, 47243679555583, +ERASE, 47243678212096, 47243678212096, +STORE, 47243678212096, 47243679555583, +STORE, 47243679866880, 47243679870975, +STORE, 47243679555584, 47243679866879, +ERASE, 47243679555584, 47243679555584, +STORE, 47243679555584, 47243679866879, +STORE, 47243679895552, 47243679911935, +STORE, 47243679870976, 47243679895551, +ERASE, 47243679870976, 47243679870976, +STORE, 47243679870976, 47243679895551, +ERASE, 47243679895552, 47243679895552, +STORE, 47243679895552, 47243679911935, +STORE, 47243679895552, 47243679924223, +ERASE, 47243679870976, 47243679870976, +STORE, 47243679870976, 47243679887359, +STORE, 47243679887360, 47243679895551, +ERASE, 47243678056448, 47243678056448, +STORE, 47243678056448, 47243678060543, +STORE, 47243678060544, 47243678064639, +ERASE, 94448916905984, 94448916905984, +STORE, 94448916905984, 94448916922367, +STORE, 94448916922368, 94448916926463, +ERASE, 140389117206528, 140389117206528, +STORE, 140389117206528, 140389117210623, +STORE, 140389117210624, 140389117214719, +ERASE, 47243677949952, 47243677949952, +STORE, 140737488347136, 140737488351231, +STORE, 140733068505088, 140737488351231, +ERASE, 140733068505088, 140733068505088, +STORE, 140733068505088, 140733068509183, +STORE, 94207145750528, 94207146463231, +ERASE, 94207145750528, 94207145750528, +STORE, 94207145750528, 94207145799679, +STORE, 94207145799680, 94207146463231, +ERASE, 94207145799680, 94207145799680, +STORE, 94207145799680, 94207146344447, +STORE, 94207146344448, 94207146442751, +STORE, 94207146442752, 94207146463231, +STORE, 140684504911872, 140684505083903, +ERASE, 140684504911872, 140684504911872, +STORE, 140684504911872, 140684504915967, +STORE, 140684504915968, 140684505083903, +ERASE, 140684504915968, 140684504915968, +STORE, 140684504915968, 140684505038847, +STORE, 140684505038848, 140684505071615, +STORE, 140684505071616, 140684505079807, +STORE, 140684505079808, 140684505083903, +STORE, 140733068607488, 140733068611583, +STORE, 140733068595200, 140733068607487, +STORE, 46948290084864, 46948290093055, +STORE, 46948290093056, 46948290101247, +STORE, 46948290101248, 46948290207743, +STORE, 46948290117632, 46948290207743, +STORE, 46948290101248, 46948290117631, +ERASE, 46948290117632, 46948290117632, +STORE, 46948290117632, 46948290191359, +STORE, 46948290191360, 46948290207743, +STORE, 46948290170880, 46948290191359, +STORE, 46948290117632, 46948290170879, +ERASE, 46948290117632, 46948290117632, +STORE, 46948290117632, 46948290170879, +STORE, 46948290187264, 46948290191359, +STORE, 46948290170880, 46948290187263, +ERASE, 46948290170880, 46948290170880, +STORE, 46948290170880, 46948290187263, +STORE, 46948290199552, 46948290207743, +STORE, 46948290191360, 46948290199551, +ERASE, 46948290191360, 46948290191360, +STORE, 46948290191360, 46948290199551, +ERASE, 46948290199552, 46948290199552, +STORE, 46948290199552, 46948290207743, +STORE, 46948290207744, 46948292046847, +STORE, 46948290347008, 46948292046847, +STORE, 46948290207744, 46948290347007, +ERASE, 46948290347008, 46948290347008, +STORE, 46948290347008, 46948292005887, +STORE, 46948292005888, 46948292046847, +STORE, 46948291690496, 46948292005887, +STORE, 46948290347008, 46948291690495, +ERASE, 46948290347008, 46948290347008, +STORE, 46948290347008, 46948291690495, +STORE, 46948292001792, 46948292005887, +STORE, 46948291690496, 46948292001791, +ERASE, 46948291690496, 46948291690496, +STORE, 46948291690496, 46948292001791, +STORE, 46948292030464, 46948292046847, +STORE, 46948292005888, 46948292030463, +ERASE, 46948292005888, 46948292005888, +STORE, 46948292005888, 46948292030463, +ERASE, 46948292030464, 46948292030464, +STORE, 46948292030464, 46948292046847, +STORE, 46948292030464, 46948292059135, +ERASE, 46948292005888, 46948292005888, +STORE, 46948292005888, 46948292022271, +STORE, 46948292022272, 46948292030463, +ERASE, 46948290191360, 46948290191360, +STORE, 46948290191360, 46948290195455, +STORE, 46948290195456, 46948290199551, +ERASE, 94207146442752, 94207146442752, +STORE, 94207146442752, 94207146459135, +STORE, 94207146459136, 94207146463231, +ERASE, 140684505071616, 140684505071616, +STORE, 140684505071616, 140684505075711, +STORE, 140684505075712, 140684505079807, +ERASE, 46948290084864, 46948290084864, +STORE, 140737488347136, 140737488351231, +STORE, 140726367158272, 140737488351231, +ERASE, 140726367158272, 140726367158272, +STORE, 140726367158272, 140726367162367, +STORE, 94436124106752, 94436124819455, +ERASE, 94436124106752, 94436124106752, +STORE, 94436124106752, 94436124155903, +STORE, 94436124155904, 94436124819455, +ERASE, 94436124155904, 94436124155904, +STORE, 94436124155904, 94436124700671, +STORE, 94436124700672, 94436124798975, +STORE, 94436124798976, 94436124819455, +STORE, 140049025044480, 140049025216511, +ERASE, 140049025044480, 140049025044480, +STORE, 140049025044480, 140049025048575, +STORE, 140049025048576, 140049025216511, +ERASE, 140049025048576, 140049025048576, +STORE, 140049025048576, 140049025171455, +STORE, 140049025171456, 140049025204223, +STORE, 140049025204224, 140049025212415, +STORE, 140049025212416, 140049025216511, +STORE, 140726367256576, 140726367260671, +STORE, 140726367244288, 140726367256575, +STORE, 47583769952256, 47583769960447, +STORE, 47583769960448, 47583769968639, +STORE, 47583769968640, 47583770075135, +STORE, 47583769985024, 47583770075135, +STORE, 47583769968640, 47583769985023, +ERASE, 47583769985024, 47583769985024, +STORE, 47583769985024, 47583770058751, +STORE, 47583770058752, 47583770075135, +STORE, 47583770038272, 47583770058751, +STORE, 47583769985024, 47583770038271, +ERASE, 47583769985024, 47583769985024, +STORE, 47583769985024, 47583770038271, +STORE, 47583770054656, 47583770058751, +STORE, 47583770038272, 47583770054655, +ERASE, 47583770038272, 47583770038272, +STORE, 47583770038272, 47583770054655, +STORE, 47583770066944, 47583770075135, +STORE, 47583770058752, 47583770066943, +ERASE, 47583770058752, 47583770058752, +STORE, 47583770058752, 47583770066943, +ERASE, 47583770066944, 47583770066944, +STORE, 47583770066944, 47583770075135, +STORE, 47583770075136, 47583771914239, +STORE, 47583770214400, 47583771914239, +STORE, 47583770075136, 47583770214399, +ERASE, 47583770214400, 47583770214400, +STORE, 47583770214400, 47583771873279, +STORE, 47583771873280, 47583771914239, +STORE, 47583771557888, 47583771873279, +STORE, 47583770214400, 47583771557887, +ERASE, 47583770214400, 47583770214400, +STORE, 47583770214400, 47583771557887, +STORE, 47583771869184, 47583771873279, +STORE, 47583771557888, 47583771869183, +ERASE, 47583771557888, 47583771557888, +STORE, 47583771557888, 47583771869183, +STORE, 47583771897856, 47583771914239, +STORE, 47583771873280, 47583771897855, +ERASE, 47583771873280, 47583771873280, +STORE, 47583771873280, 47583771897855, +ERASE, 47583771897856, 47583771897856, +STORE, 47583771897856, 47583771914239, +STORE, 47583771897856, 47583771926527, +ERASE, 47583771873280, 47583771873280, +STORE, 47583771873280, 47583771889663, +STORE, 47583771889664, 47583771897855, +ERASE, 47583770058752, 47583770058752, +STORE, 47583770058752, 47583770062847, +STORE, 47583770062848, 47583770066943, +ERASE, 94436124798976, 94436124798976, +STORE, 94436124798976, 94436124815359, +STORE, 94436124815360, 94436124819455, +ERASE, 140049025204224, 140049025204224, +STORE, 140049025204224, 140049025208319, +STORE, 140049025208320, 140049025212415, +ERASE, 47583769952256, 47583769952256, +STORE, 140737488347136, 140737488351231, +STORE, 140727116099584, 140737488351231, +ERASE, 140727116099584, 140727116099584, +STORE, 140727116099584, 140727116103679, +STORE, 94166319734784, 94166320447487, +ERASE, 94166319734784, 94166319734784, +STORE, 94166319734784, 94166319783935, +STORE, 94166319783936, 94166320447487, +ERASE, 94166319783936, 94166319783936, +STORE, 94166319783936, 94166320328703, +STORE, 94166320328704, 94166320427007, +STORE, 94166320427008, 94166320447487, +STORE, 139976559542272, 139976559714303, +ERASE, 139976559542272, 139976559542272, +STORE, 139976559542272, 139976559546367, +STORE, 139976559546368, 139976559714303, +ERASE, 139976559546368, 139976559546368, +STORE, 139976559546368, 139976559669247, +STORE, 139976559669248, 139976559702015, +STORE, 139976559702016, 139976559710207, +STORE, 139976559710208, 139976559714303, +STORE, 140727116222464, 140727116226559, +STORE, 140727116210176, 140727116222463, +STORE, 47656235454464, 47656235462655, +STORE, 47656235462656, 47656235470847, +STORE, 47656235470848, 47656235577343, +STORE, 47656235487232, 47656235577343, +STORE, 47656235470848, 47656235487231, +ERASE, 47656235487232, 47656235487232, +STORE, 47656235487232, 47656235560959, +STORE, 47656235560960, 47656235577343, +STORE, 47656235540480, 47656235560959, +STORE, 47656235487232, 47656235540479, +ERASE, 47656235487232, 47656235487232, +STORE, 47656235487232, 47656235540479, +STORE, 47656235556864, 47656235560959, +STORE, 47656235540480, 47656235556863, +ERASE, 47656235540480, 47656235540480, +STORE, 47656235540480, 47656235556863, +STORE, 47656235569152, 47656235577343, +STORE, 47656235560960, 47656235569151, +ERASE, 47656235560960, 47656235560960, +STORE, 47656235560960, 47656235569151, +ERASE, 47656235569152, 47656235569152, +STORE, 47656235569152, 47656235577343, +STORE, 47656235577344, 47656237416447, +STORE, 47656235716608, 47656237416447, +STORE, 47656235577344, 47656235716607, +ERASE, 47656235716608, 47656235716608, +STORE, 47656235716608, 47656237375487, +STORE, 47656237375488, 47656237416447, +STORE, 47656237060096, 47656237375487, +STORE, 47656235716608, 47656237060095, +ERASE, 47656235716608, 47656235716608, +STORE, 47656235716608, 47656237060095, +STORE, 47656237371392, 47656237375487, +STORE, 47656237060096, 47656237371391, +ERASE, 47656237060096, 47656237060096, +STORE, 47656237060096, 47656237371391, +STORE, 47656237400064, 47656237416447, +STORE, 47656237375488, 47656237400063, +ERASE, 47656237375488, 47656237375488, +STORE, 47656237375488, 47656237400063, +ERASE, 47656237400064, 47656237400064, +STORE, 47656237400064, 47656237416447, +STORE, 47656237400064, 47656237428735, +ERASE, 47656237375488, 47656237375488, +STORE, 47656237375488, 47656237391871, +STORE, 47656237391872, 47656237400063, +ERASE, 47656235560960, 47656235560960, +STORE, 47656235560960, 47656235565055, +STORE, 47656235565056, 47656235569151, +ERASE, 94166320427008, 94166320427008, +STORE, 94166320427008, 94166320443391, +STORE, 94166320443392, 94166320447487, +ERASE, 139976559702016, 139976559702016, +STORE, 139976559702016, 139976559706111, +STORE, 139976559706112, 139976559710207, +ERASE, 47656235454464, 47656235454464, +STORE, 94166332153856, 94166332289023, +STORE, 140737488347136, 140737488351231, +STORE, 140726412816384, 140737488351231, +ERASE, 140726412816384, 140726412816384, +STORE, 140726412816384, 140726412820479, +STORE, 94094884507648, 94094885220351, +ERASE, 94094884507648, 94094884507648, +STORE, 94094884507648, 94094884556799, +STORE, 94094884556800, 94094885220351, +ERASE, 94094884556800, 94094884556800, +STORE, 94094884556800, 94094885101567, +STORE, 94094885101568, 94094885199871, +STORE, 94094885199872, 94094885220351, +STORE, 139773773938688, 139773774110719, +ERASE, 139773773938688, 139773773938688, +STORE, 139773773938688, 139773773942783, +STORE, 139773773942784, 139773774110719, +ERASE, 139773773942784, 139773773942784, +STORE, 139773773942784, 139773774065663, +STORE, 139773774065664, 139773774098431, +STORE, 139773774098432, 139773774106623, +STORE, 139773774106624, 139773774110719, +STORE, 140726412963840, 140726412967935, +STORE, 140726412951552, 140726412963839, +STORE, 47859021058048, 47859021066239, +STORE, 47859021066240, 47859021074431, +STORE, 47859021074432, 47859021180927, +STORE, 47859021090816, 47859021180927, +STORE, 47859021074432, 47859021090815, +ERASE, 47859021090816, 47859021090816, +STORE, 47859021090816, 47859021164543, +STORE, 47859021164544, 47859021180927, +STORE, 47859021144064, 47859021164543, +STORE, 47859021090816, 47859021144063, +ERASE, 47859021090816, 47859021090816, +STORE, 47859021090816, 47859021144063, +STORE, 47859021160448, 47859021164543, +STORE, 47859021144064, 47859021160447, +ERASE, 47859021144064, 47859021144064, +STORE, 47859021144064, 47859021160447, +STORE, 47859021172736, 47859021180927, +STORE, 47859021164544, 47859021172735, +ERASE, 47859021164544, 47859021164544, +STORE, 47859021164544, 47859021172735, +ERASE, 47859021172736, 47859021172736, +STORE, 47859021172736, 47859021180927, +STORE, 47859021180928, 47859023020031, +STORE, 47859021320192, 47859023020031, +STORE, 47859021180928, 47859021320191, +ERASE, 47859021320192, 47859021320192, +STORE, 47859021320192, 47859022979071, +STORE, 47859022979072, 47859023020031, +STORE, 47859022663680, 47859022979071, +STORE, 47859021320192, 47859022663679, +ERASE, 47859021320192, 47859021320192, +STORE, 47859021320192, 47859022663679, +STORE, 47859022974976, 47859022979071, +STORE, 47859022663680, 47859022974975, +ERASE, 47859022663680, 47859022663680, +STORE, 47859022663680, 47859022974975, +STORE, 47859023003648, 47859023020031, +STORE, 47859022979072, 47859023003647, +ERASE, 47859022979072, 47859022979072, +STORE, 47859022979072, 47859023003647, +ERASE, 47859023003648, 47859023003648, +STORE, 47859023003648, 47859023020031, +STORE, 47859023003648, 47859023032319, +ERASE, 47859022979072, 47859022979072, +STORE, 47859022979072, 47859022995455, +STORE, 47859022995456, 47859023003647, +ERASE, 47859021164544, 47859021164544, +STORE, 47859021164544, 47859021168639, +STORE, 47859021168640, 47859021172735, +ERASE, 94094885199872, 94094885199872, +STORE, 94094885199872, 94094885216255, +STORE, 94094885216256, 94094885220351, +ERASE, 139773774098432, 139773774098432, +STORE, 139773774098432, 139773774102527, +STORE, 139773774102528, 139773774106623, +ERASE, 47859021058048, 47859021058048, +STORE, 94094901108736, 94094901243903, +STORE, 140737488347136, 140737488351231, +STORE, 140736567963648, 140737488351231, +ERASE, 140736567963648, 140736567963648, +STORE, 140736567963648, 140736567967743, +STORE, 94924425748480, 94924426461183, +ERASE, 94924425748480, 94924425748480, +STORE, 94924425748480, 94924425797631, +STORE, 94924425797632, 94924426461183, +ERASE, 94924425797632, 94924425797632, +STORE, 94924425797632, 94924426342399, +STORE, 94924426342400, 94924426440703, +STORE, 94924426440704, 94924426461183, +STORE, 140042126319616, 140042126491647, +ERASE, 140042126319616, 140042126319616, +STORE, 140042126319616, 140042126323711, +STORE, 140042126323712, 140042126491647, +ERASE, 140042126323712, 140042126323712, +STORE, 140042126323712, 140042126446591, +STORE, 140042126446592, 140042126479359, +STORE, 140042126479360, 140042126487551, +STORE, 140042126487552, 140042126491647, +STORE, 140736568672256, 140736568676351, +STORE, 140736568659968, 140736568672255, +STORE, 47590668677120, 47590668685311, +STORE, 47590668685312, 47590668693503, +STORE, 47590668693504, 47590668799999, +STORE, 47590668709888, 47590668799999, +STORE, 47590668693504, 47590668709887, +ERASE, 47590668709888, 47590668709888, +STORE, 47590668709888, 47590668783615, +STORE, 47590668783616, 47590668799999, +STORE, 47590668763136, 47590668783615, +STORE, 47590668709888, 47590668763135, +ERASE, 47590668709888, 47590668709888, +STORE, 47590668709888, 47590668763135, +STORE, 47590668779520, 47590668783615, +STORE, 47590668763136, 47590668779519, +ERASE, 47590668763136, 47590668763136, +STORE, 47590668763136, 47590668779519, +STORE, 47590668791808, 47590668799999, +STORE, 47590668783616, 47590668791807, +ERASE, 47590668783616, 47590668783616, +STORE, 47590668783616, 47590668791807, +ERASE, 47590668791808, 47590668791808, +STORE, 47590668791808, 47590668799999, +STORE, 47590668800000, 47590670639103, +STORE, 47590668939264, 47590670639103, +STORE, 47590668800000, 47590668939263, +ERASE, 47590668939264, 47590668939264, +STORE, 47590668939264, 47590670598143, +STORE, 47590670598144, 47590670639103, +STORE, 47590670282752, 47590670598143, +STORE, 47590668939264, 47590670282751, +ERASE, 47590668939264, 47590668939264, +STORE, 47590668939264, 47590670282751, +STORE, 47590670594048, 47590670598143, +STORE, 47590670282752, 47590670594047, +ERASE, 47590670282752, 47590670282752, +STORE, 47590670282752, 47590670594047, +STORE, 47590670622720, 47590670639103, +STORE, 47590670598144, 47590670622719, +ERASE, 47590670598144, 47590670598144, +STORE, 47590670598144, 47590670622719, +ERASE, 47590670622720, 47590670622720, +STORE, 47590670622720, 47590670639103, +STORE, 47590670622720, 47590670651391, +ERASE, 47590670598144, 47590670598144, +STORE, 47590670598144, 47590670614527, +STORE, 47590670614528, 47590670622719, +ERASE, 47590668783616, 47590668783616, +STORE, 47590668783616, 47590668787711, +STORE, 47590668787712, 47590668791807, +ERASE, 94924426440704, 94924426440704, +STORE, 94924426440704, 94924426457087, +STORE, 94924426457088, 94924426461183, +ERASE, 140042126479360, 140042126479360, +STORE, 140042126479360, 140042126483455, +STORE, 140042126483456, 140042126487551, +ERASE, 47590668677120, 47590668677120, +STORE, 140737488347136, 140737488351231, +STORE, 140733281439744, 140737488351231, +ERASE, 140733281439744, 140733281439744, +STORE, 140733281439744, 140733281443839, +STORE, 94490667069440, 94490667782143, +ERASE, 94490667069440, 94490667069440, +STORE, 94490667069440, 94490667118591, +STORE, 94490667118592, 94490667782143, +ERASE, 94490667118592, 94490667118592, +STORE, 94490667118592, 94490667663359, +STORE, 94490667663360, 94490667761663, +STORE, 94490667761664, 94490667782143, +STORE, 139878215118848, 139878215290879, +ERASE, 139878215118848, 139878215118848, +STORE, 139878215118848, 139878215122943, +STORE, 139878215122944, 139878215290879, +ERASE, 139878215122944, 139878215122944, +STORE, 139878215122944, 139878215245823, +STORE, 139878215245824, 139878215278591, +STORE, 139878215278592, 139878215286783, +STORE, 139878215286784, 139878215290879, +STORE, 140733281464320, 140733281468415, +STORE, 140733281452032, 140733281464319, +STORE, 47754579877888, 47754579886079, +STORE, 47754579886080, 47754579894271, +STORE, 47754579894272, 47754580000767, +STORE, 47754579910656, 47754580000767, +STORE, 47754579894272, 47754579910655, +ERASE, 47754579910656, 47754579910656, +STORE, 47754579910656, 47754579984383, +STORE, 47754579984384, 47754580000767, +STORE, 47754579963904, 47754579984383, +STORE, 47754579910656, 47754579963903, +ERASE, 47754579910656, 47754579910656, +STORE, 47754579910656, 47754579963903, +STORE, 47754579980288, 47754579984383, +STORE, 47754579963904, 47754579980287, +ERASE, 47754579963904, 47754579963904, +STORE, 47754579963904, 47754579980287, +STORE, 47754579992576, 47754580000767, +STORE, 47754579984384, 47754579992575, +ERASE, 47754579984384, 47754579984384, +STORE, 47754579984384, 47754579992575, +ERASE, 47754579992576, 47754579992576, +STORE, 47754579992576, 47754580000767, +STORE, 47754580000768, 47754581839871, +STORE, 47754580140032, 47754581839871, +STORE, 47754580000768, 47754580140031, +ERASE, 47754580140032, 47754580140032, +STORE, 47754580140032, 47754581798911, +STORE, 47754581798912, 47754581839871, +STORE, 47754581483520, 47754581798911, +STORE, 47754580140032, 47754581483519, +ERASE, 47754580140032, 47754580140032, +STORE, 47754580140032, 47754581483519, +STORE, 47754581794816, 47754581798911, +STORE, 47754581483520, 47754581794815, +ERASE, 47754581483520, 47754581483520, +STORE, 47754581483520, 47754581794815, +STORE, 47754581823488, 47754581839871, +STORE, 47754581798912, 47754581823487, +ERASE, 47754581798912, 47754581798912, +STORE, 47754581798912, 47754581823487, +ERASE, 47754581823488, 47754581823488, +STORE, 47754581823488, 47754581839871, +STORE, 47754581823488, 47754581852159, +ERASE, 47754581798912, 47754581798912, +STORE, 47754581798912, 47754581815295, +STORE, 47754581815296, 47754581823487, +ERASE, 47754579984384, 47754579984384, +STORE, 47754579984384, 47754579988479, +STORE, 47754579988480, 47754579992575, +ERASE, 94490667761664, 94490667761664, +STORE, 94490667761664, 94490667778047, +STORE, 94490667778048, 94490667782143, +ERASE, 139878215278592, 139878215278592, +STORE, 139878215278592, 139878215282687, +STORE, 139878215282688, 139878215286783, +ERASE, 47754579877888, 47754579877888, +STORE, 94490669649920, 94490669785087, +STORE, 140737488347136, 140737488351231, +STORE, 140735382188032, 140737488351231, +ERASE, 140735382188032, 140735382188032, +STORE, 140735382188032, 140735382192127, +STORE, 94150181302272, 94150182014975, +ERASE, 94150181302272, 94150181302272, +STORE, 94150181302272, 94150181351423, +STORE, 94150181351424, 94150182014975, +ERASE, 94150181351424, 94150181351424, +STORE, 94150181351424, 94150181896191, +STORE, 94150181896192, 94150181994495, +STORE, 94150181994496, 94150182014975, +STORE, 139679752458240, 139679752630271, +ERASE, 139679752458240, 139679752458240, +STORE, 139679752458240, 139679752462335, +STORE, 139679752462336, 139679752630271, +ERASE, 139679752462336, 139679752462336, +STORE, 139679752462336, 139679752585215, +STORE, 139679752585216, 139679752617983, +STORE, 139679752617984, 139679752626175, +STORE, 139679752626176, 139679752630271, +STORE, 140735382536192, 140735382540287, +STORE, 140735382523904, 140735382536191, +STORE, 47953042538496, 47953042546687, +STORE, 47953042546688, 47953042554879, +STORE, 47953042554880, 47953042661375, +STORE, 47953042571264, 47953042661375, +STORE, 47953042554880, 47953042571263, +ERASE, 47953042571264, 47953042571264, +STORE, 47953042571264, 47953042644991, +STORE, 47953042644992, 47953042661375, +STORE, 47953042624512, 47953042644991, +STORE, 47953042571264, 47953042624511, +ERASE, 47953042571264, 47953042571264, +STORE, 47953042571264, 47953042624511, +STORE, 47953042640896, 47953042644991, +STORE, 47953042624512, 47953042640895, +ERASE, 47953042624512, 47953042624512, +STORE, 47953042624512, 47953042640895, +STORE, 47953042653184, 47953042661375, +STORE, 47953042644992, 47953042653183, +ERASE, 47953042644992, 47953042644992, +STORE, 47953042644992, 47953042653183, +ERASE, 47953042653184, 47953042653184, +STORE, 47953042653184, 47953042661375, +STORE, 47953042661376, 47953044500479, +STORE, 47953042800640, 47953044500479, +STORE, 47953042661376, 47953042800639, +ERASE, 47953042800640, 47953042800640, +STORE, 47953042800640, 47953044459519, +STORE, 47953044459520, 47953044500479, +STORE, 47953044144128, 47953044459519, +STORE, 47953042800640, 47953044144127, +ERASE, 47953042800640, 47953042800640, +STORE, 47953042800640, 47953044144127, +STORE, 47953044455424, 47953044459519, +STORE, 47953044144128, 47953044455423, +ERASE, 47953044144128, 47953044144128, +STORE, 47953044144128, 47953044455423, +STORE, 47953044484096, 47953044500479, +STORE, 47953044459520, 47953044484095, +ERASE, 47953044459520, 47953044459520, +STORE, 47953044459520, 47953044484095, +ERASE, 47953044484096, 47953044484096, +STORE, 47953044484096, 47953044500479, +STORE, 47953044484096, 47953044512767, +ERASE, 47953044459520, 47953044459520, +STORE, 47953044459520, 47953044475903, +STORE, 47953044475904, 47953044484095, +ERASE, 47953042644992, 47953042644992, +STORE, 47953042644992, 47953042649087, +STORE, 47953042649088, 47953042653183, +ERASE, 94150181994496, 94150181994496, +STORE, 94150181994496, 94150182010879, +STORE, 94150182010880, 94150182014975, +ERASE, 139679752617984, 139679752617984, +STORE, 139679752617984, 139679752622079, +STORE, 139679752622080, 139679752626175, +ERASE, 47953042538496, 47953042538496, +STORE, 140737488347136, 140737488351231, +STORE, 140737044123648, 140737488351231, +ERASE, 140737044123648, 140737044123648, +STORE, 140737044123648, 140737044127743, +STORE, 94425324294144, 94425325006847, +ERASE, 94425324294144, 94425324294144, +STORE, 94425324294144, 94425324343295, +STORE, 94425324343296, 94425325006847, +ERASE, 94425324343296, 94425324343296, +STORE, 94425324343296, 94425324888063, +STORE, 94425324888064, 94425324986367, +STORE, 94425324986368, 94425325006847, +STORE, 140382015016960, 140382015188991, +ERASE, 140382015016960, 140382015016960, +STORE, 140382015016960, 140382015021055, +STORE, 140382015021056, 140382015188991, +ERASE, 140382015021056, 140382015021056, +STORE, 140382015021056, 140382015143935, +STORE, 140382015143936, 140382015176703, +STORE, 140382015176704, 140382015184895, +STORE, 140382015184896, 140382015188991, +STORE, 140737045585920, 140737045590015, +STORE, 140737045573632, 140737045585919, +STORE, 47250779979776, 47250779987967, +STORE, 47250779987968, 47250779996159, +STORE, 47250779996160, 47250780102655, +STORE, 47250780012544, 47250780102655, +STORE, 47250779996160, 47250780012543, +ERASE, 47250780012544, 47250780012544, +STORE, 47250780012544, 47250780086271, +STORE, 47250780086272, 47250780102655, +STORE, 47250780065792, 47250780086271, +STORE, 47250780012544, 47250780065791, +ERASE, 47250780012544, 47250780012544, +STORE, 47250780012544, 47250780065791, +STORE, 47250780082176, 47250780086271, +STORE, 47250780065792, 47250780082175, +ERASE, 47250780065792, 47250780065792, +STORE, 47250780065792, 47250780082175, +STORE, 47250780094464, 47250780102655, +STORE, 47250780086272, 47250780094463, +ERASE, 47250780086272, 47250780086272, +STORE, 47250780086272, 47250780094463, +ERASE, 47250780094464, 47250780094464, +STORE, 47250780094464, 47250780102655, +STORE, 47250780102656, 47250781941759, +STORE, 47250780241920, 47250781941759, +STORE, 47250780102656, 47250780241919, +ERASE, 47250780241920, 47250780241920, +STORE, 47250780241920, 47250781900799, +STORE, 47250781900800, 47250781941759, +STORE, 47250781585408, 47250781900799, +STORE, 47250780241920, 47250781585407, +ERASE, 47250780241920, 47250780241920, +STORE, 47250780241920, 47250781585407, +STORE, 47250781896704, 47250781900799, +STORE, 47250781585408, 47250781896703, +ERASE, 47250781585408, 47250781585408, +STORE, 47250781585408, 47250781896703, +STORE, 47250781925376, 47250781941759, +STORE, 47250781900800, 47250781925375, +ERASE, 47250781900800, 47250781900800, +STORE, 47250781900800, 47250781925375, +ERASE, 47250781925376, 47250781925376, +STORE, 47250781925376, 47250781941759, +STORE, 47250781925376, 47250781954047, +ERASE, 47250781900800, 47250781900800, +STORE, 47250781900800, 47250781917183, +STORE, 47250781917184, 47250781925375, +ERASE, 47250780086272, 47250780086272, +STORE, 47250780086272, 47250780090367, +STORE, 47250780090368, 47250780094463, +ERASE, 94425324986368, 94425324986368, +STORE, 94425324986368, 94425325002751, +STORE, 94425325002752, 94425325006847, +ERASE, 140382015176704, 140382015176704, +STORE, 140382015176704, 140382015180799, +STORE, 140382015180800, 140382015184895, +ERASE, 47250779979776, 47250779979776, +STORE, 94425351438336, 94425351573503, +STORE, 140737488347136, 140737488351231, +STORE, 140736801144832, 140737488351231, +ERASE, 140736801144832, 140736801144832, +STORE, 140736801144832, 140736801148927, +STORE, 94629429358592, 94629430071295, +ERASE, 94629429358592, 94629429358592, +STORE, 94629429358592, 94629429407743, +STORE, 94629429407744, 94629430071295, +ERASE, 94629429407744, 94629429407744, +STORE, 94629429407744, 94629429952511, +STORE, 94629429952512, 94629430050815, +STORE, 94629430050816, 94629430071295, +STORE, 139801685483520, 139801685655551, +ERASE, 139801685483520, 139801685483520, +STORE, 139801685483520, 139801685487615, +STORE, 139801685487616, 139801685655551, +ERASE, 139801685487616, 139801685487616, +STORE, 139801685487616, 139801685610495, +STORE, 139801685610496, 139801685643263, +STORE, 139801685643264, 139801685651455, +STORE, 139801685651456, 139801685655551, +STORE, 140736801198080, 140736801202175, +STORE, 140736801185792, 140736801198079, +STORE, 47831109513216, 47831109521407, +STORE, 47831109521408, 47831109529599, +STORE, 47831109529600, 47831109636095, +STORE, 47831109545984, 47831109636095, +STORE, 47831109529600, 47831109545983, +ERASE, 47831109545984, 47831109545984, +STORE, 47831109545984, 47831109619711, +STORE, 47831109619712, 47831109636095, +STORE, 47831109599232, 47831109619711, +STORE, 47831109545984, 47831109599231, +ERASE, 47831109545984, 47831109545984, +STORE, 47831109545984, 47831109599231, +STORE, 47831109615616, 47831109619711, +STORE, 47831109599232, 47831109615615, +ERASE, 47831109599232, 47831109599232, +STORE, 47831109599232, 47831109615615, +STORE, 47831109627904, 47831109636095, +STORE, 47831109619712, 47831109627903, +ERASE, 47831109619712, 47831109619712, +STORE, 47831109619712, 47831109627903, +ERASE, 47831109627904, 47831109627904, +STORE, 47831109627904, 47831109636095, +STORE, 47831109636096, 47831111475199, +STORE, 47831109775360, 47831111475199, +STORE, 47831109636096, 47831109775359, +ERASE, 47831109775360, 47831109775360, +STORE, 47831109775360, 47831111434239, +STORE, 47831111434240, 47831111475199, +STORE, 47831111118848, 47831111434239, +STORE, 47831109775360, 47831111118847, +ERASE, 47831109775360, 47831109775360, +STORE, 47831109775360, 47831111118847, +STORE, 47831111430144, 47831111434239, +STORE, 47831111118848, 47831111430143, +ERASE, 47831111118848, 47831111118848, +STORE, 47831111118848, 47831111430143, +STORE, 47831111458816, 47831111475199, +STORE, 47831111434240, 47831111458815, +ERASE, 47831111434240, 47831111434240, +STORE, 47831111434240, 47831111458815, +ERASE, 47831111458816, 47831111458816, +STORE, 47831111458816, 47831111475199, +STORE, 47831111458816, 47831111487487, +ERASE, 47831111434240, 47831111434240, +STORE, 47831111434240, 47831111450623, +STORE, 47831111450624, 47831111458815, +ERASE, 47831109619712, 47831109619712, +STORE, 47831109619712, 47831109623807, +STORE, 47831109623808, 47831109627903, +ERASE, 94629430050816, 94629430050816, +STORE, 94629430050816, 94629430067199, +STORE, 94629430067200, 94629430071295, +ERASE, 139801685643264, 139801685643264, +STORE, 139801685643264, 139801685647359, +STORE, 139801685647360, 139801685651455, +ERASE, 47831109513216, 47831109513216, +STORE, 140737488347136, 140737488351231, +STORE, 140729419612160, 140737488351231, +ERASE, 140729419612160, 140729419612160, +STORE, 140729419612160, 140729419616255, +STORE, 94443354148864, 94443354861567, +ERASE, 94443354148864, 94443354148864, +STORE, 94443354148864, 94443354198015, +STORE, 94443354198016, 94443354861567, +ERASE, 94443354198016, 94443354198016, +STORE, 94443354198016, 94443354742783, +STORE, 94443354742784, 94443354841087, +STORE, 94443354841088, 94443354861567, +STORE, 139741700038656, 139741700210687, +ERASE, 139741700038656, 139741700038656, +STORE, 139741700038656, 139741700042751, +STORE, 139741700042752, 139741700210687, +ERASE, 139741700042752, 139741700042752, +STORE, 139741700042752, 139741700165631, +STORE, 139741700165632, 139741700198399, +STORE, 139741700198400, 139741700206591, +STORE, 139741700206592, 139741700210687, +STORE, 140729420574720, 140729420578815, +STORE, 140729420562432, 140729420574719, +STORE, 47891094958080, 47891094966271, +STORE, 47891094966272, 47891094974463, +STORE, 47891094974464, 47891095080959, +STORE, 47891094990848, 47891095080959, +STORE, 47891094974464, 47891094990847, +ERASE, 47891094990848, 47891094990848, +STORE, 47891094990848, 47891095064575, +STORE, 47891095064576, 47891095080959, +STORE, 47891095044096, 47891095064575, +STORE, 47891094990848, 47891095044095, +ERASE, 47891094990848, 47891094990848, +STORE, 47891094990848, 47891095044095, +STORE, 47891095060480, 47891095064575, +STORE, 47891095044096, 47891095060479, +ERASE, 47891095044096, 47891095044096, +STORE, 47891095044096, 47891095060479, +STORE, 47891095072768, 47891095080959, +STORE, 47891095064576, 47891095072767, +ERASE, 47891095064576, 47891095064576, +STORE, 47891095064576, 47891095072767, +ERASE, 47891095072768, 47891095072768, +STORE, 47891095072768, 47891095080959, +STORE, 47891095080960, 47891096920063, +STORE, 47891095220224, 47891096920063, +STORE, 47891095080960, 47891095220223, +ERASE, 47891095220224, 47891095220224, +STORE, 47891095220224, 47891096879103, +STORE, 47891096879104, 47891096920063, +STORE, 47891096563712, 47891096879103, +STORE, 47891095220224, 47891096563711, +ERASE, 47891095220224, 47891095220224, +STORE, 47891095220224, 47891096563711, +STORE, 47891096875008, 47891096879103, +STORE, 47891096563712, 47891096875007, +ERASE, 47891096563712, 47891096563712, +STORE, 47891096563712, 47891096875007, +STORE, 47891096903680, 47891096920063, +STORE, 47891096879104, 47891096903679, +ERASE, 47891096879104, 47891096879104, +STORE, 47891096879104, 47891096903679, +ERASE, 47891096903680, 47891096903680, +STORE, 47891096903680, 47891096920063, +STORE, 47891096903680, 47891096932351, +ERASE, 47891096879104, 47891096879104, +STORE, 47891096879104, 47891096895487, +STORE, 47891096895488, 47891096903679, +ERASE, 47891095064576, 47891095064576, +STORE, 47891095064576, 47891095068671, +STORE, 47891095068672, 47891095072767, +ERASE, 94443354841088, 94443354841088, +STORE, 94443354841088, 94443354857471, +STORE, 94443354857472, 94443354861567, +ERASE, 139741700198400, 139741700198400, +STORE, 139741700198400, 139741700202495, +STORE, 139741700202496, 139741700206591, +ERASE, 47891094958080, 47891094958080, +STORE, 94443360825344, 94443360960511, +STORE, 140737488347136, 140737488351231, +STORE, 140722961661952, 140737488351231, +ERASE, 140722961661952, 140722961661952, +STORE, 140722961661952, 140722961666047, +STORE, 94878388944896, 94878389657599, +ERASE, 94878388944896, 94878388944896, +STORE, 94878388944896, 94878388994047, +STORE, 94878388994048, 94878389657599, +ERASE, 94878388994048, 94878388994048, +STORE, 94878388994048, 94878389538815, +STORE, 94878389538816, 94878389637119, +STORE, 94878389637120, 94878389657599, +STORE, 140210690056192, 140210690228223, +ERASE, 140210690056192, 140210690056192, +STORE, 140210690056192, 140210690060287, +STORE, 140210690060288, 140210690228223, +ERASE, 140210690060288, 140210690060288, +STORE, 140210690060288, 140210690183167, +STORE, 140210690183168, 140210690215935, +STORE, 140210690215936, 140210690224127, +STORE, 140210690224128, 140210690228223, +STORE, 140722963148800, 140722963152895, +STORE, 140722963136512, 140722963148799, +STORE, 47422104940544, 47422104948735, +STORE, 47422104948736, 47422104956927, +STORE, 47422104956928, 47422105063423, +STORE, 47422104973312, 47422105063423, +STORE, 47422104956928, 47422104973311, +ERASE, 47422104973312, 47422104973312, +STORE, 47422104973312, 47422105047039, +STORE, 47422105047040, 47422105063423, +STORE, 47422105026560, 47422105047039, +STORE, 47422104973312, 47422105026559, +ERASE, 47422104973312, 47422104973312, +STORE, 47422104973312, 47422105026559, +STORE, 47422105042944, 47422105047039, +STORE, 47422105026560, 47422105042943, +ERASE, 47422105026560, 47422105026560, +STORE, 47422105026560, 47422105042943, +STORE, 47422105055232, 47422105063423, +STORE, 47422105047040, 47422105055231, +ERASE, 47422105047040, 47422105047040, +STORE, 47422105047040, 47422105055231, +ERASE, 47422105055232, 47422105055232, +STORE, 47422105055232, 47422105063423, +STORE, 47422105063424, 47422106902527, +STORE, 47422105202688, 47422106902527, +STORE, 47422105063424, 47422105202687, +ERASE, 47422105202688, 47422105202688, +STORE, 47422105202688, 47422106861567, +STORE, 47422106861568, 47422106902527, +STORE, 47422106546176, 47422106861567, +STORE, 47422105202688, 47422106546175, +ERASE, 47422105202688, 47422105202688, +STORE, 47422105202688, 47422106546175, +STORE, 47422106857472, 47422106861567, +STORE, 47422106546176, 47422106857471, +ERASE, 47422106546176, 47422106546176, +STORE, 47422106546176, 47422106857471, +STORE, 47422106886144, 47422106902527, +STORE, 47422106861568, 47422106886143, +ERASE, 47422106861568, 47422106861568, +STORE, 47422106861568, 47422106886143, +ERASE, 47422106886144, 47422106886144, +STORE, 47422106886144, 47422106902527, +STORE, 47422106886144, 47422106914815, +ERASE, 47422106861568, 47422106861568, +STORE, 47422106861568, 47422106877951, +STORE, 47422106877952, 47422106886143, +ERASE, 47422105047040, 47422105047040, +STORE, 47422105047040, 47422105051135, +STORE, 47422105051136, 47422105055231, +ERASE, 94878389637120, 94878389637120, +STORE, 94878389637120, 94878389653503, +STORE, 94878389653504, 94878389657599, +ERASE, 140210690215936, 140210690215936, +STORE, 140210690215936, 140210690220031, +STORE, 140210690220032, 140210690224127, +ERASE, 47422104940544, 47422104940544, +STORE, 140737488347136, 140737488351231, +STORE, 140727690309632, 140737488351231, +ERASE, 140727690309632, 140727690309632, +STORE, 140727690309632, 140727690313727, +STORE, 94121892208640, 94121892921343, +ERASE, 94121892208640, 94121892208640, +STORE, 94121892208640, 94121892257791, +STORE, 94121892257792, 94121892921343, +ERASE, 94121892257792, 94121892257792, +STORE, 94121892257792, 94121892802559, +STORE, 94121892802560, 94121892900863, +STORE, 94121892900864, 94121892921343, +STORE, 140662438326272, 140662438498303, +ERASE, 140662438326272, 140662438326272, +STORE, 140662438326272, 140662438330367, +STORE, 140662438330368, 140662438498303, +ERASE, 140662438330368, 140662438330368, +STORE, 140662438330368, 140662438453247, +STORE, 140662438453248, 140662438486015, +STORE, 140662438486016, 140662438494207, +STORE, 140662438494208, 140662438498303, +STORE, 140727690379264, 140727690383359, +STORE, 140727690366976, 140727690379263, +STORE, 46970356670464, 46970356678655, +STORE, 46970356678656, 46970356686847, +STORE, 46970356686848, 46970356793343, +STORE, 46970356703232, 46970356793343, +STORE, 46970356686848, 46970356703231, +ERASE, 46970356703232, 46970356703232, +STORE, 46970356703232, 46970356776959, +STORE, 46970356776960, 46970356793343, +STORE, 46970356756480, 46970356776959, +STORE, 46970356703232, 46970356756479, +ERASE, 46970356703232, 46970356703232, +STORE, 46970356703232, 46970356756479, +STORE, 46970356772864, 46970356776959, +STORE, 46970356756480, 46970356772863, +ERASE, 46970356756480, 46970356756480, +STORE, 46970356756480, 46970356772863, +STORE, 46970356785152, 46970356793343, +STORE, 46970356776960, 46970356785151, +ERASE, 46970356776960, 46970356776960, +STORE, 46970356776960, 46970356785151, +ERASE, 46970356785152, 46970356785152, +STORE, 46970356785152, 46970356793343, +STORE, 46970356793344, 46970358632447, +STORE, 46970356932608, 46970358632447, +STORE, 46970356793344, 46970356932607, +ERASE, 46970356932608, 46970356932608, +STORE, 46970356932608, 46970358591487, +STORE, 46970358591488, 46970358632447, +STORE, 46970358276096, 46970358591487, +STORE, 46970356932608, 46970358276095, +ERASE, 46970356932608, 46970356932608, +STORE, 46970356932608, 46970358276095, +STORE, 46970358587392, 46970358591487, +STORE, 46970358276096, 46970358587391, +ERASE, 46970358276096, 46970358276096, +STORE, 46970358276096, 46970358587391, +STORE, 46970358616064, 46970358632447, +STORE, 46970358591488, 46970358616063, +ERASE, 46970358591488, 46970358591488, +STORE, 46970358591488, 46970358616063, +ERASE, 46970358616064, 46970358616064, +STORE, 46970358616064, 46970358632447, +STORE, 46970358616064, 46970358644735, +ERASE, 46970358591488, 46970358591488, +STORE, 46970358591488, 46970358607871, +STORE, 46970358607872, 46970358616063, +ERASE, 46970356776960, 46970356776960, +STORE, 46970356776960, 46970356781055, +STORE, 46970356781056, 46970356785151, +ERASE, 94121892900864, 94121892900864, +STORE, 94121892900864, 94121892917247, +STORE, 94121892917248, 94121892921343, +ERASE, 140662438486016, 140662438486016, +STORE, 140662438486016, 140662438490111, +STORE, 140662438490112, 140662438494207, +ERASE, 46970356670464, 46970356670464, +STORE, 94121898610688, 94121898745855, +STORE, 140737488347136, 140737488351231, +STORE, 140737189351424, 140737488351231, +ERASE, 140737189351424, 140737189351424, +STORE, 140737189351424, 140737189355519, +STORE, 93847948832768, 93847949545471, +ERASE, 93847948832768, 93847948832768, +STORE, 93847948832768, 93847948881919, +STORE, 93847948881920, 93847949545471, +ERASE, 93847948881920, 93847948881920, +STORE, 93847948881920, 93847949426687, +STORE, 93847949426688, 93847949524991, +STORE, 93847949524992, 93847949545471, +STORE, 139698989985792, 139698990157823, +ERASE, 139698989985792, 139698989985792, +STORE, 139698989985792, 139698989989887, +STORE, 139698989989888, 139698990157823, +ERASE, 139698989989888, 139698989989888, +STORE, 139698989989888, 139698990112767, +STORE, 139698990112768, 139698990145535, +STORE, 139698990145536, 139698990153727, +STORE, 139698990153728, 139698990157823, +STORE, 140737189744640, 140737189748735, +STORE, 140737189732352, 140737189744639, +STORE, 47933805010944, 47933805019135, +STORE, 47933805019136, 47933805027327, +STORE, 47933805027328, 47933805133823, +STORE, 47933805043712, 47933805133823, +STORE, 47933805027328, 47933805043711, +ERASE, 47933805043712, 47933805043712, +STORE, 47933805043712, 47933805117439, +STORE, 47933805117440, 47933805133823, +STORE, 47933805096960, 47933805117439, +STORE, 47933805043712, 47933805096959, +ERASE, 47933805043712, 47933805043712, +STORE, 47933805043712, 47933805096959, +STORE, 47933805113344, 47933805117439, +STORE, 47933805096960, 47933805113343, +ERASE, 47933805096960, 47933805096960, +STORE, 47933805096960, 47933805113343, +STORE, 47933805125632, 47933805133823, +STORE, 47933805117440, 47933805125631, +ERASE, 47933805117440, 47933805117440, +STORE, 47933805117440, 47933805125631, +ERASE, 47933805125632, 47933805125632, +STORE, 47933805125632, 47933805133823, +STORE, 47933805133824, 47933806972927, +STORE, 47933805273088, 47933806972927, +STORE, 47933805133824, 47933805273087, +ERASE, 47933805273088, 47933805273088, +STORE, 47933805273088, 47933806931967, +STORE, 47933806931968, 47933806972927, +STORE, 47933806616576, 47933806931967, +STORE, 47933805273088, 47933806616575, +ERASE, 47933805273088, 47933805273088, +STORE, 47933805273088, 47933806616575, +STORE, 47933806927872, 47933806931967, +STORE, 47933806616576, 47933806927871, +ERASE, 47933806616576, 47933806616576, +STORE, 47933806616576, 47933806927871, +STORE, 47933806956544, 47933806972927, +STORE, 47933806931968, 47933806956543, +ERASE, 47933806931968, 47933806931968, +STORE, 47933806931968, 47933806956543, +ERASE, 47933806956544, 47933806956544, +STORE, 47933806956544, 47933806972927, +STORE, 47933806956544, 47933806985215, +ERASE, 47933806931968, 47933806931968, +STORE, 47933806931968, 47933806948351, +STORE, 47933806948352, 47933806956543, +ERASE, 47933805117440, 47933805117440, +STORE, 47933805117440, 47933805121535, +STORE, 47933805121536, 47933805125631, +ERASE, 93847949524992, 93847949524992, +STORE, 93847949524992, 93847949541375, +STORE, 93847949541376, 93847949545471, +ERASE, 139698990145536, 139698990145536, +STORE, 139698990145536, 139698990149631, +STORE, 139698990149632, 139698990153727, +ERASE, 47933805010944, 47933805010944, +STORE, 140737488347136, 140737488351231, +STORE, 140725553991680, 140737488351231, +ERASE, 140725553991680, 140725553991680, +STORE, 140725553991680, 140725553995775, +STORE, 93980056248320, 93980056961023, +ERASE, 93980056248320, 93980056248320, +STORE, 93980056248320, 93980056297471, +STORE, 93980056297472, 93980056961023, +ERASE, 93980056297472, 93980056297472, +STORE, 93980056297472, 93980056842239, +STORE, 93980056842240, 93980056940543, +STORE, 93980056940544, 93980056961023, +STORE, 140146588971008, 140146589143039, +ERASE, 140146588971008, 140146588971008, +STORE, 140146588971008, 140146588975103, +STORE, 140146588975104, 140146589143039, +ERASE, 140146588975104, 140146588975104, +STORE, 140146588975104, 140146589097983, +STORE, 140146589097984, 140146589130751, +STORE, 140146589130752, 140146589138943, +STORE, 140146589138944, 140146589143039, +STORE, 140725554860032, 140725554864127, +STORE, 140725554847744, 140725554860031, +STORE, 47486206025728, 47486206033919, +STORE, 47486206033920, 47486206042111, +STORE, 47486206042112, 47486206148607, +STORE, 47486206058496, 47486206148607, +STORE, 47486206042112, 47486206058495, +ERASE, 47486206058496, 47486206058496, +STORE, 47486206058496, 47486206132223, +STORE, 47486206132224, 47486206148607, +STORE, 47486206111744, 47486206132223, +STORE, 47486206058496, 47486206111743, +ERASE, 47486206058496, 47486206058496, +STORE, 47486206058496, 47486206111743, +STORE, 47486206128128, 47486206132223, +STORE, 47486206111744, 47486206128127, +ERASE, 47486206111744, 47486206111744, +STORE, 47486206111744, 47486206128127, +STORE, 47486206140416, 47486206148607, +STORE, 47486206132224, 47486206140415, +ERASE, 47486206132224, 47486206132224, +STORE, 47486206132224, 47486206140415, +ERASE, 47486206140416, 47486206140416, +STORE, 47486206140416, 47486206148607, +STORE, 47486206148608, 47486207987711, +STORE, 47486206287872, 47486207987711, +STORE, 47486206148608, 47486206287871, +ERASE, 47486206287872, 47486206287872, +STORE, 47486206287872, 47486207946751, +STORE, 47486207946752, 47486207987711, +STORE, 47486207631360, 47486207946751, +STORE, 47486206287872, 47486207631359, +ERASE, 47486206287872, 47486206287872, +STORE, 47486206287872, 47486207631359, +STORE, 47486207942656, 47486207946751, +STORE, 47486207631360, 47486207942655, +ERASE, 47486207631360, 47486207631360, +STORE, 47486207631360, 47486207942655, +STORE, 47486207971328, 47486207987711, +STORE, 47486207946752, 47486207971327, +ERASE, 47486207946752, 47486207946752, +STORE, 47486207946752, 47486207971327, +ERASE, 47486207971328, 47486207971328, +STORE, 47486207971328, 47486207987711, +STORE, 47486207971328, 47486207999999, +ERASE, 47486207946752, 47486207946752, +STORE, 47486207946752, 47486207963135, +STORE, 47486207963136, 47486207971327, +ERASE, 47486206132224, 47486206132224, +STORE, 47486206132224, 47486206136319, +STORE, 47486206136320, 47486206140415, +ERASE, 93980056940544, 93980056940544, +STORE, 93980056940544, 93980056956927, +STORE, 93980056956928, 93980056961023, +ERASE, 140146589130752, 140146589130752, +STORE, 140146589130752, 140146589134847, +STORE, 140146589134848, 140146589138943, +ERASE, 47486206025728, 47486206025728, +STORE, 93980070006784, 93980070141951, +STORE, 140737488347136, 140737488351231, +STORE, 140727334776832, 140737488351231, +ERASE, 140727334776832, 140727334776832, +STORE, 140727334776832, 140727334780927, +STORE, 94049747247104, 94049747959807, +ERASE, 94049747247104, 94049747247104, +STORE, 94049747247104, 94049747296255, +STORE, 94049747296256, 94049747959807, +ERASE, 94049747296256, 94049747296256, +STORE, 94049747296256, 94049747841023, +STORE, 94049747841024, 94049747939327, +STORE, 94049747939328, 94049747959807, +STORE, 140227307216896, 140227307388927, +ERASE, 140227307216896, 140227307216896, +STORE, 140227307216896, 140227307220991, +STORE, 140227307220992, 140227307388927, +ERASE, 140227307220992, 140227307220992, +STORE, 140227307220992, 140227307343871, +STORE, 140227307343872, 140227307376639, +STORE, 140227307376640, 140227307384831, +STORE, 140227307384832, 140227307388927, +STORE, 140727335337984, 140727335342079, +STORE, 140727335325696, 140727335337983, +STORE, 47405487779840, 47405487788031, +STORE, 47405487788032, 47405487796223, +STORE, 47405487796224, 47405487902719, +STORE, 47405487812608, 47405487902719, +STORE, 47405487796224, 47405487812607, +ERASE, 47405487812608, 47405487812608, +STORE, 47405487812608, 47405487886335, +STORE, 47405487886336, 47405487902719, +STORE, 47405487865856, 47405487886335, +STORE, 47405487812608, 47405487865855, +ERASE, 47405487812608, 47405487812608, +STORE, 47405487812608, 47405487865855, +STORE, 47405487882240, 47405487886335, +STORE, 47405487865856, 47405487882239, +ERASE, 47405487865856, 47405487865856, +STORE, 47405487865856, 47405487882239, +STORE, 47405487894528, 47405487902719, +STORE, 47405487886336, 47405487894527, +ERASE, 47405487886336, 47405487886336, +STORE, 47405487886336, 47405487894527, +ERASE, 47405487894528, 47405487894528, +STORE, 47405487894528, 47405487902719, +STORE, 47405487902720, 47405489741823, +STORE, 47405488041984, 47405489741823, +STORE, 47405487902720, 47405488041983, +ERASE, 47405488041984, 47405488041984, +STORE, 47405488041984, 47405489700863, +STORE, 47405489700864, 47405489741823, +STORE, 47405489385472, 47405489700863, +STORE, 47405488041984, 47405489385471, +ERASE, 47405488041984, 47405488041984, +STORE, 47405488041984, 47405489385471, +STORE, 47405489696768, 47405489700863, +STORE, 47405489385472, 47405489696767, +ERASE, 47405489385472, 47405489385472, +STORE, 47405489385472, 47405489696767, +STORE, 47405489725440, 47405489741823, +STORE, 47405489700864, 47405489725439, +ERASE, 47405489700864, 47405489700864, +STORE, 47405489700864, 47405489725439, +ERASE, 47405489725440, 47405489725440, +STORE, 47405489725440, 47405489741823, +STORE, 47405489725440, 47405489754111, +ERASE, 47405489700864, 47405489700864, +STORE, 47405489700864, 47405489717247, +STORE, 47405489717248, 47405489725439, +ERASE, 47405487886336, 47405487886336, +STORE, 47405487886336, 47405487890431, +STORE, 47405487890432, 47405487894527, +ERASE, 94049747939328, 94049747939328, +STORE, 94049747939328, 94049747955711, +STORE, 94049747955712, 94049747959807, +ERASE, 140227307376640, 140227307376640, +STORE, 140227307376640, 140227307380735, +STORE, 140227307380736, 140227307384831, +ERASE, 47405487779840, 47405487779840, +STORE, 94049758810112, 94049758945279, +STORE, 140737488347136, 140737488351231, +STORE, 140727079718912, 140737488351231, +ERASE, 140727079718912, 140727079718912, +STORE, 140727079718912, 140727079723007, +STORE, 94250996527104, 94250997239807, +ERASE, 94250996527104, 94250996527104, +STORE, 94250996527104, 94250996576255, +STORE, 94250996576256, 94250997239807, +ERASE, 94250996576256, 94250996576256, +STORE, 94250996576256, 94250997121023, +STORE, 94250997121024, 94250997219327, +STORE, 94250997219328, 94250997239807, +STORE, 140060022587392, 140060022759423, +ERASE, 140060022587392, 140060022587392, +STORE, 140060022587392, 140060022591487, +STORE, 140060022591488, 140060022759423, +ERASE, 140060022591488, 140060022591488, +STORE, 140060022591488, 140060022714367, +STORE, 140060022714368, 140060022747135, +STORE, 140060022747136, 140060022755327, +STORE, 140060022755328, 140060022759423, +STORE, 140727079788544, 140727079792639, +STORE, 140727079776256, 140727079788543, +/* this next one caused issues when lowering the efficiency */ +STORE, 47572772409344, 47572772417535, +STORE, 47572772417536, 47572772425727, +STORE, 47572772425728, 47572772532223, +STORE, 47572772442112, 47572772532223, +STORE, 47572772425728, 47572772442111, +ERASE, 47572772442112, 47572772442112, +STORE, 47572772442112, 47572772515839, +STORE, 47572772515840, 47572772532223, +STORE, 47572772495360, 47572772515839, +STORE, 47572772442112, 47572772495359, +ERASE, 47572772442112, 47572772442112, +STORE, 47572772442112, 47572772495359, +STORE, 47572772511744, 47572772515839, +STORE, 47572772495360, 47572772511743, +ERASE, 47572772495360, 47572772495360, +STORE, 47572772495360, 47572772511743, +STORE, 47572772524032, 47572772532223, +STORE, 47572772515840, 47572772524031, +ERASE, 47572772515840, 47572772515840, +STORE, 47572772515840, 47572772524031, +ERASE, 47572772524032, 47572772524032, +STORE, 47572772524032, 47572772532223, +STORE, 47572772532224, 47572774371327, +STORE, 47572772671488, 47572774371327, +STORE, 47572772532224, 47572772671487, +ERASE, 47572772671488, 47572772671488, +STORE, 47572772671488, 47572774330367, +STORE, 47572774330368, 47572774371327, +STORE, 47572774014976, 47572774330367, +STORE, 47572772671488, 47572774014975, +ERASE, 47572772671488, 47572772671488, +STORE, 47572772671488, 47572774014975, +STORE, 47572774326272, 47572774330367, +STORE, 47572774014976, 47572774326271, +ERASE, 47572774014976, 47572774014976, +STORE, 47572774014976, 47572774326271, +STORE, 47572774354944, 47572774371327, +STORE, 47572774330368, 47572774354943, +ERASE, 47572774330368, 47572774330368, +STORE, 47572774330368, 47572774354943, +ERASE, 47572774354944, 47572774354944, +STORE, 47572774354944, 47572774371327, +STORE, 47572774354944, 47572774383615, +ERASE, 47572774330368, 47572774330368, +STORE, 47572774330368, 47572774346751, +STORE, 47572774346752, 47572774354943, +ERASE, 47572772515840, 47572772515840, +STORE, 47572772515840, 47572772519935, +STORE, 47572772519936, 47572772524031, +ERASE, 94250997219328, 94250997219328, +STORE, 94250997219328, 94250997235711, +STORE, 94250997235712, 94250997239807, +ERASE, 140060022747136, 140060022747136, +STORE, 140060022747136, 140060022751231, +STORE, 140060022751232, 140060022755327, +ERASE, 47572772409344, 47572772409344, +STORE, 94251018305536, 94251018440703, +STORE, 140737488347136, 140737488351231, +STORE, 140730012389376, 140737488351231, +ERASE, 140730012389376, 140730012389376, +STORE, 140730012389376, 140730012393471, +STORE, 94382607675392, 94382607695871, +ERASE, 94382607675392, 94382607675392, +STORE, 94382607675392, 94382607679487, +STORE, 94382607679488, 94382607695871, +ERASE, 94382607679488, 94382607679488, +STORE, 94382607679488, 94382607683583, +STORE, 94382607683584, 94382607687679, +STORE, 94382607687680, 94382607695871, +STORE, 140252451454976, 140252451627007, +ERASE, 140252451454976, 140252451454976, +STORE, 140252451454976, 140252451459071, +STORE, 140252451459072, 140252451627007, +ERASE, 140252451459072, 140252451459072, +STORE, 140252451459072, 140252451581951, +STORE, 140252451581952, 140252451614719, +STORE, 140252451614720, 140252451622911, +STORE, 140252451622912, 140252451627007, +STORE, 140730013548544, 140730013552639, +STORE, 140730013536256, 140730013548543, +STORE, 47380343541760, 47380343549951, +STORE, 47380343549952, 47380343558143, +STORE, 47380343558144, 47380345397247, +STORE, 47380343697408, 47380345397247, +STORE, 47380343558144, 47380343697407, +ERASE, 47380343697408, 47380343697408, +STORE, 47380343697408, 47380345356287, +STORE, 47380345356288, 47380345397247, +STORE, 47380345040896, 47380345356287, +STORE, 47380343697408, 47380345040895, +ERASE, 47380343697408, 47380343697408, +STORE, 47380343697408, 47380345040895, +STORE, 47380345352192, 47380345356287, +STORE, 47380345040896, 47380345352191, +ERASE, 47380345040896, 47380345040896, +STORE, 47380345040896, 47380345352191, +STORE, 47380345380864, 47380345397247, +STORE, 47380345356288, 47380345380863, +ERASE, 47380345356288, 47380345356288, +STORE, 47380345356288, 47380345380863, +ERASE, 47380345380864, 47380345380864, +STORE, 47380345380864, 47380345397247, +ERASE, 47380345356288, 47380345356288, +STORE, 47380345356288, 47380345372671, +STORE, 47380345372672, 47380345380863, +ERASE, 94382607687680, 94382607687680, +STORE, 94382607687680, 94382607691775, +STORE, 94382607691776, 94382607695871, +ERASE, 140252451614720, 140252451614720, +STORE, 140252451614720, 140252451618815, +STORE, 140252451618816, 140252451622911, +ERASE, 47380343541760, 47380343541760, +STORE, 94382626803712, 94382626938879, +STORE, 140737488347136, 140737488351231, +STORE, 140730900271104, 140737488351231, +ERASE, 140730900271104, 140730900271104, +STORE, 140730900271104, 140730900275199, +STORE, 93855478120448, 93855478337535, +ERASE, 93855478120448, 93855478120448, +STORE, 93855478120448, 93855478198271, +STORE, 93855478198272, 93855478337535, +ERASE, 93855478198272, 93855478198272, +STORE, 93855478198272, 93855478243327, +STORE, 93855478243328, 93855478288383, +STORE, 93855478288384, 93855478337535, +STORE, 140092686573568, 140092686745599, +ERASE, 140092686573568, 140092686573568, +STORE, 140092686573568, 140092686577663, +STORE, 140092686577664, 140092686745599, +ERASE, 140092686577664, 140092686577664, +STORE, 140092686577664, 140092686700543, +STORE, 140092686700544, 140092686733311, +STORE, 140092686733312, 140092686741503, +STORE, 140092686741504, 140092686745599, +STORE, 140730900537344, 140730900541439, +STORE, 140730900525056, 140730900537343, +STORE, 47540108423168, 47540108431359, +STORE, 47540108431360, 47540108439551, +STORE, 47540108439552, 47540110278655, +STORE, 47540108578816, 47540110278655, +STORE, 47540108439552, 47540108578815, +ERASE, 47540108578816, 47540108578816, +STORE, 47540108578816, 47540110237695, +STORE, 47540110237696, 47540110278655, +STORE, 47540109922304, 47540110237695, +STORE, 47540108578816, 47540109922303, +ERASE, 47540108578816, 47540108578816, +STORE, 47540108578816, 47540109922303, +STORE, 47540110233600, 47540110237695, +STORE, 47540109922304, 47540110233599, +ERASE, 47540109922304, 47540109922304, +STORE, 47540109922304, 47540110233599, +STORE, 47540110262272, 47540110278655, +STORE, 47540110237696, 47540110262271, +ERASE, 47540110237696, 47540110237696, +STORE, 47540110237696, 47540110262271, +ERASE, 47540110262272, 47540110262272, +STORE, 47540110262272, 47540110278655, +ERASE, 47540110237696, 47540110237696, +STORE, 47540110237696, 47540110254079, +STORE, 47540110254080, 47540110262271, +ERASE, 93855478288384, 93855478288384, +STORE, 93855478288384, 93855478333439, +STORE, 93855478333440, 93855478337535, +ERASE, 140092686733312, 140092686733312, +STORE, 140092686733312, 140092686737407, +STORE, 140092686737408, 140092686741503, +ERASE, 47540108423168, 47540108423168, +STORE, 93855492222976, 93855492358143, +STORE, 93855492222976, 93855492493311, +STORE, 140737488347136, 140737488351231, +STORE, 140733498146816, 140737488351231, +ERASE, 140733498146816, 140733498146816, +STORE, 140733498146816, 140733498150911, +STORE, 94170739654656, 94170740367359, +ERASE, 94170739654656, 94170739654656, +STORE, 94170739654656, 94170739703807, +STORE, 94170739703808, 94170740367359, +ERASE, 94170739703808, 94170739703808, +STORE, 94170739703808, 94170740248575, +STORE, 94170740248576, 94170740346879, +STORE, 94170740346880, 94170740367359, +STORE, 140024788877312, 140024789049343, +ERASE, 140024788877312, 140024788877312, +STORE, 140024788877312, 140024788881407, +STORE, 140024788881408, 140024789049343, +ERASE, 140024788881408, 140024788881408, +STORE, 140024788881408, 140024789004287, +STORE, 140024789004288, 140024789037055, +STORE, 140024789037056, 140024789045247, +STORE, 140024789045248, 140024789049343, +STORE, 140733499023360, 140733499027455, +STORE, 140733499011072, 140733499023359, +STORE, 47608006119424, 47608006127615, +STORE, 47608006127616, 47608006135807, +STORE, 47608006135808, 47608006242303, +STORE, 47608006152192, 47608006242303, +STORE, 47608006135808, 47608006152191, +ERASE, 47608006152192, 47608006152192, +STORE, 47608006152192, 47608006225919, +STORE, 47608006225920, 47608006242303, +STORE, 47608006205440, 47608006225919, +STORE, 47608006152192, 47608006205439, +ERASE, 47608006152192, 47608006152192, +STORE, 47608006152192, 47608006205439, +STORE, 47608006221824, 47608006225919, +STORE, 47608006205440, 47608006221823, +ERASE, 47608006205440, 47608006205440, +STORE, 47608006205440, 47608006221823, +STORE, 47608006234112, 47608006242303, +STORE, 47608006225920, 47608006234111, +ERASE, 47608006225920, 47608006225920, +STORE, 47608006225920, 47608006234111, +ERASE, 47608006234112, 47608006234112, +STORE, 47608006234112, 47608006242303, +STORE, 47608006242304, 47608008081407, +STORE, 47608006381568, 47608008081407, +STORE, 47608006242304, 47608006381567, +ERASE, 47608006381568, 47608006381568, +STORE, 47608006381568, 47608008040447, +STORE, 47608008040448, 47608008081407, +STORE, 47608007725056, 47608008040447, +STORE, 47608006381568, 47608007725055, +ERASE, 47608006381568, 47608006381568, +STORE, 47608006381568, 47608007725055, +STORE, 47608008036352, 47608008040447, +STORE, 47608007725056, 47608008036351, +ERASE, 47608007725056, 47608007725056, +STORE, 47608007725056, 47608008036351, +STORE, 47608008065024, 47608008081407, +STORE, 47608008040448, 47608008065023, +ERASE, 47608008040448, 47608008040448, +STORE, 47608008040448, 47608008065023, +ERASE, 47608008065024, 47608008065024, +STORE, 47608008065024, 47608008081407, +STORE, 47608008065024, 47608008093695, +ERASE, 47608008040448, 47608008040448, +STORE, 47608008040448, 47608008056831, +STORE, 47608008056832, 47608008065023, +ERASE, 47608006225920, 47608006225920, +STORE, 47608006225920, 47608006230015, +STORE, 47608006230016, 47608006234111, +ERASE, 94170740346880, 94170740346880, +STORE, 94170740346880, 94170740363263, +STORE, 94170740363264, 94170740367359, +ERASE, 140024789037056, 140024789037056, +STORE, 140024789037056, 140024789041151, +STORE, 140024789041152, 140024789045247, +ERASE, 47608006119424, 47608006119424, +STORE, 140737488347136, 140737488351231, +STORE, 140730264326144, 140737488351231, +ERASE, 140730264326144, 140730264326144, +STORE, 140730264326144, 140730264330239, +STORE, 94653216407552, 94653217120255, +ERASE, 94653216407552, 94653216407552, +STORE, 94653216407552, 94653216456703, +STORE, 94653216456704, 94653217120255, +ERASE, 94653216456704, 94653216456704, +STORE, 94653216456704, 94653217001471, +STORE, 94653217001472, 94653217099775, +STORE, 94653217099776, 94653217120255, +STORE, 140103617011712, 140103617183743, +ERASE, 140103617011712, 140103617011712, +STORE, 140103617011712, 140103617015807, +STORE, 140103617015808, 140103617183743, +ERASE, 140103617015808, 140103617015808, +STORE, 140103617015808, 140103617138687, +STORE, 140103617138688, 140103617171455, +STORE, 140103617171456, 140103617179647, +STORE, 140103617179648, 140103617183743, +STORE, 140730265427968, 140730265432063, +STORE, 140730265415680, 140730265427967, +STORE, 47529177985024, 47529177993215, +STORE, 47529177993216, 47529178001407, +STORE, 47529178001408, 47529178107903, +STORE, 47529178017792, 47529178107903, +STORE, 47529178001408, 47529178017791, +ERASE, 47529178017792, 47529178017792, +STORE, 47529178017792, 47529178091519, +STORE, 47529178091520, 47529178107903, +STORE, 47529178071040, 47529178091519, +STORE, 47529178017792, 47529178071039, +ERASE, 47529178017792, 47529178017792, +STORE, 47529178017792, 47529178071039, +STORE, 47529178087424, 47529178091519, +STORE, 47529178071040, 47529178087423, +ERASE, 47529178071040, 47529178071040, +STORE, 47529178071040, 47529178087423, +STORE, 47529178099712, 47529178107903, +STORE, 47529178091520, 47529178099711, +ERASE, 47529178091520, 47529178091520, +STORE, 47529178091520, 47529178099711, +ERASE, 47529178099712, 47529178099712, +STORE, 47529178099712, 47529178107903, +STORE, 47529178107904, 47529179947007, +STORE, 47529178247168, 47529179947007, +STORE, 47529178107904, 47529178247167, +ERASE, 47529178247168, 47529178247168, +STORE, 47529178247168, 47529179906047, +STORE, 47529179906048, 47529179947007, +STORE, 47529179590656, 47529179906047, +STORE, 47529178247168, 47529179590655, +ERASE, 47529178247168, 47529178247168, +STORE, 47529178247168, 47529179590655, +STORE, 47529179901952, 47529179906047, +STORE, 47529179590656, 47529179901951, +ERASE, 47529179590656, 47529179590656, +STORE, 47529179590656, 47529179901951, +STORE, 47529179930624, 47529179947007, +STORE, 47529179906048, 47529179930623, +ERASE, 47529179906048, 47529179906048, +STORE, 47529179906048, 47529179930623, +ERASE, 47529179930624, 47529179930624, +STORE, 47529179930624, 47529179947007, +STORE, 47529179930624, 47529179959295, +ERASE, 47529179906048, 47529179906048, +STORE, 47529179906048, 47529179922431, +STORE, 47529179922432, 47529179930623, +ERASE, 47529178091520, 47529178091520, +STORE, 47529178091520, 47529178095615, +STORE, 47529178095616, 47529178099711, +ERASE, 94653217099776, 94653217099776, +STORE, 94653217099776, 94653217116159, +STORE, 94653217116160, 94653217120255, +ERASE, 140103617171456, 140103617171456, +STORE, 140103617171456, 140103617175551, +STORE, 140103617175552, 140103617179647, +ERASE, 47529177985024, 47529177985024, +STORE, 94653241135104, 94653241270271, +STORE, 140737488347136, 140737488351231, +STORE, 140736284549120, 140737488351231, +ERASE, 140736284549120, 140736284549120, +STORE, 140736284549120, 140736284553215, +STORE, 93963663822848, 93963664506879, +ERASE, 93963663822848, 93963663822848, +STORE, 93963663822848, 93963663884287, +STORE, 93963663884288, 93963664506879, +ERASE, 93963663884288, 93963663884288, +STORE, 93963663884288, 93963664240639, +STORE, 93963664240640, 93963664379903, +STORE, 93963664379904, 93963664506879, +STORE, 140450188439552, 140450188611583, +ERASE, 140450188439552, 140450188439552, +STORE, 140450188439552, 140450188443647, +STORE, 140450188443648, 140450188611583, +ERASE, 140450188443648, 140450188443648, +STORE, 140450188443648, 140450188566527, +STORE, 140450188566528, 140450188599295, +STORE, 140450188599296, 140450188607487, +STORE, 140450188607488, 140450188611583, +STORE, 140736284577792, 140736284581887, +STORE, 140736284565504, 140736284577791, +STORE, 47182606557184, 47182606565375, +STORE, 47182606565376, 47182606573567, +STORE, 47182606573568, 47182608412671, +STORE, 47182606712832, 47182608412671, +STORE, 47182606573568, 47182606712831, +ERASE, 47182606712832, 47182606712832, +STORE, 47182606712832, 47182608371711, +STORE, 47182608371712, 47182608412671, +STORE, 47182608056320, 47182608371711, +STORE, 47182606712832, 47182608056319, +ERASE, 47182606712832, 47182606712832, +STORE, 47182606712832, 47182608056319, +STORE, 47182608367616, 47182608371711, +STORE, 47182608056320, 47182608367615, +ERASE, 47182608056320, 47182608056320, +STORE, 47182608056320, 47182608367615, +STORE, 47182608396288, 47182608412671, +STORE, 47182608371712, 47182608396287, +ERASE, 47182608371712, 47182608371712, +STORE, 47182608371712, 47182608396287, +ERASE, 47182608396288, 47182608396288, +STORE, 47182608396288, 47182608412671, +STORE, 47182608412672, 47182608523263, +STORE, 47182608429056, 47182608523263, +STORE, 47182608412672, 47182608429055, +ERASE, 47182608429056, 47182608429056, +STORE, 47182608429056, 47182608515071, +STORE, 47182608515072, 47182608523263, +STORE, 47182608490496, 47182608515071, +STORE, 47182608429056, 47182608490495, +ERASE, 47182608429056, 47182608429056, +STORE, 47182608429056, 47182608490495, +STORE, 47182608510976, 47182608515071, +STORE, 47182608490496, 47182608510975, +ERASE, 47182608490496, 47182608490496, +STORE, 47182608490496, 47182608510975, +ERASE, 47182608515072, 47182608515072, +STORE, 47182608515072, 47182608523263, +STORE, 47182608523264, 47182608568319, +ERASE, 47182608523264, 47182608523264, +STORE, 47182608523264, 47182608531455, +STORE, 47182608531456, 47182608568319, +STORE, 47182608551936, 47182608568319, +STORE, 47182608531456, 47182608551935, +ERASE, 47182608531456, 47182608531456, +STORE, 47182608531456, 47182608551935, +STORE, 47182608560128, 47182608568319, +STORE, 47182608551936, 47182608560127, +ERASE, 47182608551936, 47182608551936, +STORE, 47182608551936, 47182608568319, +ERASE, 47182608551936, 47182608551936, +STORE, 47182608551936, 47182608560127, +STORE, 47182608560128, 47182608568319, +ERASE, 47182608560128, 47182608560128, +STORE, 47182608560128, 47182608568319, +STORE, 47182608568320, 47182608916479, +STORE, 47182608609280, 47182608916479, +STORE, 47182608568320, 47182608609279, +ERASE, 47182608609280, 47182608609280, +STORE, 47182608609280, 47182608891903, +STORE, 47182608891904, 47182608916479, +STORE, 47182608822272, 47182608891903, +STORE, 47182608609280, 47182608822271, +ERASE, 47182608609280, 47182608609280, +STORE, 47182608609280, 47182608822271, +STORE, 47182608887808, 47182608891903, +STORE, 47182608822272, 47182608887807, +ERASE, 47182608822272, 47182608822272, +STORE, 47182608822272, 47182608887807, +ERASE, 47182608891904, 47182608891904, +STORE, 47182608891904, 47182608916479, +STORE, 47182608916480, 47182611177471, +STORE, 47182609068032, 47182611177471, +STORE, 47182608916480, 47182609068031, +ERASE, 47182609068032, 47182609068032, +STORE, 47182609068032, 47182611161087, +STORE, 47182611161088, 47182611177471, +STORE, 47182611169280, 47182611177471, +STORE, 47182611161088, 47182611169279, +ERASE, 47182611161088, 47182611161088, +STORE, 47182611161088, 47182611169279, +ERASE, 47182611169280, 47182611169280, +STORE, 47182611169280, 47182611177471, +STORE, 47182611177472, 47182611312639, +ERASE, 47182611177472, 47182611177472, +STORE, 47182611177472, 47182611202047, +STORE, 47182611202048, 47182611312639, +STORE, 47182611263488, 47182611312639, +STORE, 47182611202048, 47182611263487, +ERASE, 47182611202048, 47182611202048, +STORE, 47182611202048, 47182611263487, +STORE, 47182611288064, 47182611312639, +STORE, 47182611263488, 47182611288063, +ERASE, 47182611263488, 47182611263488, +STORE, 47182611263488, 47182611312639, +ERASE, 47182611263488, 47182611263488, +STORE, 47182611263488, 47182611288063, +STORE, 47182611288064, 47182611312639, +STORE, 47182611296256, 47182611312639, +STORE, 47182611288064, 47182611296255, +ERASE, 47182611288064, 47182611288064, +STORE, 47182611288064, 47182611296255, +ERASE, 47182611296256, 47182611296256, +STORE, 47182611296256, 47182611312639, +STORE, 47182611296256, 47182611320831, +STORE, 47182611320832, 47182611484671, +ERASE, 47182611320832, 47182611320832, +STORE, 47182611320832, 47182611333119, +STORE, 47182611333120, 47182611484671, +STORE, 47182611431424, 47182611484671, +STORE, 47182611333120, 47182611431423, +ERASE, 47182611333120, 47182611333120, +STORE, 47182611333120, 47182611431423, +STORE, 47182611476480, 47182611484671, +STORE, 47182611431424, 47182611476479, +ERASE, 47182611431424, 47182611431424, +STORE, 47182611431424, 47182611484671, +ERASE, 47182611431424, 47182611431424, +STORE, 47182611431424, 47182611476479, +STORE, 47182611476480, 47182611484671, +ERASE, 47182611476480, 47182611476480, +STORE, 47182611476480, 47182611484671, +STORE, 47182611484672, 47182612082687, +STORE, 47182611603456, 47182612082687, +STORE, 47182611484672, 47182611603455, +ERASE, 47182611603456, 47182611603456, +STORE, 47182611603456, 47182612029439, +STORE, 47182612029440, 47182612082687, +STORE, 47182611918848, 47182612029439, +STORE, 47182611603456, 47182611918847, +ERASE, 47182611603456, 47182611603456, +STORE, 47182611603456, 47182611918847, +STORE, 47182612025344, 47182612029439, +STORE, 47182611918848, 47182612025343, +ERASE, 47182611918848, 47182611918848, +STORE, 47182611918848, 47182612025343, +ERASE, 47182612029440, 47182612029440, +STORE, 47182612029440, 47182612082687, +STORE, 47182612082688, 47182615134207, +STORE, 47182612627456, 47182615134207, +STORE, 47182612082688, 47182612627455, +ERASE, 47182612627456, 47182612627456, +STORE, 47182612627456, 47182614913023, +STORE, 47182614913024, 47182615134207, +STORE, 47182614323200, 47182614913023, +STORE, 47182612627456, 47182614323199, +ERASE, 47182612627456, 47182612627456, +STORE, 47182612627456, 47182614323199, +STORE, 47182614908928, 47182614913023, +STORE, 47182614323200, 47182614908927, +ERASE, 47182614323200, 47182614323200, +STORE, 47182614323200, 47182614908927, +STORE, 47182615117824, 47182615134207, +STORE, 47182614913024, 47182615117823, +ERASE, 47182614913024, 47182614913024, +STORE, 47182614913024, 47182615117823, +ERASE, 47182615117824, 47182615117824, +STORE, 47182615117824, 47182615134207, +STORE, 47182615134208, 47182615166975, +ERASE, 47182615134208, 47182615134208, +STORE, 47182615134208, 47182615142399, +STORE, 47182615142400, 47182615166975, +STORE, 47182615154688, 47182615166975, +STORE, 47182615142400, 47182615154687, +ERASE, 47182615142400, 47182615142400, +STORE, 47182615142400, 47182615154687, +STORE, 47182615158784, 47182615166975, +STORE, 47182615154688, 47182615158783, +ERASE, 47182615154688, 47182615154688, +STORE, 47182615154688, 47182615166975, +ERASE, 47182615154688, 47182615154688, +STORE, 47182615154688, 47182615158783, +STORE, 47182615158784, 47182615166975, +ERASE, 47182615158784, 47182615158784, +STORE, 47182615158784, 47182615166975, +STORE, 47182615166976, 47182615203839, +ERASE, 47182615166976, 47182615166976, +STORE, 47182615166976, 47182615175167, +STORE, 47182615175168, 47182615203839, +STORE, 47182615191552, 47182615203839, +STORE, 47182615175168, 47182615191551, +ERASE, 47182615175168, 47182615175168, +STORE, 47182615175168, 47182615191551, +STORE, 47182615195648, 47182615203839, +STORE, 47182615191552, 47182615195647, +ERASE, 47182615191552, 47182615191552, +STORE, 47182615191552, 47182615203839, +ERASE, 47182615191552, 47182615191552, +STORE, 47182615191552, 47182615195647, +STORE, 47182615195648, 47182615203839, +ERASE, 47182615195648, 47182615195648, +STORE, 47182615195648, 47182615203839, +STORE, 47182615203840, 47182615678975, +ERASE, 47182615203840, 47182615203840, +STORE, 47182615203840, 47182615212031, +STORE, 47182615212032, 47182615678975, +STORE, 47182615547904, 47182615678975, +STORE, 47182615212032, 47182615547903, +ERASE, 47182615212032, 47182615212032, +STORE, 47182615212032, 47182615547903, +STORE, 47182615670784, 47182615678975, +STORE, 47182615547904, 47182615670783, +ERASE, 47182615547904, 47182615547904, +STORE, 47182615547904, 47182615678975, +ERASE, 47182615547904, 47182615547904, +STORE, 47182615547904, 47182615670783, +STORE, 47182615670784, 47182615678975, +ERASE, 47182615670784, 47182615670784, +STORE, 47182615670784, 47182615678975, +STORE, 47182615678976, 47182615687167, +STORE, 47182615687168, 47182615707647, +ERASE, 47182615687168, 47182615687168, +STORE, 47182615687168, 47182615691263, +STORE, 47182615691264, 47182615707647, +STORE, 47182615695360, 47182615707647, +STORE, 47182615691264, 47182615695359, +ERASE, 47182615691264, 47182615691264, +STORE, 47182615691264, 47182615695359, +STORE, 47182615699456, 47182615707647, +STORE, 47182615695360, 47182615699455, +ERASE, 47182615695360, 47182615695360, +STORE, 47182615695360, 47182615707647, +ERASE, 47182615695360, 47182615695360, +STORE, 47182615695360, 47182615699455, +STORE, 47182615699456, 47182615707647, +ERASE, 47182615699456, 47182615699456, +STORE, 47182615699456, 47182615707647, +STORE, 47182615707648, 47182615715839, +ERASE, 47182608371712, 47182608371712, +STORE, 47182608371712, 47182608388095, +STORE, 47182608388096, 47182608396287, +ERASE, 47182615699456, 47182615699456, +STORE, 47182615699456, 47182615703551, +STORE, 47182615703552, 47182615707647, +ERASE, 47182611288064, 47182611288064, +STORE, 47182611288064, 47182611292159, +STORE, 47182611292160, 47182611296255, +ERASE, 47182615670784, 47182615670784, +STORE, 47182615670784, 47182615674879, +STORE, 47182615674880, 47182615678975, +ERASE, 47182615195648, 47182615195648, +STORE, 47182615195648, 47182615199743, +STORE, 47182615199744, 47182615203839, +ERASE, 47182615158784, 47182615158784, +STORE, 47182615158784, 47182615162879, +STORE, 47182615162880, 47182615166975, +ERASE, 47182614913024, 47182614913024, +STORE, 47182614913024, 47182615109631, +STORE, 47182615109632, 47182615117823, +ERASE, 47182612029440, 47182612029440, +STORE, 47182612029440, 47182612066303, +STORE, 47182612066304, 47182612082687, +ERASE, 47182611476480, 47182611476480, +STORE, 47182611476480, 47182611480575, +STORE, 47182611480576, 47182611484671, +ERASE, 47182611161088, 47182611161088, +STORE, 47182611161088, 47182611165183, +STORE, 47182611165184, 47182611169279, +ERASE, 47182608891904, 47182608891904, +STORE, 47182608891904, 47182608912383, +STORE, 47182608912384, 47182608916479, +ERASE, 47182608560128, 47182608560128, +STORE, 47182608560128, 47182608564223, +STORE, 47182608564224, 47182608568319, +ERASE, 47182608515072, 47182608515072, +STORE, 47182608515072, 47182608519167, +STORE, 47182608519168, 47182608523263, +ERASE, 93963664379904, 93963664379904, +STORE, 93963664379904, 93963664502783, +STORE, 93963664502784, 93963664506879, +ERASE, 140450188599296, 140450188599296, +STORE, 140450188599296, 140450188603391, +STORE, 140450188603392, 140450188607487, +ERASE, 47182606557184, 47182606557184, +STORE, 93963694723072, 93963694858239, +STORE, 140737488347136, 140737488351231, +STORE, 140730313261056, 140737488351231, +ERASE, 140730313261056, 140730313261056, +STORE, 140730313261056, 140730313265151, +STORE, 94386579017728, 94386579697663, +ERASE, 94386579017728, 94386579017728, +STORE, 94386579017728, 94386579083263, +STORE, 94386579083264, 94386579697663, +ERASE, 94386579083264, 94386579083264, +STORE, 94386579083264, 94386579431423, +STORE, 94386579431424, 94386579570687, +STORE, 94386579570688, 94386579697663, +STORE, 140124810838016, 140124811010047, +ERASE, 140124810838016, 140124810838016, +STORE, 140124810838016, 140124810842111, +STORE, 140124810842112, 140124811010047, +ERASE, 140124810842112, 140124810842112, +STORE, 140124810842112, 140124810964991, +STORE, 140124810964992, 140124810997759, +STORE, 140124810997760, 140124811005951, +STORE, 140124811005952, 140124811010047, +STORE, 140730313601024, 140730313605119, +STORE, 140730313588736, 140730313601023, +STORE, 47507984158720, 47507984166911, +STORE, 47507984166912, 47507984175103, +STORE, 47507984175104, 47507986014207, +STORE, 47507984314368, 47507986014207, +STORE, 47507984175104, 47507984314367, +ERASE, 47507984314368, 47507984314368, +STORE, 47507984314368, 47507985973247, +STORE, 47507985973248, 47507986014207, +STORE, 47507985657856, 47507985973247, +STORE, 47507984314368, 47507985657855, +ERASE, 47507984314368, 47507984314368, +STORE, 47507984314368, 47507985657855, +STORE, 47507985969152, 47507985973247, +STORE, 47507985657856, 47507985969151, +ERASE, 47507985657856, 47507985657856, +STORE, 47507985657856, 47507985969151, +STORE, 47507985997824, 47507986014207, +STORE, 47507985973248, 47507985997823, +ERASE, 47507985973248, 47507985973248, +STORE, 47507985973248, 47507985997823, +ERASE, 47507985997824, 47507985997824, +STORE, 47507985997824, 47507986014207, +STORE, 47507986014208, 47507986124799, +STORE, 47507986030592, 47507986124799, +STORE, 47507986014208, 47507986030591, +ERASE, 47507986030592, 47507986030592, +STORE, 47507986030592, 47507986116607, +STORE, 47507986116608, 47507986124799, +STORE, 47507986092032, 47507986116607, +STORE, 47507986030592, 47507986092031, +ERASE, 47507986030592, 47507986030592, +STORE, 47507986030592, 47507986092031, +STORE, 47507986112512, 47507986116607, +STORE, 47507986092032, 47507986112511, +ERASE, 47507986092032, 47507986092032, +STORE, 47507986092032, 47507986112511, +ERASE, 47507986116608, 47507986116608, +STORE, 47507986116608, 47507986124799, +STORE, 47507986124800, 47507986169855, +ERASE, 47507986124800, 47507986124800, +STORE, 47507986124800, 47507986132991, +STORE, 47507986132992, 47507986169855, +STORE, 47507986153472, 47507986169855, +STORE, 47507986132992, 47507986153471, +ERASE, 47507986132992, 47507986132992, +STORE, 47507986132992, 47507986153471, +STORE, 47507986161664, 47507986169855, +STORE, 47507986153472, 47507986161663, +ERASE, 47507986153472, 47507986153472, +STORE, 47507986153472, 47507986169855, +ERASE, 47507986153472, 47507986153472, +STORE, 47507986153472, 47507986161663, +STORE, 47507986161664, 47507986169855, +ERASE, 47507986161664, 47507986161664, +STORE, 47507986161664, 47507986169855, +STORE, 47507986169856, 47507986518015, +STORE, 47507986210816, 47507986518015, +STORE, 47507986169856, 47507986210815, +ERASE, 47507986210816, 47507986210816, +STORE, 47507986210816, 47507986493439, +STORE, 47507986493440, 47507986518015, +STORE, 47507986423808, 47507986493439, +STORE, 47507986210816, 47507986423807, +ERASE, 47507986210816, 47507986210816, +STORE, 47507986210816, 47507986423807, +STORE, 47507986489344, 47507986493439, +STORE, 47507986423808, 47507986489343, +ERASE, 47507986423808, 47507986423808, +STORE, 47507986423808, 47507986489343, +ERASE, 47507986493440, 47507986493440, +STORE, 47507986493440, 47507986518015, +STORE, 47507986518016, 47507988779007, +STORE, 47507986669568, 47507988779007, +STORE, 47507986518016, 47507986669567, +ERASE, 47507986669568, 47507986669568, +STORE, 47507986669568, 47507988762623, +STORE, 47507988762624, 47507988779007, +STORE, 47507988770816, 47507988779007, +STORE, 47507988762624, 47507988770815, +ERASE, 47507988762624, 47507988762624, +STORE, 47507988762624, 47507988770815, +ERASE, 47507988770816, 47507988770816, +STORE, 47507988770816, 47507988779007, +STORE, 47507988779008, 47507988914175, +ERASE, 47507988779008, 47507988779008, +STORE, 47507988779008, 47507988803583, +STORE, 47507988803584, 47507988914175, +STORE, 47507988865024, 47507988914175, +STORE, 47507988803584, 47507988865023, +ERASE, 47507988803584, 47507988803584, +STORE, 47507988803584, 47507988865023, +STORE, 47507988889600, 47507988914175, +STORE, 47507988865024, 47507988889599, +ERASE, 47507988865024, 47507988865024, +STORE, 47507988865024, 47507988914175, +ERASE, 47507988865024, 47507988865024, +STORE, 47507988865024, 47507988889599, +STORE, 47507988889600, 47507988914175, +STORE, 47507988897792, 47507988914175, +STORE, 47507988889600, 47507988897791, +ERASE, 47507988889600, 47507988889600, +STORE, 47507988889600, 47507988897791, +ERASE, 47507988897792, 47507988897792, +STORE, 47507988897792, 47507988914175, +STORE, 47507988897792, 47507988922367, +STORE, 47507988922368, 47507989086207, +ERASE, 47507988922368, 47507988922368, +STORE, 47507988922368, 47507988934655, +STORE, 47507988934656, 47507989086207, +STORE, 47507989032960, 47507989086207, +STORE, 47507988934656, 47507989032959, +ERASE, 47507988934656, 47507988934656, +STORE, 47507988934656, 47507989032959, +STORE, 47507989078016, 47507989086207, +STORE, 47507989032960, 47507989078015, +ERASE, 47507989032960, 47507989032960, +STORE, 47507989032960, 47507989086207, +ERASE, 47507989032960, 47507989032960, +STORE, 47507989032960, 47507989078015, +STORE, 47507989078016, 47507989086207, +ERASE, 47507989078016, 47507989078016, +STORE, 47507989078016, 47507989086207, +STORE, 47507989086208, 47507989684223, +STORE, 47507989204992, 47507989684223, +STORE, 47507989086208, 47507989204991, +ERASE, 47507989204992, 47507989204992, +STORE, 47507989204992, 47507989630975, +STORE, 47507989630976, 47507989684223, +STORE, 47507989520384, 47507989630975, +STORE, 47507989204992, 47507989520383, +ERASE, 47507989204992, 47507989204992, +STORE, 47507989204992, 47507989520383, +STORE, 47507989626880, 47507989630975, +STORE, 47507989520384, 47507989626879, +ERASE, 47507989520384, 47507989520384, +STORE, 47507989520384, 47507989626879, +ERASE, 47507989630976, 47507989630976, +STORE, 47507989630976, 47507989684223, +STORE, 47507989684224, 47507992735743, +STORE, 47507990228992, 47507992735743, +STORE, 47507989684224, 47507990228991, +ERASE, 47507990228992, 47507990228992, +STORE, 47507990228992, 47507992514559, +STORE, 47507992514560, 47507992735743, +STORE, 47507991924736, 47507992514559, +STORE, 47507990228992, 47507991924735, +ERASE, 47507990228992, 47507990228992, +STORE, 47507990228992, 47507991924735, +STORE, 47507992510464, 47507992514559, +STORE, 47507991924736, 47507992510463, +ERASE, 47507991924736, 47507991924736, +STORE, 47507991924736, 47507992510463, +STORE, 47507992719360, 47507992735743, +STORE, 47507992514560, 47507992719359, +ERASE, 47507992514560, 47507992514560, +STORE, 47507992514560, 47507992719359, +ERASE, 47507992719360, 47507992719360, +STORE, 47507992719360, 47507992735743, +STORE, 47507992735744, 47507992768511, +ERASE, 47507992735744, 47507992735744, +STORE, 47507992735744, 47507992743935, +STORE, 47507992743936, 47507992768511, +STORE, 47507992756224, 47507992768511, +STORE, 47507992743936, 47507992756223, +ERASE, 47507992743936, 47507992743936, +STORE, 47507992743936, 47507992756223, +STORE, 47507992760320, 47507992768511, +STORE, 47507992756224, 47507992760319, +ERASE, 47507992756224, 47507992756224, +STORE, 47507992756224, 47507992768511, +ERASE, 47507992756224, 47507992756224, +STORE, 47507992756224, 47507992760319, +STORE, 47507992760320, 47507992768511, +ERASE, 47507992760320, 47507992760320, +STORE, 47507992760320, 47507992768511, +STORE, 47507992768512, 47507992805375, +ERASE, 47507992768512, 47507992768512, +STORE, 47507992768512, 47507992776703, +STORE, 47507992776704, 47507992805375, +STORE, 47507992793088, 47507992805375, +STORE, 47507992776704, 47507992793087, +ERASE, 47507992776704, 47507992776704, +STORE, 47507992776704, 47507992793087, +STORE, 47507992797184, 47507992805375, +STORE, 47507992793088, 47507992797183, +ERASE, 47507992793088, 47507992793088, +STORE, 47507992793088, 47507992805375, +ERASE, 47507992793088, 47507992793088, +STORE, 47507992793088, 47507992797183, +STORE, 47507992797184, 47507992805375, +ERASE, 47507992797184, 47507992797184, +STORE, 47507992797184, 47507992805375, +STORE, 47507992805376, 47507993280511, +ERASE, 47507992805376, 47507992805376, +STORE, 47507992805376, 47507992813567, +STORE, 47507992813568, 47507993280511, +STORE, 47507993149440, 47507993280511, +STORE, 47507992813568, 47507993149439, +ERASE, 47507992813568, 47507992813568, +STORE, 47507992813568, 47507993149439, +STORE, 47507993272320, 47507993280511, +STORE, 47507993149440, 47507993272319, +ERASE, 47507993149440, 47507993149440, +STORE, 47507993149440, 47507993280511, +ERASE, 47507993149440, 47507993149440, +STORE, 47507993149440, 47507993272319, +STORE, 47507993272320, 47507993280511, +ERASE, 47507993272320, 47507993272320, +STORE, 47507993272320, 47507993280511, +STORE, 47507993280512, 47507993288703, +STORE, 47507993288704, 47507993309183, +ERASE, 47507993288704, 47507993288704, +STORE, 47507993288704, 47507993292799, +STORE, 47507993292800, 47507993309183, +STORE, 47507993296896, 47507993309183, +STORE, 47507993292800, 47507993296895, +ERASE, 47507993292800, 47507993292800, +STORE, 47507993292800, 47507993296895, +STORE, 47507993300992, 47507993309183, +STORE, 47507993296896, 47507993300991, +ERASE, 47507993296896, 47507993296896, +STORE, 47507993296896, 47507993309183, +ERASE, 47507993296896, 47507993296896, +STORE, 47507993296896, 47507993300991, +STORE, 47507993300992, 47507993309183, +ERASE, 47507993300992, 47507993300992, +STORE, 47507993300992, 47507993309183, +STORE, 47507993309184, 47507993317375, +ERASE, 47507985973248, 47507985973248, +STORE, 47507985973248, 47507985989631, +STORE, 47507985989632, 47507985997823, +ERASE, 47507993300992, 47507993300992, +STORE, 47507993300992, 47507993305087, +STORE, 47507993305088, 47507993309183, +ERASE, 47507988889600, 47507988889600, +STORE, 47507988889600, 47507988893695, +STORE, 47507988893696, 47507988897791, +ERASE, 47507993272320, 47507993272320, +STORE, 47507993272320, 47507993276415, +STORE, 47507993276416, 47507993280511, +ERASE, 47507992797184, 47507992797184, +STORE, 47507992797184, 47507992801279, +STORE, 47507992801280, 47507992805375, +ERASE, 47507992760320, 47507992760320, +STORE, 47507992760320, 47507992764415, +STORE, 47507992764416, 47507992768511, +ERASE, 47507992514560, 47507992514560, +STORE, 47507992514560, 47507992711167, +STORE, 47507992711168, 47507992719359, +ERASE, 47507989630976, 47507989630976, +STORE, 47507989630976, 47507989667839, +STORE, 47507989667840, 47507989684223, +ERASE, 47507989078016, 47507989078016, +STORE, 47507989078016, 47507989082111, +STORE, 47507989082112, 47507989086207, +ERASE, 47507988762624, 47507988762624, +STORE, 47507988762624, 47507988766719, +STORE, 47507988766720, 47507988770815, +ERASE, 47507986493440, 47507986493440, +STORE, 47507986493440, 47507986513919, +STORE, 47507986513920, 47507986518015, +ERASE, 47507986161664, 47507986161664, +STORE, 47507986161664, 47507986165759, +STORE, 47507986165760, 47507986169855, +ERASE, 47507986116608, 47507986116608, +STORE, 47507986116608, 47507986120703, +STORE, 47507986120704, 47507986124799, +ERASE, 94386579570688, 94386579570688, +STORE, 94386579570688, 94386579693567, +STORE, 94386579693568, 94386579697663, +ERASE, 140124810997760, 140124810997760, +STORE, 140124810997760, 140124811001855, +STORE, 140124811001856, 140124811005951, +ERASE, 47507984158720, 47507984158720, +STORE, 94386583982080, 94386584117247, +STORE, 94386583982080, 94386584256511, +ERASE, 94386583982080, 94386583982080, +STORE, 94386583982080, 94386584223743, +STORE, 94386584223744, 94386584256511, +ERASE, 94386584223744, 94386584223744, +STORE, 140737488347136, 140737488351231, +STORE, 140733763395584, 140737488351231, +ERASE, 140733763395584, 140733763395584, +STORE, 140733763395584, 140733763399679, +STORE, 94011546472448, 94011547152383, +ERASE, 94011546472448, 94011546472448, +STORE, 94011546472448, 94011546537983, +STORE, 94011546537984, 94011547152383, +ERASE, 94011546537984, 94011546537984, +STORE, 94011546537984, 94011546886143, +STORE, 94011546886144, 94011547025407, +STORE, 94011547025408, 94011547152383, +STORE, 139757597949952, 139757598121983, +ERASE, 139757597949952, 139757597949952, +STORE, 139757597949952, 139757597954047, +STORE, 139757597954048, 139757598121983, +ERASE, 139757597954048, 139757597954048, +STORE, 139757597954048, 139757598076927, +STORE, 139757598076928, 139757598109695, +STORE, 139757598109696, 139757598117887, +STORE, 139757598117888, 139757598121983, +STORE, 140733763596288, 140733763600383, +STORE, 140733763584000, 140733763596287, +STORE, 47875197046784, 47875197054975, +STORE, 47875197054976, 47875197063167, +STORE, 47875197063168, 47875198902271, +STORE, 47875197202432, 47875198902271, +STORE, 47875197063168, 47875197202431, +ERASE, 47875197202432, 47875197202432, +STORE, 47875197202432, 47875198861311, +STORE, 47875198861312, 47875198902271, +STORE, 47875198545920, 47875198861311, +STORE, 47875197202432, 47875198545919, +ERASE, 47875197202432, 47875197202432, +STORE, 47875197202432, 47875198545919, +STORE, 47875198857216, 47875198861311, +STORE, 47875198545920, 47875198857215, +ERASE, 47875198545920, 47875198545920, +STORE, 47875198545920, 47875198857215, +STORE, 47875198885888, 47875198902271, +STORE, 47875198861312, 47875198885887, +ERASE, 47875198861312, 47875198861312, +STORE, 47875198861312, 47875198885887, +ERASE, 47875198885888, 47875198885888, +STORE, 47875198885888, 47875198902271, +STORE, 47875198902272, 47875199012863, +STORE, 47875198918656, 47875199012863, +STORE, 47875198902272, 47875198918655, +ERASE, 47875198918656, 47875198918656, +STORE, 47875198918656, 47875199004671, +STORE, 47875199004672, 47875199012863, +STORE, 47875198980096, 47875199004671, +STORE, 47875198918656, 47875198980095, +ERASE, 47875198918656, 47875198918656, +STORE, 47875198918656, 47875198980095, +STORE, 47875199000576, 47875199004671, +STORE, 47875198980096, 47875199000575, +ERASE, 47875198980096, 47875198980096, +STORE, 47875198980096, 47875199000575, +ERASE, 47875199004672, 47875199004672, +STORE, 47875199004672, 47875199012863, +STORE, 47875199012864, 47875199057919, +ERASE, 47875199012864, 47875199012864, +STORE, 47875199012864, 47875199021055, +STORE, 47875199021056, 47875199057919, +STORE, 47875199041536, 47875199057919, +STORE, 47875199021056, 47875199041535, +ERASE, 47875199021056, 47875199021056, +STORE, 47875199021056, 47875199041535, +STORE, 47875199049728, 47875199057919, +STORE, 47875199041536, 47875199049727, +ERASE, 47875199041536, 47875199041536, +STORE, 47875199041536, 47875199057919, +ERASE, 47875199041536, 47875199041536, +STORE, 47875199041536, 47875199049727, +STORE, 47875199049728, 47875199057919, +ERASE, 47875199049728, 47875199049728, +STORE, 47875199049728, 47875199057919, +STORE, 47875199057920, 47875199406079, +STORE, 47875199098880, 47875199406079, +STORE, 47875199057920, 47875199098879, +ERASE, 47875199098880, 47875199098880, +STORE, 47875199098880, 47875199381503, +STORE, 47875199381504, 47875199406079, +STORE, 47875199311872, 47875199381503, +STORE, 47875199098880, 47875199311871, +ERASE, 47875199098880, 47875199098880, +STORE, 47875199098880, 47875199311871, +STORE, 47875199377408, 47875199381503, +STORE, 47875199311872, 47875199377407, +ERASE, 47875199311872, 47875199311872, +STORE, 47875199311872, 47875199377407, +ERASE, 47875199381504, 47875199381504, +STORE, 47875199381504, 47875199406079, +STORE, 47875199406080, 47875201667071, +STORE, 47875199557632, 47875201667071, +STORE, 47875199406080, 47875199557631, +ERASE, 47875199557632, 47875199557632, +STORE, 47875199557632, 47875201650687, +STORE, 47875201650688, 47875201667071, +STORE, 47875201658880, 47875201667071, +STORE, 47875201650688, 47875201658879, +ERASE, 47875201650688, 47875201650688, +STORE, 47875201650688, 47875201658879, +ERASE, 47875201658880, 47875201658880, +STORE, 47875201658880, 47875201667071, +STORE, 47875201667072, 47875201802239, +ERASE, 47875201667072, 47875201667072, +STORE, 47875201667072, 47875201691647, +STORE, 47875201691648, 47875201802239, +STORE, 47875201753088, 47875201802239, +STORE, 47875201691648, 47875201753087, +ERASE, 47875201691648, 47875201691648, +STORE, 47875201691648, 47875201753087, +STORE, 47875201777664, 47875201802239, +STORE, 47875201753088, 47875201777663, +ERASE, 47875201753088, 47875201753088, +STORE, 47875201753088, 47875201802239, +ERASE, 47875201753088, 47875201753088, +STORE, 47875201753088, 47875201777663, +STORE, 47875201777664, 47875201802239, +STORE, 47875201785856, 47875201802239, +STORE, 47875201777664, 47875201785855, +ERASE, 47875201777664, 47875201777664, +STORE, 47875201777664, 47875201785855, +ERASE, 47875201785856, 47875201785856, +STORE, 47875201785856, 47875201802239, +STORE, 47875201785856, 47875201810431, +STORE, 47875201810432, 47875201974271, +ERASE, 47875201810432, 47875201810432, +STORE, 47875201810432, 47875201822719, +STORE, 47875201822720, 47875201974271, +STORE, 47875201921024, 47875201974271, +STORE, 47875201822720, 47875201921023, +ERASE, 47875201822720, 47875201822720, +STORE, 47875201822720, 47875201921023, +STORE, 47875201966080, 47875201974271, +STORE, 47875201921024, 47875201966079, +ERASE, 47875201921024, 47875201921024, +STORE, 47875201921024, 47875201974271, +ERASE, 47875201921024, 47875201921024, +STORE, 47875201921024, 47875201966079, +STORE, 47875201966080, 47875201974271, +ERASE, 47875201966080, 47875201966080, +STORE, 47875201966080, 47875201974271, +STORE, 47875201974272, 47875202572287, +STORE, 47875202093056, 47875202572287, +STORE, 47875201974272, 47875202093055, +ERASE, 47875202093056, 47875202093056, +STORE, 47875202093056, 47875202519039, +STORE, 47875202519040, 47875202572287, +STORE, 47875202408448, 47875202519039, +STORE, 47875202093056, 47875202408447, +ERASE, 47875202093056, 47875202093056, +STORE, 47875202093056, 47875202408447, +STORE, 47875202514944, 47875202519039, +STORE, 47875202408448, 47875202514943, +ERASE, 47875202408448, 47875202408448, +STORE, 47875202408448, 47875202514943, +ERASE, 47875202519040, 47875202519040, +STORE, 47875202519040, 47875202572287, +STORE, 47875202572288, 47875205623807, +STORE, 47875203117056, 47875205623807, +STORE, 47875202572288, 47875203117055, +ERASE, 47875203117056, 47875203117056, +STORE, 47875203117056, 47875205402623, +STORE, 47875205402624, 47875205623807, +STORE, 47875204812800, 47875205402623, +STORE, 47875203117056, 47875204812799, +ERASE, 47875203117056, 47875203117056, +STORE, 47875203117056, 47875204812799, +STORE, 47875205398528, 47875205402623, +STORE, 47875204812800, 47875205398527, +ERASE, 47875204812800, 47875204812800, +STORE, 47875204812800, 47875205398527, +STORE, 47875205607424, 47875205623807, +STORE, 47875205402624, 47875205607423, +ERASE, 47875205402624, 47875205402624, +STORE, 47875205402624, 47875205607423, +ERASE, 47875205607424, 47875205607424, +STORE, 47875205607424, 47875205623807, +STORE, 47875205623808, 47875205656575, +ERASE, 47875205623808, 47875205623808, +STORE, 47875205623808, 47875205631999, +STORE, 47875205632000, 47875205656575, +STORE, 47875205644288, 47875205656575, +STORE, 47875205632000, 47875205644287, +ERASE, 47875205632000, 47875205632000, +STORE, 47875205632000, 47875205644287, +STORE, 47875205648384, 47875205656575, +STORE, 47875205644288, 47875205648383, +ERASE, 47875205644288, 47875205644288, +STORE, 47875205644288, 47875205656575, +ERASE, 47875205644288, 47875205644288, +STORE, 47875205644288, 47875205648383, +STORE, 47875205648384, 47875205656575, +ERASE, 47875205648384, 47875205648384, +STORE, 47875205648384, 47875205656575, +STORE, 47875205656576, 47875205693439, +ERASE, 47875205656576, 47875205656576, +STORE, 47875205656576, 47875205664767, +STORE, 47875205664768, 47875205693439, +STORE, 47875205681152, 47875205693439, +STORE, 47875205664768, 47875205681151, +ERASE, 47875205664768, 47875205664768, +STORE, 47875205664768, 47875205681151, +STORE, 47875205685248, 47875205693439, +STORE, 47875205681152, 47875205685247, +ERASE, 47875205681152, 47875205681152, +STORE, 47875205681152, 47875205693439, +ERASE, 47875205681152, 47875205681152, +STORE, 47875205681152, 47875205685247, +STORE, 47875205685248, 47875205693439, +ERASE, 47875205685248, 47875205685248, +STORE, 47875205685248, 47875205693439, +STORE, 47875205693440, 47875206168575, +ERASE, 47875205693440, 47875205693440, +STORE, 47875205693440, 47875205701631, +STORE, 47875205701632, 47875206168575, +STORE, 47875206037504, 47875206168575, +STORE, 47875205701632, 47875206037503, +ERASE, 47875205701632, 47875205701632, +STORE, 47875205701632, 47875206037503, +STORE, 47875206160384, 47875206168575, +STORE, 47875206037504, 47875206160383, +ERASE, 47875206037504, 47875206037504, +STORE, 47875206037504, 47875206168575, +ERASE, 47875206037504, 47875206037504, +STORE, 47875206037504, 47875206160383, +STORE, 47875206160384, 47875206168575, +ERASE, 47875206160384, 47875206160384, +STORE, 47875206160384, 47875206168575, +STORE, 47875206168576, 47875206176767, +STORE, 47875206176768, 47875206197247, +ERASE, 47875206176768, 47875206176768, +STORE, 47875206176768, 47875206180863, +STORE, 47875206180864, 47875206197247, +STORE, 47875206184960, 47875206197247, +STORE, 47875206180864, 47875206184959, +ERASE, 47875206180864, 47875206180864, +STORE, 47875206180864, 47875206184959, +STORE, 47875206189056, 47875206197247, +STORE, 47875206184960, 47875206189055, +ERASE, 47875206184960, 47875206184960, +STORE, 47875206184960, 47875206197247, +ERASE, 47875206184960, 47875206184960, +STORE, 47875206184960, 47875206189055, +STORE, 47875206189056, 47875206197247, +ERASE, 47875206189056, 47875206189056, +STORE, 47875206189056, 47875206197247, +STORE, 47875206197248, 47875206205439, +ERASE, 47875198861312, 47875198861312, +STORE, 47875198861312, 47875198877695, +STORE, 47875198877696, 47875198885887, +ERASE, 47875206189056, 47875206189056, +STORE, 47875206189056, 47875206193151, +STORE, 47875206193152, 47875206197247, +ERASE, 47875201777664, 47875201777664, +STORE, 47875201777664, 47875201781759, +STORE, 47875201781760, 47875201785855, +ERASE, 47875206160384, 47875206160384, +STORE, 47875206160384, 47875206164479, +STORE, 47875206164480, 47875206168575, +ERASE, 47875205685248, 47875205685248, +STORE, 47875205685248, 47875205689343, +STORE, 47875205689344, 47875205693439, +ERASE, 47875205648384, 47875205648384, +STORE, 47875205648384, 47875205652479, +STORE, 47875205652480, 47875205656575, +ERASE, 47875205402624, 47875205402624, +STORE, 47875205402624, 47875205599231, +STORE, 47875205599232, 47875205607423, +ERASE, 47875202519040, 47875202519040, +STORE, 47875202519040, 47875202555903, +STORE, 47875202555904, 47875202572287, +ERASE, 47875201966080, 47875201966080, +STORE, 47875201966080, 47875201970175, +STORE, 47875201970176, 47875201974271, +ERASE, 47875201650688, 47875201650688, +STORE, 47875201650688, 47875201654783, +STORE, 47875201654784, 47875201658879, +ERASE, 47875199381504, 47875199381504, +STORE, 47875199381504, 47875199401983, +STORE, 47875199401984, 47875199406079, +ERASE, 47875199049728, 47875199049728, +STORE, 47875199049728, 47875199053823, +STORE, 47875199053824, 47875199057919, +ERASE, 47875199004672, 47875199004672, +STORE, 47875199004672, 47875199008767, +STORE, 47875199008768, 47875199012863, +ERASE, 94011547025408, 94011547025408, +STORE, 94011547025408, 94011547148287, +STORE, 94011547148288, 94011547152383, +ERASE, 139757598109696, 139757598109696, +STORE, 139757598109696, 139757598113791, +STORE, 139757598113792, 139757598117887, +ERASE, 47875197046784, 47875197046784, +STORE, 94011557584896, 94011557720063, +STORE, 94011557584896, 94011557855231, +ERASE, 94011557584896, 94011557584896, +STORE, 94011557584896, 94011557851135, +STORE, 94011557851136, 94011557855231, +ERASE, 94011557851136, 94011557851136, +ERASE, 94011557584896, 94011557584896, +STORE, 94011557584896, 94011557847039, +STORE, 94011557847040, 94011557851135, +ERASE, 94011557847040, 94011557847040, +STORE, 94011557584896, 94011557982207, +ERASE, 94011557584896, 94011557584896, +STORE, 94011557584896, 94011557978111, +STORE, 94011557978112, 94011557982207, +ERASE, 94011557978112, 94011557978112, +ERASE, 94011557584896, 94011557584896, +STORE, 94011557584896, 94011557974015, +STORE, 94011557974016, 94011557978111, +ERASE, 94011557974016, 94011557974016, +STORE, 140737488347136, 140737488351231, +STORE, 140734130360320, 140737488351231, +ERASE, 140734130360320, 140734130360320, +STORE, 140734130360320, 140734130364415, +STORE, 94641232105472, 94641232785407, +ERASE, 94641232105472, 94641232105472, +STORE, 94641232105472, 94641232171007, +STORE, 94641232171008, 94641232785407, +ERASE, 94641232171008, 94641232171008, +STORE, 94641232171008, 94641232519167, +STORE, 94641232519168, 94641232658431, +STORE, 94641232658432, 94641232785407, +STORE, 139726599516160, 139726599688191, +ERASE, 139726599516160, 139726599516160, +STORE, 139726599516160, 139726599520255, +STORE, 139726599520256, 139726599688191, +ERASE, 139726599520256, 139726599520256, +STORE, 139726599520256, 139726599643135, +STORE, 139726599643136, 139726599675903, +STORE, 139726599675904, 139726599684095, +STORE, 139726599684096, 139726599688191, +STORE, 140734130446336, 140734130450431, +STORE, 140734130434048, 140734130446335, +STORE, 47906195480576, 47906195488767, +STORE, 47906195488768, 47906195496959, +STORE, 47906195496960, 47906197336063, +STORE, 47906195636224, 47906197336063, +STORE, 47906195496960, 47906195636223, +ERASE, 47906195636224, 47906195636224, +STORE, 47906195636224, 47906197295103, +STORE, 47906197295104, 47906197336063, +STORE, 47906196979712, 47906197295103, +STORE, 47906195636224, 47906196979711, +ERASE, 47906195636224, 47906195636224, +STORE, 47906195636224, 47906196979711, +STORE, 47906197291008, 47906197295103, +STORE, 47906196979712, 47906197291007, +ERASE, 47906196979712, 47906196979712, +STORE, 47906196979712, 47906197291007, +STORE, 47906197319680, 47906197336063, +STORE, 47906197295104, 47906197319679, +ERASE, 47906197295104, 47906197295104, +STORE, 47906197295104, 47906197319679, +ERASE, 47906197319680, 47906197319680, +STORE, 47906197319680, 47906197336063, +STORE, 47906197336064, 47906197446655, +STORE, 47906197352448, 47906197446655, +STORE, 47906197336064, 47906197352447, +ERASE, 47906197352448, 47906197352448, +STORE, 47906197352448, 47906197438463, +STORE, 47906197438464, 47906197446655, +STORE, 47906197413888, 47906197438463, +STORE, 47906197352448, 47906197413887, +ERASE, 47906197352448, 47906197352448, +STORE, 47906197352448, 47906197413887, +STORE, 47906197434368, 47906197438463, +STORE, 47906197413888, 47906197434367, +ERASE, 47906197413888, 47906197413888, +STORE, 47906197413888, 47906197434367, +ERASE, 47906197438464, 47906197438464, +STORE, 47906197438464, 47906197446655, +STORE, 47906197446656, 47906197491711, +ERASE, 47906197446656, 47906197446656, +STORE, 47906197446656, 47906197454847, +STORE, 47906197454848, 47906197491711, +STORE, 47906197475328, 47906197491711, +STORE, 47906197454848, 47906197475327, +ERASE, 47906197454848, 47906197454848, +STORE, 47906197454848, 47906197475327, +STORE, 47906197483520, 47906197491711, +STORE, 47906197475328, 47906197483519, +ERASE, 47906197475328, 47906197475328, +STORE, 47906197475328, 47906197491711, +ERASE, 47906197475328, 47906197475328, +STORE, 47906197475328, 47906197483519, +STORE, 47906197483520, 47906197491711, +ERASE, 47906197483520, 47906197483520, +STORE, 47906197483520, 47906197491711, +STORE, 47906197491712, 47906197839871, +STORE, 47906197532672, 47906197839871, +STORE, 47906197491712, 47906197532671, +ERASE, 47906197532672, 47906197532672, +STORE, 47906197532672, 47906197815295, +STORE, 47906197815296, 47906197839871, +STORE, 47906197745664, 47906197815295, +STORE, 47906197532672, 47906197745663, +ERASE, 47906197532672, 47906197532672, +STORE, 47906197532672, 47906197745663, +STORE, 47906197811200, 47906197815295, +STORE, 47906197745664, 47906197811199, +ERASE, 47906197745664, 47906197745664, +STORE, 47906197745664, 47906197811199, +ERASE, 47906197815296, 47906197815296, +STORE, 47906197815296, 47906197839871, +STORE, 47906197839872, 47906200100863, +STORE, 47906197991424, 47906200100863, +STORE, 47906197839872, 47906197991423, +ERASE, 47906197991424, 47906197991424, +STORE, 47906197991424, 47906200084479, +STORE, 47906200084480, 47906200100863, +STORE, 47906200092672, 47906200100863, +STORE, 47906200084480, 47906200092671, +ERASE, 47906200084480, 47906200084480, +STORE, 47906200084480, 47906200092671, +ERASE, 47906200092672, 47906200092672, +STORE, 47906200092672, 47906200100863, +STORE, 47906200100864, 47906200236031, +ERASE, 47906200100864, 47906200100864, +STORE, 47906200100864, 47906200125439, +STORE, 47906200125440, 47906200236031, +STORE, 47906200186880, 47906200236031, +STORE, 47906200125440, 47906200186879, +ERASE, 47906200125440, 47906200125440, +STORE, 47906200125440, 47906200186879, +STORE, 47906200211456, 47906200236031, +STORE, 47906200186880, 47906200211455, +ERASE, 47906200186880, 47906200186880, +STORE, 47906200186880, 47906200236031, +ERASE, 47906200186880, 47906200186880, +STORE, 47906200186880, 47906200211455, +STORE, 47906200211456, 47906200236031, +STORE, 47906200219648, 47906200236031, +STORE, 47906200211456, 47906200219647, +ERASE, 47906200211456, 47906200211456, +STORE, 47906200211456, 47906200219647, +ERASE, 47906200219648, 47906200219648, +STORE, 47906200219648, 47906200236031, +STORE, 47906200219648, 47906200244223, +STORE, 47906200244224, 47906200408063, +ERASE, 47906200244224, 47906200244224, +STORE, 47906200244224, 47906200256511, +STORE, 47906200256512, 47906200408063, +STORE, 47906200354816, 47906200408063, +STORE, 47906200256512, 47906200354815, +ERASE, 47906200256512, 47906200256512, +STORE, 47906200256512, 47906200354815, +STORE, 47906200399872, 47906200408063, +STORE, 47906200354816, 47906200399871, +ERASE, 47906200354816, 47906200354816, +STORE, 47906200354816, 47906200408063, +ERASE, 47906200354816, 47906200354816, +STORE, 47906200354816, 47906200399871, +STORE, 47906200399872, 47906200408063, +ERASE, 47906200399872, 47906200399872, +STORE, 47906200399872, 47906200408063, +STORE, 47906200408064, 47906201006079, +STORE, 47906200526848, 47906201006079, +STORE, 47906200408064, 47906200526847, +ERASE, 47906200526848, 47906200526848, +STORE, 47906200526848, 47906200952831, +STORE, 47906200952832, 47906201006079, +STORE, 47906200842240, 47906200952831, +STORE, 47906200526848, 47906200842239, +ERASE, 47906200526848, 47906200526848, +STORE, 47906200526848, 47906200842239, +STORE, 47906200948736, 47906200952831, +STORE, 47906200842240, 47906200948735, +ERASE, 47906200842240, 47906200842240, +STORE, 47906200842240, 47906200948735, +ERASE, 47906200952832, 47906200952832, +STORE, 47906200952832, 47906201006079, +STORE, 47906201006080, 47906204057599, +STORE, 47906201550848, 47906204057599, +STORE, 47906201006080, 47906201550847, +ERASE, 47906201550848, 47906201550848, +STORE, 47906201550848, 47906203836415, +STORE, 47906203836416, 47906204057599, +STORE, 47906203246592, 47906203836415, +STORE, 47906201550848, 47906203246591, +ERASE, 47906201550848, 47906201550848, +STORE, 47906201550848, 47906203246591, +STORE, 47906203832320, 47906203836415, +STORE, 47906203246592, 47906203832319, +ERASE, 47906203246592, 47906203246592, +STORE, 47906203246592, 47906203832319, +STORE, 47906204041216, 47906204057599, +STORE, 47906203836416, 47906204041215, +ERASE, 47906203836416, 47906203836416, +STORE, 47906203836416, 47906204041215, +ERASE, 47906204041216, 47906204041216, +STORE, 47906204041216, 47906204057599, +STORE, 47906204057600, 47906204090367, +ERASE, 47906204057600, 47906204057600, +STORE, 47906204057600, 47906204065791, +STORE, 47906204065792, 47906204090367, +STORE, 47906204078080, 47906204090367, +STORE, 47906204065792, 47906204078079, +ERASE, 47906204065792, 47906204065792, +STORE, 47906204065792, 47906204078079, +STORE, 47906204082176, 47906204090367, +STORE, 47906204078080, 47906204082175, +ERASE, 47906204078080, 47906204078080, +STORE, 47906204078080, 47906204090367, +ERASE, 47906204078080, 47906204078080, +STORE, 47906204078080, 47906204082175, +STORE, 47906204082176, 47906204090367, +ERASE, 47906204082176, 47906204082176, +STORE, 47906204082176, 47906204090367, +STORE, 47906204090368, 47906204127231, +ERASE, 47906204090368, 47906204090368, +STORE, 47906204090368, 47906204098559, +STORE, 47906204098560, 47906204127231, +STORE, 47906204114944, 47906204127231, +STORE, 47906204098560, 47906204114943, +ERASE, 47906204098560, 47906204098560, +STORE, 47906204098560, 47906204114943, +STORE, 47906204119040, 47906204127231, +STORE, 47906204114944, 47906204119039, +ERASE, 47906204114944, 47906204114944, +STORE, 47906204114944, 47906204127231, +ERASE, 47906204114944, 47906204114944, +STORE, 47906204114944, 47906204119039, +STORE, 47906204119040, 47906204127231, +ERASE, 47906204119040, 47906204119040, +STORE, 47906204119040, 47906204127231, +STORE, 47906204127232, 47906204602367, +ERASE, 47906204127232, 47906204127232, +STORE, 47906204127232, 47906204135423, +STORE, 47906204135424, 47906204602367, +STORE, 47906204471296, 47906204602367, +STORE, 47906204135424, 47906204471295, +ERASE, 47906204135424, 47906204135424, +STORE, 47906204135424, 47906204471295, +STORE, 47906204594176, 47906204602367, +STORE, 47906204471296, 47906204594175, +ERASE, 47906204471296, 47906204471296, +STORE, 47906204471296, 47906204602367, +ERASE, 47906204471296, 47906204471296, +STORE, 47906204471296, 47906204594175, +STORE, 47906204594176, 47906204602367, +ERASE, 47906204594176, 47906204594176, +STORE, 47906204594176, 47906204602367, +STORE, 47906204602368, 47906204610559, +STORE, 47906204610560, 47906204631039, +ERASE, 47906204610560, 47906204610560, +STORE, 47906204610560, 47906204614655, +STORE, 47906204614656, 47906204631039, +STORE, 47906204618752, 47906204631039, +STORE, 47906204614656, 47906204618751, +ERASE, 47906204614656, 47906204614656, +STORE, 47906204614656, 47906204618751, +STORE, 47906204622848, 47906204631039, +STORE, 47906204618752, 47906204622847, +ERASE, 47906204618752, 47906204618752, +STORE, 47906204618752, 47906204631039, +ERASE, 47906204618752, 47906204618752, +STORE, 47906204618752, 47906204622847, +STORE, 47906204622848, 47906204631039, +ERASE, 47906204622848, 47906204622848, +STORE, 47906204622848, 47906204631039, +STORE, 47906204631040, 47906204639231, +ERASE, 47906197295104, 47906197295104, +STORE, 47906197295104, 47906197311487, +STORE, 47906197311488, 47906197319679, +ERASE, 47906204622848, 47906204622848, +STORE, 47906204622848, 47906204626943, +STORE, 47906204626944, 47906204631039, +ERASE, 47906200211456, 47906200211456, +STORE, 47906200211456, 47906200215551, +STORE, 47906200215552, 47906200219647, +ERASE, 47906204594176, 47906204594176, +STORE, 47906204594176, 47906204598271, +STORE, 47906204598272, 47906204602367, +ERASE, 47906204119040, 47906204119040, +STORE, 47906204119040, 47906204123135, +STORE, 47906204123136, 47906204127231, +ERASE, 47906204082176, 47906204082176, +STORE, 47906204082176, 47906204086271, +STORE, 47906204086272, 47906204090367, +ERASE, 47906203836416, 47906203836416, +STORE, 47906203836416, 47906204033023, +STORE, 47906204033024, 47906204041215, +ERASE, 47906200952832, 47906200952832, +STORE, 47906200952832, 47906200989695, +STORE, 47906200989696, 47906201006079, +ERASE, 47906200399872, 47906200399872, +STORE, 47906200399872, 47906200403967, +STORE, 47906200403968, 47906200408063, +ERASE, 47906200084480, 47906200084480, +STORE, 47906200084480, 47906200088575, +STORE, 47906200088576, 47906200092671, +ERASE, 47906197815296, 47906197815296, +STORE, 47906197815296, 47906197835775, +STORE, 47906197835776, 47906197839871, +ERASE, 47906197483520, 47906197483520, +STORE, 47906197483520, 47906197487615, +STORE, 47906197487616, 47906197491711, +ERASE, 47906197438464, 47906197438464, +STORE, 47906197438464, 47906197442559, +STORE, 47906197442560, 47906197446655, +ERASE, 94641232658432, 94641232658432, +STORE, 94641232658432, 94641232781311, +STORE, 94641232781312, 94641232785407, +ERASE, 139726599675904, 139726599675904, +STORE, 139726599675904, 139726599679999, +STORE, 139726599680000, 139726599684095, +ERASE, 47906195480576, 47906195480576, +STORE, 94641242615808, 94641242750975, + }; + + unsigned long set10[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140736427839488, 140737488351231, +ERASE, 140736427839488, 140736427839488, +STORE, 140736427839488, 140736427843583, +STORE, 94071213395968, 94071213567999, +ERASE, 94071213395968, 94071213395968, +STORE, 94071213395968, 94071213412351, +STORE, 94071213412352, 94071213567999, +ERASE, 94071213412352, 94071213412352, +STORE, 94071213412352, 94071213514751, +STORE, 94071213514752, 94071213555711, +STORE, 94071213555712, 94071213567999, +STORE, 139968410644480, 139968410816511, +ERASE, 139968410644480, 139968410644480, +STORE, 139968410644480, 139968410648575, +STORE, 139968410648576, 139968410816511, +ERASE, 139968410648576, 139968410648576, +STORE, 139968410648576, 139968410771455, +STORE, 139968410771456, 139968410804223, +STORE, 139968410804224, 139968410812415, +STORE, 139968410812416, 139968410816511, +STORE, 140736429277184, 140736429281279, +STORE, 140736429264896, 140736429277183, +STORE, 47664384352256, 47664384360447, +STORE, 47664384360448, 47664384368639, +STORE, 47664384368640, 47664384532479, +ERASE, 47664384368640, 47664384368640, +STORE, 47664384368640, 47664384380927, +STORE, 47664384380928, 47664384532479, +STORE, 47664384479232, 47664384532479, +STORE, 47664384380928, 47664384479231, +ERASE, 47664384380928, 47664384380928, +STORE, 47664384380928, 47664384479231, +STORE, 47664384524288, 47664384532479, +STORE, 47664384479232, 47664384524287, +ERASE, 47664384479232, 47664384479232, +STORE, 47664384479232, 47664384532479, +ERASE, 47664384479232, 47664384479232, +STORE, 47664384479232, 47664384524287, +STORE, 47664384524288, 47664384532479, +ERASE, 47664384524288, 47664384524288, +STORE, 47664384524288, 47664384532479, +STORE, 47664384532480, 47664387583999, +STORE, 47664385077248, 47664387583999, +STORE, 47664384532480, 47664385077247, +ERASE, 47664385077248, 47664385077248, +STORE, 47664385077248, 47664387362815, +STORE, 47664387362816, 47664387583999, +STORE, 47664386772992, 47664387362815, +STORE, 47664385077248, 47664386772991, +ERASE, 47664385077248, 47664385077248, +STORE, 47664385077248, 47664386772991, +STORE, 47664387358720, 47664387362815, +STORE, 47664386772992, 47664387358719, +ERASE, 47664386772992, 47664386772992, +STORE, 47664386772992, 47664387358719, +STORE, 47664387567616, 47664387583999, +STORE, 47664387362816, 47664387567615, +ERASE, 47664387362816, 47664387362816, +STORE, 47664387362816, 47664387567615, +ERASE, 47664387567616, 47664387567616, +STORE, 47664387567616, 47664387583999, +STORE, 47664387584000, 47664389423103, +STORE, 47664387723264, 47664389423103, +STORE, 47664387584000, 47664387723263, +ERASE, 47664387723264, 47664387723264, +STORE, 47664387723264, 47664389382143, +STORE, 47664389382144, 47664389423103, +STORE, 47664389066752, 47664389382143, +STORE, 47664387723264, 47664389066751, +ERASE, 47664387723264, 47664387723264, +STORE, 47664387723264, 47664389066751, +STORE, 47664389378048, 47664389382143, +STORE, 47664389066752, 47664389378047, +ERASE, 47664389066752, 47664389066752, +STORE, 47664389066752, 47664389378047, +STORE, 47664389406720, 47664389423103, +STORE, 47664389382144, 47664389406719, +ERASE, 47664389382144, 47664389382144, +STORE, 47664389382144, 47664389406719, +ERASE, 47664389406720, 47664389406720, +STORE, 47664389406720, 47664389423103, +STORE, 47664389423104, 47664389558271, +ERASE, 47664389423104, 47664389423104, +STORE, 47664389423104, 47664389447679, +STORE, 47664389447680, 47664389558271, +STORE, 47664389509120, 47664389558271, +STORE, 47664389447680, 47664389509119, +ERASE, 47664389447680, 47664389447680, +STORE, 47664389447680, 47664389509119, +STORE, 47664389533696, 47664389558271, +STORE, 47664389509120, 47664389533695, +ERASE, 47664389509120, 47664389509120, +STORE, 47664389509120, 47664389558271, +ERASE, 47664389509120, 47664389509120, +STORE, 47664389509120, 47664389533695, +STORE, 47664389533696, 47664389558271, +STORE, 47664389541888, 47664389558271, +STORE, 47664389533696, 47664389541887, +ERASE, 47664389533696, 47664389533696, +STORE, 47664389533696, 47664389541887, +ERASE, 47664389541888, 47664389541888, +STORE, 47664389541888, 47664389558271, +STORE, 47664389558272, 47664389578751, +ERASE, 47664389558272, 47664389558272, +STORE, 47664389558272, 47664389562367, +STORE, 47664389562368, 47664389578751, +STORE, 47664389566464, 47664389578751, +STORE, 47664389562368, 47664389566463, +ERASE, 47664389562368, 47664389562368, +STORE, 47664389562368, 47664389566463, +STORE, 47664389570560, 47664389578751, +STORE, 47664389566464, 47664389570559, +ERASE, 47664389566464, 47664389566464, +STORE, 47664389566464, 47664389578751, +ERASE, 47664389566464, 47664389566464, +STORE, 47664389566464, 47664389570559, +STORE, 47664389570560, 47664389578751, +ERASE, 47664389570560, 47664389570560, +STORE, 47664389570560, 47664389578751, +STORE, 47664389578752, 47664389586943, +ERASE, 47664389382144, 47664389382144, +STORE, 47664389382144, 47664389398527, +STORE, 47664389398528, 47664389406719, +ERASE, 47664389570560, 47664389570560, +STORE, 47664389570560, 47664389574655, +STORE, 47664389574656, 47664389578751, +ERASE, 47664389533696, 47664389533696, +STORE, 47664389533696, 47664389537791, +STORE, 47664389537792, 47664389541887, +ERASE, 47664387362816, 47664387362816, +STORE, 47664387362816, 47664387559423, +STORE, 47664387559424, 47664387567615, +ERASE, 47664384524288, 47664384524288, +STORE, 47664384524288, 47664384528383, +STORE, 47664384528384, 47664384532479, +ERASE, 94071213555712, 94071213555712, +STORE, 94071213555712, 94071213563903, +STORE, 94071213563904, 94071213567999, +ERASE, 139968410804224, 139968410804224, +STORE, 139968410804224, 139968410808319, +STORE, 139968410808320, 139968410812415, +ERASE, 47664384352256, 47664384352256, +STORE, 94071244402688, 94071244537855, +STORE, 140737488347136, 140737488351231, +STORE, 140728271503360, 140737488351231, +ERASE, 140728271503360, 140728271503360, +STORE, 140728271503360, 140728271507455, +STORE, 94410361982976, 94410362155007, +ERASE, 94410361982976, 94410361982976, +STORE, 94410361982976, 94410361999359, +STORE, 94410361999360, 94410362155007, +ERASE, 94410361999360, 94410361999360, +STORE, 94410361999360, 94410362101759, +STORE, 94410362101760, 94410362142719, +STORE, 94410362142720, 94410362155007, +STORE, 140351953997824, 140351954169855, +ERASE, 140351953997824, 140351953997824, +STORE, 140351953997824, 140351954001919, +STORE, 140351954001920, 140351954169855, +ERASE, 140351954001920, 140351954001920, +STORE, 140351954001920, 140351954124799, +STORE, 140351954124800, 140351954157567, +STORE, 140351954157568, 140351954165759, +STORE, 140351954165760, 140351954169855, +STORE, 140728272429056, 140728272433151, +STORE, 140728272416768, 140728272429055, +STORE, 47280840998912, 47280841007103, +STORE, 47280841007104, 47280841015295, +STORE, 47280841015296, 47280841179135, +ERASE, 47280841015296, 47280841015296, +STORE, 47280841015296, 47280841027583, +STORE, 47280841027584, 47280841179135, +STORE, 47280841125888, 47280841179135, +STORE, 47280841027584, 47280841125887, +ERASE, 47280841027584, 47280841027584, +STORE, 47280841027584, 47280841125887, +STORE, 47280841170944, 47280841179135, +STORE, 47280841125888, 47280841170943, +ERASE, 47280841125888, 47280841125888, +STORE, 47280841125888, 47280841179135, +ERASE, 47280841125888, 47280841125888, +STORE, 47280841125888, 47280841170943, +STORE, 47280841170944, 47280841179135, +ERASE, 47280841170944, 47280841170944, +STORE, 47280841170944, 47280841179135, +STORE, 47280841179136, 47280844230655, +STORE, 47280841723904, 47280844230655, +STORE, 47280841179136, 47280841723903, +ERASE, 47280841723904, 47280841723904, +STORE, 47280841723904, 47280844009471, +STORE, 47280844009472, 47280844230655, +STORE, 47280843419648, 47280844009471, +STORE, 47280841723904, 47280843419647, +ERASE, 47280841723904, 47280841723904, +STORE, 47280841723904, 47280843419647, +STORE, 47280844005376, 47280844009471, +STORE, 47280843419648, 47280844005375, +ERASE, 47280843419648, 47280843419648, +STORE, 47280843419648, 47280844005375, +STORE, 47280844214272, 47280844230655, +STORE, 47280844009472, 47280844214271, +ERASE, 47280844009472, 47280844009472, +STORE, 47280844009472, 47280844214271, +ERASE, 47280844214272, 47280844214272, +STORE, 47280844214272, 47280844230655, +STORE, 47280844230656, 47280846069759, +STORE, 47280844369920, 47280846069759, +STORE, 47280844230656, 47280844369919, +ERASE, 47280844369920, 47280844369920, +STORE, 47280844369920, 47280846028799, +STORE, 47280846028800, 47280846069759, +STORE, 47280845713408, 47280846028799, +STORE, 47280844369920, 47280845713407, +ERASE, 47280844369920, 47280844369920, +STORE, 47280844369920, 47280845713407, +STORE, 47280846024704, 47280846028799, +STORE, 47280845713408, 47280846024703, +ERASE, 47280845713408, 47280845713408, +STORE, 47280845713408, 47280846024703, +STORE, 47280846053376, 47280846069759, +STORE, 47280846028800, 47280846053375, +ERASE, 47280846028800, 47280846028800, +STORE, 47280846028800, 47280846053375, +ERASE, 47280846053376, 47280846053376, +STORE, 47280846053376, 47280846069759, +STORE, 47280846069760, 47280846204927, +ERASE, 47280846069760, 47280846069760, +STORE, 47280846069760, 47280846094335, +STORE, 47280846094336, 47280846204927, +STORE, 47280846155776, 47280846204927, +STORE, 47280846094336, 47280846155775, +ERASE, 47280846094336, 47280846094336, +STORE, 47280846094336, 47280846155775, +STORE, 47280846180352, 47280846204927, +STORE, 47280846155776, 47280846180351, +ERASE, 47280846155776, 47280846155776, +STORE, 47280846155776, 47280846204927, +ERASE, 47280846155776, 47280846155776, +STORE, 47280846155776, 47280846180351, +STORE, 47280846180352, 47280846204927, +STORE, 47280846188544, 47280846204927, +STORE, 47280846180352, 47280846188543, +ERASE, 47280846180352, 47280846180352, +STORE, 47280846180352, 47280846188543, +ERASE, 47280846188544, 47280846188544, +STORE, 47280846188544, 47280846204927, +STORE, 47280846204928, 47280846225407, +ERASE, 47280846204928, 47280846204928, +STORE, 47280846204928, 47280846209023, +STORE, 47280846209024, 47280846225407, +STORE, 47280846213120, 47280846225407, +STORE, 47280846209024, 47280846213119, +ERASE, 47280846209024, 47280846209024, +STORE, 47280846209024, 47280846213119, +STORE, 47280846217216, 47280846225407, +STORE, 47280846213120, 47280846217215, +ERASE, 47280846213120, 47280846213120, +STORE, 47280846213120, 47280846225407, +ERASE, 47280846213120, 47280846213120, +STORE, 47280846213120, 47280846217215, +STORE, 47280846217216, 47280846225407, +ERASE, 47280846217216, 47280846217216, +STORE, 47280846217216, 47280846225407, +STORE, 47280846225408, 47280846233599, +ERASE, 47280846028800, 47280846028800, +STORE, 47280846028800, 47280846045183, +STORE, 47280846045184, 47280846053375, +ERASE, 47280846217216, 47280846217216, +STORE, 47280846217216, 47280846221311, +STORE, 47280846221312, 47280846225407, +ERASE, 47280846180352, 47280846180352, +STORE, 47280846180352, 47280846184447, +STORE, 47280846184448, 47280846188543, +ERASE, 47280844009472, 47280844009472, +STORE, 47280844009472, 47280844206079, +STORE, 47280844206080, 47280844214271, +ERASE, 47280841170944, 47280841170944, +STORE, 47280841170944, 47280841175039, +STORE, 47280841175040, 47280841179135, +ERASE, 94410362142720, 94410362142720, +STORE, 94410362142720, 94410362150911, +STORE, 94410362150912, 94410362155007, +ERASE, 140351954157568, 140351954157568, +STORE, 140351954157568, 140351954161663, +STORE, 140351954161664, 140351954165759, +ERASE, 47280840998912, 47280840998912, +STORE, 94410379456512, 94410379591679, +STORE, 140737488347136, 140737488351231, +STORE, 140732946362368, 140737488351231, +ERASE, 140732946362368, 140732946362368, +STORE, 140732946362368, 140732946366463, +STORE, 94352937934848, 94352938106879, +ERASE, 94352937934848, 94352937934848, +STORE, 94352937934848, 94352937951231, +STORE, 94352937951232, 94352938106879, +ERASE, 94352937951232, 94352937951232, +STORE, 94352937951232, 94352938053631, +STORE, 94352938053632, 94352938094591, +STORE, 94352938094592, 94352938106879, +STORE, 140595518742528, 140595518914559, +ERASE, 140595518742528, 140595518742528, +STORE, 140595518742528, 140595518746623, +STORE, 140595518746624, 140595518914559, +ERASE, 140595518746624, 140595518746624, +STORE, 140595518746624, 140595518869503, +STORE, 140595518869504, 140595518902271, +STORE, 140595518902272, 140595518910463, +STORE, 140595518910464, 140595518914559, +STORE, 140732947468288, 140732947472383, +STORE, 140732947456000, 140732947468287, +STORE, 47037276254208, 47037276262399, +STORE, 47037276262400, 47037276270591, +STORE, 47037276270592, 47037276434431, +ERASE, 47037276270592, 47037276270592, +STORE, 47037276270592, 47037276282879, +STORE, 47037276282880, 47037276434431, +STORE, 47037276381184, 47037276434431, +STORE, 47037276282880, 47037276381183, +ERASE, 47037276282880, 47037276282880, +STORE, 47037276282880, 47037276381183, +STORE, 47037276426240, 47037276434431, +STORE, 47037276381184, 47037276426239, +ERASE, 47037276381184, 47037276381184, +STORE, 47037276381184, 47037276434431, +ERASE, 47037276381184, 47037276381184, +STORE, 47037276381184, 47037276426239, +STORE, 47037276426240, 47037276434431, +ERASE, 47037276426240, 47037276426240, +STORE, 47037276426240, 47037276434431, +STORE, 47037276434432, 47037279485951, +STORE, 47037276979200, 47037279485951, +STORE, 47037276434432, 47037276979199, +ERASE, 47037276979200, 47037276979200, +STORE, 47037276979200, 47037279264767, +STORE, 47037279264768, 47037279485951, +STORE, 47037278674944, 47037279264767, +STORE, 47037276979200, 47037278674943, +ERASE, 47037276979200, 47037276979200, +STORE, 47037276979200, 47037278674943, +STORE, 47037279260672, 47037279264767, +STORE, 47037278674944, 47037279260671, +ERASE, 47037278674944, 47037278674944, +STORE, 47037278674944, 47037279260671, +STORE, 47037279469568, 47037279485951, +STORE, 47037279264768, 47037279469567, +ERASE, 47037279264768, 47037279264768, +STORE, 47037279264768, 47037279469567, +ERASE, 47037279469568, 47037279469568, +STORE, 47037279469568, 47037279485951, +STORE, 47037279485952, 47037281325055, +STORE, 47037279625216, 47037281325055, +STORE, 47037279485952, 47037279625215, +ERASE, 47037279625216, 47037279625216, +STORE, 47037279625216, 47037281284095, +STORE, 47037281284096, 47037281325055, +STORE, 47037280968704, 47037281284095, +STORE, 47037279625216, 47037280968703, +ERASE, 47037279625216, 47037279625216, +STORE, 47037279625216, 47037280968703, +STORE, 47037281280000, 47037281284095, +STORE, 47037280968704, 47037281279999, +ERASE, 47037280968704, 47037280968704, +STORE, 47037280968704, 47037281279999, +STORE, 47037281308672, 47037281325055, +STORE, 47037281284096, 47037281308671, +ERASE, 47037281284096, 47037281284096, +STORE, 47037281284096, 47037281308671, +ERASE, 47037281308672, 47037281308672, +STORE, 47037281308672, 47037281325055, +STORE, 47037281325056, 47037281460223, +ERASE, 47037281325056, 47037281325056, +STORE, 47037281325056, 47037281349631, +STORE, 47037281349632, 47037281460223, +STORE, 47037281411072, 47037281460223, +STORE, 47037281349632, 47037281411071, +ERASE, 47037281349632, 47037281349632, +STORE, 47037281349632, 47037281411071, +STORE, 47037281435648, 47037281460223, +STORE, 47037281411072, 47037281435647, +ERASE, 47037281411072, 47037281411072, +STORE, 47037281411072, 47037281460223, +ERASE, 47037281411072, 47037281411072, +STORE, 47037281411072, 47037281435647, +STORE, 47037281435648, 47037281460223, +STORE, 47037281443840, 47037281460223, +STORE, 47037281435648, 47037281443839, +ERASE, 47037281435648, 47037281435648, +STORE, 47037281435648, 47037281443839, +ERASE, 47037281443840, 47037281443840, +STORE, 47037281443840, 47037281460223, +STORE, 47037281460224, 47037281480703, +ERASE, 47037281460224, 47037281460224, +STORE, 47037281460224, 47037281464319, +STORE, 47037281464320, 47037281480703, +STORE, 47037281468416, 47037281480703, +STORE, 47037281464320, 47037281468415, +ERASE, 47037281464320, 47037281464320, +STORE, 47037281464320, 47037281468415, +STORE, 47037281472512, 47037281480703, +STORE, 47037281468416, 47037281472511, +ERASE, 47037281468416, 47037281468416, +STORE, 47037281468416, 47037281480703, +ERASE, 47037281468416, 47037281468416, +STORE, 47037281468416, 47037281472511, +STORE, 47037281472512, 47037281480703, +ERASE, 47037281472512, 47037281472512, +STORE, 47037281472512, 47037281480703, +STORE, 47037281480704, 47037281488895, +ERASE, 47037281284096, 47037281284096, +STORE, 47037281284096, 47037281300479, +STORE, 47037281300480, 47037281308671, +ERASE, 47037281472512, 47037281472512, +STORE, 47037281472512, 47037281476607, +STORE, 47037281476608, 47037281480703, +ERASE, 47037281435648, 47037281435648, +STORE, 47037281435648, 47037281439743, +STORE, 47037281439744, 47037281443839, +ERASE, 47037279264768, 47037279264768, +STORE, 47037279264768, 47037279461375, +STORE, 47037279461376, 47037279469567, +ERASE, 47037276426240, 47037276426240, +STORE, 47037276426240, 47037276430335, +STORE, 47037276430336, 47037276434431, +ERASE, 94352938094592, 94352938094592, +STORE, 94352938094592, 94352938102783, +STORE, 94352938102784, 94352938106879, +ERASE, 140595518902272, 140595518902272, +STORE, 140595518902272, 140595518906367, +STORE, 140595518906368, 140595518910463, +ERASE, 47037276254208, 47037276254208, +STORE, 94352938438656, 94352938573823, +STORE, 140737488347136, 140737488351231, +STORE, 140733506027520, 140737488351231, +ERASE, 140733506027520, 140733506027520, +STORE, 140733506027520, 140733506031615, +STORE, 94150123073536, 94150123245567, +ERASE, 94150123073536, 94150123073536, +STORE, 94150123073536, 94150123089919, +STORE, 94150123089920, 94150123245567, +ERASE, 94150123089920, 94150123089920, +STORE, 94150123089920, 94150123192319, +STORE, 94150123192320, 94150123233279, +STORE, 94150123233280, 94150123245567, +STORE, 140081290375168, 140081290547199, +ERASE, 140081290375168, 140081290375168, +STORE, 140081290375168, 140081290379263, +STORE, 140081290379264, 140081290547199, +ERASE, 140081290379264, 140081290379264, +STORE, 140081290379264, 140081290502143, +STORE, 140081290502144, 140081290534911, +STORE, 140081290534912, 140081290543103, +STORE, 140081290543104, 140081290547199, +STORE, 140733506707456, 140733506711551, +STORE, 140733506695168, 140733506707455, +STORE, 47551504621568, 47551504629759, +STORE, 47551504629760, 47551504637951, +STORE, 47551504637952, 47551504801791, +ERASE, 47551504637952, 47551504637952, +STORE, 47551504637952, 47551504650239, +STORE, 47551504650240, 47551504801791, +STORE, 47551504748544, 47551504801791, +STORE, 47551504650240, 47551504748543, +ERASE, 47551504650240, 47551504650240, +STORE, 47551504650240, 47551504748543, +STORE, 47551504793600, 47551504801791, +STORE, 47551504748544, 47551504793599, +ERASE, 47551504748544, 47551504748544, +STORE, 47551504748544, 47551504801791, +ERASE, 47551504748544, 47551504748544, +STORE, 47551504748544, 47551504793599, +STORE, 47551504793600, 47551504801791, +ERASE, 47551504793600, 47551504793600, +STORE, 47551504793600, 47551504801791, +STORE, 47551504801792, 47551507853311, +STORE, 47551505346560, 47551507853311, +STORE, 47551504801792, 47551505346559, +ERASE, 47551505346560, 47551505346560, +STORE, 47551505346560, 47551507632127, +STORE, 47551507632128, 47551507853311, +STORE, 47551507042304, 47551507632127, +STORE, 47551505346560, 47551507042303, +ERASE, 47551505346560, 47551505346560, +STORE, 47551505346560, 47551507042303, +STORE, 47551507628032, 47551507632127, +STORE, 47551507042304, 47551507628031, +ERASE, 47551507042304, 47551507042304, +STORE, 47551507042304, 47551507628031, +STORE, 47551507836928, 47551507853311, +STORE, 47551507632128, 47551507836927, +ERASE, 47551507632128, 47551507632128, +STORE, 47551507632128, 47551507836927, +ERASE, 47551507836928, 47551507836928, +STORE, 47551507836928, 47551507853311, +STORE, 47551507853312, 47551509692415, +STORE, 47551507992576, 47551509692415, +STORE, 47551507853312, 47551507992575, +ERASE, 47551507992576, 47551507992576, +STORE, 47551507992576, 47551509651455, +STORE, 47551509651456, 47551509692415, +STORE, 47551509336064, 47551509651455, +STORE, 47551507992576, 47551509336063, +ERASE, 47551507992576, 47551507992576, +STORE, 47551507992576, 47551509336063, +STORE, 47551509647360, 47551509651455, +STORE, 47551509336064, 47551509647359, +ERASE, 47551509336064, 47551509336064, +STORE, 47551509336064, 47551509647359, +STORE, 47551509676032, 47551509692415, +STORE, 47551509651456, 47551509676031, +ERASE, 47551509651456, 47551509651456, +STORE, 47551509651456, 47551509676031, +ERASE, 47551509676032, 47551509676032, +STORE, 47551509676032, 47551509692415, +STORE, 47551509692416, 47551509827583, +ERASE, 47551509692416, 47551509692416, +STORE, 47551509692416, 47551509716991, +STORE, 47551509716992, 47551509827583, +STORE, 47551509778432, 47551509827583, +STORE, 47551509716992, 47551509778431, +ERASE, 47551509716992, 47551509716992, +STORE, 47551509716992, 47551509778431, +STORE, 47551509803008, 47551509827583, +STORE, 47551509778432, 47551509803007, +ERASE, 47551509778432, 47551509778432, +STORE, 47551509778432, 47551509827583, +ERASE, 47551509778432, 47551509778432, +STORE, 47551509778432, 47551509803007, +STORE, 47551509803008, 47551509827583, +STORE, 47551509811200, 47551509827583, +STORE, 47551509803008, 47551509811199, +ERASE, 47551509803008, 47551509803008, +STORE, 47551509803008, 47551509811199, +ERASE, 47551509811200, 47551509811200, +STORE, 47551509811200, 47551509827583, +STORE, 47551509827584, 47551509848063, +ERASE, 47551509827584, 47551509827584, +STORE, 47551509827584, 47551509831679, +STORE, 47551509831680, 47551509848063, +STORE, 47551509835776, 47551509848063, +STORE, 47551509831680, 47551509835775, +ERASE, 47551509831680, 47551509831680, +STORE, 47551509831680, 47551509835775, +STORE, 47551509839872, 47551509848063, +STORE, 47551509835776, 47551509839871, +ERASE, 47551509835776, 47551509835776, +STORE, 47551509835776, 47551509848063, +ERASE, 47551509835776, 47551509835776, +STORE, 47551509835776, 47551509839871, +STORE, 47551509839872, 47551509848063, +ERASE, 47551509839872, 47551509839872, +STORE, 47551509839872, 47551509848063, +STORE, 47551509848064, 47551509856255, +ERASE, 47551509651456, 47551509651456, +STORE, 47551509651456, 47551509667839, +STORE, 47551509667840, 47551509676031, +ERASE, 47551509839872, 47551509839872, +STORE, 47551509839872, 47551509843967, +STORE, 47551509843968, 47551509848063, +ERASE, 47551509803008, 47551509803008, +STORE, 47551509803008, 47551509807103, +STORE, 47551509807104, 47551509811199, +ERASE, 47551507632128, 47551507632128, +STORE, 47551507632128, 47551507828735, +STORE, 47551507828736, 47551507836927, +ERASE, 47551504793600, 47551504793600, +STORE, 47551504793600, 47551504797695, +STORE, 47551504797696, 47551504801791, +ERASE, 94150123233280, 94150123233280, +STORE, 94150123233280, 94150123241471, +STORE, 94150123241472, 94150123245567, +ERASE, 140081290534912, 140081290534912, +STORE, 140081290534912, 140081290539007, +STORE, 140081290539008, 140081290543103, +ERASE, 47551504621568, 47551504621568, +STORE, 94150148112384, 94150148247551, +STORE, 140737488347136, 140737488351231, +STORE, 140734389334016, 140737488351231, +ERASE, 140734389334016, 140734389334016, +STORE, 140734389334016, 140734389338111, +STORE, 94844636606464, 94844636778495, +ERASE, 94844636606464, 94844636606464, +STORE, 94844636606464, 94844636622847, +STORE, 94844636622848, 94844636778495, +ERASE, 94844636622848, 94844636622848, +STORE, 94844636622848, 94844636725247, +STORE, 94844636725248, 94844636766207, +STORE, 94844636766208, 94844636778495, +STORE, 139922765217792, 139922765389823, +ERASE, 139922765217792, 139922765217792, +STORE, 139922765217792, 139922765221887, +STORE, 139922765221888, 139922765389823, +ERASE, 139922765221888, 139922765221888, +STORE, 139922765221888, 139922765344767, +STORE, 139922765344768, 139922765377535, +STORE, 139922765377536, 139922765385727, +STORE, 139922765385728, 139922765389823, +STORE, 140734389678080, 140734389682175, +STORE, 140734389665792, 140734389678079, +STORE, 47710029778944, 47710029787135, +STORE, 47710029787136, 47710029795327, +STORE, 47710029795328, 47710029959167, +ERASE, 47710029795328, 47710029795328, +STORE, 47710029795328, 47710029807615, +STORE, 47710029807616, 47710029959167, +STORE, 47710029905920, 47710029959167, +STORE, 47710029807616, 47710029905919, +ERASE, 47710029807616, 47710029807616, +STORE, 47710029807616, 47710029905919, +STORE, 47710029950976, 47710029959167, +STORE, 47710029905920, 47710029950975, +ERASE, 47710029905920, 47710029905920, +STORE, 47710029905920, 47710029959167, +ERASE, 47710029905920, 47710029905920, +STORE, 47710029905920, 47710029950975, +STORE, 47710029950976, 47710029959167, +ERASE, 47710029950976, 47710029950976, +STORE, 47710029950976, 47710029959167, +STORE, 47710029959168, 47710033010687, +STORE, 47710030503936, 47710033010687, +STORE, 47710029959168, 47710030503935, +ERASE, 47710030503936, 47710030503936, +STORE, 47710030503936, 47710032789503, +STORE, 47710032789504, 47710033010687, +STORE, 47710032199680, 47710032789503, +STORE, 47710030503936, 47710032199679, +ERASE, 47710030503936, 47710030503936, +STORE, 47710030503936, 47710032199679, +STORE, 47710032785408, 47710032789503, +STORE, 47710032199680, 47710032785407, +ERASE, 47710032199680, 47710032199680, +STORE, 47710032199680, 47710032785407, +STORE, 47710032994304, 47710033010687, +STORE, 47710032789504, 47710032994303, +ERASE, 47710032789504, 47710032789504, +STORE, 47710032789504, 47710032994303, +ERASE, 47710032994304, 47710032994304, +STORE, 47710032994304, 47710033010687, +STORE, 47710033010688, 47710034849791, +STORE, 47710033149952, 47710034849791, +STORE, 47710033010688, 47710033149951, +ERASE, 47710033149952, 47710033149952, +STORE, 47710033149952, 47710034808831, +STORE, 47710034808832, 47710034849791, +STORE, 47710034493440, 47710034808831, +STORE, 47710033149952, 47710034493439, +ERASE, 47710033149952, 47710033149952, +STORE, 47710033149952, 47710034493439, +STORE, 47710034804736, 47710034808831, +STORE, 47710034493440, 47710034804735, +ERASE, 47710034493440, 47710034493440, +STORE, 47710034493440, 47710034804735, +STORE, 47710034833408, 47710034849791, +STORE, 47710034808832, 47710034833407, +ERASE, 47710034808832, 47710034808832, +STORE, 47710034808832, 47710034833407, +ERASE, 47710034833408, 47710034833408, +STORE, 47710034833408, 47710034849791, +STORE, 47710034849792, 47710034984959, +ERASE, 47710034849792, 47710034849792, +STORE, 47710034849792, 47710034874367, +STORE, 47710034874368, 47710034984959, +STORE, 47710034935808, 47710034984959, +STORE, 47710034874368, 47710034935807, +ERASE, 47710034874368, 47710034874368, +STORE, 47710034874368, 47710034935807, +STORE, 47710034960384, 47710034984959, +STORE, 47710034935808, 47710034960383, +ERASE, 47710034935808, 47710034935808, +STORE, 47710034935808, 47710034984959, +ERASE, 47710034935808, 47710034935808, +STORE, 47710034935808, 47710034960383, +STORE, 47710034960384, 47710034984959, +STORE, 47710034968576, 47710034984959, +STORE, 47710034960384, 47710034968575, +ERASE, 47710034960384, 47710034960384, +STORE, 47710034960384, 47710034968575, +ERASE, 47710034968576, 47710034968576, +STORE, 47710034968576, 47710034984959, +STORE, 47710034984960, 47710035005439, +ERASE, 47710034984960, 47710034984960, +STORE, 47710034984960, 47710034989055, +STORE, 47710034989056, 47710035005439, +STORE, 47710034993152, 47710035005439, +STORE, 47710034989056, 47710034993151, +ERASE, 47710034989056, 47710034989056, +STORE, 47710034989056, 47710034993151, +STORE, 47710034997248, 47710035005439, +STORE, 47710034993152, 47710034997247, +ERASE, 47710034993152, 47710034993152, +STORE, 47710034993152, 47710035005439, +ERASE, 47710034993152, 47710034993152, +STORE, 47710034993152, 47710034997247, +STORE, 47710034997248, 47710035005439, +ERASE, 47710034997248, 47710034997248, +STORE, 47710034997248, 47710035005439, +STORE, 47710035005440, 47710035013631, +ERASE, 47710034808832, 47710034808832, +STORE, 47710034808832, 47710034825215, +STORE, 47710034825216, 47710034833407, +ERASE, 47710034997248, 47710034997248, +STORE, 47710034997248, 47710035001343, +STORE, 47710035001344, 47710035005439, +ERASE, 47710034960384, 47710034960384, +STORE, 47710034960384, 47710034964479, +STORE, 47710034964480, 47710034968575, +ERASE, 47710032789504, 47710032789504, +STORE, 47710032789504, 47710032986111, +STORE, 47710032986112, 47710032994303, +ERASE, 47710029950976, 47710029950976, +STORE, 47710029950976, 47710029955071, +STORE, 47710029955072, 47710029959167, +ERASE, 94844636766208, 94844636766208, +STORE, 94844636766208, 94844636774399, +STORE, 94844636774400, 94844636778495, +ERASE, 139922765377536, 139922765377536, +STORE, 139922765377536, 139922765381631, +STORE, 139922765381632, 139922765385727, +ERASE, 47710029778944, 47710029778944, +STORE, 94844641775616, 94844641910783, +STORE, 140737488347136, 140737488351231, +STORE, 140732213886976, 140737488351231, +ERASE, 140732213886976, 140732213886976, +STORE, 140732213886976, 140732213891071, +STORE, 94240508887040, 94240509059071, +ERASE, 94240508887040, 94240508887040, +STORE, 94240508887040, 94240508903423, +STORE, 94240508903424, 94240509059071, +ERASE, 94240508903424, 94240508903424, +STORE, 94240508903424, 94240509005823, +STORE, 94240509005824, 94240509046783, +STORE, 94240509046784, 94240509059071, +STORE, 140275106516992, 140275106689023, +ERASE, 140275106516992, 140275106516992, +STORE, 140275106516992, 140275106521087, +STORE, 140275106521088, 140275106689023, +ERASE, 140275106521088, 140275106521088, +STORE, 140275106521088, 140275106643967, +STORE, 140275106643968, 140275106676735, +STORE, 140275106676736, 140275106684927, +STORE, 140275106684928, 140275106689023, +STORE, 140732213977088, 140732213981183, +STORE, 140732213964800, 140732213977087, +STORE, 47357688479744, 47357688487935, +STORE, 47357688487936, 47357688496127, +STORE, 47357688496128, 47357688659967, +ERASE, 47357688496128, 47357688496128, +STORE, 47357688496128, 47357688508415, +STORE, 47357688508416, 47357688659967, +STORE, 47357688606720, 47357688659967, +STORE, 47357688508416, 47357688606719, +ERASE, 47357688508416, 47357688508416, +STORE, 47357688508416, 47357688606719, +STORE, 47357688651776, 47357688659967, +STORE, 47357688606720, 47357688651775, +ERASE, 47357688606720, 47357688606720, +STORE, 47357688606720, 47357688659967, +ERASE, 47357688606720, 47357688606720, +STORE, 47357688606720, 47357688651775, +STORE, 47357688651776, 47357688659967, +ERASE, 47357688651776, 47357688651776, +STORE, 47357688651776, 47357688659967, +STORE, 47357688659968, 47357691711487, +STORE, 47357689204736, 47357691711487, +STORE, 47357688659968, 47357689204735, +ERASE, 47357689204736, 47357689204736, +STORE, 47357689204736, 47357691490303, +STORE, 47357691490304, 47357691711487, +STORE, 47357690900480, 47357691490303, +STORE, 47357689204736, 47357690900479, +ERASE, 47357689204736, 47357689204736, +STORE, 47357689204736, 47357690900479, +STORE, 47357691486208, 47357691490303, +STORE, 47357690900480, 47357691486207, +ERASE, 47357690900480, 47357690900480, +STORE, 47357690900480, 47357691486207, +STORE, 47357691695104, 47357691711487, +STORE, 47357691490304, 47357691695103, +ERASE, 47357691490304, 47357691490304, +STORE, 47357691490304, 47357691695103, +ERASE, 47357691695104, 47357691695104, +STORE, 47357691695104, 47357691711487, +STORE, 47357691711488, 47357693550591, +STORE, 47357691850752, 47357693550591, +STORE, 47357691711488, 47357691850751, +ERASE, 47357691850752, 47357691850752, +STORE, 47357691850752, 47357693509631, +STORE, 47357693509632, 47357693550591, +STORE, 47357693194240, 47357693509631, +STORE, 47357691850752, 47357693194239, +ERASE, 47357691850752, 47357691850752, +STORE, 47357691850752, 47357693194239, +STORE, 47357693505536, 47357693509631, +STORE, 47357693194240, 47357693505535, +ERASE, 47357693194240, 47357693194240, +STORE, 47357693194240, 47357693505535, +STORE, 47357693534208, 47357693550591, +STORE, 47357693509632, 47357693534207, +ERASE, 47357693509632, 47357693509632, +STORE, 47357693509632, 47357693534207, +ERASE, 47357693534208, 47357693534208, +STORE, 47357693534208, 47357693550591, +STORE, 47357693550592, 47357693685759, +ERASE, 47357693550592, 47357693550592, +STORE, 47357693550592, 47357693575167, +STORE, 47357693575168, 47357693685759, +STORE, 47357693636608, 47357693685759, +STORE, 47357693575168, 47357693636607, +ERASE, 47357693575168, 47357693575168, +STORE, 47357693575168, 47357693636607, +STORE, 47357693661184, 47357693685759, +STORE, 47357693636608, 47357693661183, +ERASE, 47357693636608, 47357693636608, +STORE, 47357693636608, 47357693685759, +ERASE, 47357693636608, 47357693636608, +STORE, 47357693636608, 47357693661183, +STORE, 47357693661184, 47357693685759, +STORE, 47357693669376, 47357693685759, +STORE, 47357693661184, 47357693669375, +ERASE, 47357693661184, 47357693661184, +STORE, 47357693661184, 47357693669375, +ERASE, 47357693669376, 47357693669376, +STORE, 47357693669376, 47357693685759, +STORE, 47357693685760, 47357693706239, +ERASE, 47357693685760, 47357693685760, +STORE, 47357693685760, 47357693689855, +STORE, 47357693689856, 47357693706239, +STORE, 47357693693952, 47357693706239, +STORE, 47357693689856, 47357693693951, +ERASE, 47357693689856, 47357693689856, +STORE, 47357693689856, 47357693693951, +STORE, 47357693698048, 47357693706239, +STORE, 47357693693952, 47357693698047, +ERASE, 47357693693952, 47357693693952, +STORE, 47357693693952, 47357693706239, +ERASE, 47357693693952, 47357693693952, +STORE, 47357693693952, 47357693698047, +STORE, 47357693698048, 47357693706239, +ERASE, 47357693698048, 47357693698048, +STORE, 47357693698048, 47357693706239, +STORE, 47357693706240, 47357693714431, +ERASE, 47357693509632, 47357693509632, +STORE, 47357693509632, 47357693526015, +STORE, 47357693526016, 47357693534207, +ERASE, 47357693698048, 47357693698048, +STORE, 47357693698048, 47357693702143, +STORE, 47357693702144, 47357693706239, +ERASE, 47357693661184, 47357693661184, +STORE, 47357693661184, 47357693665279, +STORE, 47357693665280, 47357693669375, +ERASE, 47357691490304, 47357691490304, +STORE, 47357691490304, 47357691686911, +STORE, 47357691686912, 47357691695103, +ERASE, 47357688651776, 47357688651776, +STORE, 47357688651776, 47357688655871, +STORE, 47357688655872, 47357688659967, +ERASE, 94240509046784, 94240509046784, +STORE, 94240509046784, 94240509054975, +STORE, 94240509054976, 94240509059071, +ERASE, 140275106676736, 140275106676736, +STORE, 140275106676736, 140275106680831, +STORE, 140275106680832, 140275106684927, +ERASE, 47357688479744, 47357688479744, +STORE, 94240518361088, 94240518496255, +STORE, 140737488347136, 140737488351231, +STORE, 140732688277504, 140737488351231, +ERASE, 140732688277504, 140732688277504, +STORE, 140732688277504, 140732688281599, +STORE, 94629171351552, 94629172064255, +ERASE, 94629171351552, 94629171351552, +STORE, 94629171351552, 94629171400703, +STORE, 94629171400704, 94629172064255, +ERASE, 94629171400704, 94629171400704, +STORE, 94629171400704, 94629171945471, +STORE, 94629171945472, 94629172043775, +STORE, 94629172043776, 94629172064255, +STORE, 139770707644416, 139770707816447, +ERASE, 139770707644416, 139770707644416, +STORE, 139770707644416, 139770707648511, +STORE, 139770707648512, 139770707816447, +ERASE, 139770707648512, 139770707648512, +STORE, 139770707648512, 139770707771391, +STORE, 139770707771392, 139770707804159, +STORE, 139770707804160, 139770707812351, +STORE, 139770707812352, 139770707816447, +STORE, 140732689121280, 140732689125375, +STORE, 140732689108992, 140732689121279, +STORE, 47862087352320, 47862087360511, +STORE, 47862087360512, 47862087368703, +STORE, 47862087368704, 47862087475199, +STORE, 47862087385088, 47862087475199, +STORE, 47862087368704, 47862087385087, +ERASE, 47862087385088, 47862087385088, +STORE, 47862087385088, 47862087458815, +STORE, 47862087458816, 47862087475199, +STORE, 47862087438336, 47862087458815, +STORE, 47862087385088, 47862087438335, +ERASE, 47862087385088, 47862087385088, +STORE, 47862087385088, 47862087438335, +STORE, 47862087454720, 47862087458815, +STORE, 47862087438336, 47862087454719, +ERASE, 47862087438336, 47862087438336, +STORE, 47862087438336, 47862087454719, +STORE, 47862087467008, 47862087475199, +STORE, 47862087458816, 47862087467007, +ERASE, 47862087458816, 47862087458816, +STORE, 47862087458816, 47862087467007, +ERASE, 47862087467008, 47862087467008, +STORE, 47862087467008, 47862087475199, +STORE, 47862087475200, 47862089314303, +STORE, 47862087614464, 47862089314303, +STORE, 47862087475200, 47862087614463, +ERASE, 47862087614464, 47862087614464, +STORE, 47862087614464, 47862089273343, +STORE, 47862089273344, 47862089314303, +STORE, 47862088957952, 47862089273343, +STORE, 47862087614464, 47862088957951, +ERASE, 47862087614464, 47862087614464, +STORE, 47862087614464, 47862088957951, +STORE, 47862089269248, 47862089273343, +STORE, 47862088957952, 47862089269247, +ERASE, 47862088957952, 47862088957952, +STORE, 47862088957952, 47862089269247, +STORE, 47862089297920, 47862089314303, +STORE, 47862089273344, 47862089297919, +ERASE, 47862089273344, 47862089273344, +STORE, 47862089273344, 47862089297919, +ERASE, 47862089297920, 47862089297920, +STORE, 47862089297920, 47862089314303, +STORE, 47862089297920, 47862089326591, +ERASE, 47862089273344, 47862089273344, +STORE, 47862089273344, 47862089289727, +STORE, 47862089289728, 47862089297919, +ERASE, 47862087458816, 47862087458816, +STORE, 47862087458816, 47862087462911, +STORE, 47862087462912, 47862087467007, +ERASE, 94629172043776, 94629172043776, +STORE, 94629172043776, 94629172060159, +STORE, 94629172060160, 94629172064255, +ERASE, 139770707804160, 139770707804160, +STORE, 139770707804160, 139770707808255, +STORE, 139770707808256, 139770707812351, +ERASE, 47862087352320, 47862087352320, +STORE, 94629197533184, 94629197668351, +STORE, 140737488347136, 140737488351231, +STORE, 140727540711424, 140737488351231, +ERASE, 140727540711424, 140727540711424, +STORE, 140727540711424, 140727540715519, +STORE, 94299865313280, 94299866025983, +ERASE, 94299865313280, 94299865313280, +STORE, 94299865313280, 94299865362431, +STORE, 94299865362432, 94299866025983, +ERASE, 94299865362432, 94299865362432, +STORE, 94299865362432, 94299865907199, +STORE, 94299865907200, 94299866005503, +STORE, 94299866005504, 94299866025983, +STORE, 140680268763136, 140680268935167, +ERASE, 140680268763136, 140680268763136, +STORE, 140680268763136, 140680268767231, +STORE, 140680268767232, 140680268935167, +ERASE, 140680268767232, 140680268767232, +STORE, 140680268767232, 140680268890111, +STORE, 140680268890112, 140680268922879, +STORE, 140680268922880, 140680268931071, +STORE, 140680268931072, 140680268935167, +STORE, 140727541424128, 140727541428223, +STORE, 140727541411840, 140727541424127, +STORE, 46952526233600, 46952526241791, +STORE, 46952526241792, 46952526249983, +STORE, 46952526249984, 46952526356479, +STORE, 46952526266368, 46952526356479, +STORE, 46952526249984, 46952526266367, +ERASE, 46952526266368, 46952526266368, +STORE, 46952526266368, 46952526340095, +STORE, 46952526340096, 46952526356479, +STORE, 46952526319616, 46952526340095, +STORE, 46952526266368, 46952526319615, +ERASE, 46952526266368, 46952526266368, +STORE, 46952526266368, 46952526319615, +STORE, 46952526336000, 46952526340095, +STORE, 46952526319616, 46952526335999, +ERASE, 46952526319616, 46952526319616, +STORE, 46952526319616, 46952526335999, +STORE, 46952526348288, 46952526356479, +STORE, 46952526340096, 46952526348287, +ERASE, 46952526340096, 46952526340096, +STORE, 46952526340096, 46952526348287, +ERASE, 46952526348288, 46952526348288, +STORE, 46952526348288, 46952526356479, +STORE, 46952526356480, 46952528195583, +STORE, 46952526495744, 46952528195583, +STORE, 46952526356480, 46952526495743, +ERASE, 46952526495744, 46952526495744, +STORE, 46952526495744, 46952528154623, +STORE, 46952528154624, 46952528195583, +STORE, 46952527839232, 46952528154623, +STORE, 46952526495744, 46952527839231, +ERASE, 46952526495744, 46952526495744, +STORE, 46952526495744, 46952527839231, +STORE, 46952528150528, 46952528154623, +STORE, 46952527839232, 46952528150527, +ERASE, 46952527839232, 46952527839232, +STORE, 46952527839232, 46952528150527, +STORE, 46952528179200, 46952528195583, +STORE, 46952528154624, 46952528179199, +ERASE, 46952528154624, 46952528154624, +STORE, 46952528154624, 46952528179199, +ERASE, 46952528179200, 46952528179200, +STORE, 46952528179200, 46952528195583, +STORE, 46952528179200, 46952528207871, +ERASE, 46952528154624, 46952528154624, +STORE, 46952528154624, 46952528171007, +STORE, 46952528171008, 46952528179199, +ERASE, 46952526340096, 46952526340096, +STORE, 46952526340096, 46952526344191, +STORE, 46952526344192, 46952526348287, +ERASE, 94299866005504, 94299866005504, +STORE, 94299866005504, 94299866021887, +STORE, 94299866021888, 94299866025983, +ERASE, 140680268922880, 140680268922880, +STORE, 140680268922880, 140680268926975, +STORE, 140680268926976, 140680268931071, +ERASE, 46952526233600, 46952526233600, +STORE, 140737488347136, 140737488351231, +STORE, 140722874793984, 140737488351231, +ERASE, 140722874793984, 140722874793984, +STORE, 140722874793984, 140722874798079, +STORE, 94448916213760, 94448916926463, +ERASE, 94448916213760, 94448916213760, +STORE, 94448916213760, 94448916262911, +STORE, 94448916262912, 94448916926463, +ERASE, 94448916262912, 94448916262912, +STORE, 94448916262912, 94448916807679, +STORE, 94448916807680, 94448916905983, +STORE, 94448916905984, 94448916926463, +STORE, 140389117046784, 140389117218815, +ERASE, 140389117046784, 140389117046784, +STORE, 140389117046784, 140389117050879, +STORE, 140389117050880, 140389117218815, +ERASE, 140389117050880, 140389117050880, +STORE, 140389117050880, 140389117173759, +STORE, 140389117173760, 140389117206527, +STORE, 140389117206528, 140389117214719, +STORE, 140389117214720, 140389117218815, +STORE, 140722875297792, 140722875301887, +STORE, 140722875285504, 140722875297791, +STORE, 47243677949952, 47243677958143, +STORE, 47243677958144, 47243677966335, +STORE, 47243677966336, 47243678072831, +STORE, 47243677982720, 47243678072831, +STORE, 47243677966336, 47243677982719, +ERASE, 47243677982720, 47243677982720, +STORE, 47243677982720, 47243678056447, +STORE, 47243678056448, 47243678072831, +STORE, 47243678035968, 47243678056447, +STORE, 47243677982720, 47243678035967, +ERASE, 47243677982720, 47243677982720, +STORE, 47243677982720, 47243678035967, +STORE, 47243678052352, 47243678056447, +STORE, 47243678035968, 47243678052351, +ERASE, 47243678035968, 47243678035968, +STORE, 47243678035968, 47243678052351, +STORE, 47243678064640, 47243678072831, +STORE, 47243678056448, 47243678064639, +ERASE, 47243678056448, 47243678056448, +STORE, 47243678056448, 47243678064639, +ERASE, 47243678064640, 47243678064640, +STORE, 47243678064640, 47243678072831, +STORE, 47243678072832, 47243679911935, +STORE, 47243678212096, 47243679911935, +STORE, 47243678072832, 47243678212095, +ERASE, 47243678212096, 47243678212096, +STORE, 47243678212096, 47243679870975, +STORE, 47243679870976, 47243679911935, +STORE, 47243679555584, 47243679870975, +STORE, 47243678212096, 47243679555583, +ERASE, 47243678212096, 47243678212096, +STORE, 47243678212096, 47243679555583, +STORE, 47243679866880, 47243679870975, +STORE, 47243679555584, 47243679866879, +ERASE, 47243679555584, 47243679555584, +STORE, 47243679555584, 47243679866879, +STORE, 47243679895552, 47243679911935, +STORE, 47243679870976, 47243679895551, +ERASE, 47243679870976, 47243679870976, +STORE, 47243679870976, 47243679895551, +ERASE, 47243679895552, 47243679895552, +STORE, 47243679895552, 47243679911935, +STORE, 47243679895552, 47243679924223, +ERASE, 47243679870976, 47243679870976, +STORE, 47243679870976, 47243679887359, +STORE, 47243679887360, 47243679895551, +ERASE, 47243678056448, 47243678056448, +STORE, 47243678056448, 47243678060543, +STORE, 47243678060544, 47243678064639, +ERASE, 94448916905984, 94448916905984, +STORE, 94448916905984, 94448916922367, +STORE, 94448916922368, 94448916926463, +ERASE, 140389117206528, 140389117206528, +STORE, 140389117206528, 140389117210623, +STORE, 140389117210624, 140389117214719, +ERASE, 47243677949952, 47243677949952, +STORE, 140737488347136, 140737488351231, +STORE, 140733068505088, 140737488351231, +ERASE, 140733068505088, 140733068505088, +STORE, 140733068505088, 140733068509183, +STORE, 94207145750528, 94207146463231, +ERASE, 94207145750528, 94207145750528, +STORE, 94207145750528, 94207145799679, +STORE, 94207145799680, 94207146463231, +ERASE, 94207145799680, 94207145799680, +STORE, 94207145799680, 94207146344447, +STORE, 94207146344448, 94207146442751, +STORE, 94207146442752, 94207146463231, +STORE, 140684504911872, 140684505083903, +ERASE, 140684504911872, 140684504911872, +STORE, 140684504911872, 140684504915967, +STORE, 140684504915968, 140684505083903, +ERASE, 140684504915968, 140684504915968, +STORE, 140684504915968, 140684505038847, +STORE, 140684505038848, 140684505071615, +STORE, 140684505071616, 140684505079807, +STORE, 140684505079808, 140684505083903, +STORE, 140733068607488, 140733068611583, +STORE, 140733068595200, 140733068607487, +STORE, 46948290084864, 46948290093055, +STORE, 46948290093056, 46948290101247, +STORE, 46948290101248, 46948290207743, +STORE, 46948290117632, 46948290207743, +STORE, 46948290101248, 46948290117631, +ERASE, 46948290117632, 46948290117632, +STORE, 46948290117632, 46948290191359, +STORE, 46948290191360, 46948290207743, +STORE, 46948290170880, 46948290191359, +STORE, 46948290117632, 46948290170879, +ERASE, 46948290117632, 46948290117632, +STORE, 46948290117632, 46948290170879, +STORE, 46948290187264, 46948290191359, +STORE, 46948290170880, 46948290187263, +ERASE, 46948290170880, 46948290170880, +STORE, 46948290170880, 46948290187263, +STORE, 46948290199552, 46948290207743, +STORE, 46948290191360, 46948290199551, +ERASE, 46948290191360, 46948290191360, +STORE, 46948290191360, 46948290199551, +ERASE, 46948290199552, 46948290199552, +STORE, 46948290199552, 46948290207743, +STORE, 46948290207744, 46948292046847, +STORE, 46948290347008, 46948292046847, +STORE, 46948290207744, 46948290347007, +ERASE, 46948290347008, 46948290347008, +STORE, 46948290347008, 46948292005887, +STORE, 46948292005888, 46948292046847, +STORE, 46948291690496, 46948292005887, +STORE, 46948290347008, 46948291690495, +ERASE, 46948290347008, 46948290347008, +STORE, 46948290347008, 46948291690495, +STORE, 46948292001792, 46948292005887, +STORE, 46948291690496, 46948292001791, +ERASE, 46948291690496, 46948291690496, +STORE, 46948291690496, 46948292001791, +STORE, 46948292030464, 46948292046847, +STORE, 46948292005888, 46948292030463, +ERASE, 46948292005888, 46948292005888, +STORE, 46948292005888, 46948292030463, +ERASE, 46948292030464, 46948292030464, +STORE, 46948292030464, 46948292046847, +STORE, 46948292030464, 46948292059135, +ERASE, 46948292005888, 46948292005888, +STORE, 46948292005888, 46948292022271, +STORE, 46948292022272, 46948292030463, +ERASE, 46948290191360, 46948290191360, +STORE, 46948290191360, 46948290195455, +STORE, 46948290195456, 46948290199551, +ERASE, 94207146442752, 94207146442752, +STORE, 94207146442752, 94207146459135, +STORE, 94207146459136, 94207146463231, +ERASE, 140684505071616, 140684505071616, +STORE, 140684505071616, 140684505075711, +STORE, 140684505075712, 140684505079807, +ERASE, 46948290084864, 46948290084864, +STORE, 140737488347136, 140737488351231, +STORE, 140726367158272, 140737488351231, +ERASE, 140726367158272, 140726367158272, +STORE, 140726367158272, 140726367162367, +STORE, 94436124106752, 94436124819455, +ERASE, 94436124106752, 94436124106752, +STORE, 94436124106752, 94436124155903, +STORE, 94436124155904, 94436124819455, +ERASE, 94436124155904, 94436124155904, +STORE, 94436124155904, 94436124700671, +STORE, 94436124700672, 94436124798975, +STORE, 94436124798976, 94436124819455, +STORE, 140049025044480, 140049025216511, +ERASE, 140049025044480, 140049025044480, +STORE, 140049025044480, 140049025048575, +STORE, 140049025048576, 140049025216511, +ERASE, 140049025048576, 140049025048576, +STORE, 140049025048576, 140049025171455, +STORE, 140049025171456, 140049025204223, +STORE, 140049025204224, 140049025212415, +STORE, 140049025212416, 140049025216511, +STORE, 140726367256576, 140726367260671, +STORE, 140726367244288, 140726367256575, +STORE, 47583769952256, 47583769960447, +STORE, 47583769960448, 47583769968639, +STORE, 47583769968640, 47583770075135, +STORE, 47583769985024, 47583770075135, +STORE, 47583769968640, 47583769985023, +ERASE, 47583769985024, 47583769985024, +STORE, 47583769985024, 47583770058751, +STORE, 47583770058752, 47583770075135, +STORE, 47583770038272, 47583770058751, +STORE, 47583769985024, 47583770038271, +ERASE, 47583769985024, 47583769985024, +STORE, 47583769985024, 47583770038271, +STORE, 47583770054656, 47583770058751, +STORE, 47583770038272, 47583770054655, +ERASE, 47583770038272, 47583770038272, +STORE, 47583770038272, 47583770054655, +STORE, 47583770066944, 47583770075135, +STORE, 47583770058752, 47583770066943, +ERASE, 47583770058752, 47583770058752, +STORE, 47583770058752, 47583770066943, +ERASE, 47583770066944, 47583770066944, +STORE, 47583770066944, 47583770075135, +STORE, 47583770075136, 47583771914239, +STORE, 47583770214400, 47583771914239, +STORE, 47583770075136, 47583770214399, +ERASE, 47583770214400, 47583770214400, +STORE, 47583770214400, 47583771873279, +STORE, 47583771873280, 47583771914239, +STORE, 47583771557888, 47583771873279, +STORE, 47583770214400, 47583771557887, +ERASE, 47583770214400, 47583770214400, +STORE, 47583770214400, 47583771557887, +STORE, 47583771869184, 47583771873279, +STORE, 47583771557888, 47583771869183, +ERASE, 47583771557888, 47583771557888, +STORE, 47583771557888, 47583771869183, +STORE, 47583771897856, 47583771914239, +STORE, 47583771873280, 47583771897855, +ERASE, 47583771873280, 47583771873280, +STORE, 47583771873280, 47583771897855, +ERASE, 47583771897856, 47583771897856, +STORE, 47583771897856, 47583771914239, +STORE, 47583771897856, 47583771926527, +ERASE, 47583771873280, 47583771873280, +STORE, 47583771873280, 47583771889663, +STORE, 47583771889664, 47583771897855, +ERASE, 47583770058752, 47583770058752, +STORE, 47583770058752, 47583770062847, +STORE, 47583770062848, 47583770066943, +ERASE, 94436124798976, 94436124798976, +STORE, 94436124798976, 94436124815359, +STORE, 94436124815360, 94436124819455, +ERASE, 140049025204224, 140049025204224, +STORE, 140049025204224, 140049025208319, +STORE, 140049025208320, 140049025212415, +ERASE, 47583769952256, 47583769952256, +STORE, 140737488347136, 140737488351231, +STORE, 140727116099584, 140737488351231, +ERASE, 140727116099584, 140727116099584, +STORE, 140727116099584, 140727116103679, +STORE, 94166319734784, 94166320447487, +ERASE, 94166319734784, 94166319734784, +STORE, 94166319734784, 94166319783935, +STORE, 94166319783936, 94166320447487, +ERASE, 94166319783936, 94166319783936, +STORE, 94166319783936, 94166320328703, +STORE, 94166320328704, 94166320427007, +STORE, 94166320427008, 94166320447487, +STORE, 139976559542272, 139976559714303, +ERASE, 139976559542272, 139976559542272, +STORE, 139976559542272, 139976559546367, +STORE, 139976559546368, 139976559714303, +ERASE, 139976559546368, 139976559546368, +STORE, 139976559546368, 139976559669247, +STORE, 139976559669248, 139976559702015, +STORE, 139976559702016, 139976559710207, +STORE, 139976559710208, 139976559714303, +STORE, 140727116222464, 140727116226559, +STORE, 140727116210176, 140727116222463, +STORE, 47656235454464, 47656235462655, +STORE, 47656235462656, 47656235470847, +STORE, 47656235470848, 47656235577343, +STORE, 47656235487232, 47656235577343, +STORE, 47656235470848, 47656235487231, +ERASE, 47656235487232, 47656235487232, +STORE, 47656235487232, 47656235560959, +STORE, 47656235560960, 47656235577343, +STORE, 47656235540480, 47656235560959, +STORE, 47656235487232, 47656235540479, +ERASE, 47656235487232, 47656235487232, +STORE, 47656235487232, 47656235540479, +STORE, 47656235556864, 47656235560959, +STORE, 47656235540480, 47656235556863, +ERASE, 47656235540480, 47656235540480, +STORE, 47656235540480, 47656235556863, +STORE, 47656235569152, 47656235577343, +STORE, 47656235560960, 47656235569151, +ERASE, 47656235560960, 47656235560960, +STORE, 47656235560960, 47656235569151, +ERASE, 47656235569152, 47656235569152, +STORE, 47656235569152, 47656235577343, +STORE, 47656235577344, 47656237416447, +STORE, 47656235716608, 47656237416447, +STORE, 47656235577344, 47656235716607, +ERASE, 47656235716608, 47656235716608, +STORE, 47656235716608, 47656237375487, +STORE, 47656237375488, 47656237416447, +STORE, 47656237060096, 47656237375487, +STORE, 47656235716608, 47656237060095, +ERASE, 47656235716608, 47656235716608, +STORE, 47656235716608, 47656237060095, +STORE, 47656237371392, 47656237375487, +STORE, 47656237060096, 47656237371391, +ERASE, 47656237060096, 47656237060096, +STORE, 47656237060096, 47656237371391, +STORE, 47656237400064, 47656237416447, +STORE, 47656237375488, 47656237400063, +ERASE, 47656237375488, 47656237375488, +STORE, 47656237375488, 47656237400063, +ERASE, 47656237400064, 47656237400064, +STORE, 47656237400064, 47656237416447, +STORE, 47656237400064, 47656237428735, +ERASE, 47656237375488, 47656237375488, +STORE, 47656237375488, 47656237391871, +STORE, 47656237391872, 47656237400063, +ERASE, 47656235560960, 47656235560960, +STORE, 47656235560960, 47656235565055, +STORE, 47656235565056, 47656235569151, +ERASE, 94166320427008, 94166320427008, +STORE, 94166320427008, 94166320443391, +STORE, 94166320443392, 94166320447487, +ERASE, 139976559702016, 139976559702016, +STORE, 139976559702016, 139976559706111, +STORE, 139976559706112, 139976559710207, +ERASE, 47656235454464, 47656235454464, +STORE, 94166332153856, 94166332289023, +STORE, 140737488347136, 140737488351231, +STORE, 140726412816384, 140737488351231, +ERASE, 140726412816384, 140726412816384, +STORE, 140726412816384, 140726412820479, +STORE, 94094884507648, 94094885220351, +ERASE, 94094884507648, 94094884507648, +STORE, 94094884507648, 94094884556799, +STORE, 94094884556800, 94094885220351, +ERASE, 94094884556800, 94094884556800, +STORE, 94094884556800, 94094885101567, +STORE, 94094885101568, 94094885199871, +STORE, 94094885199872, 94094885220351, +STORE, 139773773938688, 139773774110719, +ERASE, 139773773938688, 139773773938688, +STORE, 139773773938688, 139773773942783, +STORE, 139773773942784, 139773774110719, +ERASE, 139773773942784, 139773773942784, +STORE, 139773773942784, 139773774065663, +STORE, 139773774065664, 139773774098431, +STORE, 139773774098432, 139773774106623, +STORE, 139773774106624, 139773774110719, +STORE, 140726412963840, 140726412967935, +STORE, 140726412951552, 140726412963839, +STORE, 47859021058048, 47859021066239, +STORE, 47859021066240, 47859021074431, +STORE, 47859021074432, 47859021180927, +STORE, 47859021090816, 47859021180927, +STORE, 47859021074432, 47859021090815, +ERASE, 47859021090816, 47859021090816, +STORE, 47859021090816, 47859021164543, +STORE, 47859021164544, 47859021180927, +STORE, 47859021144064, 47859021164543, +STORE, 47859021090816, 47859021144063, +ERASE, 47859021090816, 47859021090816, +STORE, 47859021090816, 47859021144063, +STORE, 47859021160448, 47859021164543, +STORE, 47859021144064, 47859021160447, +ERASE, 47859021144064, 47859021144064, +STORE, 47859021144064, 47859021160447, +STORE, 47859021172736, 47859021180927, +STORE, 47859021164544, 47859021172735, +ERASE, 47859021164544, 47859021164544, +STORE, 47859021164544, 47859021172735, +ERASE, 47859021172736, 47859021172736, +STORE, 47859021172736, 47859021180927, +STORE, 47859021180928, 47859023020031, +STORE, 47859021320192, 47859023020031, +STORE, 47859021180928, 47859021320191, +ERASE, 47859021320192, 47859021320192, +STORE, 47859021320192, 47859022979071, +STORE, 47859022979072, 47859023020031, +STORE, 47859022663680, 47859022979071, +STORE, 47859021320192, 47859022663679, +ERASE, 47859021320192, 47859021320192, +STORE, 47859021320192, 47859022663679, +STORE, 47859022974976, 47859022979071, +STORE, 47859022663680, 47859022974975, +ERASE, 47859022663680, 47859022663680, +STORE, 47859022663680, 47859022974975, +STORE, 47859023003648, 47859023020031, +STORE, 47859022979072, 47859023003647, +ERASE, 47859022979072, 47859022979072, +STORE, 47859022979072, 47859023003647, +ERASE, 47859023003648, 47859023003648, +STORE, 47859023003648, 47859023020031, +STORE, 47859023003648, 47859023032319, +ERASE, 47859022979072, 47859022979072, +STORE, 47859022979072, 47859022995455, +STORE, 47859022995456, 47859023003647, +ERASE, 47859021164544, 47859021164544, +STORE, 47859021164544, 47859021168639, +STORE, 47859021168640, 47859021172735, +ERASE, 94094885199872, 94094885199872, +STORE, 94094885199872, 94094885216255, +STORE, 94094885216256, 94094885220351, +ERASE, 139773774098432, 139773774098432, +STORE, 139773774098432, 139773774102527, +STORE, 139773774102528, 139773774106623, +ERASE, 47859021058048, 47859021058048, +STORE, 94094901108736, 94094901243903, +STORE, 140737488347136, 140737488351231, +STORE, 140736567963648, 140737488351231, +ERASE, 140736567963648, 140736567963648, +STORE, 140736567963648, 140736567967743, +STORE, 94924425748480, 94924426461183, +ERASE, 94924425748480, 94924425748480, +STORE, 94924425748480, 94924425797631, +STORE, 94924425797632, 94924426461183, +ERASE, 94924425797632, 94924425797632, +STORE, 94924425797632, 94924426342399, +STORE, 94924426342400, 94924426440703, +STORE, 94924426440704, 94924426461183, +STORE, 140042126319616, 140042126491647, +ERASE, 140042126319616, 140042126319616, +STORE, 140042126319616, 140042126323711, +STORE, 140042126323712, 140042126491647, +ERASE, 140042126323712, 140042126323712, +STORE, 140042126323712, 140042126446591, +STORE, 140042126446592, 140042126479359, +STORE, 140042126479360, 140042126487551, +STORE, 140042126487552, 140042126491647, +STORE, 140736568672256, 140736568676351, +STORE, 140736568659968, 140736568672255, +STORE, 47590668677120, 47590668685311, +STORE, 47590668685312, 47590668693503, +STORE, 47590668693504, 47590668799999, +STORE, 47590668709888, 47590668799999, +STORE, 47590668693504, 47590668709887, +ERASE, 47590668709888, 47590668709888, +STORE, 47590668709888, 47590668783615, +STORE, 47590668783616, 47590668799999, +STORE, 47590668763136, 47590668783615, +STORE, 47590668709888, 47590668763135, +ERASE, 47590668709888, 47590668709888, +STORE, 47590668709888, 47590668763135, +STORE, 47590668779520, 47590668783615, +STORE, 47590668763136, 47590668779519, +ERASE, 47590668763136, 47590668763136, +STORE, 47590668763136, 47590668779519, +STORE, 47590668791808, 47590668799999, +STORE, 47590668783616, 47590668791807, +ERASE, 47590668783616, 47590668783616, +STORE, 47590668783616, 47590668791807, +ERASE, 47590668791808, 47590668791808, +STORE, 47590668791808, 47590668799999, +STORE, 47590668800000, 47590670639103, +STORE, 47590668939264, 47590670639103, +STORE, 47590668800000, 47590668939263, +ERASE, 47590668939264, 47590668939264, +STORE, 47590668939264, 47590670598143, +STORE, 47590670598144, 47590670639103, +STORE, 47590670282752, 47590670598143, +STORE, 47590668939264, 47590670282751, +ERASE, 47590668939264, 47590668939264, +STORE, 47590668939264, 47590670282751, +STORE, 47590670594048, 47590670598143, +STORE, 47590670282752, 47590670594047, +ERASE, 47590670282752, 47590670282752, +STORE, 47590670282752, 47590670594047, +STORE, 47590670622720, 47590670639103, +STORE, 47590670598144, 47590670622719, +ERASE, 47590670598144, 47590670598144, +STORE, 47590670598144, 47590670622719, +ERASE, 47590670622720, 47590670622720, +STORE, 47590670622720, 47590670639103, +STORE, 47590670622720, 47590670651391, +ERASE, 47590670598144, 47590670598144, +STORE, 47590670598144, 47590670614527, +STORE, 47590670614528, 47590670622719, +ERASE, 47590668783616, 47590668783616, +STORE, 47590668783616, 47590668787711, +STORE, 47590668787712, 47590668791807, +ERASE, 94924426440704, 94924426440704, +STORE, 94924426440704, 94924426457087, +STORE, 94924426457088, 94924426461183, +ERASE, 140042126479360, 140042126479360, +STORE, 140042126479360, 140042126483455, +STORE, 140042126483456, 140042126487551, +ERASE, 47590668677120, 47590668677120, +STORE, 140737488347136, 140737488351231, +STORE, 140733281439744, 140737488351231, +ERASE, 140733281439744, 140733281439744, +STORE, 140733281439744, 140733281443839, +STORE, 94490667069440, 94490667782143, +ERASE, 94490667069440, 94490667069440, +STORE, 94490667069440, 94490667118591, +STORE, 94490667118592, 94490667782143, +ERASE, 94490667118592, 94490667118592, +STORE, 94490667118592, 94490667663359, +STORE, 94490667663360, 94490667761663, +STORE, 94490667761664, 94490667782143, +STORE, 139878215118848, 139878215290879, +ERASE, 139878215118848, 139878215118848, +STORE, 139878215118848, 139878215122943, +STORE, 139878215122944, 139878215290879, +ERASE, 139878215122944, 139878215122944, +STORE, 139878215122944, 139878215245823, +STORE, 139878215245824, 139878215278591, +STORE, 139878215278592, 139878215286783, +STORE, 139878215286784, 139878215290879, +STORE, 140733281464320, 140733281468415, +STORE, 140733281452032, 140733281464319, +STORE, 47754579877888, 47754579886079, +STORE, 47754579886080, 47754579894271, +STORE, 47754579894272, 47754580000767, +STORE, 47754579910656, 47754580000767, +STORE, 47754579894272, 47754579910655, +ERASE, 47754579910656, 47754579910656, +STORE, 47754579910656, 47754579984383, +STORE, 47754579984384, 47754580000767, +STORE, 47754579963904, 47754579984383, +STORE, 47754579910656, 47754579963903, +ERASE, 47754579910656, 47754579910656, +STORE, 47754579910656, 47754579963903, +STORE, 47754579980288, 47754579984383, +STORE, 47754579963904, 47754579980287, +ERASE, 47754579963904, 47754579963904, +STORE, 47754579963904, 47754579980287, +STORE, 47754579992576, 47754580000767, +STORE, 47754579984384, 47754579992575, +ERASE, 47754579984384, 47754579984384, +STORE, 47754579984384, 47754579992575, +ERASE, 47754579992576, 47754579992576, +STORE, 47754579992576, 47754580000767, +STORE, 47754580000768, 47754581839871, +STORE, 47754580140032, 47754581839871, +STORE, 47754580000768, 47754580140031, +ERASE, 47754580140032, 47754580140032, +STORE, 47754580140032, 47754581798911, +STORE, 47754581798912, 47754581839871, +STORE, 47754581483520, 47754581798911, +STORE, 47754580140032, 47754581483519, +ERASE, 47754580140032, 47754580140032, +STORE, 47754580140032, 47754581483519, +STORE, 47754581794816, 47754581798911, +STORE, 47754581483520, 47754581794815, +ERASE, 47754581483520, 47754581483520, +STORE, 47754581483520, 47754581794815, +STORE, 47754581823488, 47754581839871, +STORE, 47754581798912, 47754581823487, +ERASE, 47754581798912, 47754581798912, +STORE, 47754581798912, 47754581823487, +ERASE, 47754581823488, 47754581823488, +STORE, 47754581823488, 47754581839871, +STORE, 47754581823488, 47754581852159, +ERASE, 47754581798912, 47754581798912, +STORE, 47754581798912, 47754581815295, +STORE, 47754581815296, 47754581823487, +ERASE, 47754579984384, 47754579984384, +STORE, 47754579984384, 47754579988479, +STORE, 47754579988480, 47754579992575, +ERASE, 94490667761664, 94490667761664, +STORE, 94490667761664, 94490667778047, +STORE, 94490667778048, 94490667782143, +ERASE, 139878215278592, 139878215278592, +STORE, 139878215278592, 139878215282687, +STORE, 139878215282688, 139878215286783, +ERASE, 47754579877888, 47754579877888, +STORE, 94490669649920, 94490669785087, +STORE, 140737488347136, 140737488351231, +STORE, 140735382188032, 140737488351231, +ERASE, 140735382188032, 140735382188032, +STORE, 140735382188032, 140735382192127, +STORE, 94150181302272, 94150182014975, +ERASE, 94150181302272, 94150181302272, +STORE, 94150181302272, 94150181351423, +STORE, 94150181351424, 94150182014975, +ERASE, 94150181351424, 94150181351424, +STORE, 94150181351424, 94150181896191, +STORE, 94150181896192, 94150181994495, +STORE, 94150181994496, 94150182014975, +STORE, 139679752458240, 139679752630271, +ERASE, 139679752458240, 139679752458240, +STORE, 139679752458240, 139679752462335, +STORE, 139679752462336, 139679752630271, +ERASE, 139679752462336, 139679752462336, +STORE, 139679752462336, 139679752585215, +STORE, 139679752585216, 139679752617983, +STORE, 139679752617984, 139679752626175, +STORE, 139679752626176, 139679752630271, +STORE, 140735382536192, 140735382540287, +STORE, 140735382523904, 140735382536191, +STORE, 47953042538496, 47953042546687, +STORE, 47953042546688, 47953042554879, +STORE, 47953042554880, 47953042661375, +STORE, 47953042571264, 47953042661375, +STORE, 47953042554880, 47953042571263, +ERASE, 47953042571264, 47953042571264, +STORE, 47953042571264, 47953042644991, +STORE, 47953042644992, 47953042661375, +STORE, 47953042624512, 47953042644991, +STORE, 47953042571264, 47953042624511, +ERASE, 47953042571264, 47953042571264, +STORE, 47953042571264, 47953042624511, +STORE, 47953042640896, 47953042644991, +STORE, 47953042624512, 47953042640895, +ERASE, 47953042624512, 47953042624512, +STORE, 47953042624512, 47953042640895, +STORE, 47953042653184, 47953042661375, +STORE, 47953042644992, 47953042653183, +ERASE, 47953042644992, 47953042644992, +STORE, 47953042644992, 47953042653183, +ERASE, 47953042653184, 47953042653184, +STORE, 47953042653184, 47953042661375, +STORE, 47953042661376, 47953044500479, +STORE, 47953042800640, 47953044500479, +STORE, 47953042661376, 47953042800639, +ERASE, 47953042800640, 47953042800640, +STORE, 47953042800640, 47953044459519, +STORE, 47953044459520, 47953044500479, +STORE, 47953044144128, 47953044459519, +STORE, 47953042800640, 47953044144127, +ERASE, 47953042800640, 47953042800640, +STORE, 47953042800640, 47953044144127, +STORE, 47953044455424, 47953044459519, +STORE, 47953044144128, 47953044455423, +ERASE, 47953044144128, 47953044144128, +STORE, 47953044144128, 47953044455423, +STORE, 47953044484096, 47953044500479, +STORE, 47953044459520, 47953044484095, +ERASE, 47953044459520, 47953044459520, +STORE, 47953044459520, 47953044484095, +ERASE, 47953044484096, 47953044484096, +STORE, 47953044484096, 47953044500479, +STORE, 47953044484096, 47953044512767, +ERASE, 47953044459520, 47953044459520, +STORE, 47953044459520, 47953044475903, +STORE, 47953044475904, 47953044484095, +ERASE, 47953042644992, 47953042644992, +STORE, 47953042644992, 47953042649087, +STORE, 47953042649088, 47953042653183, +ERASE, 94150181994496, 94150181994496, +STORE, 94150181994496, 94150182010879, +STORE, 94150182010880, 94150182014975, +ERASE, 139679752617984, 139679752617984, +STORE, 139679752617984, 139679752622079, +STORE, 139679752622080, 139679752626175, +ERASE, 47953042538496, 47953042538496, +STORE, 140737488347136, 140737488351231, +STORE, 140737044123648, 140737488351231, +ERASE, 140737044123648, 140737044123648, +STORE, 140737044123648, 140737044127743, +STORE, 94425324294144, 94425325006847, +ERASE, 94425324294144, 94425324294144, +STORE, 94425324294144, 94425324343295, +STORE, 94425324343296, 94425325006847, +ERASE, 94425324343296, 94425324343296, +STORE, 94425324343296, 94425324888063, +STORE, 94425324888064, 94425324986367, +STORE, 94425324986368, 94425325006847, +STORE, 140382015016960, 140382015188991, +ERASE, 140382015016960, 140382015016960, +STORE, 140382015016960, 140382015021055, +STORE, 140382015021056, 140382015188991, +ERASE, 140382015021056, 140382015021056, +STORE, 140382015021056, 140382015143935, +STORE, 140382015143936, 140382015176703, +STORE, 140382015176704, 140382015184895, +STORE, 140382015184896, 140382015188991, +STORE, 140737045585920, 140737045590015, +STORE, 140737045573632, 140737045585919, +STORE, 47250779979776, 47250779987967, +STORE, 47250779987968, 47250779996159, +STORE, 47250779996160, 47250780102655, +STORE, 47250780012544, 47250780102655, +STORE, 47250779996160, 47250780012543, +ERASE, 47250780012544, 47250780012544, +STORE, 47250780012544, 47250780086271, +STORE, 47250780086272, 47250780102655, +STORE, 47250780065792, 47250780086271, +STORE, 47250780012544, 47250780065791, +ERASE, 47250780012544, 47250780012544, +STORE, 47250780012544, 47250780065791, +STORE, 47250780082176, 47250780086271, +STORE, 47250780065792, 47250780082175, +ERASE, 47250780065792, 47250780065792, +STORE, 47250780065792, 47250780082175, +STORE, 47250780094464, 47250780102655, +STORE, 47250780086272, 47250780094463, +ERASE, 47250780086272, 47250780086272, +STORE, 47250780086272, 47250780094463, +ERASE, 47250780094464, 47250780094464, +STORE, 47250780094464, 47250780102655, +STORE, 47250780102656, 47250781941759, +STORE, 47250780241920, 47250781941759, +STORE, 47250780102656, 47250780241919, +ERASE, 47250780241920, 47250780241920, +STORE, 47250780241920, 47250781900799, +STORE, 47250781900800, 47250781941759, +STORE, 47250781585408, 47250781900799, +STORE, 47250780241920, 47250781585407, +ERASE, 47250780241920, 47250780241920, +STORE, 47250780241920, 47250781585407, +STORE, 47250781896704, 47250781900799, +STORE, 47250781585408, 47250781896703, +ERASE, 47250781585408, 47250781585408, +STORE, 47250781585408, 47250781896703, +STORE, 47250781925376, 47250781941759, +STORE, 47250781900800, 47250781925375, +ERASE, 47250781900800, 47250781900800, +STORE, 47250781900800, 47250781925375, +ERASE, 47250781925376, 47250781925376, +STORE, 47250781925376, 47250781941759, +STORE, 47250781925376, 47250781954047, +ERASE, 47250781900800, 47250781900800, +STORE, 47250781900800, 47250781917183, +STORE, 47250781917184, 47250781925375, +ERASE, 47250780086272, 47250780086272, +STORE, 47250780086272, 47250780090367, +STORE, 47250780090368, 47250780094463, +ERASE, 94425324986368, 94425324986368, +STORE, 94425324986368, 94425325002751, +STORE, 94425325002752, 94425325006847, +ERASE, 140382015176704, 140382015176704, +STORE, 140382015176704, 140382015180799, +STORE, 140382015180800, 140382015184895, +ERASE, 47250779979776, 47250779979776, +STORE, 94425351438336, 94425351573503, +STORE, 140737488347136, 140737488351231, +STORE, 140736801144832, 140737488351231, +ERASE, 140736801144832, 140736801144832, +STORE, 140736801144832, 140736801148927, +STORE, 94629429358592, 94629430071295, +ERASE, 94629429358592, 94629429358592, +STORE, 94629429358592, 94629429407743, +STORE, 94629429407744, 94629430071295, +ERASE, 94629429407744, 94629429407744, +STORE, 94629429407744, 94629429952511, +STORE, 94629429952512, 94629430050815, +STORE, 94629430050816, 94629430071295, +STORE, 139801685483520, 139801685655551, +ERASE, 139801685483520, 139801685483520, +STORE, 139801685483520, 139801685487615, +STORE, 139801685487616, 139801685655551, +ERASE, 139801685487616, 139801685487616, +STORE, 139801685487616, 139801685610495, +STORE, 139801685610496, 139801685643263, +STORE, 139801685643264, 139801685651455, +STORE, 139801685651456, 139801685655551, +STORE, 140736801198080, 140736801202175, +STORE, 140736801185792, 140736801198079, +STORE, 47831109513216, 47831109521407, +STORE, 47831109521408, 47831109529599, +STORE, 47831109529600, 47831109636095, +STORE, 47831109545984, 47831109636095, +STORE, 47831109529600, 47831109545983, +ERASE, 47831109545984, 47831109545984, +STORE, 47831109545984, 47831109619711, +STORE, 47831109619712, 47831109636095, +STORE, 47831109599232, 47831109619711, +STORE, 47831109545984, 47831109599231, +ERASE, 47831109545984, 47831109545984, +STORE, 47831109545984, 47831109599231, +STORE, 47831109615616, 47831109619711, +STORE, 47831109599232, 47831109615615, +ERASE, 47831109599232, 47831109599232, +STORE, 47831109599232, 47831109615615, +STORE, 47831109627904, 47831109636095, +STORE, 47831109619712, 47831109627903, +ERASE, 47831109619712, 47831109619712, +STORE, 47831109619712, 47831109627903, +ERASE, 47831109627904, 47831109627904, +STORE, 47831109627904, 47831109636095, +STORE, 47831109636096, 47831111475199, +STORE, 47831109775360, 47831111475199, +STORE, 47831109636096, 47831109775359, +ERASE, 47831109775360, 47831109775360, +STORE, 47831109775360, 47831111434239, +STORE, 47831111434240, 47831111475199, +STORE, 47831111118848, 47831111434239, +STORE, 47831109775360, 47831111118847, +ERASE, 47831109775360, 47831109775360, +STORE, 47831109775360, 47831111118847, +STORE, 47831111430144, 47831111434239, +STORE, 47831111118848, 47831111430143, +ERASE, 47831111118848, 47831111118848, +STORE, 47831111118848, 47831111430143, +STORE, 47831111458816, 47831111475199, +STORE, 47831111434240, 47831111458815, +ERASE, 47831111434240, 47831111434240, +STORE, 47831111434240, 47831111458815, +ERASE, 47831111458816, 47831111458816, +STORE, 47831111458816, 47831111475199, +STORE, 47831111458816, 47831111487487, +ERASE, 47831111434240, 47831111434240, +STORE, 47831111434240, 47831111450623, +STORE, 47831111450624, 47831111458815, +ERASE, 47831109619712, 47831109619712, +STORE, 47831109619712, 47831109623807, +STORE, 47831109623808, 47831109627903, +ERASE, 94629430050816, 94629430050816, +STORE, 94629430050816, 94629430067199, +STORE, 94629430067200, 94629430071295, +ERASE, 139801685643264, 139801685643264, +STORE, 139801685643264, 139801685647359, +STORE, 139801685647360, 139801685651455, +ERASE, 47831109513216, 47831109513216, +STORE, 140737488347136, 140737488351231, +STORE, 140729419612160, 140737488351231, +ERASE, 140729419612160, 140729419612160, +STORE, 140729419612160, 140729419616255, +STORE, 94443354148864, 94443354861567, +ERASE, 94443354148864, 94443354148864, +STORE, 94443354148864, 94443354198015, +STORE, 94443354198016, 94443354861567, +ERASE, 94443354198016, 94443354198016, +STORE, 94443354198016, 94443354742783, +STORE, 94443354742784, 94443354841087, +STORE, 94443354841088, 94443354861567, +STORE, 139741700038656, 139741700210687, +ERASE, 139741700038656, 139741700038656, +STORE, 139741700038656, 139741700042751, +STORE, 139741700042752, 139741700210687, +ERASE, 139741700042752, 139741700042752, +STORE, 139741700042752, 139741700165631, +STORE, 139741700165632, 139741700198399, +STORE, 139741700198400, 139741700206591, +STORE, 139741700206592, 139741700210687, +STORE, 140729420574720, 140729420578815, +STORE, 140729420562432, 140729420574719, +STORE, 47891094958080, 47891094966271, +STORE, 47891094966272, 47891094974463, +STORE, 47891094974464, 47891095080959, +STORE, 47891094990848, 47891095080959, +STORE, 47891094974464, 47891094990847, +ERASE, 47891094990848, 47891094990848, +STORE, 47891094990848, 47891095064575, +STORE, 47891095064576, 47891095080959, +STORE, 47891095044096, 47891095064575, +STORE, 47891094990848, 47891095044095, +ERASE, 47891094990848, 47891094990848, +STORE, 47891094990848, 47891095044095, +STORE, 47891095060480, 47891095064575, +STORE, 47891095044096, 47891095060479, +ERASE, 47891095044096, 47891095044096, +STORE, 47891095044096, 47891095060479, +STORE, 47891095072768, 47891095080959, +STORE, 47891095064576, 47891095072767, +ERASE, 47891095064576, 47891095064576, +STORE, 47891095064576, 47891095072767, +ERASE, 47891095072768, 47891095072768, +STORE, 47891095072768, 47891095080959, +STORE, 47891095080960, 47891096920063, +STORE, 47891095220224, 47891096920063, +STORE, 47891095080960, 47891095220223, +ERASE, 47891095220224, 47891095220224, +STORE, 47891095220224, 47891096879103, +STORE, 47891096879104, 47891096920063, +STORE, 47891096563712, 47891096879103, +STORE, 47891095220224, 47891096563711, +ERASE, 47891095220224, 47891095220224, +STORE, 47891095220224, 47891096563711, +STORE, 47891096875008, 47891096879103, +STORE, 47891096563712, 47891096875007, +ERASE, 47891096563712, 47891096563712, +STORE, 47891096563712, 47891096875007, +STORE, 47891096903680, 47891096920063, +STORE, 47891096879104, 47891096903679, +ERASE, 47891096879104, 47891096879104, +STORE, 47891096879104, 47891096903679, +ERASE, 47891096903680, 47891096903680, +STORE, 47891096903680, 47891096920063, +STORE, 47891096903680, 47891096932351, +ERASE, 47891096879104, 47891096879104, +STORE, 47891096879104, 47891096895487, +STORE, 47891096895488, 47891096903679, +ERASE, 47891095064576, 47891095064576, +STORE, 47891095064576, 47891095068671, +STORE, 47891095068672, 47891095072767, +ERASE, 94443354841088, 94443354841088, +STORE, 94443354841088, 94443354857471, +STORE, 94443354857472, 94443354861567, +ERASE, 139741700198400, 139741700198400, +STORE, 139741700198400, 139741700202495, +STORE, 139741700202496, 139741700206591, +ERASE, 47891094958080, 47891094958080, +STORE, 94443360825344, 94443360960511, +STORE, 140737488347136, 140737488351231, +STORE, 140722961661952, 140737488351231, +ERASE, 140722961661952, 140722961661952, +STORE, 140722961661952, 140722961666047, +STORE, 94878388944896, 94878389657599, +ERASE, 94878388944896, 94878388944896, +STORE, 94878388944896, 94878388994047, +STORE, 94878388994048, 94878389657599, +ERASE, 94878388994048, 94878388994048, +STORE, 94878388994048, 94878389538815, +STORE, 94878389538816, 94878389637119, +STORE, 94878389637120, 94878389657599, +STORE, 140210690056192, 140210690228223, +ERASE, 140210690056192, 140210690056192, +STORE, 140210690056192, 140210690060287, +STORE, 140210690060288, 140210690228223, +ERASE, 140210690060288, 140210690060288, +STORE, 140210690060288, 140210690183167, +STORE, 140210690183168, 140210690215935, +STORE, 140210690215936, 140210690224127, +STORE, 140210690224128, 140210690228223, +STORE, 140722963148800, 140722963152895, +STORE, 140722963136512, 140722963148799, +STORE, 47422104940544, 47422104948735, +STORE, 47422104948736, 47422104956927, +STORE, 47422104956928, 47422105063423, +STORE, 47422104973312, 47422105063423, +STORE, 47422104956928, 47422104973311, +ERASE, 47422104973312, 47422104973312, +STORE, 47422104973312, 47422105047039, +STORE, 47422105047040, 47422105063423, +STORE, 47422105026560, 47422105047039, +STORE, 47422104973312, 47422105026559, +ERASE, 47422104973312, 47422104973312, +STORE, 47422104973312, 47422105026559, +STORE, 47422105042944, 47422105047039, +STORE, 47422105026560, 47422105042943, +ERASE, 47422105026560, 47422105026560, +STORE, 47422105026560, 47422105042943, +STORE, 47422105055232, 47422105063423, +STORE, 47422105047040, 47422105055231, +ERASE, 47422105047040, 47422105047040, +STORE, 47422105047040, 47422105055231, +ERASE, 47422105055232, 47422105055232, +STORE, 47422105055232, 47422105063423, +STORE, 47422105063424, 47422106902527, +STORE, 47422105202688, 47422106902527, +STORE, 47422105063424, 47422105202687, +ERASE, 47422105202688, 47422105202688, +STORE, 47422105202688, 47422106861567, +STORE, 47422106861568, 47422106902527, +STORE, 47422106546176, 47422106861567, +STORE, 47422105202688, 47422106546175, +ERASE, 47422105202688, 47422105202688, +STORE, 47422105202688, 47422106546175, +STORE, 47422106857472, 47422106861567, +STORE, 47422106546176, 47422106857471, +ERASE, 47422106546176, 47422106546176, +STORE, 47422106546176, 47422106857471, +STORE, 47422106886144, 47422106902527, +STORE, 47422106861568, 47422106886143, +ERASE, 47422106861568, 47422106861568, +STORE, 47422106861568, 47422106886143, +ERASE, 47422106886144, 47422106886144, +STORE, 47422106886144, 47422106902527, +STORE, 47422106886144, 47422106914815, +ERASE, 47422106861568, 47422106861568, +STORE, 47422106861568, 47422106877951, +STORE, 47422106877952, 47422106886143, +ERASE, 47422105047040, 47422105047040, +STORE, 47422105047040, 47422105051135, +STORE, 47422105051136, 47422105055231, +ERASE, 94878389637120, 94878389637120, +STORE, 94878389637120, 94878389653503, +STORE, 94878389653504, 94878389657599, +ERASE, 140210690215936, 140210690215936, +STORE, 140210690215936, 140210690220031, +STORE, 140210690220032, 140210690224127, +ERASE, 47422104940544, 47422104940544, +STORE, 140737488347136, 140737488351231, +STORE, 140727690309632, 140737488351231, +ERASE, 140727690309632, 140727690309632, +STORE, 140727690309632, 140727690313727, +STORE, 94121892208640, 94121892921343, +ERASE, 94121892208640, 94121892208640, +STORE, 94121892208640, 94121892257791, +STORE, 94121892257792, 94121892921343, +ERASE, 94121892257792, 94121892257792, +STORE, 94121892257792, 94121892802559, +STORE, 94121892802560, 94121892900863, +STORE, 94121892900864, 94121892921343, +STORE, 140662438326272, 140662438498303, +ERASE, 140662438326272, 140662438326272, +STORE, 140662438326272, 140662438330367, +STORE, 140662438330368, 140662438498303, +ERASE, 140662438330368, 140662438330368, +STORE, 140662438330368, 140662438453247, +STORE, 140662438453248, 140662438486015, +STORE, 140662438486016, 140662438494207, +STORE, 140662438494208, 140662438498303, +STORE, 140727690379264, 140727690383359, +STORE, 140727690366976, 140727690379263, +STORE, 46970356670464, 46970356678655, +STORE, 46970356678656, 46970356686847, +STORE, 46970356686848, 46970356793343, +STORE, 46970356703232, 46970356793343, +STORE, 46970356686848, 46970356703231, +ERASE, 46970356703232, 46970356703232, +STORE, 46970356703232, 46970356776959, +STORE, 46970356776960, 46970356793343, +STORE, 46970356756480, 46970356776959, +STORE, 46970356703232, 46970356756479, +ERASE, 46970356703232, 46970356703232, +STORE, 46970356703232, 46970356756479, +STORE, 46970356772864, 46970356776959, +STORE, 46970356756480, 46970356772863, +ERASE, 46970356756480, 46970356756480, +STORE, 46970356756480, 46970356772863, +STORE, 46970356785152, 46970356793343, +STORE, 46970356776960, 46970356785151, +ERASE, 46970356776960, 46970356776960, +STORE, 46970356776960, 46970356785151, +ERASE, 46970356785152, 46970356785152, +STORE, 46970356785152, 46970356793343, +STORE, 46970356793344, 46970358632447, +STORE, 46970356932608, 46970358632447, +STORE, 46970356793344, 46970356932607, +ERASE, 46970356932608, 46970356932608, +STORE, 46970356932608, 46970358591487, +STORE, 46970358591488, 46970358632447, +STORE, 46970358276096, 46970358591487, +STORE, 46970356932608, 46970358276095, +ERASE, 46970356932608, 46970356932608, +STORE, 46970356932608, 46970358276095, +STORE, 46970358587392, 46970358591487, +STORE, 46970358276096, 46970358587391, +ERASE, 46970358276096, 46970358276096, +STORE, 46970358276096, 46970358587391, +STORE, 46970358616064, 46970358632447, +STORE, 46970358591488, 46970358616063, +ERASE, 46970358591488, 46970358591488, +STORE, 46970358591488, 46970358616063, +ERASE, 46970358616064, 46970358616064, +STORE, 46970358616064, 46970358632447, +STORE, 46970358616064, 46970358644735, +ERASE, 46970358591488, 46970358591488, +STORE, 46970358591488, 46970358607871, +STORE, 46970358607872, 46970358616063, +ERASE, 46970356776960, 46970356776960, +STORE, 46970356776960, 46970356781055, +STORE, 46970356781056, 46970356785151, +ERASE, 94121892900864, 94121892900864, +STORE, 94121892900864, 94121892917247, +STORE, 94121892917248, 94121892921343, +ERASE, 140662438486016, 140662438486016, +STORE, 140662438486016, 140662438490111, +STORE, 140662438490112, 140662438494207, +ERASE, 46970356670464, 46970356670464, +STORE, 94121898610688, 94121898745855, +STORE, 140737488347136, 140737488351231, +STORE, 140737189351424, 140737488351231, +ERASE, 140737189351424, 140737189351424, +STORE, 140737189351424, 140737189355519, +STORE, 93847948832768, 93847949545471, +ERASE, 93847948832768, 93847948832768, +STORE, 93847948832768, 93847948881919, +STORE, 93847948881920, 93847949545471, +ERASE, 93847948881920, 93847948881920, +STORE, 93847948881920, 93847949426687, +STORE, 93847949426688, 93847949524991, +STORE, 93847949524992, 93847949545471, +STORE, 139698989985792, 139698990157823, +ERASE, 139698989985792, 139698989985792, +STORE, 139698989985792, 139698989989887, +STORE, 139698989989888, 139698990157823, +ERASE, 139698989989888, 139698989989888, +STORE, 139698989989888, 139698990112767, +STORE, 139698990112768, 139698990145535, +STORE, 139698990145536, 139698990153727, +STORE, 139698990153728, 139698990157823, +STORE, 140737189744640, 140737189748735, +STORE, 140737189732352, 140737189744639, +STORE, 47933805010944, 47933805019135, +STORE, 47933805019136, 47933805027327, +STORE, 47933805027328, 47933805133823, +STORE, 47933805043712, 47933805133823, +STORE, 47933805027328, 47933805043711, +ERASE, 47933805043712, 47933805043712, +STORE, 47933805043712, 47933805117439, +STORE, 47933805117440, 47933805133823, +STORE, 47933805096960, 47933805117439, +STORE, 47933805043712, 47933805096959, +ERASE, 47933805043712, 47933805043712, +STORE, 47933805043712, 47933805096959, +STORE, 47933805113344, 47933805117439, +STORE, 47933805096960, 47933805113343, +ERASE, 47933805096960, 47933805096960, +STORE, 47933805096960, 47933805113343, +STORE, 47933805125632, 47933805133823, +STORE, 47933805117440, 47933805125631, +ERASE, 47933805117440, 47933805117440, +STORE, 47933805117440, 47933805125631, +ERASE, 47933805125632, 47933805125632, +STORE, 47933805125632, 47933805133823, +STORE, 47933805133824, 47933806972927, +STORE, 47933805273088, 47933806972927, +STORE, 47933805133824, 47933805273087, +ERASE, 47933805273088, 47933805273088, +STORE, 47933805273088, 47933806931967, +STORE, 47933806931968, 47933806972927, +STORE, 47933806616576, 47933806931967, +STORE, 47933805273088, 47933806616575, +ERASE, 47933805273088, 47933805273088, +STORE, 47933805273088, 47933806616575, +STORE, 47933806927872, 47933806931967, +STORE, 47933806616576, 47933806927871, +ERASE, 47933806616576, 47933806616576, +STORE, 47933806616576, 47933806927871, +STORE, 47933806956544, 47933806972927, +STORE, 47933806931968, 47933806956543, +ERASE, 47933806931968, 47933806931968, +STORE, 47933806931968, 47933806956543, +ERASE, 47933806956544, 47933806956544, +STORE, 47933806956544, 47933806972927, +STORE, 47933806956544, 47933806985215, +ERASE, 47933806931968, 47933806931968, +STORE, 47933806931968, 47933806948351, +STORE, 47933806948352, 47933806956543, +ERASE, 47933805117440, 47933805117440, +STORE, 47933805117440, 47933805121535, +STORE, 47933805121536, 47933805125631, +ERASE, 93847949524992, 93847949524992, +STORE, 93847949524992, 93847949541375, +STORE, 93847949541376, 93847949545471, +ERASE, 139698990145536, 139698990145536, +STORE, 139698990145536, 139698990149631, +STORE, 139698990149632, 139698990153727, +ERASE, 47933805010944, 47933805010944, +STORE, 140737488347136, 140737488351231, +STORE, 140725553991680, 140737488351231, +ERASE, 140725553991680, 140725553991680, +STORE, 140725553991680, 140725553995775, +STORE, 93980056248320, 93980056961023, +ERASE, 93980056248320, 93980056248320, +STORE, 93980056248320, 93980056297471, +STORE, 93980056297472, 93980056961023, +ERASE, 93980056297472, 93980056297472, +STORE, 93980056297472, 93980056842239, +STORE, 93980056842240, 93980056940543, +STORE, 93980056940544, 93980056961023, +STORE, 140146588971008, 140146589143039, +ERASE, 140146588971008, 140146588971008, +STORE, 140146588971008, 140146588975103, +STORE, 140146588975104, 140146589143039, +ERASE, 140146588975104, 140146588975104, +STORE, 140146588975104, 140146589097983, +STORE, 140146589097984, 140146589130751, +STORE, 140146589130752, 140146589138943, +STORE, 140146589138944, 140146589143039, +STORE, 140725554860032, 140725554864127, +STORE, 140725554847744, 140725554860031, +STORE, 47486206025728, 47486206033919, +STORE, 47486206033920, 47486206042111, +STORE, 47486206042112, 47486206148607, +STORE, 47486206058496, 47486206148607, +STORE, 47486206042112, 47486206058495, +ERASE, 47486206058496, 47486206058496, +STORE, 47486206058496, 47486206132223, +STORE, 47486206132224, 47486206148607, +STORE, 47486206111744, 47486206132223, +STORE, 47486206058496, 47486206111743, +ERASE, 47486206058496, 47486206058496, +STORE, 47486206058496, 47486206111743, +STORE, 47486206128128, 47486206132223, +STORE, 47486206111744, 47486206128127, +ERASE, 47486206111744, 47486206111744, +STORE, 47486206111744, 47486206128127, +STORE, 47486206140416, 47486206148607, +STORE, 47486206132224, 47486206140415, +ERASE, 47486206132224, 47486206132224, +STORE, 47486206132224, 47486206140415, +ERASE, 47486206140416, 47486206140416, +STORE, 47486206140416, 47486206148607, +STORE, 47486206148608, 47486207987711, +STORE, 47486206287872, 47486207987711, +STORE, 47486206148608, 47486206287871, +ERASE, 47486206287872, 47486206287872, +STORE, 47486206287872, 47486207946751, +STORE, 47486207946752, 47486207987711, +STORE, 47486207631360, 47486207946751, +STORE, 47486206287872, 47486207631359, +ERASE, 47486206287872, 47486206287872, +STORE, 47486206287872, 47486207631359, +STORE, 47486207942656, 47486207946751, +STORE, 47486207631360, 47486207942655, +ERASE, 47486207631360, 47486207631360, +STORE, 47486207631360, 47486207942655, +STORE, 47486207971328, 47486207987711, +STORE, 47486207946752, 47486207971327, +ERASE, 47486207946752, 47486207946752, +STORE, 47486207946752, 47486207971327, +ERASE, 47486207971328, 47486207971328, +STORE, 47486207971328, 47486207987711, +STORE, 47486207971328, 47486207999999, +ERASE, 47486207946752, 47486207946752, +STORE, 47486207946752, 47486207963135, +STORE, 47486207963136, 47486207971327, +ERASE, 47486206132224, 47486206132224, +STORE, 47486206132224, 47486206136319, +STORE, 47486206136320, 47486206140415, +ERASE, 93980056940544, 93980056940544, +STORE, 93980056940544, 93980056956927, +STORE, 93980056956928, 93980056961023, +ERASE, 140146589130752, 140146589130752, +STORE, 140146589130752, 140146589134847, +STORE, 140146589134848, 140146589138943, +ERASE, 47486206025728, 47486206025728, +STORE, 93980070006784, 93980070141951, +STORE, 140737488347136, 140737488351231, +STORE, 140727334776832, 140737488351231, +ERASE, 140727334776832, 140727334776832, +STORE, 140727334776832, 140727334780927, +STORE, 94049747247104, 94049747959807, +ERASE, 94049747247104, 94049747247104, +STORE, 94049747247104, 94049747296255, +STORE, 94049747296256, 94049747959807, +ERASE, 94049747296256, 94049747296256, +STORE, 94049747296256, 94049747841023, +STORE, 94049747841024, 94049747939327, +STORE, 94049747939328, 94049747959807, +STORE, 140227307216896, 140227307388927, +ERASE, 140227307216896, 140227307216896, +STORE, 140227307216896, 140227307220991, +STORE, 140227307220992, 140227307388927, +ERASE, 140227307220992, 140227307220992, +STORE, 140227307220992, 140227307343871, +STORE, 140227307343872, 140227307376639, +STORE, 140227307376640, 140227307384831, +STORE, 140227307384832, 140227307388927, +STORE, 140727335337984, 140727335342079, +STORE, 140727335325696, 140727335337983, +STORE, 47405487779840, 47405487788031, +STORE, 47405487788032, 47405487796223, +STORE, 47405487796224, 47405487902719, +STORE, 47405487812608, 47405487902719, +STORE, 47405487796224, 47405487812607, +ERASE, 47405487812608, 47405487812608, +STORE, 47405487812608, 47405487886335, +STORE, 47405487886336, 47405487902719, +STORE, 47405487865856, 47405487886335, +STORE, 47405487812608, 47405487865855, +ERASE, 47405487812608, 47405487812608, +STORE, 47405487812608, 47405487865855, +STORE, 47405487882240, 47405487886335, +STORE, 47405487865856, 47405487882239, +ERASE, 47405487865856, 47405487865856, +STORE, 47405487865856, 47405487882239, +STORE, 47405487894528, 47405487902719, +STORE, 47405487886336, 47405487894527, +ERASE, 47405487886336, 47405487886336, +STORE, 47405487886336, 47405487894527, +ERASE, 47405487894528, 47405487894528, +STORE, 47405487894528, 47405487902719, +STORE, 47405487902720, 47405489741823, +STORE, 47405488041984, 47405489741823, +STORE, 47405487902720, 47405488041983, +ERASE, 47405488041984, 47405488041984, +STORE, 47405488041984, 47405489700863, +STORE, 47405489700864, 47405489741823, +STORE, 47405489385472, 47405489700863, +STORE, 47405488041984, 47405489385471, +ERASE, 47405488041984, 47405488041984, +STORE, 47405488041984, 47405489385471, +STORE, 47405489696768, 47405489700863, +STORE, 47405489385472, 47405489696767, +ERASE, 47405489385472, 47405489385472, +STORE, 47405489385472, 47405489696767, +STORE, 47405489725440, 47405489741823, +STORE, 47405489700864, 47405489725439, +ERASE, 47405489700864, 47405489700864, +STORE, 47405489700864, 47405489725439, +ERASE, 47405489725440, 47405489725440, +STORE, 47405489725440, 47405489741823, +STORE, 47405489725440, 47405489754111, +ERASE, 47405489700864, 47405489700864, +STORE, 47405489700864, 47405489717247, +STORE, 47405489717248, 47405489725439, +ERASE, 47405487886336, 47405487886336, +STORE, 47405487886336, 47405487890431, +STORE, 47405487890432, 47405487894527, +ERASE, 94049747939328, 94049747939328, +STORE, 94049747939328, 94049747955711, +STORE, 94049747955712, 94049747959807, +ERASE, 140227307376640, 140227307376640, +STORE, 140227307376640, 140227307380735, +STORE, 140227307380736, 140227307384831, +ERASE, 47405487779840, 47405487779840, +STORE, 94049758810112, 94049758945279, +STORE, 140737488347136, 140737488351231, +STORE, 140727079718912, 140737488351231, +ERASE, 140727079718912, 140727079718912, +STORE, 140727079718912, 140727079723007, +STORE, 94250996527104, 94250997239807, +ERASE, 94250996527104, 94250996527104, +STORE, 94250996527104, 94250996576255, +STORE, 94250996576256, 94250997239807, +ERASE, 94250996576256, 94250996576256, +STORE, 94250996576256, 94250997121023, +STORE, 94250997121024, 94250997219327, +STORE, 94250997219328, 94250997239807, +STORE, 140060022587392, 140060022759423, +ERASE, 140060022587392, 140060022587392, +STORE, 140060022587392, 140060022591487, +STORE, 140060022591488, 140060022759423, +ERASE, 140060022591488, 140060022591488, +STORE, 140060022591488, 140060022714367, +STORE, 140060022714368, 140060022747135, +STORE, 140060022747136, 140060022755327, +STORE, 140060022755328, 140060022759423, +STORE, 140727079788544, 140727079792639, +STORE, 140727079776256, 140727079788543, +STORE, 47572772409344, 47572772417535, +STORE, 47572772417536, 47572772425727, +STORE, 47572772425728, 47572772532223, +STORE, 47572772442112, 47572772532223, +STORE, 47572772425728, 47572772442111, +ERASE, 47572772442112, 47572772442112, +STORE, 47572772442112, 47572772515839, +STORE, 47572772515840, 47572772532223, +STORE, 47572772495360, 47572772515839, +STORE, 47572772442112, 47572772495359, +ERASE, 47572772442112, 47572772442112, +STORE, 47572772442112, 47572772495359, +STORE, 47572772511744, 47572772515839, +STORE, 47572772495360, 47572772511743, +ERASE, 47572772495360, 47572772495360, +STORE, 47572772495360, 47572772511743, +STORE, 47572772524032, 47572772532223, +STORE, 47572772515840, 47572772524031, +ERASE, 47572772515840, 47572772515840, +STORE, 47572772515840, 47572772524031, +ERASE, 47572772524032, 47572772524032, +STORE, 47572772524032, 47572772532223, +STORE, 47572772532224, 47572774371327, +STORE, 47572772671488, 47572774371327, +STORE, 47572772532224, 47572772671487, +ERASE, 47572772671488, 47572772671488, +STORE, 47572772671488, 47572774330367, +STORE, 47572774330368, 47572774371327, +STORE, 47572774014976, 47572774330367, +STORE, 47572772671488, 47572774014975, +ERASE, 47572772671488, 47572772671488, +STORE, 47572772671488, 47572774014975, +STORE, 47572774326272, 47572774330367, +STORE, 47572774014976, 47572774326271, +ERASE, 47572774014976, 47572774014976, +STORE, 47572774014976, 47572774326271, +STORE, 47572774354944, 47572774371327, +STORE, 47572774330368, 47572774354943, +ERASE, 47572774330368, 47572774330368, +STORE, 47572774330368, 47572774354943, +ERASE, 47572774354944, 47572774354944, +STORE, 47572774354944, 47572774371327, +STORE, 47572774354944, 47572774383615, +ERASE, 47572774330368, 47572774330368, +STORE, 47572774330368, 47572774346751, +STORE, 47572774346752, 47572774354943, +ERASE, 47572772515840, 47572772515840, +STORE, 47572772515840, 47572772519935, +STORE, 47572772519936, 47572772524031, +ERASE, 94250997219328, 94250997219328, +STORE, 94250997219328, 94250997235711, +STORE, 94250997235712, 94250997239807, +ERASE, 140060022747136, 140060022747136, +STORE, 140060022747136, 140060022751231, +STORE, 140060022751232, 140060022755327, +ERASE, 47572772409344, 47572772409344, +STORE, 94251018305536, 94251018440703, +STORE, 140737488347136, 140737488351231, +STORE, 140730012389376, 140737488351231, +ERASE, 140730012389376, 140730012389376, +STORE, 140730012389376, 140730012393471, +STORE, 94382607675392, 94382607695871, +ERASE, 94382607675392, 94382607675392, +STORE, 94382607675392, 94382607679487, +STORE, 94382607679488, 94382607695871, +ERASE, 94382607679488, 94382607679488, +STORE, 94382607679488, 94382607683583, +STORE, 94382607683584, 94382607687679, +STORE, 94382607687680, 94382607695871, +STORE, 140252451454976, 140252451627007, +ERASE, 140252451454976, 140252451454976, +STORE, 140252451454976, 140252451459071, +STORE, 140252451459072, 140252451627007, +ERASE, 140252451459072, 140252451459072, +STORE, 140252451459072, 140252451581951, +STORE, 140252451581952, 140252451614719, +STORE, 140252451614720, 140252451622911, +STORE, 140252451622912, 140252451627007, +STORE, 140730013548544, 140730013552639, +STORE, 140730013536256, 140730013548543, +STORE, 47380343541760, 47380343549951, +STORE, 47380343549952, 47380343558143, +STORE, 47380343558144, 47380345397247, +STORE, 47380343697408, 47380345397247, +STORE, 47380343558144, 47380343697407, +ERASE, 47380343697408, 47380343697408, +STORE, 47380343697408, 47380345356287, +STORE, 47380345356288, 47380345397247, +STORE, 47380345040896, 47380345356287, +STORE, 47380343697408, 47380345040895, +ERASE, 47380343697408, 47380343697408, +STORE, 47380343697408, 47380345040895, +STORE, 47380345352192, 47380345356287, +STORE, 47380345040896, 47380345352191, +ERASE, 47380345040896, 47380345040896, +STORE, 47380345040896, 47380345352191, +STORE, 47380345380864, 47380345397247, +STORE, 47380345356288, 47380345380863, +ERASE, 47380345356288, 47380345356288, +STORE, 47380345356288, 47380345380863, +ERASE, 47380345380864, 47380345380864, +STORE, 47380345380864, 47380345397247, +ERASE, 47380345356288, 47380345356288, +STORE, 47380345356288, 47380345372671, +STORE, 47380345372672, 47380345380863, +ERASE, 94382607687680, 94382607687680, +STORE, 94382607687680, 94382607691775, +STORE, 94382607691776, 94382607695871, +ERASE, 140252451614720, 140252451614720, +STORE, 140252451614720, 140252451618815, +STORE, 140252451618816, 140252451622911, +ERASE, 47380343541760, 47380343541760, +STORE, 94382626803712, 94382626938879, +STORE, 140737488347136, 140737488351231, +STORE, 140730900271104, 140737488351231, +ERASE, 140730900271104, 140730900271104, +STORE, 140730900271104, 140730900275199, +STORE, 93855478120448, 93855478337535, +ERASE, 93855478120448, 93855478120448, +STORE, 93855478120448, 93855478198271, +STORE, 93855478198272, 93855478337535, +ERASE, 93855478198272, 93855478198272, +STORE, 93855478198272, 93855478243327, +STORE, 93855478243328, 93855478288383, +STORE, 93855478288384, 93855478337535, +STORE, 140092686573568, 140092686745599, +ERASE, 140092686573568, 140092686573568, +STORE, 140092686573568, 140092686577663, +STORE, 140092686577664, 140092686745599, +ERASE, 140092686577664, 140092686577664, +STORE, 140092686577664, 140092686700543, +STORE, 140092686700544, 140092686733311, +STORE, 140092686733312, 140092686741503, +STORE, 140092686741504, 140092686745599, +STORE, 140730900537344, 140730900541439, +STORE, 140730900525056, 140730900537343, +STORE, 47540108423168, 47540108431359, +STORE, 47540108431360, 47540108439551, +STORE, 47540108439552, 47540110278655, +STORE, 47540108578816, 47540110278655, +STORE, 47540108439552, 47540108578815, +ERASE, 47540108578816, 47540108578816, +STORE, 47540108578816, 47540110237695, +STORE, 47540110237696, 47540110278655, +STORE, 47540109922304, 47540110237695, +STORE, 47540108578816, 47540109922303, +ERASE, 47540108578816, 47540108578816, +STORE, 47540108578816, 47540109922303, +STORE, 47540110233600, 47540110237695, +STORE, 47540109922304, 47540110233599, +ERASE, 47540109922304, 47540109922304, +STORE, 47540109922304, 47540110233599, +STORE, 47540110262272, 47540110278655, +STORE, 47540110237696, 47540110262271, +ERASE, 47540110237696, 47540110237696, +STORE, 47540110237696, 47540110262271, +ERASE, 47540110262272, 47540110262272, +STORE, 47540110262272, 47540110278655, +ERASE, 47540110237696, 47540110237696, +STORE, 47540110237696, 47540110254079, +STORE, 47540110254080, 47540110262271, +ERASE, 93855478288384, 93855478288384, +STORE, 93855478288384, 93855478333439, +STORE, 93855478333440, 93855478337535, +ERASE, 140092686733312, 140092686733312, +STORE, 140092686733312, 140092686737407, +STORE, 140092686737408, 140092686741503, +ERASE, 47540108423168, 47540108423168, +STORE, 93855492222976, 93855492358143, +STORE, 93855492222976, 93855492493311, +STORE, 140737488347136, 140737488351231, +STORE, 140733498146816, 140737488351231, +ERASE, 140733498146816, 140733498146816, +STORE, 140733498146816, 140733498150911, +STORE, 94170739654656, 94170740367359, +ERASE, 94170739654656, 94170739654656, +STORE, 94170739654656, 94170739703807, +STORE, 94170739703808, 94170740367359, +ERASE, 94170739703808, 94170739703808, +STORE, 94170739703808, 94170740248575, +STORE, 94170740248576, 94170740346879, +STORE, 94170740346880, 94170740367359, +STORE, 140024788877312, 140024789049343, +ERASE, 140024788877312, 140024788877312, +STORE, 140024788877312, 140024788881407, +STORE, 140024788881408, 140024789049343, +ERASE, 140024788881408, 140024788881408, +STORE, 140024788881408, 140024789004287, +STORE, 140024789004288, 140024789037055, +STORE, 140024789037056, 140024789045247, +STORE, 140024789045248, 140024789049343, +STORE, 140733499023360, 140733499027455, +STORE, 140733499011072, 140733499023359, +STORE, 47608006119424, 47608006127615, +STORE, 47608006127616, 47608006135807, +STORE, 47608006135808, 47608006242303, +STORE, 47608006152192, 47608006242303, +STORE, 47608006135808, 47608006152191, +ERASE, 47608006152192, 47608006152192, +STORE, 47608006152192, 47608006225919, +STORE, 47608006225920, 47608006242303, +STORE, 47608006205440, 47608006225919, +STORE, 47608006152192, 47608006205439, +ERASE, 47608006152192, 47608006152192, +STORE, 47608006152192, 47608006205439, +STORE, 47608006221824, 47608006225919, +STORE, 47608006205440, 47608006221823, +ERASE, 47608006205440, 47608006205440, +STORE, 47608006205440, 47608006221823, +STORE, 47608006234112, 47608006242303, +STORE, 47608006225920, 47608006234111, +ERASE, 47608006225920, 47608006225920, +STORE, 47608006225920, 47608006234111, +ERASE, 47608006234112, 47608006234112, +STORE, 47608006234112, 47608006242303, +STORE, 47608006242304, 47608008081407, +STORE, 47608006381568, 47608008081407, +STORE, 47608006242304, 47608006381567, +ERASE, 47608006381568, 47608006381568, +STORE, 47608006381568, 47608008040447, +STORE, 47608008040448, 47608008081407, +STORE, 47608007725056, 47608008040447, +STORE, 47608006381568, 47608007725055, +ERASE, 47608006381568, 47608006381568, +STORE, 47608006381568, 47608007725055, +STORE, 47608008036352, 47608008040447, +STORE, 47608007725056, 47608008036351, +ERASE, 47608007725056, 47608007725056, +STORE, 47608007725056, 47608008036351, +STORE, 47608008065024, 47608008081407, +STORE, 47608008040448, 47608008065023, +ERASE, 47608008040448, 47608008040448, +STORE, 47608008040448, 47608008065023, +ERASE, 47608008065024, 47608008065024, +STORE, 47608008065024, 47608008081407, +STORE, 47608008065024, 47608008093695, +ERASE, 47608008040448, 47608008040448, +STORE, 47608008040448, 47608008056831, +STORE, 47608008056832, 47608008065023, +ERASE, 47608006225920, 47608006225920, +STORE, 47608006225920, 47608006230015, +STORE, 47608006230016, 47608006234111, +ERASE, 94170740346880, 94170740346880, +STORE, 94170740346880, 94170740363263, +STORE, 94170740363264, 94170740367359, +ERASE, 140024789037056, 140024789037056, +STORE, 140024789037056, 140024789041151, +STORE, 140024789041152, 140024789045247, +ERASE, 47608006119424, 47608006119424, +STORE, 140737488347136, 140737488351231, +STORE, 140730264326144, 140737488351231, +ERASE, 140730264326144, 140730264326144, +STORE, 140730264326144, 140730264330239, +STORE, 94653216407552, 94653217120255, +ERASE, 94653216407552, 94653216407552, +STORE, 94653216407552, 94653216456703, +STORE, 94653216456704, 94653217120255, +ERASE, 94653216456704, 94653216456704, +STORE, 94653216456704, 94653217001471, +STORE, 94653217001472, 94653217099775, +STORE, 94653217099776, 94653217120255, +STORE, 140103617011712, 140103617183743, +ERASE, 140103617011712, 140103617011712, +STORE, 140103617011712, 140103617015807, +STORE, 140103617015808, 140103617183743, +ERASE, 140103617015808, 140103617015808, +STORE, 140103617015808, 140103617138687, +STORE, 140103617138688, 140103617171455, +STORE, 140103617171456, 140103617179647, +STORE, 140103617179648, 140103617183743, +STORE, 140730265427968, 140730265432063, +STORE, 140730265415680, 140730265427967, +STORE, 47529177985024, 47529177993215, +STORE, 47529177993216, 47529178001407, +STORE, 47529178001408, 47529178107903, +STORE, 47529178017792, 47529178107903, +STORE, 47529178001408, 47529178017791, +ERASE, 47529178017792, 47529178017792, +STORE, 47529178017792, 47529178091519, +STORE, 47529178091520, 47529178107903, +STORE, 47529178071040, 47529178091519, +STORE, 47529178017792, 47529178071039, +ERASE, 47529178017792, 47529178017792, +STORE, 47529178017792, 47529178071039, +STORE, 47529178087424, 47529178091519, +STORE, 47529178071040, 47529178087423, +ERASE, 47529178071040, 47529178071040, +STORE, 47529178071040, 47529178087423, +STORE, 47529178099712, 47529178107903, +STORE, 47529178091520, 47529178099711, +ERASE, 47529178091520, 47529178091520, +STORE, 47529178091520, 47529178099711, +ERASE, 47529178099712, 47529178099712, +STORE, 47529178099712, 47529178107903, +STORE, 47529178107904, 47529179947007, +STORE, 47529178247168, 47529179947007, +STORE, 47529178107904, 47529178247167, +ERASE, 47529178247168, 47529178247168, +STORE, 47529178247168, 47529179906047, +STORE, 47529179906048, 47529179947007, +STORE, 47529179590656, 47529179906047, +STORE, 47529178247168, 47529179590655, +ERASE, 47529178247168, 47529178247168, +STORE, 47529178247168, 47529179590655, +STORE, 47529179901952, 47529179906047, +STORE, 47529179590656, 47529179901951, +ERASE, 47529179590656, 47529179590656, +STORE, 47529179590656, 47529179901951, +STORE, 47529179930624, 47529179947007, +STORE, 47529179906048, 47529179930623, +ERASE, 47529179906048, 47529179906048, +STORE, 47529179906048, 47529179930623, +ERASE, 47529179930624, 47529179930624, +STORE, 47529179930624, 47529179947007, +STORE, 47529179930624, 47529179959295, +ERASE, 47529179906048, 47529179906048, +STORE, 47529179906048, 47529179922431, +STORE, 47529179922432, 47529179930623, +ERASE, 47529178091520, 47529178091520, +STORE, 47529178091520, 47529178095615, +STORE, 47529178095616, 47529178099711, +ERASE, 94653217099776, 94653217099776, +STORE, 94653217099776, 94653217116159, +STORE, 94653217116160, 94653217120255, +ERASE, 140103617171456, 140103617171456, +STORE, 140103617171456, 140103617175551, +STORE, 140103617175552, 140103617179647, +ERASE, 47529177985024, 47529177985024, +STORE, 94653241135104, 94653241270271, +STORE, 140737488347136, 140737488351231, +STORE, 140736284549120, 140737488351231, +ERASE, 140736284549120, 140736284549120, +STORE, 140736284549120, 140736284553215, +STORE, 93963663822848, 93963664506879, +ERASE, 93963663822848, 93963663822848, +STORE, 93963663822848, 93963663884287, +STORE, 93963663884288, 93963664506879, +ERASE, 93963663884288, 93963663884288, +STORE, 93963663884288, 93963664240639, +STORE, 93963664240640, 93963664379903, +STORE, 93963664379904, 93963664506879, +STORE, 140450188439552, 140450188611583, +ERASE, 140450188439552, 140450188439552, +STORE, 140450188439552, 140450188443647, +STORE, 140450188443648, 140450188611583, +ERASE, 140450188443648, 140450188443648, +STORE, 140450188443648, 140450188566527, +STORE, 140450188566528, 140450188599295, +STORE, 140450188599296, 140450188607487, +STORE, 140450188607488, 140450188611583, +STORE, 140736284577792, 140736284581887, +STORE, 140736284565504, 140736284577791, +STORE, 47182606557184, 47182606565375, +STORE, 47182606565376, 47182606573567, +STORE, 47182606573568, 47182608412671, +STORE, 47182606712832, 47182608412671, +STORE, 47182606573568, 47182606712831, +ERASE, 47182606712832, 47182606712832, +STORE, 47182606712832, 47182608371711, +STORE, 47182608371712, 47182608412671, +STORE, 47182608056320, 47182608371711, +STORE, 47182606712832, 47182608056319, +ERASE, 47182606712832, 47182606712832, +STORE, 47182606712832, 47182608056319, +STORE, 47182608367616, 47182608371711, +STORE, 47182608056320, 47182608367615, +ERASE, 47182608056320, 47182608056320, +STORE, 47182608056320, 47182608367615, +STORE, 47182608396288, 47182608412671, +STORE, 47182608371712, 47182608396287, +ERASE, 47182608371712, 47182608371712, +STORE, 47182608371712, 47182608396287, +ERASE, 47182608396288, 47182608396288, +STORE, 47182608396288, 47182608412671, +STORE, 47182608412672, 47182608523263, +STORE, 47182608429056, 47182608523263, +STORE, 47182608412672, 47182608429055, +ERASE, 47182608429056, 47182608429056, +STORE, 47182608429056, 47182608515071, +STORE, 47182608515072, 47182608523263, +STORE, 47182608490496, 47182608515071, +STORE, 47182608429056, 47182608490495, +ERASE, 47182608429056, 47182608429056, +STORE, 47182608429056, 47182608490495, +STORE, 47182608510976, 47182608515071, +STORE, 47182608490496, 47182608510975, +ERASE, 47182608490496, 47182608490496, +STORE, 47182608490496, 47182608510975, +ERASE, 47182608515072, 47182608515072, +STORE, 47182608515072, 47182608523263, +STORE, 47182608523264, 47182608568319, +ERASE, 47182608523264, 47182608523264, +STORE, 47182608523264, 47182608531455, +STORE, 47182608531456, 47182608568319, +STORE, 47182608551936, 47182608568319, +STORE, 47182608531456, 47182608551935, +ERASE, 47182608531456, 47182608531456, +STORE, 47182608531456, 47182608551935, +STORE, 47182608560128, 47182608568319, +STORE, 47182608551936, 47182608560127, +ERASE, 47182608551936, 47182608551936, +STORE, 47182608551936, 47182608568319, +ERASE, 47182608551936, 47182608551936, +STORE, 47182608551936, 47182608560127, +STORE, 47182608560128, 47182608568319, +ERASE, 47182608560128, 47182608560128, +STORE, 47182608560128, 47182608568319, +STORE, 47182608568320, 47182608916479, +STORE, 47182608609280, 47182608916479, +STORE, 47182608568320, 47182608609279, +ERASE, 47182608609280, 47182608609280, +STORE, 47182608609280, 47182608891903, +STORE, 47182608891904, 47182608916479, +STORE, 47182608822272, 47182608891903, +STORE, 47182608609280, 47182608822271, +ERASE, 47182608609280, 47182608609280, +STORE, 47182608609280, 47182608822271, +STORE, 47182608887808, 47182608891903, +STORE, 47182608822272, 47182608887807, +ERASE, 47182608822272, 47182608822272, +STORE, 47182608822272, 47182608887807, +ERASE, 47182608891904, 47182608891904, +STORE, 47182608891904, 47182608916479, +STORE, 47182608916480, 47182611177471, +STORE, 47182609068032, 47182611177471, +STORE, 47182608916480, 47182609068031, +ERASE, 47182609068032, 47182609068032, +STORE, 47182609068032, 47182611161087, +STORE, 47182611161088, 47182611177471, +STORE, 47182611169280, 47182611177471, +STORE, 47182611161088, 47182611169279, +ERASE, 47182611161088, 47182611161088, +STORE, 47182611161088, 47182611169279, +ERASE, 47182611169280, 47182611169280, +STORE, 47182611169280, 47182611177471, +STORE, 47182611177472, 47182611312639, +ERASE, 47182611177472, 47182611177472, +STORE, 47182611177472, 47182611202047, +STORE, 47182611202048, 47182611312639, +STORE, 47182611263488, 47182611312639, +STORE, 47182611202048, 47182611263487, +ERASE, 47182611202048, 47182611202048, +STORE, 47182611202048, 47182611263487, +STORE, 47182611288064, 47182611312639, +STORE, 47182611263488, 47182611288063, +ERASE, 47182611263488, 47182611263488, +STORE, 47182611263488, 47182611312639, +ERASE, 47182611263488, 47182611263488, +STORE, 47182611263488, 47182611288063, +STORE, 47182611288064, 47182611312639, +STORE, 47182611296256, 47182611312639, +STORE, 47182611288064, 47182611296255, +ERASE, 47182611288064, 47182611288064, +STORE, 47182611288064, 47182611296255, +ERASE, 47182611296256, 47182611296256, +STORE, 47182611296256, 47182611312639, +STORE, 47182611296256, 47182611320831, +STORE, 47182611320832, 47182611484671, +ERASE, 47182611320832, 47182611320832, +STORE, 47182611320832, 47182611333119, +STORE, 47182611333120, 47182611484671, +STORE, 47182611431424, 47182611484671, +STORE, 47182611333120, 47182611431423, +ERASE, 47182611333120, 47182611333120, +STORE, 47182611333120, 47182611431423, +STORE, 47182611476480, 47182611484671, +STORE, 47182611431424, 47182611476479, +ERASE, 47182611431424, 47182611431424, +STORE, 47182611431424, 47182611484671, +ERASE, 47182611431424, 47182611431424, +STORE, 47182611431424, 47182611476479, +STORE, 47182611476480, 47182611484671, +ERASE, 47182611476480, 47182611476480, +STORE, 47182611476480, 47182611484671, +STORE, 47182611484672, 47182612082687, +STORE, 47182611603456, 47182612082687, +STORE, 47182611484672, 47182611603455, +ERASE, 47182611603456, 47182611603456, +STORE, 47182611603456, 47182612029439, +STORE, 47182612029440, 47182612082687, +STORE, 47182611918848, 47182612029439, +STORE, 47182611603456, 47182611918847, +ERASE, 47182611603456, 47182611603456, +STORE, 47182611603456, 47182611918847, +STORE, 47182612025344, 47182612029439, +STORE, 47182611918848, 47182612025343, +ERASE, 47182611918848, 47182611918848, +STORE, 47182611918848, 47182612025343, +ERASE, 47182612029440, 47182612029440, +STORE, 47182612029440, 47182612082687, +STORE, 47182612082688, 47182615134207, +STORE, 47182612627456, 47182615134207, +STORE, 47182612082688, 47182612627455, +ERASE, 47182612627456, 47182612627456, +STORE, 47182612627456, 47182614913023, +STORE, 47182614913024, 47182615134207, +STORE, 47182614323200, 47182614913023, +STORE, 47182612627456, 47182614323199, +ERASE, 47182612627456, 47182612627456, +STORE, 47182612627456, 47182614323199, +STORE, 47182614908928, 47182614913023, +STORE, 47182614323200, 47182614908927, +ERASE, 47182614323200, 47182614323200, +STORE, 47182614323200, 47182614908927, +STORE, 47182615117824, 47182615134207, +STORE, 47182614913024, 47182615117823, +ERASE, 47182614913024, 47182614913024, +STORE, 47182614913024, 47182615117823, +ERASE, 47182615117824, 47182615117824, +STORE, 47182615117824, 47182615134207, +STORE, 47182615134208, 47182615166975, +ERASE, 47182615134208, 47182615134208, +STORE, 47182615134208, 47182615142399, +STORE, 47182615142400, 47182615166975, +STORE, 47182615154688, 47182615166975, +STORE, 47182615142400, 47182615154687, +ERASE, 47182615142400, 47182615142400, +STORE, 47182615142400, 47182615154687, +STORE, 47182615158784, 47182615166975, +STORE, 47182615154688, 47182615158783, +ERASE, 47182615154688, 47182615154688, +STORE, 47182615154688, 47182615166975, +ERASE, 47182615154688, 47182615154688, +STORE, 47182615154688, 47182615158783, +STORE, 47182615158784, 47182615166975, +ERASE, 47182615158784, 47182615158784, +STORE, 47182615158784, 47182615166975, +STORE, 47182615166976, 47182615203839, +ERASE, 47182615166976, 47182615166976, +STORE, 47182615166976, 47182615175167, +STORE, 47182615175168, 47182615203839, +STORE, 47182615191552, 47182615203839, +STORE, 47182615175168, 47182615191551, +ERASE, 47182615175168, 47182615175168, +STORE, 47182615175168, 47182615191551, +STORE, 47182615195648, 47182615203839, +STORE, 47182615191552, 47182615195647, +ERASE, 47182615191552, 47182615191552, +STORE, 47182615191552, 47182615203839, +ERASE, 47182615191552, 47182615191552, +STORE, 47182615191552, 47182615195647, +STORE, 47182615195648, 47182615203839, +ERASE, 47182615195648, 47182615195648, +STORE, 47182615195648, 47182615203839, +STORE, 47182615203840, 47182615678975, +ERASE, 47182615203840, 47182615203840, +STORE, 47182615203840, 47182615212031, +STORE, 47182615212032, 47182615678975, +STORE, 47182615547904, 47182615678975, +STORE, 47182615212032, 47182615547903, +ERASE, 47182615212032, 47182615212032, +STORE, 47182615212032, 47182615547903, +STORE, 47182615670784, 47182615678975, +STORE, 47182615547904, 47182615670783, +ERASE, 47182615547904, 47182615547904, +STORE, 47182615547904, 47182615678975, +ERASE, 47182615547904, 47182615547904, +STORE, 47182615547904, 47182615670783, +STORE, 47182615670784, 47182615678975, +ERASE, 47182615670784, 47182615670784, +STORE, 47182615670784, 47182615678975, +STORE, 47182615678976, 47182615687167, +STORE, 47182615687168, 47182615707647, +ERASE, 47182615687168, 47182615687168, +STORE, 47182615687168, 47182615691263, +STORE, 47182615691264, 47182615707647, +STORE, 47182615695360, 47182615707647, +STORE, 47182615691264, 47182615695359, +ERASE, 47182615691264, 47182615691264, +STORE, 47182615691264, 47182615695359, +STORE, 47182615699456, 47182615707647, +STORE, 47182615695360, 47182615699455, +ERASE, 47182615695360, 47182615695360, +STORE, 47182615695360, 47182615707647, +ERASE, 47182615695360, 47182615695360, +STORE, 47182615695360, 47182615699455, +STORE, 47182615699456, 47182615707647, +ERASE, 47182615699456, 47182615699456, +STORE, 47182615699456, 47182615707647, +STORE, 47182615707648, 47182615715839, +ERASE, 47182608371712, 47182608371712, +STORE, 47182608371712, 47182608388095, +STORE, 47182608388096, 47182608396287, +ERASE, 47182615699456, 47182615699456, +STORE, 47182615699456, 47182615703551, +STORE, 47182615703552, 47182615707647, +ERASE, 47182611288064, 47182611288064, +STORE, 47182611288064, 47182611292159, +STORE, 47182611292160, 47182611296255, +ERASE, 47182615670784, 47182615670784, +STORE, 47182615670784, 47182615674879, +STORE, 47182615674880, 47182615678975, +ERASE, 47182615195648, 47182615195648, +STORE, 47182615195648, 47182615199743, +STORE, 47182615199744, 47182615203839, +ERASE, 47182615158784, 47182615158784, +STORE, 47182615158784, 47182615162879, +STORE, 47182615162880, 47182615166975, +ERASE, 47182614913024, 47182614913024, +STORE, 47182614913024, 47182615109631, +STORE, 47182615109632, 47182615117823, +ERASE, 47182612029440, 47182612029440, +STORE, 47182612029440, 47182612066303, +STORE, 47182612066304, 47182612082687, +ERASE, 47182611476480, 47182611476480, +STORE, 47182611476480, 47182611480575, +STORE, 47182611480576, 47182611484671, +ERASE, 47182611161088, 47182611161088, +STORE, 47182611161088, 47182611165183, +STORE, 47182611165184, 47182611169279, +ERASE, 47182608891904, 47182608891904, +STORE, 47182608891904, 47182608912383, +STORE, 47182608912384, 47182608916479, +ERASE, 47182608560128, 47182608560128, +STORE, 47182608560128, 47182608564223, +STORE, 47182608564224, 47182608568319, +ERASE, 47182608515072, 47182608515072, +STORE, 47182608515072, 47182608519167, +STORE, 47182608519168, 47182608523263, +ERASE, 93963664379904, 93963664379904, +STORE, 93963664379904, 93963664502783, +STORE, 93963664502784, 93963664506879, +ERASE, 140450188599296, 140450188599296, +STORE, 140450188599296, 140450188603391, +STORE, 140450188603392, 140450188607487, +ERASE, 47182606557184, 47182606557184, +STORE, 93963694723072, 93963694858239, +STORE, 140737488347136, 140737488351231, +STORE, 140730313261056, 140737488351231, +ERASE, 140730313261056, 140730313261056, +STORE, 140730313261056, 140730313265151, +STORE, 94386579017728, 94386579697663, +ERASE, 94386579017728, 94386579017728, +STORE, 94386579017728, 94386579083263, +STORE, 94386579083264, 94386579697663, +ERASE, 94386579083264, 94386579083264, +STORE, 94386579083264, 94386579431423, +STORE, 94386579431424, 94386579570687, +STORE, 94386579570688, 94386579697663, +STORE, 140124810838016, 140124811010047, +ERASE, 140124810838016, 140124810838016, +STORE, 140124810838016, 140124810842111, +STORE, 140124810842112, 140124811010047, +ERASE, 140124810842112, 140124810842112, +STORE, 140124810842112, 140124810964991, +STORE, 140124810964992, 140124810997759, +STORE, 140124810997760, 140124811005951, +STORE, 140124811005952, 140124811010047, +STORE, 140730313601024, 140730313605119, +STORE, 140730313588736, 140730313601023, +STORE, 47507984158720, 47507984166911, +STORE, 47507984166912, 47507984175103, +STORE, 47507984175104, 47507986014207, +STORE, 47507984314368, 47507986014207, +STORE, 47507984175104, 47507984314367, +ERASE, 47507984314368, 47507984314368, +STORE, 47507984314368, 47507985973247, +STORE, 47507985973248, 47507986014207, +STORE, 47507985657856, 47507985973247, +STORE, 47507984314368, 47507985657855, +ERASE, 47507984314368, 47507984314368, +STORE, 47507984314368, 47507985657855, +STORE, 47507985969152, 47507985973247, +STORE, 47507985657856, 47507985969151, +ERASE, 47507985657856, 47507985657856, +STORE, 47507985657856, 47507985969151, +STORE, 47507985997824, 47507986014207, +STORE, 47507985973248, 47507985997823, +ERASE, 47507985973248, 47507985973248, +STORE, 47507985973248, 47507985997823, +ERASE, 47507985997824, 47507985997824, +STORE, 47507985997824, 47507986014207, +STORE, 47507986014208, 47507986124799, +STORE, 47507986030592, 47507986124799, +STORE, 47507986014208, 47507986030591, +ERASE, 47507986030592, 47507986030592, +STORE, 47507986030592, 47507986116607, +STORE, 47507986116608, 47507986124799, +STORE, 47507986092032, 47507986116607, +STORE, 47507986030592, 47507986092031, +ERASE, 47507986030592, 47507986030592, +STORE, 47507986030592, 47507986092031, +STORE, 47507986112512, 47507986116607, +STORE, 47507986092032, 47507986112511, +ERASE, 47507986092032, 47507986092032, +STORE, 47507986092032, 47507986112511, +ERASE, 47507986116608, 47507986116608, +STORE, 47507986116608, 47507986124799, +STORE, 47507986124800, 47507986169855, +ERASE, 47507986124800, 47507986124800, +STORE, 47507986124800, 47507986132991, +STORE, 47507986132992, 47507986169855, +STORE, 47507986153472, 47507986169855, +STORE, 47507986132992, 47507986153471, +ERASE, 47507986132992, 47507986132992, +STORE, 47507986132992, 47507986153471, +STORE, 47507986161664, 47507986169855, +STORE, 47507986153472, 47507986161663, +ERASE, 47507986153472, 47507986153472, +STORE, 47507986153472, 47507986169855, +ERASE, 47507986153472, 47507986153472, +STORE, 47507986153472, 47507986161663, +STORE, 47507986161664, 47507986169855, +ERASE, 47507986161664, 47507986161664, +STORE, 47507986161664, 47507986169855, +STORE, 47507986169856, 47507986518015, +STORE, 47507986210816, 47507986518015, +STORE, 47507986169856, 47507986210815, +ERASE, 47507986210816, 47507986210816, +STORE, 47507986210816, 47507986493439, +STORE, 47507986493440, 47507986518015, +STORE, 47507986423808, 47507986493439, +STORE, 47507986210816, 47507986423807, +ERASE, 47507986210816, 47507986210816, +STORE, 47507986210816, 47507986423807, +STORE, 47507986489344, 47507986493439, +STORE, 47507986423808, 47507986489343, +ERASE, 47507986423808, 47507986423808, +STORE, 47507986423808, 47507986489343, +ERASE, 47507986493440, 47507986493440, +STORE, 47507986493440, 47507986518015, +STORE, 47507986518016, 47507988779007, +STORE, 47507986669568, 47507988779007, +STORE, 47507986518016, 47507986669567, +ERASE, 47507986669568, 47507986669568, +STORE, 47507986669568, 47507988762623, +STORE, 47507988762624, 47507988779007, +STORE, 47507988770816, 47507988779007, +STORE, 47507988762624, 47507988770815, +ERASE, 47507988762624, 47507988762624, +STORE, 47507988762624, 47507988770815, +ERASE, 47507988770816, 47507988770816, +STORE, 47507988770816, 47507988779007, +STORE, 47507988779008, 47507988914175, +ERASE, 47507988779008, 47507988779008, +STORE, 47507988779008, 47507988803583, +STORE, 47507988803584, 47507988914175, +STORE, 47507988865024, 47507988914175, +STORE, 47507988803584, 47507988865023, +ERASE, 47507988803584, 47507988803584, +STORE, 47507988803584, 47507988865023, +STORE, 47507988889600, 47507988914175, +STORE, 47507988865024, 47507988889599, +ERASE, 47507988865024, 47507988865024, +STORE, 47507988865024, 47507988914175, +ERASE, 47507988865024, 47507988865024, +STORE, 47507988865024, 47507988889599, +STORE, 47507988889600, 47507988914175, +STORE, 47507988897792, 47507988914175, +STORE, 47507988889600, 47507988897791, +ERASE, 47507988889600, 47507988889600, +STORE, 47507988889600, 47507988897791, +ERASE, 47507988897792, 47507988897792, +STORE, 47507988897792, 47507988914175, +STORE, 47507988897792, 47507988922367, +STORE, 47507988922368, 47507989086207, +ERASE, 47507988922368, 47507988922368, +STORE, 47507988922368, 47507988934655, +STORE, 47507988934656, 47507989086207, +STORE, 47507989032960, 47507989086207, +STORE, 47507988934656, 47507989032959, +ERASE, 47507988934656, 47507988934656, +STORE, 47507988934656, 47507989032959, +STORE, 47507989078016, 47507989086207, +STORE, 47507989032960, 47507989078015, +ERASE, 47507989032960, 47507989032960, +STORE, 47507989032960, 47507989086207, +ERASE, 47507989032960, 47507989032960, +STORE, 47507989032960, 47507989078015, +STORE, 47507989078016, 47507989086207, +ERASE, 47507989078016, 47507989078016, +STORE, 47507989078016, 47507989086207, +STORE, 47507989086208, 47507989684223, +STORE, 47507989204992, 47507989684223, +STORE, 47507989086208, 47507989204991, +ERASE, 47507989204992, 47507989204992, +STORE, 47507989204992, 47507989630975, +STORE, 47507989630976, 47507989684223, +STORE, 47507989520384, 47507989630975, +STORE, 47507989204992, 47507989520383, +ERASE, 47507989204992, 47507989204992, +STORE, 47507989204992, 47507989520383, +STORE, 47507989626880, 47507989630975, +STORE, 47507989520384, 47507989626879, +ERASE, 47507989520384, 47507989520384, +STORE, 47507989520384, 47507989626879, +ERASE, 47507989630976, 47507989630976, +STORE, 47507989630976, 47507989684223, +STORE, 47507989684224, 47507992735743, +STORE, 47507990228992, 47507992735743, +STORE, 47507989684224, 47507990228991, +ERASE, 47507990228992, 47507990228992, +STORE, 47507990228992, 47507992514559, +STORE, 47507992514560, 47507992735743, +STORE, 47507991924736, 47507992514559, +STORE, 47507990228992, 47507991924735, +ERASE, 47507990228992, 47507990228992, +STORE, 47507990228992, 47507991924735, +STORE, 47507992510464, 47507992514559, +STORE, 47507991924736, 47507992510463, +ERASE, 47507991924736, 47507991924736, +STORE, 47507991924736, 47507992510463, +STORE, 47507992719360, 47507992735743, +STORE, 47507992514560, 47507992719359, +ERASE, 47507992514560, 47507992514560, +STORE, 47507992514560, 47507992719359, +ERASE, 47507992719360, 47507992719360, +STORE, 47507992719360, 47507992735743, +STORE, 47507992735744, 47507992768511, +ERASE, 47507992735744, 47507992735744, +STORE, 47507992735744, 47507992743935, +STORE, 47507992743936, 47507992768511, +STORE, 47507992756224, 47507992768511, +STORE, 47507992743936, 47507992756223, +ERASE, 47507992743936, 47507992743936, +STORE, 47507992743936, 47507992756223, +STORE, 47507992760320, 47507992768511, +STORE, 47507992756224, 47507992760319, +ERASE, 47507992756224, 47507992756224, +STORE, 47507992756224, 47507992768511, +ERASE, 47507992756224, 47507992756224, +STORE, 47507992756224, 47507992760319, +STORE, 47507992760320, 47507992768511, +ERASE, 47507992760320, 47507992760320, +STORE, 47507992760320, 47507992768511, +STORE, 47507992768512, 47507992805375, +ERASE, 47507992768512, 47507992768512, +STORE, 47507992768512, 47507992776703, +STORE, 47507992776704, 47507992805375, +STORE, 47507992793088, 47507992805375, +STORE, 47507992776704, 47507992793087, +ERASE, 47507992776704, 47507992776704, +STORE, 47507992776704, 47507992793087, +STORE, 47507992797184, 47507992805375, +STORE, 47507992793088, 47507992797183, +ERASE, 47507992793088, 47507992793088, +STORE, 47507992793088, 47507992805375, +ERASE, 47507992793088, 47507992793088, +STORE, 47507992793088, 47507992797183, +STORE, 47507992797184, 47507992805375, +ERASE, 47507992797184, 47507992797184, +STORE, 47507992797184, 47507992805375, +STORE, 47507992805376, 47507993280511, +ERASE, 47507992805376, 47507992805376, +STORE, 47507992805376, 47507992813567, +STORE, 47507992813568, 47507993280511, +STORE, 47507993149440, 47507993280511, +STORE, 47507992813568, 47507993149439, +ERASE, 47507992813568, 47507992813568, +STORE, 47507992813568, 47507993149439, +STORE, 47507993272320, 47507993280511, +STORE, 47507993149440, 47507993272319, +ERASE, 47507993149440, 47507993149440, +STORE, 47507993149440, 47507993280511, +ERASE, 47507993149440, 47507993149440, +STORE, 47507993149440, 47507993272319, +STORE, 47507993272320, 47507993280511, +ERASE, 47507993272320, 47507993272320, +STORE, 47507993272320, 47507993280511, +STORE, 47507993280512, 47507993288703, +STORE, 47507993288704, 47507993309183, +ERASE, 47507993288704, 47507993288704, +STORE, 47507993288704, 47507993292799, +STORE, 47507993292800, 47507993309183, +STORE, 47507993296896, 47507993309183, +STORE, 47507993292800, 47507993296895, +ERASE, 47507993292800, 47507993292800, +STORE, 47507993292800, 47507993296895, +STORE, 47507993300992, 47507993309183, +STORE, 47507993296896, 47507993300991, +ERASE, 47507993296896, 47507993296896, +STORE, 47507993296896, 47507993309183, +ERASE, 47507993296896, 47507993296896, +STORE, 47507993296896, 47507993300991, +STORE, 47507993300992, 47507993309183, +ERASE, 47507993300992, 47507993300992, +STORE, 47507993300992, 47507993309183, +STORE, 47507993309184, 47507993317375, +ERASE, 47507985973248, 47507985973248, +STORE, 47507985973248, 47507985989631, +STORE, 47507985989632, 47507985997823, +ERASE, 47507993300992, 47507993300992, +STORE, 47507993300992, 47507993305087, +STORE, 47507993305088, 47507993309183, +ERASE, 47507988889600, 47507988889600, +STORE, 47507988889600, 47507988893695, +STORE, 47507988893696, 47507988897791, +ERASE, 47507993272320, 47507993272320, +STORE, 47507993272320, 47507993276415, +STORE, 47507993276416, 47507993280511, +ERASE, 47507992797184, 47507992797184, +STORE, 47507992797184, 47507992801279, +STORE, 47507992801280, 47507992805375, +ERASE, 47507992760320, 47507992760320, +STORE, 47507992760320, 47507992764415, +STORE, 47507992764416, 47507992768511, +ERASE, 47507992514560, 47507992514560, +STORE, 47507992514560, 47507992711167, +STORE, 47507992711168, 47507992719359, +ERASE, 47507989630976, 47507989630976, +STORE, 47507989630976, 47507989667839, +STORE, 47507989667840, 47507989684223, +ERASE, 47507989078016, 47507989078016, +STORE, 47507989078016, 47507989082111, +STORE, 47507989082112, 47507989086207, +ERASE, 47507988762624, 47507988762624, +STORE, 47507988762624, 47507988766719, +STORE, 47507988766720, 47507988770815, +ERASE, 47507986493440, 47507986493440, +STORE, 47507986493440, 47507986513919, +STORE, 47507986513920, 47507986518015, +ERASE, 47507986161664, 47507986161664, +STORE, 47507986161664, 47507986165759, +STORE, 47507986165760, 47507986169855, +ERASE, 47507986116608, 47507986116608, +STORE, 47507986116608, 47507986120703, +STORE, 47507986120704, 47507986124799, +ERASE, 94386579570688, 94386579570688, +STORE, 94386579570688, 94386579693567, +STORE, 94386579693568, 94386579697663, +ERASE, 140124810997760, 140124810997760, +STORE, 140124810997760, 140124811001855, +STORE, 140124811001856, 140124811005951, +ERASE, 47507984158720, 47507984158720, +STORE, 94386583982080, 94386584117247, +STORE, 94386583982080, 94386584256511, +ERASE, 94386583982080, 94386583982080, +STORE, 94386583982080, 94386584223743, +STORE, 94386584223744, 94386584256511, +ERASE, 94386584223744, 94386584223744, +STORE, 140737488347136, 140737488351231, +STORE, 140733763395584, 140737488351231, +ERASE, 140733763395584, 140733763395584, +STORE, 140733763395584, 140733763399679, +STORE, 94011546472448, 94011547152383, +ERASE, 94011546472448, 94011546472448, +STORE, 94011546472448, 94011546537983, +STORE, 94011546537984, 94011547152383, +ERASE, 94011546537984, 94011546537984, +STORE, 94011546537984, 94011546886143, +STORE, 94011546886144, 94011547025407, +STORE, 94011547025408, 94011547152383, +STORE, 139757597949952, 139757598121983, +ERASE, 139757597949952, 139757597949952, +STORE, 139757597949952, 139757597954047, +STORE, 139757597954048, 139757598121983, +ERASE, 139757597954048, 139757597954048, +STORE, 139757597954048, 139757598076927, +STORE, 139757598076928, 139757598109695, +STORE, 139757598109696, 139757598117887, +STORE, 139757598117888, 139757598121983, +STORE, 140733763596288, 140733763600383, +STORE, 140733763584000, 140733763596287, +STORE, 47875197046784, 47875197054975, +STORE, 47875197054976, 47875197063167, +STORE, 47875197063168, 47875198902271, +STORE, 47875197202432, 47875198902271, +STORE, 47875197063168, 47875197202431, +ERASE, 47875197202432, 47875197202432, +STORE, 47875197202432, 47875198861311, +STORE, 47875198861312, 47875198902271, +STORE, 47875198545920, 47875198861311, +STORE, 47875197202432, 47875198545919, +ERASE, 47875197202432, 47875197202432, +STORE, 47875197202432, 47875198545919, +STORE, 47875198857216, 47875198861311, +STORE, 47875198545920, 47875198857215, +ERASE, 47875198545920, 47875198545920, +STORE, 47875198545920, 47875198857215, +STORE, 47875198885888, 47875198902271, +STORE, 47875198861312, 47875198885887, +ERASE, 47875198861312, 47875198861312, +STORE, 47875198861312, 47875198885887, +ERASE, 47875198885888, 47875198885888, +STORE, 47875198885888, 47875198902271, +STORE, 47875198902272, 47875199012863, +STORE, 47875198918656, 47875199012863, +STORE, 47875198902272, 47875198918655, +ERASE, 47875198918656, 47875198918656, +STORE, 47875198918656, 47875199004671, +STORE, 47875199004672, 47875199012863, +STORE, 47875198980096, 47875199004671, +STORE, 47875198918656, 47875198980095, +ERASE, 47875198918656, 47875198918656, +STORE, 47875198918656, 47875198980095, +STORE, 47875199000576, 47875199004671, +STORE, 47875198980096, 47875199000575, +ERASE, 47875198980096, 47875198980096, +STORE, 47875198980096, 47875199000575, +ERASE, 47875199004672, 47875199004672, +STORE, 47875199004672, 47875199012863, +STORE, 47875199012864, 47875199057919, +ERASE, 47875199012864, 47875199012864, +STORE, 47875199012864, 47875199021055, +STORE, 47875199021056, 47875199057919, +STORE, 47875199041536, 47875199057919, +STORE, 47875199021056, 47875199041535, +ERASE, 47875199021056, 47875199021056, +STORE, 47875199021056, 47875199041535, +STORE, 47875199049728, 47875199057919, +STORE, 47875199041536, 47875199049727, +ERASE, 47875199041536, 47875199041536, +STORE, 47875199041536, 47875199057919, +ERASE, 47875199041536, 47875199041536, +STORE, 47875199041536, 47875199049727, +STORE, 47875199049728, 47875199057919, +ERASE, 47875199049728, 47875199049728, +STORE, 47875199049728, 47875199057919, +STORE, 47875199057920, 47875199406079, +STORE, 47875199098880, 47875199406079, +STORE, 47875199057920, 47875199098879, +ERASE, 47875199098880, 47875199098880, +STORE, 47875199098880, 47875199381503, +STORE, 47875199381504, 47875199406079, +STORE, 47875199311872, 47875199381503, +STORE, 47875199098880, 47875199311871, +ERASE, 47875199098880, 47875199098880, +STORE, 47875199098880, 47875199311871, +STORE, 47875199377408, 47875199381503, +STORE, 47875199311872, 47875199377407, +ERASE, 47875199311872, 47875199311872, +STORE, 47875199311872, 47875199377407, +ERASE, 47875199381504, 47875199381504, +STORE, 47875199381504, 47875199406079, +STORE, 47875199406080, 47875201667071, +STORE, 47875199557632, 47875201667071, +STORE, 47875199406080, 47875199557631, +ERASE, 47875199557632, 47875199557632, +STORE, 47875199557632, 47875201650687, +STORE, 47875201650688, 47875201667071, +STORE, 47875201658880, 47875201667071, +STORE, 47875201650688, 47875201658879, +ERASE, 47875201650688, 47875201650688, +STORE, 47875201650688, 47875201658879, +ERASE, 47875201658880, 47875201658880, +STORE, 47875201658880, 47875201667071, +STORE, 47875201667072, 47875201802239, +ERASE, 47875201667072, 47875201667072, +STORE, 47875201667072, 47875201691647, +STORE, 47875201691648, 47875201802239, +STORE, 47875201753088, 47875201802239, +STORE, 47875201691648, 47875201753087, +ERASE, 47875201691648, 47875201691648, +STORE, 47875201691648, 47875201753087, +STORE, 47875201777664, 47875201802239, +STORE, 47875201753088, 47875201777663, +ERASE, 47875201753088, 47875201753088, +STORE, 47875201753088, 47875201802239, +ERASE, 47875201753088, 47875201753088, +STORE, 47875201753088, 47875201777663, +STORE, 47875201777664, 47875201802239, +STORE, 47875201785856, 47875201802239, +STORE, 47875201777664, 47875201785855, +ERASE, 47875201777664, 47875201777664, +STORE, 47875201777664, 47875201785855, +ERASE, 47875201785856, 47875201785856, +STORE, 47875201785856, 47875201802239, +STORE, 47875201785856, 47875201810431, +STORE, 47875201810432, 47875201974271, +ERASE, 47875201810432, 47875201810432, +STORE, 47875201810432, 47875201822719, +STORE, 47875201822720, 47875201974271, +STORE, 47875201921024, 47875201974271, +STORE, 47875201822720, 47875201921023, +ERASE, 47875201822720, 47875201822720, +STORE, 47875201822720, 47875201921023, +STORE, 47875201966080, 47875201974271, +STORE, 47875201921024, 47875201966079, +ERASE, 47875201921024, 47875201921024, +STORE, 47875201921024, 47875201974271, +ERASE, 47875201921024, 47875201921024, +STORE, 47875201921024, 47875201966079, +STORE, 47875201966080, 47875201974271, +ERASE, 47875201966080, 47875201966080, +STORE, 47875201966080, 47875201974271, +STORE, 47875201974272, 47875202572287, +STORE, 47875202093056, 47875202572287, +STORE, 47875201974272, 47875202093055, +ERASE, 47875202093056, 47875202093056, +STORE, 47875202093056, 47875202519039, +STORE, 47875202519040, 47875202572287, +STORE, 47875202408448, 47875202519039, +STORE, 47875202093056, 47875202408447, +ERASE, 47875202093056, 47875202093056, +STORE, 47875202093056, 47875202408447, +STORE, 47875202514944, 47875202519039, +STORE, 47875202408448, 47875202514943, +ERASE, 47875202408448, 47875202408448, +STORE, 47875202408448, 47875202514943, +ERASE, 47875202519040, 47875202519040, +STORE, 47875202519040, 47875202572287, +STORE, 47875202572288, 47875205623807, +STORE, 47875203117056, 47875205623807, +STORE, 47875202572288, 47875203117055, +ERASE, 47875203117056, 47875203117056, +STORE, 47875203117056, 47875205402623, +STORE, 47875205402624, 47875205623807, +STORE, 47875204812800, 47875205402623, +STORE, 47875203117056, 47875204812799, +ERASE, 47875203117056, 47875203117056, +STORE, 47875203117056, 47875204812799, +STORE, 47875205398528, 47875205402623, +STORE, 47875204812800, 47875205398527, +ERASE, 47875204812800, 47875204812800, +STORE, 47875204812800, 47875205398527, +STORE, 47875205607424, 47875205623807, +STORE, 47875205402624, 47875205607423, +ERASE, 47875205402624, 47875205402624, +STORE, 47875205402624, 47875205607423, +ERASE, 47875205607424, 47875205607424, +STORE, 47875205607424, 47875205623807, +STORE, 47875205623808, 47875205656575, +ERASE, 47875205623808, 47875205623808, +STORE, 47875205623808, 47875205631999, +STORE, 47875205632000, 47875205656575, +STORE, 47875205644288, 47875205656575, +STORE, 47875205632000, 47875205644287, +ERASE, 47875205632000, 47875205632000, +STORE, 47875205632000, 47875205644287, +STORE, 47875205648384, 47875205656575, +STORE, 47875205644288, 47875205648383, +ERASE, 47875205644288, 47875205644288, +STORE, 47875205644288, 47875205656575, +ERASE, 47875205644288, 47875205644288, +STORE, 47875205644288, 47875205648383, +STORE, 47875205648384, 47875205656575, +ERASE, 47875205648384, 47875205648384, +STORE, 47875205648384, 47875205656575, +STORE, 47875205656576, 47875205693439, +ERASE, 47875205656576, 47875205656576, +STORE, 47875205656576, 47875205664767, +STORE, 47875205664768, 47875205693439, +STORE, 47875205681152, 47875205693439, +STORE, 47875205664768, 47875205681151, +ERASE, 47875205664768, 47875205664768, +STORE, 47875205664768, 47875205681151, +STORE, 47875205685248, 47875205693439, +STORE, 47875205681152, 47875205685247, +ERASE, 47875205681152, 47875205681152, +STORE, 47875205681152, 47875205693439, +ERASE, 47875205681152, 47875205681152, +STORE, 47875205681152, 47875205685247, +STORE, 47875205685248, 47875205693439, +ERASE, 47875205685248, 47875205685248, +STORE, 47875205685248, 47875205693439, +STORE, 47875205693440, 47875206168575, +ERASE, 47875205693440, 47875205693440, +STORE, 47875205693440, 47875205701631, +STORE, 47875205701632, 47875206168575, +STORE, 47875206037504, 47875206168575, +STORE, 47875205701632, 47875206037503, +ERASE, 47875205701632, 47875205701632, +STORE, 47875205701632, 47875206037503, +STORE, 47875206160384, 47875206168575, +STORE, 47875206037504, 47875206160383, +ERASE, 47875206037504, 47875206037504, +STORE, 47875206037504, 47875206168575, +ERASE, 47875206037504, 47875206037504, +STORE, 47875206037504, 47875206160383, +STORE, 47875206160384, 47875206168575, +ERASE, 47875206160384, 47875206160384, +STORE, 47875206160384, 47875206168575, +STORE, 47875206168576, 47875206176767, +STORE, 47875206176768, 47875206197247, +ERASE, 47875206176768, 47875206176768, +STORE, 47875206176768, 47875206180863, +STORE, 47875206180864, 47875206197247, +STORE, 47875206184960, 47875206197247, +STORE, 47875206180864, 47875206184959, +ERASE, 47875206180864, 47875206180864, +STORE, 47875206180864, 47875206184959, +STORE, 47875206189056, 47875206197247, +STORE, 47875206184960, 47875206189055, +ERASE, 47875206184960, 47875206184960, +STORE, 47875206184960, 47875206197247, +ERASE, 47875206184960, 47875206184960, +STORE, 47875206184960, 47875206189055, +STORE, 47875206189056, 47875206197247, +ERASE, 47875206189056, 47875206189056, +STORE, 47875206189056, 47875206197247, +STORE, 47875206197248, 47875206205439, +ERASE, 47875198861312, 47875198861312, +STORE, 47875198861312, 47875198877695, +STORE, 47875198877696, 47875198885887, +ERASE, 47875206189056, 47875206189056, +STORE, 47875206189056, 47875206193151, +STORE, 47875206193152, 47875206197247, +ERASE, 47875201777664, 47875201777664, +STORE, 47875201777664, 47875201781759, +STORE, 47875201781760, 47875201785855, +ERASE, 47875206160384, 47875206160384, +STORE, 47875206160384, 47875206164479, +STORE, 47875206164480, 47875206168575, +ERASE, 47875205685248, 47875205685248, +STORE, 47875205685248, 47875205689343, +STORE, 47875205689344, 47875205693439, +ERASE, 47875205648384, 47875205648384, +STORE, 47875205648384, 47875205652479, +STORE, 47875205652480, 47875205656575, +ERASE, 47875205402624, 47875205402624, +STORE, 47875205402624, 47875205599231, +STORE, 47875205599232, 47875205607423, +ERASE, 47875202519040, 47875202519040, +STORE, 47875202519040, 47875202555903, +STORE, 47875202555904, 47875202572287, +ERASE, 47875201966080, 47875201966080, +STORE, 47875201966080, 47875201970175, +STORE, 47875201970176, 47875201974271, +ERASE, 47875201650688, 47875201650688, +STORE, 47875201650688, 47875201654783, +STORE, 47875201654784, 47875201658879, +ERASE, 47875199381504, 47875199381504, +STORE, 47875199381504, 47875199401983, +STORE, 47875199401984, 47875199406079, +ERASE, 47875199049728, 47875199049728, +STORE, 47875199049728, 47875199053823, +STORE, 47875199053824, 47875199057919, +ERASE, 47875199004672, 47875199004672, +STORE, 47875199004672, 47875199008767, +STORE, 47875199008768, 47875199012863, +ERASE, 94011547025408, 94011547025408, +STORE, 94011547025408, 94011547148287, +STORE, 94011547148288, 94011547152383, +ERASE, 139757598109696, 139757598109696, +STORE, 139757598109696, 139757598113791, +STORE, 139757598113792, 139757598117887, +ERASE, 47875197046784, 47875197046784, +STORE, 94011557584896, 94011557720063, +STORE, 94011557584896, 94011557855231, +ERASE, 94011557584896, 94011557584896, +STORE, 94011557584896, 94011557851135, +STORE, 94011557851136, 94011557855231, +ERASE, 94011557851136, 94011557851136, +ERASE, 94011557584896, 94011557584896, +STORE, 94011557584896, 94011557847039, +STORE, 94011557847040, 94011557851135, +ERASE, 94011557847040, 94011557847040, +STORE, 94011557584896, 94011557982207, +ERASE, 94011557584896, 94011557584896, +STORE, 94011557584896, 94011557978111, +STORE, 94011557978112, 94011557982207, +ERASE, 94011557978112, 94011557978112, +ERASE, 94011557584896, 94011557584896, +STORE, 94011557584896, 94011557974015, +STORE, 94011557974016, 94011557978111, +ERASE, 94011557974016, 94011557974016, +STORE, 140737488347136, 140737488351231, +STORE, 140734130360320, 140737488351231, +ERASE, 140734130360320, 140734130360320, +STORE, 140734130360320, 140734130364415, +STORE, 94641232105472, 94641232785407, +ERASE, 94641232105472, 94641232105472, +STORE, 94641232105472, 94641232171007, +STORE, 94641232171008, 94641232785407, +ERASE, 94641232171008, 94641232171008, +STORE, 94641232171008, 94641232519167, +STORE, 94641232519168, 94641232658431, +STORE, 94641232658432, 94641232785407, +STORE, 139726599516160, 139726599688191, +ERASE, 139726599516160, 139726599516160, +STORE, 139726599516160, 139726599520255, +STORE, 139726599520256, 139726599688191, +ERASE, 139726599520256, 139726599520256, +STORE, 139726599520256, 139726599643135, +STORE, 139726599643136, 139726599675903, +STORE, 139726599675904, 139726599684095, +STORE, 139726599684096, 139726599688191, +STORE, 140734130446336, 140734130450431, +STORE, 140734130434048, 140734130446335, +STORE, 47906195480576, 47906195488767, +STORE, 47906195488768, 47906195496959, +STORE, 47906195496960, 47906197336063, +STORE, 47906195636224, 47906197336063, +STORE, 47906195496960, 47906195636223, +ERASE, 47906195636224, 47906195636224, +STORE, 47906195636224, 47906197295103, +STORE, 47906197295104, 47906197336063, +STORE, 47906196979712, 47906197295103, +STORE, 47906195636224, 47906196979711, +ERASE, 47906195636224, 47906195636224, +STORE, 47906195636224, 47906196979711, +STORE, 47906197291008, 47906197295103, +STORE, 47906196979712, 47906197291007, +ERASE, 47906196979712, 47906196979712, +STORE, 47906196979712, 47906197291007, +STORE, 47906197319680, 47906197336063, +STORE, 47906197295104, 47906197319679, +ERASE, 47906197295104, 47906197295104, +STORE, 47906197295104, 47906197319679, +ERASE, 47906197319680, 47906197319680, +STORE, 47906197319680, 47906197336063, +STORE, 47906197336064, 47906197446655, +STORE, 47906197352448, 47906197446655, +STORE, 47906197336064, 47906197352447, +ERASE, 47906197352448, 47906197352448, +STORE, 47906197352448, 47906197438463, +STORE, 47906197438464, 47906197446655, +STORE, 47906197413888, 47906197438463, +STORE, 47906197352448, 47906197413887, +ERASE, 47906197352448, 47906197352448, +STORE, 47906197352448, 47906197413887, +STORE, 47906197434368, 47906197438463, +STORE, 47906197413888, 47906197434367, +ERASE, 47906197413888, 47906197413888, +STORE, 47906197413888, 47906197434367, +ERASE, 47906197438464, 47906197438464, +STORE, 47906197438464, 47906197446655, +STORE, 47906197446656, 47906197491711, +ERASE, 47906197446656, 47906197446656, +STORE, 47906197446656, 47906197454847, +STORE, 47906197454848, 47906197491711, +STORE, 47906197475328, 47906197491711, +STORE, 47906197454848, 47906197475327, +ERASE, 47906197454848, 47906197454848, +STORE, 47906197454848, 47906197475327, +STORE, 47906197483520, 47906197491711, +STORE, 47906197475328, 47906197483519, +ERASE, 47906197475328, 47906197475328, +STORE, 47906197475328, 47906197491711, +ERASE, 47906197475328, 47906197475328, +STORE, 47906197475328, 47906197483519, +STORE, 47906197483520, 47906197491711, +ERASE, 47906197483520, 47906197483520, +STORE, 47906197483520, 47906197491711, +STORE, 47906197491712, 47906197839871, +STORE, 47906197532672, 47906197839871, +STORE, 47906197491712, 47906197532671, +ERASE, 47906197532672, 47906197532672, +STORE, 47906197532672, 47906197815295, +STORE, 47906197815296, 47906197839871, +STORE, 47906197745664, 47906197815295, +STORE, 47906197532672, 47906197745663, +ERASE, 47906197532672, 47906197532672, +STORE, 47906197532672, 47906197745663, +STORE, 47906197811200, 47906197815295, +STORE, 47906197745664, 47906197811199, +ERASE, 47906197745664, 47906197745664, +STORE, 47906197745664, 47906197811199, +ERASE, 47906197815296, 47906197815296, +STORE, 47906197815296, 47906197839871, +STORE, 47906197839872, 47906200100863, +STORE, 47906197991424, 47906200100863, +STORE, 47906197839872, 47906197991423, +ERASE, 47906197991424, 47906197991424, +STORE, 47906197991424, 47906200084479, +STORE, 47906200084480, 47906200100863, +STORE, 47906200092672, 47906200100863, +STORE, 47906200084480, 47906200092671, +ERASE, 47906200084480, 47906200084480, +STORE, 47906200084480, 47906200092671, +ERASE, 47906200092672, 47906200092672, +STORE, 47906200092672, 47906200100863, +STORE, 47906200100864, 47906200236031, +ERASE, 47906200100864, 47906200100864, +STORE, 47906200100864, 47906200125439, +STORE, 47906200125440, 47906200236031, +STORE, 47906200186880, 47906200236031, +STORE, 47906200125440, 47906200186879, +ERASE, 47906200125440, 47906200125440, +STORE, 47906200125440, 47906200186879, +STORE, 47906200211456, 47906200236031, +STORE, 47906200186880, 47906200211455, +ERASE, 47906200186880, 47906200186880, +STORE, 47906200186880, 47906200236031, +ERASE, 47906200186880, 47906200186880, +STORE, 47906200186880, 47906200211455, +STORE, 47906200211456, 47906200236031, +STORE, 47906200219648, 47906200236031, +STORE, 47906200211456, 47906200219647, +ERASE, 47906200211456, 47906200211456, +STORE, 47906200211456, 47906200219647, +ERASE, 47906200219648, 47906200219648, +STORE, 47906200219648, 47906200236031, +STORE, 47906200219648, 47906200244223, +STORE, 47906200244224, 47906200408063, +ERASE, 47906200244224, 47906200244224, +STORE, 47906200244224, 47906200256511, +STORE, 47906200256512, 47906200408063, +STORE, 47906200354816, 47906200408063, +STORE, 47906200256512, 47906200354815, +ERASE, 47906200256512, 47906200256512, +STORE, 47906200256512, 47906200354815, +STORE, 47906200399872, 47906200408063, +STORE, 47906200354816, 47906200399871, +ERASE, 47906200354816, 47906200354816, +STORE, 47906200354816, 47906200408063, +ERASE, 47906200354816, 47906200354816, +STORE, 47906200354816, 47906200399871, +STORE, 47906200399872, 47906200408063, +ERASE, 47906200399872, 47906200399872, +STORE, 47906200399872, 47906200408063, +STORE, 47906200408064, 47906201006079, +STORE, 47906200526848, 47906201006079, +STORE, 47906200408064, 47906200526847, +ERASE, 47906200526848, 47906200526848, +STORE, 47906200526848, 47906200952831, +STORE, 47906200952832, 47906201006079, +STORE, 47906200842240, 47906200952831, +STORE, 47906200526848, 47906200842239, +ERASE, 47906200526848, 47906200526848, +STORE, 47906200526848, 47906200842239, +STORE, 47906200948736, 47906200952831, +STORE, 47906200842240, 47906200948735, +ERASE, 47906200842240, 47906200842240, +STORE, 47906200842240, 47906200948735, +ERASE, 47906200952832, 47906200952832, +STORE, 47906200952832, 47906201006079, +STORE, 47906201006080, 47906204057599, +STORE, 47906201550848, 47906204057599, +STORE, 47906201006080, 47906201550847, +ERASE, 47906201550848, 47906201550848, +STORE, 47906201550848, 47906203836415, +STORE, 47906203836416, 47906204057599, +STORE, 47906203246592, 47906203836415, +STORE, 47906201550848, 47906203246591, +ERASE, 47906201550848, 47906201550848, +STORE, 47906201550848, 47906203246591, +STORE, 47906203832320, 47906203836415, +STORE, 47906203246592, 47906203832319, +ERASE, 47906203246592, 47906203246592, +STORE, 47906203246592, 47906203832319, +STORE, 47906204041216, 47906204057599, +STORE, 47906203836416, 47906204041215, +ERASE, 47906203836416, 47906203836416, +STORE, 47906203836416, 47906204041215, +ERASE, 47906204041216, 47906204041216, +STORE, 47906204041216, 47906204057599, +STORE, 47906204057600, 47906204090367, +ERASE, 47906204057600, 47906204057600, +STORE, 47906204057600, 47906204065791, +STORE, 47906204065792, 47906204090367, +STORE, 47906204078080, 47906204090367, +STORE, 47906204065792, 47906204078079, +ERASE, 47906204065792, 47906204065792, +STORE, 47906204065792, 47906204078079, +STORE, 47906204082176, 47906204090367, +STORE, 47906204078080, 47906204082175, +ERASE, 47906204078080, 47906204078080, +STORE, 47906204078080, 47906204090367, +ERASE, 47906204078080, 47906204078080, +STORE, 47906204078080, 47906204082175, +STORE, 47906204082176, 47906204090367, +ERASE, 47906204082176, 47906204082176, +STORE, 47906204082176, 47906204090367, +STORE, 47906204090368, 47906204127231, +ERASE, 47906204090368, 47906204090368, +STORE, 47906204090368, 47906204098559, +STORE, 47906204098560, 47906204127231, +STORE, 47906204114944, 47906204127231, +STORE, 47906204098560, 47906204114943, +ERASE, 47906204098560, 47906204098560, +STORE, 47906204098560, 47906204114943, +STORE, 47906204119040, 47906204127231, +STORE, 47906204114944, 47906204119039, +ERASE, 47906204114944, 47906204114944, +STORE, 47906204114944, 47906204127231, +ERASE, 47906204114944, 47906204114944, +STORE, 47906204114944, 47906204119039, +STORE, 47906204119040, 47906204127231, +ERASE, 47906204119040, 47906204119040, +STORE, 47906204119040, 47906204127231, +STORE, 47906204127232, 47906204602367, +ERASE, 47906204127232, 47906204127232, +STORE, 47906204127232, 47906204135423, +STORE, 47906204135424, 47906204602367, +STORE, 47906204471296, 47906204602367, +STORE, 47906204135424, 47906204471295, +ERASE, 47906204135424, 47906204135424, +STORE, 47906204135424, 47906204471295, +STORE, 47906204594176, 47906204602367, +STORE, 47906204471296, 47906204594175, +ERASE, 47906204471296, 47906204471296, +STORE, 47906204471296, 47906204602367, +ERASE, 47906204471296, 47906204471296, +STORE, 47906204471296, 47906204594175, +STORE, 47906204594176, 47906204602367, +ERASE, 47906204594176, 47906204594176, +STORE, 47906204594176, 47906204602367, +STORE, 47906204602368, 47906204610559, +STORE, 47906204610560, 47906204631039, +ERASE, 47906204610560, 47906204610560, +STORE, 47906204610560, 47906204614655, +STORE, 47906204614656, 47906204631039, +STORE, 47906204618752, 47906204631039, +STORE, 47906204614656, 47906204618751, +ERASE, 47906204614656, 47906204614656, +STORE, 47906204614656, 47906204618751, +STORE, 47906204622848, 47906204631039, +STORE, 47906204618752, 47906204622847, +ERASE, 47906204618752, 47906204618752, +STORE, 47906204618752, 47906204631039, +ERASE, 47906204618752, 47906204618752, +STORE, 47906204618752, 47906204622847, +STORE, 47906204622848, 47906204631039, +ERASE, 47906204622848, 47906204622848, +STORE, 47906204622848, 47906204631039, +STORE, 47906204631040, 47906204639231, +ERASE, 47906197295104, 47906197295104, +STORE, 47906197295104, 47906197311487, +STORE, 47906197311488, 47906197319679, +ERASE, 47906204622848, 47906204622848, +STORE, 47906204622848, 47906204626943, +STORE, 47906204626944, 47906204631039, +ERASE, 47906200211456, 47906200211456, +STORE, 47906200211456, 47906200215551, +STORE, 47906200215552, 47906200219647, +ERASE, 47906204594176, 47906204594176, +STORE, 47906204594176, 47906204598271, +STORE, 47906204598272, 47906204602367, +ERASE, 47906204119040, 47906204119040, +STORE, 47906204119040, 47906204123135, +STORE, 47906204123136, 47906204127231, +ERASE, 47906204082176, 47906204082176, +STORE, 47906204082176, 47906204086271, +STORE, 47906204086272, 47906204090367, +ERASE, 47906203836416, 47906203836416, +STORE, 47906203836416, 47906204033023, +STORE, 47906204033024, 47906204041215, +ERASE, 47906200952832, 47906200952832, +STORE, 47906200952832, 47906200989695, +STORE, 47906200989696, 47906201006079, +ERASE, 47906200399872, 47906200399872, +STORE, 47906200399872, 47906200403967, +STORE, 47906200403968, 47906200408063, +ERASE, 47906200084480, 47906200084480, +STORE, 47906200084480, 47906200088575, +STORE, 47906200088576, 47906200092671, +ERASE, 47906197815296, 47906197815296, +STORE, 47906197815296, 47906197835775, +STORE, 47906197835776, 47906197839871, +ERASE, 47906197483520, 47906197483520, +STORE, 47906197483520, 47906197487615, +STORE, 47906197487616, 47906197491711, +ERASE, 47906197438464, 47906197438464, +STORE, 47906197438464, 47906197442559, +STORE, 47906197442560, 47906197446655, +ERASE, 94641232658432, 94641232658432, +STORE, 94641232658432, 94641232781311, +STORE, 94641232781312, 94641232785407, +ERASE, 139726599675904, 139726599675904, +STORE, 139726599675904, 139726599679999, +STORE, 139726599680000, 139726599684095, +ERASE, 47906195480576, 47906195480576, +STORE, 94641242615808, 94641242750975, + }; + unsigned long set11[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140732658499584, 140737488351231, +ERASE, 140732658499584, 140732658499584, +STORE, 140732658499584, 140732658503679, +STORE, 94029856579584, 94029856751615, +ERASE, 94029856579584, 94029856579584, +STORE, 94029856579584, 94029856595967, +STORE, 94029856595968, 94029856751615, +ERASE, 94029856595968, 94029856595968, +STORE, 94029856595968, 94029856698367, +STORE, 94029856698368, 94029856739327, +STORE, 94029856739328, 94029856751615, +STORE, 140014592573440, 140014592745471, +ERASE, 140014592573440, 140014592573440, +STORE, 140014592573440, 140014592577535, +STORE, 140014592577536, 140014592745471, +ERASE, 140014592577536, 140014592577536, +STORE, 140014592577536, 140014592700415, +STORE, 140014592700416, 140014592733183, +STORE, 140014592733184, 140014592741375, +STORE, 140014592741376, 140014592745471, +STORE, 140732658565120, 140732658569215, +STORE, 140732658552832, 140732658565119, + }; + + unsigned long set12[] = { /* contains 12 values. */ +STORE, 140737488347136, 140737488351231, +STORE, 140732658499584, 140737488351231, +ERASE, 140732658499584, 140732658499584, +STORE, 140732658499584, 140732658503679, +STORE, 94029856579584, 94029856751615, +ERASE, 94029856579584, 94029856579584, +STORE, 94029856579584, 94029856595967, +STORE, 94029856595968, 94029856751615, +ERASE, 94029856595968, 94029856595968, +STORE, 94029856595968, 94029856698367, +STORE, 94029856698368, 94029856739327, +STORE, 94029856739328, 94029856751615, +STORE, 140014592573440, 140014592745471, +ERASE, 140014592573440, 140014592573440, +STORE, 140014592573440, 140014592577535, +STORE, 140014592577536, 140014592745471, +ERASE, 140014592577536, 140014592577536, +STORE, 140014592577536, 140014592700415, +STORE, 140014592700416, 140014592733183, +STORE, 140014592733184, 140014592741375, +STORE, 140014592741376, 140014592745471, +STORE, 140732658565120, 140732658569215, +STORE, 140732658552832, 140732658565119, +STORE, 140014592741375, 140014592741375, /* contrived */ +STORE, 140014592733184, 140014592741376, /* creates first entry retry. */ + }; + unsigned long set13[] = { +STORE, 140373516247040, 140373516251135,/*: ffffa2e7b0e10d80 */ +STORE, 140373516251136, 140373516255231,/*: ffffa2e7b1195d80 */ +STORE, 140373516255232, 140373516443647,/*: ffffa2e7b0e109c0 */ +STORE, 140373516443648, 140373516587007,/*: ffffa2e7b05fecc0 */ +STORE, 140373516963840, 140373518647295,/*: ffffa2e7bfbdcc00 */ +STORE, 140373518647296, 140373518663679,/*: ffffa2e7bf5d59c0 */ +STORE, 140373518663680, 140373518684159,/*: deleted (257) */ +STORE, 140373518680064, 140373518684159,/*: ffffa2e7b0e1cb40 */ +STORE, 140373518684160, 140373518688254,/*: ffffa2e7b05fec00 */ +STORE, 140373518688256, 140373518692351,/*: ffffa2e7bfbdcd80 */ +STORE, 140373518692352, 140373518696447,/*: ffffa2e7b0749e40 */ + }; + unsigned long set14[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140731667996672, 140737488351231, +SNULL, 140731668000767, 140737488351231, +STORE, 140731667996672, 140731668000767, +STORE, 140731667865600, 140731668000767, +STORE, 94077521272832, 94077521313791, +SNULL, 94077521301503, 94077521313791, +STORE, 94077521272832, 94077521301503, +STORE, 94077521301504, 94077521313791, +ERASE, 94077521301504, 94077521313791, +STORE, 94077521305600, 94077521313791, +STORE, 139826134630400, 139826136883199, +SNULL, 139826134773759, 139826136883199, +STORE, 139826134630400, 139826134773759, +STORE, 139826134773760, 139826136883199, +ERASE, 139826134773760, 139826136883199, +STORE, 139826136870912, 139826136879103, +STORE, 139826136879104, 139826136883199, +STORE, 140731668013056, 140731668017151, +STORE, 140731668000768, 140731668013055, +STORE, 139826136862720, 139826136870911, +STORE, 139826132406272, 139826134630399, +SNULL, 139826134056959, 139826134630399, +STORE, 139826132406272, 139826134056959, +STORE, 139826134056960, 139826134630399, +SNULL, 139826134056960, 139826134626303, +STORE, 139826134626304, 139826134630399, +STORE, 139826134056960, 139826134626303, +ERASE, 139826134056960, 139826134626303, +STORE, 139826134056960, 139826134626303, +ERASE, 139826134626304, 139826134630399, +STORE, 139826134626304, 139826134630399, +STORE, 139826136842240, 139826136862719, +STORE, 139826130022400, 139826132406271, +SNULL, 139826130022400, 139826130288639, +STORE, 139826130288640, 139826132406271, +STORE, 139826130022400, 139826130288639, +SNULL, 139826132381695, 139826132406271, +STORE, 139826130288640, 139826132381695, +STORE, 139826132381696, 139826132406271, +SNULL, 139826132381696, 139826132402175, +STORE, 139826132402176, 139826132406271, +STORE, 139826132381696, 139826132402175, +ERASE, 139826132381696, 139826132402175, +STORE, 139826132381696, 139826132402175, +ERASE, 139826132402176, 139826132406271, +STORE, 139826132402176, 139826132406271, +STORE, 139826127806464, 139826130022399, +SNULL, 139826127806464, 139826127904767, +STORE, 139826127904768, 139826130022399, +STORE, 139826127806464, 139826127904767, +SNULL, 139826129997823, 139826130022399, +STORE, 139826127904768, 139826129997823, +STORE, 139826129997824, 139826130022399, +SNULL, 139826129997824, 139826130006015, +STORE, 139826130006016, 139826130022399, +STORE, 139826129997824, 139826130006015, +ERASE, 139826129997824, 139826130006015, +STORE, 139826129997824, 139826130006015, +ERASE, 139826130006016, 139826130022399, +STORE, 139826130006016, 139826130022399, +STORE, 139826124009472, 139826127806463, +SNULL, 139826124009472, 139826125668351, +STORE, 139826125668352, 139826127806463, +STORE, 139826124009472, 139826125668351, +SNULL, 139826127765503, 139826127806463, +STORE, 139826125668352, 139826127765503, +STORE, 139826127765504, 139826127806463, +SNULL, 139826127765504, 139826127790079, +STORE, 139826127790080, 139826127806463, +STORE, 139826127765504, 139826127790079, +ERASE, 139826127765504, 139826127790079, +STORE, 139826127765504, 139826127790079, +ERASE, 139826127790080, 139826127806463, +STORE, 139826127790080, 139826127806463, +STORE, 139826121748480, 139826124009471, +SNULL, 139826121748480, 139826121900031, +STORE, 139826121900032, 139826124009471, +STORE, 139826121748480, 139826121900031, +SNULL, 139826123993087, 139826124009471, +STORE, 139826121900032, 139826123993087, +STORE, 139826123993088, 139826124009471, +SNULL, 139826123993088, 139826124001279, +STORE, 139826124001280, 139826124009471, +STORE, 139826123993088, 139826124001279, +ERASE, 139826123993088, 139826124001279, +STORE, 139826123993088, 139826124001279, +ERASE, 139826124001280, 139826124009471, +STORE, 139826124001280, 139826124009471, +STORE, 139826119626752, 139826121748479, +SNULL, 139826119626752, 139826119643135, +STORE, 139826119643136, 139826121748479, +STORE, 139826119626752, 139826119643135, +SNULL, 139826121740287, 139826121748479, +STORE, 139826119643136, 139826121740287, +STORE, 139826121740288, 139826121748479, +ERASE, 139826121740288, 139826121748479, +STORE, 139826121740288, 139826121748479, +STORE, 139826136834048, 139826136842239, +STORE, 139826117496832, 139826119626751, +SNULL, 139826117496832, 139826117525503, +STORE, 139826117525504, 139826119626751, +STORE, 139826117496832, 139826117525503, +SNULL, 139826119618559, 139826119626751, +STORE, 139826117525504, 139826119618559, +STORE, 139826119618560, 139826119626751, +ERASE, 139826119618560, 139826119626751, +STORE, 139826119618560, 139826119626751, +STORE, 139826115244032, 139826117496831, +SNULL, 139826115244032, 139826115395583, +STORE, 139826115395584, 139826117496831, +STORE, 139826115244032, 139826115395583, +SNULL, 139826117488639, 139826117496831, +STORE, 139826115395584, 139826117488639, +STORE, 139826117488640, 139826117496831, +ERASE, 139826117488640, 139826117496831, +STORE, 139826117488640, 139826117496831, +STORE, 139826113073152, 139826115244031, +SNULL, 139826113073152, 139826113142783, +STORE, 139826113142784, 139826115244031, +STORE, 139826113073152, 139826113142783, +SNULL, 139826115235839, 139826115244031, +STORE, 139826113142784, 139826115235839, +STORE, 139826115235840, 139826115244031, +ERASE, 139826115235840, 139826115244031, +STORE, 139826115235840, 139826115244031, +STORE, 139826109861888, 139826113073151, +SNULL, 139826109861888, 139826110939135, +STORE, 139826110939136, 139826113073151, +STORE, 139826109861888, 139826110939135, +SNULL, 139826113036287, 139826113073151, +STORE, 139826110939136, 139826113036287, +STORE, 139826113036288, 139826113073151, +ERASE, 139826113036288, 139826113073151, +STORE, 139826113036288, 139826113073151, +STORE, 139826107727872, 139826109861887, +SNULL, 139826107727872, 139826107756543, +STORE, 139826107756544, 139826109861887, +STORE, 139826107727872, 139826107756543, +SNULL, 139826109853695, 139826109861887, +STORE, 139826107756544, 139826109853695, +STORE, 139826109853696, 139826109861887, +ERASE, 139826109853696, 139826109861887, +STORE, 139826109853696, 139826109861887, +STORE, 139826105417728, 139826107727871, +SNULL, 139826105417728, 139826105622527, +STORE, 139826105622528, 139826107727871, +STORE, 139826105417728, 139826105622527, +SNULL, 139826107719679, 139826107727871, +STORE, 139826105622528, 139826107719679, +STORE, 139826107719680, 139826107727871, +ERASE, 139826107719680, 139826107727871, +STORE, 139826107719680, 139826107727871, +STORE, 139826136825856, 139826136842239, +STORE, 139826103033856, 139826105417727, +SNULL, 139826103033856, 139826103226367, +STORE, 139826103226368, 139826105417727, +STORE, 139826103033856, 139826103226367, +SNULL, 139826105319423, 139826105417727, +STORE, 139826103226368, 139826105319423, +STORE, 139826105319424, 139826105417727, +ERASE, 139826105319424, 139826105417727, +STORE, 139826105319424, 139826105417727, +STORE, 139826100916224, 139826103033855, +SNULL, 139826100916224, 139826100932607, +STORE, 139826100932608, 139826103033855, +STORE, 139826100916224, 139826100932607, +SNULL, 139826103025663, 139826103033855, +STORE, 139826100932608, 139826103025663, +STORE, 139826103025664, 139826103033855, +ERASE, 139826103025664, 139826103033855, +STORE, 139826103025664, 139826103033855, +STORE, 139826098348032, 139826100916223, +SNULL, 139826098348032, 139826098814975, +STORE, 139826098814976, 139826100916223, +STORE, 139826098348032, 139826098814975, +SNULL, 139826100908031, 139826100916223, +STORE, 139826098814976, 139826100908031, +STORE, 139826100908032, 139826100916223, +ERASE, 139826100908032, 139826100916223, +STORE, 139826100908032, 139826100916223, +STORE, 139826096234496, 139826098348031, +SNULL, 139826096234496, 139826096246783, +STORE, 139826096246784, 139826098348031, +STORE, 139826096234496, 139826096246783, +SNULL, 139826098339839, 139826098348031, +STORE, 139826096246784, 139826098339839, +STORE, 139826098339840, 139826098348031, +ERASE, 139826098339840, 139826098348031, +STORE, 139826098339840, 139826098348031, +STORE, 139826094055424, 139826096234495, +SNULL, 139826094055424, 139826094133247, +STORE, 139826094133248, 139826096234495, +STORE, 139826094055424, 139826094133247, +SNULL, 139826096226303, 139826096234495, +STORE, 139826094133248, 139826096226303, +STORE, 139826096226304, 139826096234495, +ERASE, 139826096226304, 139826096234495, +STORE, 139826096226304, 139826096234495, +STORE, 139826136817664, 139826136842239, +STORE, 139826091937792, 139826094055423, +SNULL, 139826091937792, 139826091954175, +STORE, 139826091954176, 139826094055423, +STORE, 139826091937792, 139826091954175, +SNULL, 139826094047231, 139826094055423, +STORE, 139826091954176, 139826094047231, +STORE, 139826094047232, 139826094055423, +ERASE, 139826094047232, 139826094055423, +STORE, 139826094047232, 139826094055423, +STORE, 139826136809472, 139826136842239, +SNULL, 139826127781887, 139826127790079, +STORE, 139826127765504, 139826127781887, +STORE, 139826127781888, 139826127790079, +SNULL, 139826094051327, 139826094055423, +STORE, 139826094047232, 139826094051327, +STORE, 139826094051328, 139826094055423, +SNULL, 139826096230399, 139826096234495, +STORE, 139826096226304, 139826096230399, +STORE, 139826096230400, 139826096234495, +SNULL, 139826098343935, 139826098348031, +STORE, 139826098339840, 139826098343935, +STORE, 139826098343936, 139826098348031, +SNULL, 139826130001919, 139826130006015, +STORE, 139826129997824, 139826130001919, +STORE, 139826130001920, 139826130006015, +SNULL, 139826100912127, 139826100916223, +STORE, 139826100908032, 139826100912127, +STORE, 139826100912128, 139826100916223, +SNULL, 139826103029759, 139826103033855, +STORE, 139826103025664, 139826103029759, +STORE, 139826103029760, 139826103033855, +SNULL, 139826105413631, 139826105417727, +STORE, 139826105319424, 139826105413631, +STORE, 139826105413632, 139826105417727, +SNULL, 139826107723775, 139826107727871, +STORE, 139826107719680, 139826107723775, +STORE, 139826107723776, 139826107727871, +SNULL, 139826109857791, 139826109861887, +STORE, 139826109853696, 139826109857791, +STORE, 139826109857792, 139826109861887, +SNULL, 139826113044479, 139826113073151, +STORE, 139826113036288, 139826113044479, +STORE, 139826113044480, 139826113073151, +SNULL, 139826115239935, 139826115244031, +STORE, 139826115235840, 139826115239935, +STORE, 139826115239936, 139826115244031, +SNULL, 139826117492735, 139826117496831, +STORE, 139826117488640, 139826117492735, +STORE, 139826117492736, 139826117496831, +SNULL, 139826119622655, 139826119626751, +STORE, 139826119618560, 139826119622655, +STORE, 139826119622656, 139826119626751, +SNULL, 139826121744383, 139826121748479, +STORE, 139826121740288, 139826121744383, +STORE, 139826121744384, 139826121748479, +SNULL, 139826123997183, 139826124001279, +STORE, 139826123993088, 139826123997183, +STORE, 139826123997184, 139826124001279, +SNULL, 139826132398079, 139826132402175, +STORE, 139826132381696, 139826132398079, +STORE, 139826132398080, 139826132402175, +SNULL, 139826134622207, 139826134626303, +STORE, 139826134056960, 139826134622207, +STORE, 139826134622208, 139826134626303, +SNULL, 94077521309695, 94077521313791, +STORE, 94077521305600, 94077521309695, +STORE, 94077521309696, 94077521313791, +SNULL, 139826136875007, 139826136879103, +STORE, 139826136870912, 139826136875007, +STORE, 139826136875008, 139826136879103, +ERASE, 139826136842240, 139826136862719, +STORE, 94077554049024, 94077554184191, +STORE, 139826136543232, 139826136842239, +STORE, 139826136276992, 139826136842239, +STORE, 139826136010752, 139826136842239, +STORE, 139826135744512, 139826136842239, +SNULL, 139826136543231, 139826136842239, +STORE, 139826135744512, 139826136543231, +STORE, 139826136543232, 139826136842239, +SNULL, 139826136543232, 139826136809471, +STORE, 139826136809472, 139826136842239, +STORE, 139826136543232, 139826136809471, + }; + unsigned long set15[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140722061451264, 140737488351231, +SNULL, 140722061455359, 140737488351231, +STORE, 140722061451264, 140722061455359, +STORE, 140722061320192, 140722061455359, +STORE, 94728600248320, 94728600289279, +SNULL, 94728600276991, 94728600289279, +STORE, 94728600248320, 94728600276991, +STORE, 94728600276992, 94728600289279, +ERASE, 94728600276992, 94728600289279, +STORE, 94728600281088, 94728600289279, +STORE, 139906806779904, 139906809032703, +SNULL, 139906806923263, 139906809032703, +STORE, 139906806779904, 139906806923263, +STORE, 139906806923264, 139906809032703, +ERASE, 139906806923264, 139906809032703, +STORE, 139906809020416, 139906809028607, +STORE, 139906809028608, 139906809032703, +STORE, 140722061692928, 140722061697023, +STORE, 140722061680640, 140722061692927, +STORE, 139906809012224, 139906809020415, +STORE, 139906804555776, 139906806779903, +SNULL, 139906806206463, 139906806779903, +STORE, 139906804555776, 139906806206463, +STORE, 139906806206464, 139906806779903, +SNULL, 139906806206464, 139906806775807, +STORE, 139906806775808, 139906806779903, +STORE, 139906806206464, 139906806775807, +ERASE, 139906806206464, 139906806775807, +STORE, 139906806206464, 139906806775807, +ERASE, 139906806775808, 139906806779903, +STORE, 139906806775808, 139906806779903, +STORE, 139906808991744, 139906809012223, +STORE, 139906802171904, 139906804555775, +SNULL, 139906802171904, 139906802438143, +STORE, 139906802438144, 139906804555775, +STORE, 139906802171904, 139906802438143, +SNULL, 139906804531199, 139906804555775, +STORE, 139906802438144, 139906804531199, +STORE, 139906804531200, 139906804555775, +SNULL, 139906804531200, 139906804551679, +STORE, 139906804551680, 139906804555775, +STORE, 139906804531200, 139906804551679, +ERASE, 139906804531200, 139906804551679, +STORE, 139906804531200, 139906804551679, +ERASE, 139906804551680, 139906804555775, +STORE, 139906804551680, 139906804555775, +STORE, 139906799955968, 139906802171903, +SNULL, 139906799955968, 139906800054271, +STORE, 139906800054272, 139906802171903, +STORE, 139906799955968, 139906800054271, +SNULL, 139906802147327, 139906802171903, +STORE, 139906800054272, 139906802147327, +STORE, 139906802147328, 139906802171903, +SNULL, 139906802147328, 139906802155519, +STORE, 139906802155520, 139906802171903, +STORE, 139906802147328, 139906802155519, +ERASE, 139906802147328, 139906802155519, +STORE, 139906802147328, 139906802155519, +ERASE, 139906802155520, 139906802171903, +STORE, 139906802155520, 139906802171903, +STORE, 139906796158976, 139906799955967, +SNULL, 139906796158976, 139906797817855, +STORE, 139906797817856, 139906799955967, +STORE, 139906796158976, 139906797817855, +SNULL, 139906799915007, 139906799955967, +STORE, 139906797817856, 139906799915007, +STORE, 139906799915008, 139906799955967, +SNULL, 139906799915008, 139906799939583, +STORE, 139906799939584, 139906799955967, +STORE, 139906799915008, 139906799939583, +ERASE, 139906799915008, 139906799939583, +STORE, 139906799915008, 139906799939583, +ERASE, 139906799939584, 139906799955967, +STORE, 139906799939584, 139906799955967, +STORE, 139906793897984, 139906796158975, +SNULL, 139906793897984, 139906794049535, +STORE, 139906794049536, 139906796158975, +STORE, 139906793897984, 139906794049535, +SNULL, 139906796142591, 139906796158975, +STORE, 139906794049536, 139906796142591, +STORE, 139906796142592, 139906796158975, +SNULL, 139906796142592, 139906796150783, +STORE, 139906796150784, 139906796158975, +STORE, 139906796142592, 139906796150783, +ERASE, 139906796142592, 139906796150783, +STORE, 139906796142592, 139906796150783, +ERASE, 139906796150784, 139906796158975, +STORE, 139906796150784, 139906796158975, +STORE, 139906791776256, 139906793897983, +SNULL, 139906791776256, 139906791792639, +STORE, 139906791792640, 139906793897983, +STORE, 139906791776256, 139906791792639, +SNULL, 139906793889791, 139906793897983, +STORE, 139906791792640, 139906793889791, +STORE, 139906793889792, 139906793897983, +ERASE, 139906793889792, 139906793897983, +STORE, 139906793889792, 139906793897983, +STORE, 139906808983552, 139906808991743, +STORE, 139906789646336, 139906791776255, +SNULL, 139906789646336, 139906789675007, +STORE, 139906789675008, 139906791776255, +STORE, 139906789646336, 139906789675007, +SNULL, 139906791768063, 139906791776255, +STORE, 139906789675008, 139906791768063, +STORE, 139906791768064, 139906791776255, +ERASE, 139906791768064, 139906791776255, +STORE, 139906791768064, 139906791776255, +STORE, 139906787393536, 139906789646335, +SNULL, 139906787393536, 139906787545087, +STORE, 139906787545088, 139906789646335, +STORE, 139906787393536, 139906787545087, +SNULL, 139906789638143, 139906789646335, +STORE, 139906787545088, 139906789638143, +STORE, 139906789638144, 139906789646335, +ERASE, 139906789638144, 139906789646335, +STORE, 139906789638144, 139906789646335, +STORE, 139906785222656, 139906787393535, +SNULL, 139906785222656, 139906785292287, +STORE, 139906785292288, 139906787393535, +STORE, 139906785222656, 139906785292287, +SNULL, 139906787385343, 139906787393535, +STORE, 139906785292288, 139906787385343, +STORE, 139906787385344, 139906787393535, +ERASE, 139906787385344, 139906787393535, +STORE, 139906787385344, 139906787393535, +STORE, 139906782011392, 139906785222655, +SNULL, 139906782011392, 139906783088639, +STORE, 139906783088640, 139906785222655, +STORE, 139906782011392, 139906783088639, +SNULL, 139906785185791, 139906785222655, +STORE, 139906783088640, 139906785185791, +STORE, 139906785185792, 139906785222655, +ERASE, 139906785185792, 139906785222655, +STORE, 139906785185792, 139906785222655, +STORE, 139906779877376, 139906782011391, +SNULL, 139906779877376, 139906779906047, +STORE, 139906779906048, 139906782011391, +STORE, 139906779877376, 139906779906047, +SNULL, 139906782003199, 139906782011391, +STORE, 139906779906048, 139906782003199, +STORE, 139906782003200, 139906782011391, +ERASE, 139906782003200, 139906782011391, +STORE, 139906782003200, 139906782011391, +STORE, 139906777567232, 139906779877375, +SNULL, 139906777567232, 139906777772031, +STORE, 139906777772032, 139906779877375, +STORE, 139906777567232, 139906777772031, +SNULL, 139906779869183, 139906779877375, +STORE, 139906777772032, 139906779869183, +STORE, 139906779869184, 139906779877375, +ERASE, 139906779869184, 139906779877375, +STORE, 139906779869184, 139906779877375, +STORE, 139906808975360, 139906808991743, +STORE, 139906775183360, 139906777567231, +SNULL, 139906775183360, 139906775375871, +STORE, 139906775375872, 139906777567231, +STORE, 139906775183360, 139906775375871, +SNULL, 139906777468927, 139906777567231, +STORE, 139906775375872, 139906777468927, +STORE, 139906777468928, 139906777567231, +ERASE, 139906777468928, 139906777567231, +STORE, 139906777468928, 139906777567231, +STORE, 139906773065728, 139906775183359, +SNULL, 139906773065728, 139906773082111, +STORE, 139906773082112, 139906775183359, +STORE, 139906773065728, 139906773082111, +SNULL, 139906775175167, 139906775183359, +STORE, 139906773082112, 139906775175167, +STORE, 139906775175168, 139906775183359, +ERASE, 139906775175168, 139906775183359, +STORE, 139906775175168, 139906775183359, +STORE, 139906770497536, 139906773065727, +SNULL, 139906770497536, 139906770964479, +STORE, 139906770964480, 139906773065727, +STORE, 139906770497536, 139906770964479, +SNULL, 139906773057535, 139906773065727, +STORE, 139906770964480, 139906773057535, +STORE, 139906773057536, 139906773065727, +ERASE, 139906773057536, 139906773065727, +STORE, 139906773057536, 139906773065727, +STORE, 139906768384000, 139906770497535, +SNULL, 139906768384000, 139906768396287, +STORE, 139906768396288, 139906770497535, +STORE, 139906768384000, 139906768396287, +SNULL, 139906770489343, 139906770497535, +STORE, 139906768396288, 139906770489343, +STORE, 139906770489344, 139906770497535, +ERASE, 139906770489344, 139906770497535, +STORE, 139906770489344, 139906770497535, +STORE, 139906766204928, 139906768383999, +SNULL, 139906766204928, 139906766282751, +STORE, 139906766282752, 139906768383999, +STORE, 139906766204928, 139906766282751, +SNULL, 139906768375807, 139906768383999, +STORE, 139906766282752, 139906768375807, +STORE, 139906768375808, 139906768383999, +ERASE, 139906768375808, 139906768383999, +STORE, 139906768375808, 139906768383999, +STORE, 139906808967168, 139906808991743, +STORE, 139906764087296, 139906766204927, +SNULL, 139906764087296, 139906764103679, +STORE, 139906764103680, 139906766204927, +STORE, 139906764087296, 139906764103679, +SNULL, 139906766196735, 139906766204927, +STORE, 139906764103680, 139906766196735, +STORE, 139906766196736, 139906766204927, +ERASE, 139906766196736, 139906766204927, +STORE, 139906766196736, 139906766204927, +STORE, 139906808958976, 139906808991743, +SNULL, 139906799931391, 139906799939583, +STORE, 139906799915008, 139906799931391, +STORE, 139906799931392, 139906799939583, +SNULL, 139906766200831, 139906766204927, +STORE, 139906766196736, 139906766200831, +STORE, 139906766200832, 139906766204927, +SNULL, 139906768379903, 139906768383999, +STORE, 139906768375808, 139906768379903, +STORE, 139906768379904, 139906768383999, +SNULL, 139906770493439, 139906770497535, +STORE, 139906770489344, 139906770493439, +STORE, 139906770493440, 139906770497535, +SNULL, 139906802151423, 139906802155519, +STORE, 139906802147328, 139906802151423, +STORE, 139906802151424, 139906802155519, +SNULL, 139906773061631, 139906773065727, +STORE, 139906773057536, 139906773061631, +STORE, 139906773061632, 139906773065727, +SNULL, 139906775179263, 139906775183359, +STORE, 139906775175168, 139906775179263, +STORE, 139906775179264, 139906775183359, +SNULL, 139906777563135, 139906777567231, +STORE, 139906777468928, 139906777563135, +STORE, 139906777563136, 139906777567231, +SNULL, 139906779873279, 139906779877375, +STORE, 139906779869184, 139906779873279, +STORE, 139906779873280, 139906779877375, +SNULL, 139906782007295, 139906782011391, +STORE, 139906782003200, 139906782007295, +STORE, 139906782007296, 139906782011391, +SNULL, 139906785193983, 139906785222655, +STORE, 139906785185792, 139906785193983, +STORE, 139906785193984, 139906785222655, +SNULL, 139906787389439, 139906787393535, +STORE, 139906787385344, 139906787389439, +STORE, 139906787389440, 139906787393535, +SNULL, 139906789642239, 139906789646335, +STORE, 139906789638144, 139906789642239, +STORE, 139906789642240, 139906789646335, +SNULL, 139906791772159, 139906791776255, +STORE, 139906791768064, 139906791772159, +STORE, 139906791772160, 139906791776255, +SNULL, 139906793893887, 139906793897983, +STORE, 139906793889792, 139906793893887, +STORE, 139906793893888, 139906793897983, +SNULL, 139906796146687, 139906796150783, +STORE, 139906796142592, 139906796146687, +STORE, 139906796146688, 139906796150783, +SNULL, 139906804547583, 139906804551679, +STORE, 139906804531200, 139906804547583, +STORE, 139906804547584, 139906804551679, +SNULL, 139906806771711, 139906806775807, +STORE, 139906806206464, 139906806771711, +STORE, 139906806771712, 139906806775807, +SNULL, 94728600285183, 94728600289279, +STORE, 94728600281088, 94728600285183, +STORE, 94728600285184, 94728600289279, +SNULL, 139906809024511, 139906809028607, +STORE, 139906809020416, 139906809024511, +STORE, 139906809024512, 139906809028607, +ERASE, 139906808991744, 139906809012223, +STORE, 94728620138496, 94728620273663, +STORE, 139906808692736, 139906808991743, +STORE, 139906808426496, 139906808991743, +STORE, 139906808160256, 139906808991743, +STORE, 139906807894016, 139906808991743, +SNULL, 139906808692735, 139906808991743, +STORE, 139906807894016, 139906808692735, +STORE, 139906808692736, 139906808991743, +SNULL, 139906808692736, 139906808958975, +STORE, 139906808958976, 139906808991743, +STORE, 139906808692736, 139906808958975, + }; + + unsigned long set16[] = { +STORE, 94174808662016, 94174809321471, +STORE, 94174811414528, 94174811426815, +STORE, 94174811426816, 94174811430911, +STORE, 94174811430912, 94174811443199, +STORE, 94174841700352, 94174841835519, +STORE, 140173257838592, 140173259497471, +STORE, 140173259497472, 140173261594623, +STORE, 140173261594624, 140173261611007, +STORE, 140173261611008, 140173261619199, +STORE, 140173261619200, 140173261635583, +STORE, 140173261635584, 140173261778943, +STORE, 140173263863808, 140173263871999, +STORE, 140173263876096, 140173263880191, +STORE, 140173263880192, 140173263884287, +STORE, 140173263884288, 140173263888383, +STORE, 140729801007104, 140729801142271, +STORE, 140729801617408, 140729801629695, +STORE, 140729801629696, 140729801633791, +STORE, 140737488347136, 140737488351231, +STORE, 140728166858752, 140737488351231, +SNULL, 140728166862847, 140737488351231, +STORE, 140728166858752, 140728166862847, +STORE, 140728166727680, 140728166862847, +STORE, 93912949866496, 93912950337535, +SNULL, 93912950288383, 93912950337535, +STORE, 93912949866496, 93912950288383, +STORE, 93912950288384, 93912950337535, +ERASE, 93912950288384, 93912950337535, +STORE, 93912950292480, 93912950337535, +STORE, 139921863385088, 139921865637887, +SNULL, 139921863528447, 139921865637887, +STORE, 139921863385088, 139921863528447, +STORE, 139921863528448, 139921865637887, +ERASE, 139921863528448, 139921865637887, +STORE, 139921865625600, 139921865633791, +STORE, 139921865633792, 139921865637887, +STORE, 140728167899136, 140728167903231, +STORE, 140728167886848, 140728167899135, +STORE, 139921865601024, 139921865625599, +STORE, 139921865592832, 139921865601023, +STORE, 139921861251072, 139921863385087, +SNULL, 139921861251072, 139921861279743, +STORE, 139921861279744, 139921863385087, +STORE, 139921861251072, 139921861279743, +SNULL, 139921863376895, 139921863385087, +STORE, 139921861279744, 139921863376895, +STORE, 139921863376896, 139921863385087, +ERASE, 139921863376896, 139921863385087, +STORE, 139921863376896, 139921863385087, +STORE, 139921858867200, 139921861251071, +SNULL, 139921858867200, 139921859133439, +STORE, 139921859133440, 139921861251071, +STORE, 139921858867200, 139921859133439, +SNULL, 139921861226495, 139921861251071, +STORE, 139921859133440, 139921861226495, +STORE, 139921861226496, 139921861251071, +SNULL, 139921861226496, 139921861246975, +STORE, 139921861246976, 139921861251071, +STORE, 139921861226496, 139921861246975, +ERASE, 139921861226496, 139921861246975, +STORE, 139921861226496, 139921861246975, +ERASE, 139921861246976, 139921861251071, +STORE, 139921861246976, 139921861251071, +STORE, 139921856675840, 139921858867199, +SNULL, 139921856675840, 139921856765951, +STORE, 139921856765952, 139921858867199, +STORE, 139921856675840, 139921856765951, +SNULL, 139921858859007, 139921858867199, +STORE, 139921856765952, 139921858859007, +STORE, 139921858859008, 139921858867199, +ERASE, 139921858859008, 139921858867199, +STORE, 139921858859008, 139921858867199, +STORE, 139921854414848, 139921856675839, +SNULL, 139921854414848, 139921854566399, +STORE, 139921854566400, 139921856675839, +STORE, 139921854414848, 139921854566399, +SNULL, 139921856659455, 139921856675839, +STORE, 139921854566400, 139921856659455, +STORE, 139921856659456, 139921856675839, +SNULL, 139921856659456, 139921856667647, +STORE, 139921856667648, 139921856675839, +STORE, 139921856659456, 139921856667647, +ERASE, 139921856659456, 139921856667647, +STORE, 139921856659456, 139921856667647, +ERASE, 139921856667648, 139921856675839, +STORE, 139921856667648, 139921856675839, +STORE, 139921852284928, 139921854414847, +SNULL, 139921852284928, 139921852313599, +STORE, 139921852313600, 139921854414847, +STORE, 139921852284928, 139921852313599, +SNULL, 139921854406655, 139921854414847, +STORE, 139921852313600, 139921854406655, +STORE, 139921854406656, 139921854414847, +ERASE, 139921854406656, 139921854414847, +STORE, 139921854406656, 139921854414847, +STORE, 139921850068992, 139921852284927, +SNULL, 139921850068992, 139921850167295, +STORE, 139921850167296, 139921852284927, +STORE, 139921850068992, 139921850167295, +SNULL, 139921852260351, 139921852284927, +STORE, 139921850167296, 139921852260351, +STORE, 139921852260352, 139921852284927, +SNULL, 139921852260352, 139921852268543, +STORE, 139921852268544, 139921852284927, +STORE, 139921852260352, 139921852268543, +ERASE, 139921852260352, 139921852268543, +STORE, 139921852260352, 139921852268543, +ERASE, 139921852268544, 139921852284927, +STORE, 139921852268544, 139921852284927, +STORE, 139921865584640, 139921865601023, +STORE, 139921846272000, 139921850068991, +SNULL, 139921846272000, 139921847930879, +STORE, 139921847930880, 139921850068991, +STORE, 139921846272000, 139921847930879, +SNULL, 139921850028031, 139921850068991, +STORE, 139921847930880, 139921850028031, +STORE, 139921850028032, 139921850068991, +SNULL, 139921850028032, 139921850052607, +STORE, 139921850052608, 139921850068991, +STORE, 139921850028032, 139921850052607, +ERASE, 139921850028032, 139921850052607, +STORE, 139921850028032, 139921850052607, +ERASE, 139921850052608, 139921850068991, +STORE, 139921850052608, 139921850068991, +STORE, 139921844154368, 139921846271999, +SNULL, 139921844154368, 139921844170751, +STORE, 139921844170752, 139921846271999, +STORE, 139921844154368, 139921844170751, +SNULL, 139921846263807, 139921846271999, +STORE, 139921844170752, 139921846263807, +STORE, 139921846263808, 139921846271999, +ERASE, 139921846263808, 139921846271999, +STORE, 139921846263808, 139921846271999, +STORE, 139921842036736, 139921844154367, +SNULL, 139921842036736, 139921842053119, +STORE, 139921842053120, 139921844154367, +STORE, 139921842036736, 139921842053119, +SNULL, 139921844146175, 139921844154367, +STORE, 139921842053120, 139921844146175, +STORE, 139921844146176, 139921844154367, +ERASE, 139921844146176, 139921844154367, +STORE, 139921844146176, 139921844154367, +STORE, 139921839468544, 139921842036735, +SNULL, 139921839468544, 139921839935487, +STORE, 139921839935488, 139921842036735, +STORE, 139921839468544, 139921839935487, +SNULL, 139921842028543, 139921842036735, +STORE, 139921839935488, 139921842028543, +STORE, 139921842028544, 139921842036735, +ERASE, 139921842028544, 139921842036735, +STORE, 139921842028544, 139921842036735, +STORE, 139921837355008, 139921839468543, +SNULL, 139921837355008, 139921837367295, +STORE, 139921837367296, 139921839468543, +STORE, 139921837355008, 139921837367295, +SNULL, 139921839460351, 139921839468543, +STORE, 139921837367296, 139921839460351, +STORE, 139921839460352, 139921839468543, +ERASE, 139921839460352, 139921839468543, +STORE, 139921839460352, 139921839468543, +STORE, 139921865576448, 139921865601023, +STORE, 139921865564160, 139921865601023, +SNULL, 139921850044415, 139921850052607, +STORE, 139921850028032, 139921850044415, +STORE, 139921850044416, 139921850052607, +SNULL, 139921839464447, 139921839468543, +STORE, 139921839460352, 139921839464447, +STORE, 139921839464448, 139921839468543, +SNULL, 139921852264447, 139921852268543, +STORE, 139921852260352, 139921852264447, +STORE, 139921852264448, 139921852268543, +SNULL, 139921842032639, 139921842036735, +STORE, 139921842028544, 139921842032639, +STORE, 139921842032640, 139921842036735, +SNULL, 139921844150271, 139921844154367, +STORE, 139921844146176, 139921844150271, +STORE, 139921844150272, 139921844154367, +SNULL, 139921846267903, 139921846271999, +STORE, 139921846263808, 139921846267903, +STORE, 139921846267904, 139921846271999, +SNULL, 139921854410751, 139921854414847, +STORE, 139921854406656, 139921854410751, +STORE, 139921854410752, 139921854414847, +SNULL, 139921856663551, 139921856667647, +STORE, 139921856659456, 139921856663551, +STORE, 139921856663552, 139921856667647, +SNULL, 139921858863103, 139921858867199, +STORE, 139921858859008, 139921858863103, +STORE, 139921858863104, 139921858867199, +SNULL, 139921861242879, 139921861246975, +STORE, 139921861226496, 139921861242879, +STORE, 139921861242880, 139921861246975, +SNULL, 139921863380991, 139921863385087, +STORE, 139921863376896, 139921863380991, +STORE, 139921863380992, 139921863385087, +SNULL, 93912950333439, 93912950337535, +STORE, 93912950292480, 93912950333439, +STORE, 93912950333440, 93912950337535, +SNULL, 139921865629695, 139921865633791, +STORE, 139921865625600, 139921865629695, +STORE, 139921865629696, 139921865633791, +ERASE, 139921865601024, 139921865625599, +STORE, 93912968110080, 93912968245247, +STORE, 139921828913152, 139921837355007, +STORE, 139921865621504, 139921865625599, +STORE, 139921865617408, 139921865621503, +STORE, 139921865613312, 139921865617407, +STORE, 139921865547776, 139921865564159, + }; + + unsigned long set17[] = { +STORE, 94397057224704, 94397057646591, +STORE, 94397057650688, 94397057691647, +STORE, 94397057691648, 94397057695743, +STORE, 94397075271680, 94397075406847, +STORE, 139953169051648, 139953169063935, +STORE, 139953169063936, 139953171156991, +STORE, 139953171156992, 139953171161087, +STORE, 139953171161088, 139953171165183, +STORE, 139953171165184, 139953171632127, +STORE, 139953171632128, 139953173725183, +STORE, 139953173725184, 139953173729279, +STORE, 139953173729280, 139953173733375, +STORE, 139953173733376, 139953173749759, +STORE, 139953173749760, 139953175842815, +STORE, 139953175842816, 139953175846911, +STORE, 139953175846912, 139953175851007, +STORE, 139953175851008, 139953175867391, +STORE, 139953175867392, 139953177960447, +STORE, 139953177960448, 139953177964543, +STORE, 139953177964544, 139953177968639, +STORE, 139953177968640, 139953179627519, +STORE, 139953179627520, 139953181724671, +STORE, 139953181724672, 139953181741055, +STORE, 139953181741056, 139953181749247, +STORE, 139953181749248, 139953181765631, +STORE, 139953181765632, 139953181863935, +STORE, 139953181863936, 139953183956991, +STORE, 139953183956992, 139953183961087, +STORE, 139953183961088, 139953183965183, +STORE, 139953183965184, 139953183981567, +STORE, 139953183981568, 139953184010239, +STORE, 139953184010240, 139953186103295, +STORE, 139953186103296, 139953186107391, +STORE, 139953186107392, 139953186111487, +STORE, 139953186111488, 139953186263039, +STORE, 139953186263040, 139953188356095, +STORE, 139953188356096, 139953188360191, +STORE, 139953188360192, 139953188364287, +STORE, 139953188364288, 139953188372479, +STORE, 139953188372480, 139953188462591, +STORE, 139953188462592, 139953190555647, +STORE, 139953190555648, 139953190559743, +STORE, 139953190559744, 139953190563839, +STORE, 139953190563840, 139953190830079, +STORE, 139953190830080, 139953192923135, +STORE, 139953192923136, 139953192939519, +STORE, 139953192939520, 139953192943615, +STORE, 139953192943616, 139953192947711, +STORE, 139953192947712, 139953192976383, +STORE, 139953192976384, 139953195073535, +STORE, 139953195073536, 139953195077631, +STORE, 139953195077632, 139953195081727, +STORE, 139953195081728, 139953195225087, +STORE, 139953197281280, 139953197318143, +STORE, 139953197322240, 139953197326335, +STORE, 139953197326336, 139953197330431, +STORE, 139953197330432, 139953197334527, +STORE, 140720477511680, 140720477646847, +STORE, 140720478302208, 140720478314495, +STORE, 140720478314496, 140720478318591, + }; + unsigned long set18[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140724953673728, 140737488351231, +SNULL, 140724953677823, 140737488351231, +STORE, 140724953673728, 140724953677823, +STORE, 140724953542656, 140724953677823, +STORE, 94675199266816, 94675199311871, +SNULL, 94675199303679, 94675199311871, +STORE, 94675199266816, 94675199303679, +STORE, 94675199303680, 94675199311871, +ERASE, 94675199303680, 94675199311871, +STORE, 94675199303680, 94675199311871, +STORE, 140222970605568, 140222972858367, +SNULL, 140222970748927, 140222972858367, +STORE, 140222970605568, 140222970748927, +STORE, 140222970748928, 140222972858367, +ERASE, 140222970748928, 140222972858367, +STORE, 140222972846080, 140222972854271, +STORE, 140222972854272, 140222972858367, +STORE, 140724954365952, 140724954370047, +STORE, 140724954353664, 140724954365951, +STORE, 140222972841984, 140222972846079, +STORE, 140222972833792, 140222972841983, +STORE, 140222968475648, 140222970605567, +SNULL, 140222968475648, 140222968504319, +STORE, 140222968504320, 140222970605567, +STORE, 140222968475648, 140222968504319, +SNULL, 140222970597375, 140222970605567, +STORE, 140222968504320, 140222970597375, +STORE, 140222970597376, 140222970605567, +ERASE, 140222970597376, 140222970605567, +STORE, 140222970597376, 140222970605567, + }; + unsigned long set19[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140725182459904, 140737488351231, +SNULL, 140725182463999, 140737488351231, +STORE, 140725182459904, 140725182463999, +STORE, 140725182328832, 140725182463999, +STORE, 94730166636544, 94730166763519, +SNULL, 94730166747135, 94730166763519, +STORE, 94730166636544, 94730166747135, +STORE, 94730166747136, 94730166763519, +ERASE, 94730166747136, 94730166763519, +STORE, 94730166751232, 94730166763519, +STORE, 140656834555904, 140656836808703, +SNULL, 140656834699263, 140656836808703, +STORE, 140656834555904, 140656834699263, +STORE, 140656834699264, 140656836808703, +ERASE, 140656834699264, 140656836808703, +STORE, 140656836796416, 140656836804607, +STORE, 140656836804608, 140656836808703, +STORE, 140725183389696, 140725183393791, +STORE, 140725183377408, 140725183389695, +STORE, 140656836788224, 140656836796415, +STORE, 140656832331776, 140656834555903, +SNULL, 140656833982463, 140656834555903, +STORE, 140656832331776, 140656833982463, +STORE, 140656833982464, 140656834555903, +SNULL, 140656833982464, 140656834551807, +STORE, 140656834551808, 140656834555903, +STORE, 140656833982464, 140656834551807, +ERASE, 140656833982464, 140656834551807, +STORE, 140656833982464, 140656834551807, +ERASE, 140656834551808, 140656834555903, +STORE, 140656834551808, 140656834555903, +STORE, 140656836763648, 140656836788223, +STORE, 140656830070784, 140656832331775, +SNULL, 140656830070784, 140656830222335, +STORE, 140656830222336, 140656832331775, +STORE, 140656830070784, 140656830222335, +SNULL, 140656832315391, 140656832331775, +STORE, 140656830222336, 140656832315391, +STORE, 140656832315392, 140656832331775, +SNULL, 140656832315392, 140656832323583, +STORE, 140656832323584, 140656832331775, +STORE, 140656832315392, 140656832323583, +ERASE, 140656832315392, 140656832323583, +STORE, 140656832315392, 140656832323583, +ERASE, 140656832323584, 140656832331775, +STORE, 140656832323584, 140656832331775, +STORE, 140656827940864, 140656830070783, +SNULL, 140656827940864, 140656827969535, +STORE, 140656827969536, 140656830070783, +STORE, 140656827940864, 140656827969535, +SNULL, 140656830062591, 140656830070783, +STORE, 140656827969536, 140656830062591, +STORE, 140656830062592, 140656830070783, +ERASE, 140656830062592, 140656830070783, +STORE, 140656830062592, 140656830070783, +STORE, 140656825724928, 140656827940863, +SNULL, 140656825724928, 140656825823231, +STORE, 140656825823232, 140656827940863, +STORE, 140656825724928, 140656825823231, +SNULL, 140656827916287, 140656827940863, +STORE, 140656825823232, 140656827916287, +STORE, 140656827916288, 140656827940863, +SNULL, 140656827916288, 140656827924479, +STORE, 140656827924480, 140656827940863, +STORE, 140656827916288, 140656827924479, +ERASE, 140656827916288, 140656827924479, +STORE, 140656827916288, 140656827924479, +ERASE, 140656827924480, 140656827940863, +STORE, 140656827924480, 140656827940863, +STORE, 140656821927936, 140656825724927, +SNULL, 140656821927936, 140656823586815, +STORE, 140656823586816, 140656825724927, +STORE, 140656821927936, 140656823586815, +SNULL, 140656825683967, 140656825724927, +STORE, 140656823586816, 140656825683967, +STORE, 140656825683968, 140656825724927, +SNULL, 140656825683968, 140656825708543, +STORE, 140656825708544, 140656825724927, +STORE, 140656825683968, 140656825708543, +ERASE, 140656825683968, 140656825708543, +STORE, 140656825683968, 140656825708543, +ERASE, 140656825708544, 140656825724927, +STORE, 140656825708544, 140656825724927, +STORE, 140656819806208, 140656821927935, +SNULL, 140656819806208, 140656819822591, +STORE, 140656819822592, 140656821927935, +STORE, 140656819806208, 140656819822591, +SNULL, 140656821919743, 140656821927935, +STORE, 140656819822592, 140656821919743, +STORE, 140656821919744, 140656821927935, +ERASE, 140656821919744, 140656821927935, +STORE, 140656821919744, 140656821927935, +STORE, 140656836755456, 140656836763647, +STORE, 140656817553408, 140656819806207, +SNULL, 140656817553408, 140656817704959, +STORE, 140656817704960, 140656819806207, +STORE, 140656817553408, 140656817704959, +SNULL, 140656819798015, 140656819806207, +STORE, 140656817704960, 140656819798015, +STORE, 140656819798016, 140656819806207, +ERASE, 140656819798016, 140656819806207, +STORE, 140656819798016, 140656819806207, +STORE, 140656815382528, 140656817553407, +SNULL, 140656815382528, 140656815452159, +STORE, 140656815452160, 140656817553407, +STORE, 140656815382528, 140656815452159, +SNULL, 140656817545215, 140656817553407, +STORE, 140656815452160, 140656817545215, +STORE, 140656817545216, 140656817553407, +ERASE, 140656817545216, 140656817553407, +STORE, 140656817545216, 140656817553407, +STORE, 140656812171264, 140656815382527, +SNULL, 140656812171264, 140656813248511, +STORE, 140656813248512, 140656815382527, +STORE, 140656812171264, 140656813248511, +SNULL, 140656815345663, 140656815382527, +STORE, 140656813248512, 140656815345663, +STORE, 140656815345664, 140656815382527, +ERASE, 140656815345664, 140656815382527, +STORE, 140656815345664, 140656815382527, +STORE, 140656810037248, 140656812171263, +SNULL, 140656810037248, 140656810065919, +STORE, 140656810065920, 140656812171263, +STORE, 140656810037248, 140656810065919, +SNULL, 140656812163071, 140656812171263, +STORE, 140656810065920, 140656812163071, +STORE, 140656812163072, 140656812171263, +ERASE, 140656812163072, 140656812171263, +STORE, 140656812163072, 140656812171263, +STORE, 140656807727104, 140656810037247, +SNULL, 140656807727104, 140656807931903, +STORE, 140656807931904, 140656810037247, +STORE, 140656807727104, 140656807931903, +SNULL, 140656810029055, 140656810037247, +STORE, 140656807931904, 140656810029055, +STORE, 140656810029056, 140656810037247, +ERASE, 140656810029056, 140656810037247, +STORE, 140656810029056, 140656810037247, +STORE, 140656805343232, 140656807727103, +SNULL, 140656805343232, 140656805535743, +STORE, 140656805535744, 140656807727103, +STORE, 140656805343232, 140656805535743, +SNULL, 140656807628799, 140656807727103, +STORE, 140656805535744, 140656807628799, +STORE, 140656807628800, 140656807727103, +ERASE, 140656807628800, 140656807727103, +STORE, 140656807628800, 140656807727103, +STORE, 140656836747264, 140656836763647, +STORE, 140656802775040, 140656805343231, +SNULL, 140656802775040, 140656803241983, +STORE, 140656803241984, 140656805343231, +STORE, 140656802775040, 140656803241983, +SNULL, 140656805335039, 140656805343231, +STORE, 140656803241984, 140656805335039, +STORE, 140656805335040, 140656805343231, +ERASE, 140656805335040, 140656805343231, +STORE, 140656805335040, 140656805343231, +STORE, 140656800661504, 140656802775039, +SNULL, 140656800661504, 140656800673791, +STORE, 140656800673792, 140656802775039, +STORE, 140656800661504, 140656800673791, +SNULL, 140656802766847, 140656802775039, +STORE, 140656800673792, 140656802766847, +STORE, 140656802766848, 140656802775039, +ERASE, 140656802766848, 140656802775039, +STORE, 140656802766848, 140656802775039, +STORE, 140656798482432, 140656800661503, +SNULL, 140656798482432, 140656798560255, +STORE, 140656798560256, 140656800661503, +STORE, 140656798482432, 140656798560255, +SNULL, 140656800653311, 140656800661503, +STORE, 140656798560256, 140656800653311, +STORE, 140656800653312, 140656800661503, +ERASE, 140656800653312, 140656800661503, +STORE, 140656800653312, 140656800661503, +STORE, 140656796364800, 140656798482431, +SNULL, 140656796364800, 140656796381183, +STORE, 140656796381184, 140656798482431, +STORE, 140656796364800, 140656796381183, +SNULL, 140656798474239, 140656798482431, +STORE, 140656796381184, 140656798474239, +STORE, 140656798474240, 140656798482431, +ERASE, 140656798474240, 140656798482431, +STORE, 140656798474240, 140656798482431, +STORE, 140656836739072, 140656836763647, +STORE, 140656836726784, 140656836763647, +SNULL, 140656825700351, 140656825708543, +STORE, 140656825683968, 140656825700351, +STORE, 140656825700352, 140656825708543, +SNULL, 140656798478335, 140656798482431, +STORE, 140656798474240, 140656798478335, +STORE, 140656798478336, 140656798482431, +SNULL, 140656800657407, 140656800661503, +STORE, 140656800653312, 140656800657407, +STORE, 140656800657408, 140656800661503, +SNULL, 140656802770943, 140656802775039, +STORE, 140656802766848, 140656802770943, +STORE, 140656802770944, 140656802775039, +SNULL, 140656827920383, 140656827924479, +STORE, 140656827916288, 140656827920383, +STORE, 140656827920384, 140656827924479, +SNULL, 140656805339135, 140656805343231, +STORE, 140656805335040, 140656805339135, +STORE, 140656805339136, 140656805343231, +SNULL, 140656807723007, 140656807727103, +STORE, 140656807628800, 140656807723007, +STORE, 140656807723008, 140656807727103, +SNULL, 140656810033151, 140656810037247, +STORE, 140656810029056, 140656810033151, +STORE, 140656810033152, 140656810037247, +SNULL, 140656812167167, 140656812171263, +STORE, 140656812163072, 140656812167167, +STORE, 140656812167168, 140656812171263, +SNULL, 140656815353855, 140656815382527, +STORE, 140656815345664, 140656815353855, +STORE, 140656815353856, 140656815382527, +SNULL, 140656817549311, 140656817553407, +STORE, 140656817545216, 140656817549311, +STORE, 140656817549312, 140656817553407, +SNULL, 140656819802111, 140656819806207, +STORE, 140656819798016, 140656819802111, +STORE, 140656819802112, 140656819806207, +SNULL, 140656821923839, 140656821927935, +STORE, 140656821919744, 140656821923839, +STORE, 140656821923840, 140656821927935, +SNULL, 140656830066687, 140656830070783, +STORE, 140656830062592, 140656830066687, +STORE, 140656830066688, 140656830070783, +SNULL, 140656832319487, 140656832323583, +STORE, 140656832315392, 140656832319487, +STORE, 140656832319488, 140656832323583, +SNULL, 140656834547711, 140656834551807, +STORE, 140656833982464, 140656834547711, +STORE, 140656834547712, 140656834551807, +SNULL, 94730166759423, 94730166763519, +STORE, 94730166751232, 94730166759423, +STORE, 94730166759424, 94730166763519, +SNULL, 140656836800511, 140656836804607, +STORE, 140656836796416, 140656836800511, +STORE, 140656836800512, 140656836804607, +ERASE, 140656836763648, 140656836788223, +STORE, 94730171318272, 94730171453439, +STORE, 140656836784128, 140656836788223, +STORE, 140656836780032, 140656836784127, +STORE, 140656791920640, 140656796364799, +STORE, 140656836775936, 140656836780031, +STORE, 140656787476480, 140656791920639, +STORE, 140656779083776, 140656787476479, +SNULL, 140656779087871, 140656787476479, +STORE, 140656779083776, 140656779087871, +STORE, 140656779087872, 140656787476479, +STORE, 140656836771840, 140656836775935, +STORE, 140656774639616, 140656779083775, +STORE, 140656766246912, 140656774639615, +SNULL, 140656766251007, 140656774639615, +STORE, 140656766246912, 140656766251007, +STORE, 140656766251008, 140656774639615, +ERASE, 140656791920640, 140656796364799, +ERASE, 140656836780032, 140656836784127, +ERASE, 140656787476480, 140656791920639, +ERASE, 140656836775936, 140656836780031, +STORE, 140656836780032, 140656836784127, +STORE, 140656791920640, 140656796364799, +STORE, 140656836775936, 140656836780031, +STORE, 140656787476480, 140656791920639, +ERASE, 140656774639616, 140656779083775, + }; + unsigned long set20[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140735952392192, 140737488351231, +SNULL, 140735952396287, 140737488351231, +STORE, 140735952392192, 140735952396287, +STORE, 140735952261120, 140735952396287, +STORE, 94849008947200, 94849009414143, +SNULL, 94849009364991, 94849009414143, +STORE, 94849008947200, 94849009364991, +STORE, 94849009364992, 94849009414143, +ERASE, 94849009364992, 94849009414143, +STORE, 94849009364992, 94849009414143, +STORE, 140590397943808, 140590400196607, +SNULL, 140590398087167, 140590400196607, +STORE, 140590397943808, 140590398087167, +STORE, 140590398087168, 140590400196607, +ERASE, 140590398087168, 140590400196607, +STORE, 140590400184320, 140590400192511, +STORE, 140590400192512, 140590400196607, +STORE, 140735952850944, 140735952855039, +STORE, 140735952838656, 140735952850943, +STORE, 140590400180224, 140590400184319, +STORE, 140590400172032, 140590400180223, +STORE, 140590395809792, 140590397943807, +SNULL, 140590395809792, 140590395838463, +STORE, 140590395838464, 140590397943807, +STORE, 140590395809792, 140590395838463, +SNULL, 140590397935615, 140590397943807, +STORE, 140590395838464, 140590397935615, +STORE, 140590397935616, 140590397943807, +ERASE, 140590397935616, 140590397943807, +STORE, 140590397935616, 140590397943807, +STORE, 140590393425920, 140590395809791, +SNULL, 140590393425920, 140590393692159, +STORE, 140590393692160, 140590395809791, +STORE, 140590393425920, 140590393692159, +SNULL, 140590395785215, 140590395809791, +STORE, 140590393692160, 140590395785215, +STORE, 140590395785216, 140590395809791, +SNULL, 140590395785216, 140590395805695, +STORE, 140590395805696, 140590395809791, +STORE, 140590395785216, 140590395805695, +ERASE, 140590395785216, 140590395805695, +STORE, 140590395785216, 140590395805695, +ERASE, 140590395805696, 140590395809791, +STORE, 140590395805696, 140590395809791, +STORE, 140590391234560, 140590393425919, +SNULL, 140590391234560, 140590391324671, +STORE, 140590391324672, 140590393425919, +STORE, 140590391234560, 140590391324671, +SNULL, 140590393417727, 140590393425919, +STORE, 140590391324672, 140590393417727, +STORE, 140590393417728, 140590393425919, +ERASE, 140590393417728, 140590393425919, +STORE, 140590393417728, 140590393425919, +STORE, 140590388973568, 140590391234559, +SNULL, 140590388973568, 140590389125119, +STORE, 140590389125120, 140590391234559, +STORE, 140590388973568, 140590389125119, +SNULL, 140590391218175, 140590391234559, +STORE, 140590389125120, 140590391218175, +STORE, 140590391218176, 140590391234559, +SNULL, 140590391218176, 140590391226367, +STORE, 140590391226368, 140590391234559, +STORE, 140590391218176, 140590391226367, +ERASE, 140590391218176, 140590391226367, +STORE, 140590391218176, 140590391226367, +ERASE, 140590391226368, 140590391234559, +STORE, 140590391226368, 140590391234559, +STORE, 140590386843648, 140590388973567, +SNULL, 140590386843648, 140590386872319, +STORE, 140590386872320, 140590388973567, +STORE, 140590386843648, 140590386872319, +SNULL, 140590388965375, 140590388973567, +STORE, 140590386872320, 140590388965375, +STORE, 140590388965376, 140590388973567, +ERASE, 140590388965376, 140590388973567, +STORE, 140590388965376, 140590388973567, +STORE, 140590384627712, 140590386843647, +SNULL, 140590384627712, 140590384726015, +STORE, 140590384726016, 140590386843647, +STORE, 140590384627712, 140590384726015, +SNULL, 140590386819071, 140590386843647, +STORE, 140590384726016, 140590386819071, +STORE, 140590386819072, 140590386843647, +SNULL, 140590386819072, 140590386827263, +STORE, 140590386827264, 140590386843647, +STORE, 140590386819072, 140590386827263, +ERASE, 140590386819072, 140590386827263, +STORE, 140590386819072, 140590386827263, +ERASE, 140590386827264, 140590386843647, +STORE, 140590386827264, 140590386843647, +STORE, 140590400163840, 140590400180223, +STORE, 140590380830720, 140590384627711, +SNULL, 140590380830720, 140590382489599, +STORE, 140590382489600, 140590384627711, +STORE, 140590380830720, 140590382489599, +SNULL, 140590384586751, 140590384627711, +STORE, 140590382489600, 140590384586751, +STORE, 140590384586752, 140590384627711, +SNULL, 140590384586752, 140590384611327, +STORE, 140590384611328, 140590384627711, +STORE, 140590384586752, 140590384611327, +ERASE, 140590384586752, 140590384611327, +STORE, 140590384586752, 140590384611327, +ERASE, 140590384611328, 140590384627711, +STORE, 140590384611328, 140590384627711, +STORE, 140590378713088, 140590380830719, +SNULL, 140590378713088, 140590378729471, +STORE, 140590378729472, 140590380830719, +STORE, 140590378713088, 140590378729471, +SNULL, 140590380822527, 140590380830719, +STORE, 140590378729472, 140590380822527, +STORE, 140590380822528, 140590380830719, +ERASE, 140590380822528, 140590380830719, +STORE, 140590380822528, 140590380830719, +STORE, 140590376595456, 140590378713087, +SNULL, 140590376595456, 140590376611839, +STORE, 140590376611840, 140590378713087, +STORE, 140590376595456, 140590376611839, +SNULL, 140590378704895, 140590378713087, +STORE, 140590376611840, 140590378704895, +STORE, 140590378704896, 140590378713087, +ERASE, 140590378704896, 140590378713087, +STORE, 140590378704896, 140590378713087, +STORE, 140590374027264, 140590376595455, +SNULL, 140590374027264, 140590374494207, +STORE, 140590374494208, 140590376595455, +STORE, 140590374027264, 140590374494207, +SNULL, 140590376587263, 140590376595455, +STORE, 140590374494208, 140590376587263, +STORE, 140590376587264, 140590376595455, +ERASE, 140590376587264, 140590376595455, +STORE, 140590376587264, 140590376595455, +STORE, 140590371913728, 140590374027263, +SNULL, 140590371913728, 140590371926015, +STORE, 140590371926016, 140590374027263, +STORE, 140590371913728, 140590371926015, +SNULL, 140590374019071, 140590374027263, +STORE, 140590371926016, 140590374019071, +STORE, 140590374019072, 140590374027263, +ERASE, 140590374019072, 140590374027263, +STORE, 140590374019072, 140590374027263, +STORE, 140590400155648, 140590400180223, +STORE, 140590400143360, 140590400180223, +SNULL, 140590384603135, 140590384611327, +STORE, 140590384586752, 140590384603135, +STORE, 140590384603136, 140590384611327, +SNULL, 140590374023167, 140590374027263, +STORE, 140590374019072, 140590374023167, +STORE, 140590374023168, 140590374027263, +SNULL, 140590386823167, 140590386827263, +STORE, 140590386819072, 140590386823167, +STORE, 140590386823168, 140590386827263, +SNULL, 140590376591359, 140590376595455, + }; + unsigned long set21[] = { +STORE, 93874710941696, 93874711363583, +STORE, 93874711367680, 93874711408639, +STORE, 93874711408640, 93874711412735, +STORE, 93874720989184, 93874721124351, +STORE, 140708365086720, 140708365099007, +STORE, 140708365099008, 140708367192063, +STORE, 140708367192064, 140708367196159, +STORE, 140708367196160, 140708367200255, +STORE, 140708367200256, 140708367667199, +STORE, 140708367667200, 140708369760255, +STORE, 140708369760256, 140708369764351, +STORE, 140708369764352, 140708369768447, +STORE, 140708369768448, 140708369784831, +STORE, 140708369784832, 140708371877887, +STORE, 140708371877888, 140708371881983, +STORE, 140708371881984, 140708371886079, +STORE, 140708371886080, 140708371902463, +STORE, 140708371902464, 140708373995519, +STORE, 140708373995520, 140708373999615, +STORE, 140708373999616, 140708374003711, +STORE, 140708374003712, 140708375662591, +STORE, 140708375662592, 140708377759743, +STORE, 140708377759744, 140708377776127, +STORE, 140708377776128, 140708377784319, +STORE, 140708377784320, 140708377800703, +STORE, 140708377800704, 140708377899007, +STORE, 140708377899008, 140708379992063, +STORE, 140708379992064, 140708379996159, +STORE, 140708379996160, 140708380000255, +STORE, 140708380000256, 140708380016639, +STORE, 140708380016640, 140708380045311, +STORE, 140708380045312, 140708382138367, +STORE, 140708382138368, 140708382142463, +STORE, 140708382142464, 140708382146559, +STORE, 140708382146560, 140708382298111, +STORE, 140708382298112, 140708384391167, +STORE, 140708384391168, 140708384395263, +STORE, 140708384395264, 140708384399359, +STORE, 140708384399360, 140708384407551, +STORE, 140708384407552, 140708384497663, +STORE, 140708384497664, 140708386590719, +STORE, 140708386590720, 140708386594815, +STORE, 140708386594816, 140708386598911, +STORE, 140708386598912, 140708386865151, +STORE, 140708386865152, 140708388958207, +STORE, 140708388958208, 140708388974591, +STORE, 140708388974592, 140708388978687, +STORE, 140708388978688, 140708388982783, +STORE, 140708388982784, 140708389011455, +STORE, 140708389011456, 140708391108607, +STORE, 140708391108608, 140708391112703, +STORE, 140708391112704, 140708391116799, +STORE, 140708391116800, 140708391260159, +STORE, 140708393291776, 140708393308159, +STORE, 140708393308160, 140708393312255, +STORE, 140708393312256, 140708393316351, +STORE, 140708393316352, 140708393353215, +STORE, 140708393353216, 140708393357311, +STORE, 140708393357312, 140708393361407, +STORE, 140708393361408, 140708393365503, +STORE, 140708393365504, 140708393369599, +STORE, 140730557042688, 140730557177855, +STORE, 140730557235200, 140730557247487, +STORE, 140730557247488, 140730557251583, +ERASE, 140708393353216, 140708393357311, +ERASE, 140708393312256, 140708393316351, +ERASE, 140708393308160, 140708393312255, +ERASE, 140708393291776, 140708393308159, + }; + unsigned long set22[] = { +STORE, 93951397134336, 93951397183487, +STORE, 93951397183488, 93951397728255, +STORE, 93951397728256, 93951397826559, +STORE, 93951397826560, 93951397842943, +STORE, 93951397842944, 93951397847039, +STORE, 93951425974272, 93951426109439, +STORE, 140685152665600, 140685152677887, +STORE, 140685152677888, 140685152829439, +STORE, 140685152829440, 140685154181119, +STORE, 140685154181120, 140685154484223, +STORE, 140685154484224, 140685154496511, +STORE, 140685154496512, 140685154508799, +STORE, 140685154508800, 140685154525183, +STORE, 140685154525184, 140685154541567, +STORE, 140685154541568, 140685154590719, +STORE, 140685154590720, 140685154603007, +STORE, 140685154603008, 140685154607103, +STORE, 140685154607104, 140685154611199, +STORE, 140685154611200, 140685154615295, +STORE, 140685154615296, 140685154631679, +STORE, 140685154639872, 140685154643967, +STORE, 140685154643968, 140685154766847, +STORE, 140685154766848, 140685154799615, +STORE, 140685154803712, 140685154807807, +STORE, 140685154807808, 140685154811903, +STORE, 140685154811904, 140685154815999, +STORE, 140722188902400, 140722189037567, +STORE, 140722189512704, 140722189524991, +STORE, 140722189524992, 140722189529087, +STORE, 140737488347136, 140737488351231, +STORE, 140733429354496, 140737488351231, +SNULL, 140733429358591, 140737488351231, +STORE, 140733429354496, 140733429358591, +STORE, 140733429223424, 140733429358591, +STORE, 94526683537408, 94526683660287, +SNULL, 94526683553791, 94526683660287, +STORE, 94526683537408, 94526683553791, +STORE, 94526683553792, 94526683660287, +ERASE, 94526683553792, 94526683660287, +STORE, 94526683553792, 94526683623423, +STORE, 94526683623424, 94526683647999, +STORE, 94526683652096, 94526683660287, +STORE, 140551363747840, 140551363923967, +SNULL, 140551363751935, 140551363923967, +STORE, 140551363747840, 140551363751935, +STORE, 140551363751936, 140551363923967, +ERASE, 140551363751936, 140551363923967, +STORE, 140551363751936, 140551363874815, +STORE, 140551363874816, 140551363907583, +STORE, 140551363911680, 140551363919871, +STORE, 140551363919872, 140551363923967, +STORE, 140733429690368, 140733429694463, +STORE, 140733429678080, 140733429690367, +STORE, 140551363739648, 140551363747839, +STORE, 140551363731456, 140551363739647, +STORE, 140551363379200, 140551363731455, +SNULL, 140551363379200, 140551363420159, +STORE, 140551363420160, 140551363731455, +STORE, 140551363379200, 140551363420159, +SNULL, 140551363706879, 140551363731455, +STORE, 140551363420160, 140551363706879, +STORE, 140551363706880, 140551363731455, +SNULL, 140551363420160, 140551363637247, +STORE, 140551363637248, 140551363706879, +STORE, 140551363420160, 140551363637247, +ERASE, 140551363420160, 140551363637247, +STORE, 140551363420160, 140551363637247, +SNULL, 140551363637248, 140551363702783, +STORE, 140551363702784, 140551363706879, +STORE, 140551363637248, 140551363702783, +ERASE, 140551363637248, 140551363702783, +STORE, 140551363637248, 140551363702783, +ERASE, 140551363706880, 140551363731455, +STORE, 140551363706880, 140551363731455, +STORE, 140551361531904, 140551363379199, +SNULL, 140551361683455, 140551363379199, +STORE, 140551361531904, 140551361683455, +STORE, 140551361683456, 140551363379199, +SNULL, 140551361683456, 140551363035135, +STORE, 140551363035136, 140551363379199, +STORE, 140551361683456, 140551363035135, +ERASE, 140551361683456, 140551363035135, +STORE, 140551361683456, 140551363035135, +SNULL, 140551363035136, 140551363338239, +STORE, 140551363338240, 140551363379199, +STORE, 140551363035136, 140551363338239, +ERASE, 140551363035136, 140551363338239, +STORE, 140551363035136, 140551363379199, +SNULL, 140551363338239, 140551363379199, +STORE, 140551363035136, 140551363338239, +STORE, 140551363338240, 140551363379199, +SNULL, 140551363338240, 140551363362815, +STORE, 140551363362816, 140551363379199, +STORE, 140551363338240, 140551363362815, +ERASE, 140551363338240, 140551363362815, +STORE, 140551363338240, 140551363362815, +ERASE, 140551363362816, 140551363379199, +STORE, 140551363362816, 140551363379199, +STORE, 140551361519616, 140551361531903, +SNULL, 140551363350527, 140551363362815, +STORE, 140551363338240, 140551363350527, +STORE, 140551363350528, 140551363362815, +SNULL, 140551363727359, 140551363731455, +STORE, 140551363706880, 140551363727359, +STORE, 140551363727360, 140551363731455, +SNULL, 94526683656191, 94526683660287, +STORE, 94526683652096, 94526683656191, +STORE, 94526683656192, 94526683660287, +SNULL, 140551363915775, 140551363919871, +STORE, 140551363911680, 140551363915775, +STORE, 140551363915776, 140551363919871, +ERASE, 140551363739648, 140551363747839, +STORE, 94526715490304, 94526715625471, +STORE, 140551361253376, 140551361531903, +STORE, 140551360987136, 140551361531903, +STORE, 140551360720896, 140551361531903, +STORE, 140551360454656, 140551361531903, +SNULL, 140551361253375, 140551361531903, +STORE, 140551360454656, 140551361253375, +STORE, 140551361253376, 140551361531903, +SNULL, 140551361253376, 140551361519615, +STORE, 140551361519616, 140551361531903, +STORE, 140551361253376, 140551361519615, +ERASE, 140551361253376, 140551361519615, + }; + + unsigned long set23[] = { +STORE, 94014447943680, 94014448156671, +STORE, 94014450253824, 94014450257919, +STORE, 94014450257920, 94014450266111, +STORE, 94014450266112, 94014450278399, +STORE, 94014464225280, 94014464630783, +STORE, 139761764306944, 139761765965823, +STORE, 139761765965824, 139761768062975, +STORE, 139761768062976, 139761768079359, +STORE, 139761768079360, 139761768087551, +STORE, 139761768087552, 139761768103935, +STORE, 139761768103936, 139761768116223, +STORE, 139761768116224, 139761770209279, +STORE, 139761770209280, 139761770213375, +STORE, 139761770213376, 139761770217471, +STORE, 139761770217472, 139761770360831, +STORE, 139761770729472, 139761772412927, +STORE, 139761772412928, 139761772429311, +STORE, 139761772457984, 139761772462079, +STORE, 139761772462080, 139761772466175, +STORE, 139761772466176, 139761772470271, +STORE, 140724336517120, 140724336652287, +STORE, 140724336955392, 140724336967679, +STORE, 140724336967680, 140724336971775, +STORE, 140737488347136, 140737488351231, +STORE, 140721840295936, 140737488351231, +SNULL, 140721840300031, 140737488351231, +STORE, 140721840295936, 140721840300031, +STORE, 140721840164864, 140721840300031, +STORE, 93937913667584, 93937915830271, +SNULL, 93937913729023, 93937915830271, +STORE, 93937913667584, 93937913729023, +STORE, 93937913729024, 93937915830271, +ERASE, 93937913729024, 93937915830271, +STORE, 93937915822080, 93937915830271, +STORE, 140598835335168, 140598837587967, +SNULL, 140598835478527, 140598837587967, +STORE, 140598835335168, 140598835478527, +STORE, 140598835478528, 140598837587967, +ERASE, 140598835478528, 140598837587967, +STORE, 140598837575680, 140598837583871, +STORE, 140598837583872, 140598837587967, +STORE, 140721841086464, 140721841090559, +STORE, 140721841074176, 140721841086463, +STORE, 140598837547008, 140598837575679, +STORE, 140598837538816, 140598837547007, +STORE, 140598831538176, 140598835335167, +SNULL, 140598831538176, 140598833197055, +STORE, 140598833197056, 140598835335167, +STORE, 140598831538176, 140598833197055, +SNULL, 140598835294207, 140598835335167, +STORE, 140598833197056, 140598835294207, +STORE, 140598835294208, 140598835335167, +SNULL, 140598835294208, 140598835318783, +STORE, 140598835318784, 140598835335167, +STORE, 140598835294208, 140598835318783, +ERASE, 140598835294208, 140598835318783, +STORE, 140598835294208, 140598835318783, +ERASE, 140598835318784, 140598835335167, +STORE, 140598835318784, 140598835335167, +SNULL, 140598835310591, 140598835318783, +STORE, 140598835294208, 140598835310591, +STORE, 140598835310592, 140598835318783, +SNULL, 93937915826175, 93937915830271, +STORE, 93937915822080, 93937915826175, +STORE, 93937915826176, 93937915830271, +SNULL, 140598837579775, 140598837583871, +STORE, 140598837575680, 140598837579775, +STORE, 140598837579776, 140598837583871, +ERASE, 140598837547008, 140598837575679, +STORE, 93937929179136, 93937929314303, +STORE, 140598835855360, 140598837538815, +STORE, 140737488347136, 140737488351231, +STORE, 140728187723776, 140737488351231, +SNULL, 140728187727871, 140737488351231, +STORE, 140728187723776, 140728187727871, +STORE, 140728187592704, 140728187727871, +STORE, 4194304, 5128191, +STORE, 7221248, 7241727, +STORE, 7241728, 7249919, +STORE, 140583951437824, 140583953690623, +SNULL, 140583951581183, 140583953690623, +STORE, 140583951437824, 140583951581183, +STORE, 140583951581184, 140583953690623, +ERASE, 140583951581184, 140583953690623, +STORE, 140583953678336, 140583953686527, +STORE, 140583953686528, 140583953690623, +STORE, 140728189116416, 140728189120511, +STORE, 140728189104128, 140728189116415, +STORE, 140583953649664, 140583953678335, +STORE, 140583953641472, 140583953649663, +STORE, 140583948275712, 140583951437823, +SNULL, 140583948275712, 140583949336575, +STORE, 140583949336576, 140583951437823, +STORE, 140583948275712, 140583949336575, +SNULL, 140583951429631, 140583951437823, +STORE, 140583949336576, 140583951429631, +STORE, 140583951429632, 140583951437823, +ERASE, 140583951429632, 140583951437823, +STORE, 140583951429632, 140583951437823, +STORE, 140583944478720, 140583948275711, +SNULL, 140583944478720, 140583946137599, +STORE, 140583946137600, 140583948275711, +STORE, 140583944478720, 140583946137599, +SNULL, 140583948234751, 140583948275711, +STORE, 140583946137600, 140583948234751, +STORE, 140583948234752, 140583948275711, +SNULL, 140583948234752, 140583948259327, +STORE, 140583948259328, 140583948275711, +STORE, 140583948234752, 140583948259327, +ERASE, 140583948234752, 140583948259327, +STORE, 140583948234752, 140583948259327, +ERASE, 140583948259328, 140583948275711, +STORE, 140583948259328, 140583948275711, +STORE, 140583953629184, 140583953649663, +SNULL, 140583948251135, 140583948259327, +STORE, 140583948234752, 140583948251135, +STORE, 140583948251136, 140583948259327, +SNULL, 140583951433727, 140583951437823, +STORE, 140583951429632, 140583951433727, +STORE, 140583951433728, 140583951437823, +SNULL, 7233535, 7241727, +STORE, 7221248, 7233535, +STORE, 7233536, 7241727, +SNULL, 140583953682431, 140583953686527, +STORE, 140583953678336, 140583953682431, +STORE, 140583953682432, 140583953686527, +ERASE, 140583953649664, 140583953678335, +STORE, 17821696, 17956863, +STORE, 17821696, 18104319, +STORE, 140583951945728, 140583953629183, +STORE, 94014447943680, 94014448156671, +STORE, 94014450253824, 94014450257919, +STORE, 94014450257920, 94014450266111, +STORE, 94014450266112, 94014450278399, +STORE, 94014464225280, 94014465196031, +STORE, 139761764306944, 139761765965823, +STORE, 139761765965824, 139761768062975, +STORE, 139761768062976, 139761768079359, +STORE, 139761768079360, 139761768087551, +STORE, 139761768087552, 139761768103935, +STORE, 139761768103936, 139761768116223, +STORE, 139761768116224, 139761770209279, +STORE, 139761770209280, 139761770213375, +STORE, 139761770213376, 139761770217471, +STORE, 139761770217472, 139761770360831, +STORE, 139761770729472, 139761772412927, +STORE, 139761772412928, 139761772429311, +STORE, 139761772457984, 139761772462079, +STORE, 139761772462080, 139761772466175, +STORE, 139761772466176, 139761772470271, +STORE, 140724336517120, 140724336652287, +STORE, 140724336955392, 140724336967679, +STORE, 140724336967680, 140724336971775, +STORE, 140737488347136, 140737488351231, +STORE, 140726063296512, 140737488351231, +SNULL, 140726063300607, 140737488351231, +STORE, 140726063296512, 140726063300607, +STORE, 140726063165440, 140726063300607, +STORE, 94016795934720, 94016798158847, +SNULL, 94016796045311, 94016798158847, +STORE, 94016795934720, 94016796045311, +STORE, 94016796045312, 94016798158847, +ERASE, 94016796045312, 94016798158847, +STORE, 94016798138368, 94016798150655, +STORE, 94016798150656, 94016798158847, +STORE, 139975915966464, 139975918219263, +SNULL, 139975916109823, 139975918219263, +STORE, 139975915966464, 139975916109823, +STORE, 139975916109824, 139975918219263, +ERASE, 139975916109824, 139975918219263, +STORE, 139975918206976, 139975918215167, +STORE, 139975918215168, 139975918219263, +STORE, 140726064541696, 140726064545791, +STORE, 140726064529408, 140726064541695, +STORE, 139975918178304, 139975918206975, +STORE, 139975918170112, 139975918178303, +STORE, 139975912169472, 139975915966463, +SNULL, 139975912169472, 139975913828351, +STORE, 139975913828352, 139975915966463, +STORE, 139975912169472, 139975913828351, +SNULL, 139975915925503, 139975915966463, +STORE, 139975913828352, 139975915925503, +STORE, 139975915925504, 139975915966463, +SNULL, 139975915925504, 139975915950079, +STORE, 139975915950080, 139975915966463, +STORE, 139975915925504, 139975915950079, +ERASE, 139975915925504, 139975915950079, +STORE, 139975915925504, 139975915950079, +ERASE, 139975915950080, 139975915966463, +STORE, 139975915950080, 139975915966463, +SNULL, 139975915941887, 139975915950079, +STORE, 139975915925504, 139975915941887, +STORE, 139975915941888, 139975915950079, +SNULL, 94016798146559, 94016798150655, +STORE, 94016798138368, 94016798146559, +STORE, 94016798146560, 94016798150655, +SNULL, 139975918211071, 139975918215167, +STORE, 139975918206976, 139975918211071, +STORE, 139975918211072, 139975918215167, +ERASE, 139975918178304, 139975918206975, +STORE, 94016804925440, 94016805060607, +STORE, 94596177661952, 94596177772543, +STORE, 94596179865600, 94596179873791, +STORE, 94596179873792, 94596179877887, +STORE, 94596179877888, 94596179886079, +STORE, 94596211597312, 94596211863551, +STORE, 140127351840768, 140127353499647, +STORE, 140127353499648, 140127355596799, +STORE, 140127355596800, 140127355613183, +STORE, 140127355613184, 140127355621375, +STORE, 140127355621376, 140127355637759, +STORE, 140127355637760, 140127355781119, +STORE, 140127357841408, 140127357849599, +STORE, 140127357878272, 140127357882367, +STORE, 140127357882368, 140127357886463, +STORE, 140127357886464, 140127357890559, +STORE, 140726167252992, 140726167392255, +STORE, 140726167838720, 140726167851007, +STORE, 140726167851008, 140726167855103, +STORE, 140737488347136, 140737488351231, +STORE, 140731874017280, 140737488351231, +SNULL, 140731874021375, 140737488351231, +STORE, 140731874017280, 140731874021375, +STORE, 140731873886208, 140731874021375, +STORE, 94178682265600, 94178684489727, +SNULL, 94178682376191, 94178684489727, +STORE, 94178682265600, 94178682376191, +STORE, 94178682376192, 94178684489727, +ERASE, 94178682376192, 94178684489727, +STORE, 94178684469248, 94178684481535, +STORE, 94178684481536, 94178684489727, +STORE, 140460853403648, 140460855656447, +SNULL, 140460853547007, 140460855656447, +STORE, 140460853403648, 140460853547007, +STORE, 140460853547008, 140460855656447, +ERASE, 140460853547008, 140460855656447, +STORE, 140460855644160, 140460855652351, +STORE, 140460855652352, 140460855656447, +STORE, 140731874103296, 140731874107391, +STORE, 140731874091008, 140731874103295, +STORE, 140460855615488, 140460855644159, +STORE, 140460855607296, 140460855615487, +STORE, 140460849606656, 140460853403647, +SNULL, 140460849606656, 140460851265535, +STORE, 140460851265536, 140460853403647, +STORE, 140460849606656, 140460851265535, +SNULL, 140460853362687, 140460853403647, +STORE, 140460851265536, 140460853362687, +STORE, 140460853362688, 140460853403647, +SNULL, 140460853362688, 140460853387263, +STORE, 140460853387264, 140460853403647, +STORE, 140460853362688, 140460853387263, +ERASE, 140460853362688, 140460853387263, +STORE, 140460853362688, 140460853387263, +ERASE, 140460853387264, 140460853403647, +STORE, 140460853387264, 140460853403647, +SNULL, 140460853379071, 140460853387263, +STORE, 140460853362688, 140460853379071, +STORE, 140460853379072, 140460853387263, +SNULL, 94178684477439, 94178684481535, +STORE, 94178684469248, 94178684477439, +STORE, 94178684477440, 94178684481535, +SNULL, 140460855648255, 140460855652351, +STORE, 140460855644160, 140460855648255, +STORE, 140460855648256, 140460855652351, +ERASE, 140460855615488, 140460855644159, +STORE, 94178692063232, 94178692198399, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140733096603648, 140737488351231, +SNULL, 140733096611839, 140737488351231, +STORE, 140733096603648, 140733096611839, +STORE, 140733096472576, 140733096611839, +STORE, 94796716122112, 94796718325759, +SNULL, 94796716224511, 94796718325759, +STORE, 94796716122112, 94796716224511, +STORE, 94796716224512, 94796718325759, +ERASE, 94796716224512, 94796718325759, +STORE, 94796718317568, 94796718325759, +STORE, 139667892793344, 139667895046143, +SNULL, 139667892936703, 139667895046143, +STORE, 139667892793344, 139667892936703, +STORE, 139667892936704, 139667895046143, +ERASE, 139667892936704, 139667895046143, +STORE, 139667895033856, 139667895042047, +STORE, 139667895042048, 139667895046143, +STORE, 140733096857600, 140733096861695, +STORE, 140733096845312, 140733096857599, +STORE, 139667895005184, 139667895033855, +STORE, 139667894996992, 139667895005183, +STORE, 139667890532352, 139667892793343, +SNULL, 139667890532352, 139667890683903, +STORE, 139667890683904, 139667892793343, +STORE, 139667890532352, 139667890683903, +SNULL, 139667892776959, 139667892793343, +STORE, 139667890683904, 139667892776959, +STORE, 139667892776960, 139667892793343, +SNULL, 139667892776960, 139667892785151, +STORE, 139667892785152, 139667892793343, +STORE, 139667892776960, 139667892785151, +ERASE, 139667892776960, 139667892785151, +STORE, 139667892776960, 139667892785151, +ERASE, 139667892785152, 139667892793343, +STORE, 139667892785152, 139667892793343, +STORE, 139667886735360, 139667890532351, +SNULL, 139667886735360, 139667888394239, +STORE, 139667888394240, 139667890532351, +STORE, 139667886735360, 139667888394239, +SNULL, 139667890491391, 139667890532351, +STORE, 139667888394240, 139667890491391, +STORE, 139667890491392, 139667890532351, +SNULL, 139667890491392, 139667890515967, +STORE, 139667890515968, 139667890532351, +STORE, 139667890491392, 139667890515967, +ERASE, 139667890491392, 139667890515967, +STORE, 139667890491392, 139667890515967, +ERASE, 139667890515968, 139667890532351, +STORE, 139667890515968, 139667890532351, +STORE, 139667884167168, 139667886735359, +SNULL, 139667884167168, 139667884634111, +STORE, 139667884634112, 139667886735359, +STORE, 139667884167168, 139667884634111, +SNULL, 139667886727167, 139667886735359, +STORE, 139667884634112, 139667886727167, +STORE, 139667886727168, 139667886735359, +ERASE, 139667886727168, 139667886735359, +STORE, 139667886727168, 139667886735359, +STORE, 139667882053632, 139667884167167, +SNULL, 139667882053632, 139667882065919, +STORE, 139667882065920, 139667884167167, +STORE, 139667882053632, 139667882065919, +SNULL, 139667884158975, 139667884167167, +STORE, 139667882065920, 139667884158975, +STORE, 139667884158976, 139667884167167, +ERASE, 139667884158976, 139667884167167, +STORE, 139667884158976, 139667884167167, +STORE, 139667879837696, 139667882053631, +SNULL, 139667879837696, 139667879935999, +STORE, 139667879936000, 139667882053631, +STORE, 139667879837696, 139667879935999, +SNULL, 139667882029055, 139667882053631, +STORE, 139667879936000, 139667882029055, +STORE, 139667882029056, 139667882053631, +SNULL, 139667882029056, 139667882037247, +STORE, 139667882037248, 139667882053631, +STORE, 139667882029056, 139667882037247, +ERASE, 139667882029056, 139667882037247, +STORE, 139667882029056, 139667882037247, +ERASE, 139667882037248, 139667882053631, +STORE, 139667882037248, 139667882053631, +STORE, 139667894988800, 139667895005183, +SNULL, 139667890507775, 139667890515967, +STORE, 139667890491392, 139667890507775, +STORE, 139667890507776, 139667890515967, +SNULL, 139667882033151, 139667882037247, +STORE, 139667882029056, 139667882033151, +STORE, 139667882033152, 139667882037247, +SNULL, 139667884163071, 139667884167167, +STORE, 139667884158976, 139667884163071, +STORE, 139667884163072, 139667884167167, +SNULL, 139667886731263, 139667886735359, +STORE, 139667886727168, 139667886731263, +STORE, 139667886731264, 139667886735359, +SNULL, 139667892781055, 139667892785151, +STORE, 139667892776960, 139667892781055, +STORE, 139667892781056, 139667892785151, +SNULL, 94796718321663, 94796718325759, +STORE, 94796718317568, 94796718321663, +STORE, 94796718321664, 94796718325759, +SNULL, 139667895037951, 139667895042047, +STORE, 139667895033856, 139667895037951, +STORE, 139667895037952, 139667895042047, +ERASE, 139667895005184, 139667895033855, +STORE, 94796726063104, 94796726198271, +STORE, 139667893305344, 139667894988799, +STORE, 139667895005184, 139667895033855, +STORE, 94796726063104, 94796726333439, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722489507840, 140737488351231, +SNULL, 140722489516031, 140737488351231, +STORE, 140722489507840, 140722489516031, +STORE, 140722489376768, 140722489516031, +STORE, 93980993265664, 93980995489791, +SNULL, 93980993376255, 93980995489791, +STORE, 93980993265664, 93980993376255, +STORE, 93980993376256, 93980995489791, +ERASE, 93980993376256, 93980995489791, +STORE, 93980995469312, 93980995481599, +STORE, 93980995481600, 93980995489791, +STORE, 140261313593344, 140261315846143, +SNULL, 140261313736703, 140261315846143, +STORE, 140261313593344, 140261313736703, +STORE, 140261313736704, 140261315846143, +ERASE, 140261313736704, 140261315846143, +STORE, 140261315833856, 140261315842047, +STORE, 140261315842048, 140261315846143, +STORE, 140722489675776, 140722489679871, +STORE, 140722489663488, 140722489675775, +STORE, 140261315805184, 140261315833855, +STORE, 140261315796992, 140261315805183, +STORE, 140261309796352, 140261313593343, +SNULL, 140261309796352, 140261311455231, +STORE, 140261311455232, 140261313593343, +STORE, 140261309796352, 140261311455231, +SNULL, 140261313552383, 140261313593343, +STORE, 140261311455232, 140261313552383, +STORE, 140261313552384, 140261313593343, +SNULL, 140261313552384, 140261313576959, +STORE, 140261313576960, 140261313593343, +STORE, 140261313552384, 140261313576959, +ERASE, 140261313552384, 140261313576959, +STORE, 140261313552384, 140261313576959, +ERASE, 140261313576960, 140261313593343, +STORE, 140261313576960, 140261313593343, +SNULL, 140261313568767, 140261313576959, +STORE, 140261313552384, 140261313568767, +STORE, 140261313568768, 140261313576959, +SNULL, 93980995477503, 93980995481599, +STORE, 93980995469312, 93980995477503, +STORE, 93980995477504, 93980995481599, +SNULL, 140261315837951, 140261315842047, +STORE, 140261315833856, 140261315837951, +STORE, 140261315837952, 140261315842047, +ERASE, 140261315805184, 140261315833855, +STORE, 93980997443584, 93980997578751, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140737488338944, 140737488351231, +STORE, 140734059450368, 140737488351231, +SNULL, 140734059462655, 140737488351231, +STORE, 140734059450368, 140734059462655, +STORE, 140734059319296, 140734059462655, +STORE, 4194304, 5128191, +STORE, 7221248, 7241727, +STORE, 7241728, 7249919, +STORE, 140307554983936, 140307557236735, +SNULL, 140307555127295, 140307557236735, +STORE, 140307554983936, 140307555127295, +STORE, 140307555127296, 140307557236735, +ERASE, 140307555127296, 140307557236735, +STORE, 140307557224448, 140307557232639, +STORE, 140307557232640, 140307557236735, +STORE, 140734059483136, 140734059487231, +STORE, 140734059470848, 140734059483135, +STORE, 140307557195776, 140307557224447, +STORE, 140307557187584, 140307557195775, +STORE, 140307551821824, 140307554983935, +SNULL, 140307551821824, 140307552882687, +STORE, 140307552882688, 140307554983935, +STORE, 140307551821824, 140307552882687, +SNULL, 140307554975743, 140307554983935, +STORE, 140307552882688, 140307554975743, +STORE, 140307554975744, 140307554983935, +ERASE, 140307554975744, 140307554983935, +STORE, 140307554975744, 140307554983935, +STORE, 140307548024832, 140307551821823, +SNULL, 140307548024832, 140307549683711, +STORE, 140307549683712, 140307551821823, +STORE, 140307548024832, 140307549683711, +SNULL, 140307551780863, 140307551821823, +STORE, 140307549683712, 140307551780863, +STORE, 140307551780864, 140307551821823, +SNULL, 140307551780864, 140307551805439, +STORE, 140307551805440, 140307551821823, +STORE, 140307551780864, 140307551805439, +ERASE, 140307551780864, 140307551805439, +STORE, 140307551780864, 140307551805439, +ERASE, 140307551805440, 140307551821823, +STORE, 140307551805440, 140307551821823, +STORE, 140307557175296, 140307557195775, +SNULL, 140307551797247, 140307551805439, +STORE, 140307551780864, 140307551797247, +STORE, 140307551797248, 140307551805439, +SNULL, 140307554979839, 140307554983935, +STORE, 140307554975744, 140307554979839, +STORE, 140307554979840, 140307554983935, +SNULL, 7233535, 7241727, +STORE, 7221248, 7233535, +STORE, 7233536, 7241727, +SNULL, 140307557228543, 140307557232639, +STORE, 140307557224448, 140307557228543, +STORE, 140307557228544, 140307557232639, +ERASE, 140307557195776, 140307557224447, +STORE, 39698432, 39833599, +STORE, 39698432, 39981055, +STORE, 94306485321728, 94306485432319, +STORE, 94306487525376, 94306487533567, +STORE, 94306487533568, 94306487537663, +STORE, 94306487537664, 94306487545855, +STORE, 94306488868864, 94306489004031, +STORE, 140497673998336, 140497675657215, +STORE, 140497675657216, 140497677754367, +STORE, 140497677754368, 140497677770751, +STORE, 140497677770752, 140497677778943, +STORE, 140497677778944, 140497677795327, +STORE, 140497677795328, 140497677938687, +STORE, 140497679998976, 140497680007167, +STORE, 140497680035840, 140497680039935, +STORE, 140497680039936, 140497680044031, +STORE, 140497680044032, 140497680048127, +STORE, 140732780462080, 140732780601343, +STORE, 140732782239744, 140732782252031, +STORE, 140732782252032, 140732782256127, +STORE, 94236915900416, 94236916011007, +STORE, 94236918104064, 94236918112255, +STORE, 94236918112256, 94236918116351, +STORE, 94236918116352, 94236918124543, +STORE, 94236939489280, 94236939624447, +STORE, 140046091743232, 140046093402111, +STORE, 140046093402112, 140046095499263, +STORE, 140046095499264, 140046095515647, +STORE, 140046095515648, 140046095523839, +STORE, 140046095523840, 140046095540223, +STORE, 140046095540224, 140046095683583, +STORE, 140046097743872, 140046097752063, +STORE, 140046097780736, 140046097784831, +STORE, 140046097784832, 140046097788927, +STORE, 140046097788928, 140046097793023, +STORE, 140726694449152, 140726694588415, +STORE, 140726695313408, 140726695325695, +STORE, 140726695325696, 140726695329791, +STORE, 94894582779904, 94894582992895, +STORE, 94894585090048, 94894585094143, +STORE, 94894585094144, 94894585102335, +STORE, 94894585102336, 94894585114623, +STORE, 94894592868352, 94894594293759, +STORE, 139733563842560, 139733565501439, +STORE, 139733565501440, 139733567598591, +STORE, 139733567598592, 139733567614975, +STORE, 139733567614976, 139733567623167, +STORE, 139733567623168, 139733567639551, +STORE, 139733567639552, 139733567651839, +STORE, 139733567651840, 139733569744895, +STORE, 139733569744896, 139733569748991, +STORE, 139733569748992, 139733569753087, +STORE, 139733569753088, 139733569896447, +STORE, 139733570265088, 139733571948543, +STORE, 139733571948544, 139733571964927, +STORE, 139733571993600, 139733571997695, +STORE, 139733571997696, 139733572001791, +STORE, 139733572001792, 139733572005887, +STORE, 140726369255424, 140726369394687, +STORE, 140726370402304, 140726370414591, +STORE, 140726370414592, 140726370418687, +STORE, 94899236483072, 94899236696063, +STORE, 94899238793216, 94899238797311, +STORE, 94899238797312, 94899238805503, +STORE, 94899238805504, 94899238817791, +STORE, 94899263045632, 94899263979519, +STORE, 140040959893504, 140040961552383, +STORE, 140040961552384, 140040963649535, +STORE, 140040963649536, 140040963665919, +STORE, 140040963665920, 140040963674111, +STORE, 140040963674112, 140040963690495, +STORE, 140040963690496, 140040963702783, +STORE, 140040963702784, 140040965795839, +STORE, 140040965795840, 140040965799935, +STORE, 140040965799936, 140040965804031, +STORE, 140040965804032, 140040965947391, +STORE, 140040966316032, 140040967999487, +STORE, 140040967999488, 140040968015871, +STORE, 140040968044544, 140040968048639, +STORE, 140040968048640, 140040968052735, +STORE, 140040968052736, 140040968056831, +STORE, 140729921359872, 140729921499135, +STORE, 140729921613824, 140729921626111, +STORE, 140729921626112, 140729921630207, +STORE, 94818265190400, 94818265403391, +STORE, 94818267500544, 94818267504639, +STORE, 94818267504640, 94818267512831, +STORE, 94818267512832, 94818267525119, +STORE, 94818283372544, 94818285858815, +STORE, 139818425675776, 139818427334655, +STORE, 139818427334656, 139818429431807, +STORE, 139818429431808, 139818429448191, +STORE, 139818429448192, 139818429456383, +STORE, 139818429456384, 139818429472767, +STORE, 139818429472768, 139818429485055, +STORE, 139818429485056, 139818431578111, +STORE, 139818431578112, 139818431582207, +STORE, 139818431582208, 139818431586303, +STORE, 139818431586304, 139818431729663, +STORE, 139818432098304, 139818433781759, +STORE, 139818433781760, 139818433798143, +STORE, 139818433826816, 139818433830911, +STORE, 139818433830912, 139818433835007, +STORE, 139818433835008, 139818433839103, +STORE, 140726170509312, 140726170648575, +STORE, 140726171824128, 140726171836415, +STORE, 140726171836416, 140726171840511, +STORE, 94611513188352, 94611513401343, +STORE, 94611515498496, 94611515502591, +STORE, 94611515502592, 94611515510783, +STORE, 94611515510784, 94611515523071, +STORE, 94611516502016, 94611516907519, +STORE, 140596246388736, 140596248047615, +STORE, 140596248047616, 140596250144767, +STORE, 140596250144768, 140596250161151, +STORE, 140596250161152, 140596250169343, +STORE, 140596250169344, 140596250185727, +STORE, 140596250185728, 140596250198015, +STORE, 140596250198016, 140596252291071, +STORE, 140596252291072, 140596252295167, +STORE, 140596252295168, 140596252299263, +STORE, 140596252299264, 140596252442623, +STORE, 140596252811264, 140596254494719, +STORE, 140596254494720, 140596254511103, +STORE, 140596254539776, 140596254543871, +STORE, 140596254543872, 140596254547967, +STORE, 140596254547968, 140596254552063, +STORE, 140731551338496, 140731551477759, +STORE, 140731551780864, 140731551793151, +STORE, 140731551793152, 140731551797247, +STORE, 94313835851776, 94313836064767, +STORE, 94313838161920, 94313838166015, +STORE, 94313838166016, 94313838174207, +STORE, 94313838174208, 94313838186495, +STORE, 94313858416640, 94313861906431, +STORE, 140693503918080, 140693505576959, +STORE, 140693505576960, 140693507674111, +STORE, 140693507674112, 140693507690495, +STORE, 140693507690496, 140693507698687, +STORE, 140693507698688, 140693507715071, +STORE, 140693507715072, 140693507727359, +STORE, 140693507727360, 140693509820415, +STORE, 140693509820416, 140693509824511, +STORE, 140693509824512, 140693509828607, +STORE, 140693509828608, 140693509971967, +STORE, 140693510340608, 140693512024063, +STORE, 140693512024064, 140693512040447, +STORE, 140693512069120, 140693512073215, +STORE, 140693512073216, 140693512077311, +STORE, 140693512077312, 140693512081407, +STORE, 140721116065792, 140721116205055, +STORE, 140721117831168, 140721117843455, +STORE, 140721117843456, 140721117847551, +STORE, 94843650150400, 94843650363391, +STORE, 94843652460544, 94843652464639, +STORE, 94843652464640, 94843652472831, +STORE, 94843652472832, 94843652485119, +STORE, 94843685388288, 94843686281215, +STORE, 140484193681408, 140484195340287, +STORE, 140484195340288, 140484197437439, +STORE, 140484197437440, 140484197453823, +STORE, 140484197453824, 140484197462015, +STORE, 140484197462016, 140484197478399, +STORE, 140484197478400, 140484197490687, +STORE, 140484197490688, 140484199583743, +STORE, 140484199583744, 140484199587839, +STORE, 140484199587840, 140484199591935, +STORE, 140484199591936, 140484199735295, +STORE, 140484200103936, 140484201787391, +STORE, 140484201787392, 140484201803775, +STORE, 140484201832448, 140484201836543, +STORE, 140484201836544, 140484201840639, +STORE, 140484201840640, 140484201844735, +STORE, 140726294315008, 140726294454271, +STORE, 140726295646208, 140726295658495, +STORE, 140726295658496, 140726295662591, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140720422371328, 140737488351231, +SNULL, 140720422379519, 140737488351231, +STORE, 140720422371328, 140720422379519, +STORE, 140720422240256, 140720422379519, +STORE, 94417967845376, 94417970180095, +SNULL, 94417968058367, 94417970180095, +STORE, 94417967845376, 94417968058367, +STORE, 94417968058368, 94417970180095, +ERASE, 94417968058368, 94417970180095, +STORE, 94417970155520, 94417970167807, +STORE, 94417970167808, 94417970180095, +STORE, 140252450045952, 140252452298751, +SNULL, 140252450189311, 140252452298751, +STORE, 140252450045952, 140252450189311, +STORE, 140252450189312, 140252452298751, +ERASE, 140252450189312, 140252452298751, +STORE, 140252452286464, 140252452294655, +STORE, 140252452294656, 140252452298751, +STORE, 140720422416384, 140720422420479, +STORE, 140720422404096, 140720422416383, +STORE, 140252452257792, 140252452286463, +STORE, 140252452249600, 140252452257791, +STORE, 140252447932416, 140252450045951, +SNULL, 140252447932416, 140252447944703, +STORE, 140252447944704, 140252450045951, +STORE, 140252447932416, 140252447944703, +SNULL, 140252450037759, 140252450045951, +STORE, 140252447944704, 140252450037759, +STORE, 140252450037760, 140252450045951, +ERASE, 140252450037760, 140252450045951, +STORE, 140252450037760, 140252450045951, +STORE, 140252444135424, 140252447932415, +SNULL, 140252444135424, 140252445794303, +STORE, 140252445794304, 140252447932415, +STORE, 140252444135424, 140252445794303, +SNULL, 140252447891455, 140252447932415, +STORE, 140252445794304, 140252447891455, +STORE, 140252447891456, 140252447932415, +SNULL, 140252447891456, 140252447916031, +STORE, 140252447916032, 140252447932415, +STORE, 140252447891456, 140252447916031, +ERASE, 140252447891456, 140252447916031, +STORE, 140252447891456, 140252447916031, +ERASE, 140252447916032, 140252447932415, +STORE, 140252447916032, 140252447932415, +STORE, 140252452241408, 140252452257791, +SNULL, 140252447907839, 140252447916031, +STORE, 140252447891456, 140252447907839, +STORE, 140252447907840, 140252447916031, +SNULL, 140252450041855, 140252450045951, +STORE, 140252450037760, 140252450041855, +STORE, 140252450041856, 140252450045951, +SNULL, 94417970159615, 94417970167807, +STORE, 94417970155520, 94417970159615, +STORE, 94417970159616, 94417970167807, +SNULL, 140252452290559, 140252452294655, +STORE, 140252452286464, 140252452290559, +STORE, 140252452290560, 140252452294655, +ERASE, 140252452257792, 140252452286463, +STORE, 94417996333056, 94417996468223, +STORE, 140252450557952, 140252452241407, +STORE, 94417996333056, 94417996603391, +STORE, 94417996333056, 94417996738559, +STORE, 94417996333056, 94417996910591, +SNULL, 94417996881919, 94417996910591, +STORE, 94417996333056, 94417996881919, +STORE, 94417996881920, 94417996910591, +ERASE, 94417996881920, 94417996910591, +STORE, 94417996333056, 94417997017087, +STORE, 94417996333056, 94417997152255, +SNULL, 94417997135871, 94417997152255, +STORE, 94417996333056, 94417997135871, +STORE, 94417997135872, 94417997152255, +ERASE, 94417997135872, 94417997152255, +STORE, 94417996333056, 94417997291519, +SNULL, 94417997271039, 94417997291519, +STORE, 94417996333056, 94417997271039, +STORE, 94417997271040, 94417997291519, +ERASE, 94417997271040, 94417997291519, +STORE, 94417996333056, 94417997406207, +SNULL, 94417997381631, 94417997406207, +STORE, 94417996333056, 94417997381631, +STORE, 94417997381632, 94417997406207, +ERASE, 94417997381632, 94417997406207, +STORE, 94417996333056, 94417997516799, +SNULL, 94417997488127, 94417997516799, +STORE, 94417996333056, 94417997488127, +STORE, 94417997488128, 94417997516799, +ERASE, 94417997488128, 94417997516799, +STORE, 94417996333056, 94417997643775, +SNULL, 94417997631487, 94417997643775, +STORE, 94417996333056, 94417997631487, +STORE, 94417997631488, 94417997643775, +ERASE, 94417997631488, 94417997643775, +SNULL, 94417997590527, 94417997631487, +STORE, 94417996333056, 94417997590527, +STORE, 94417997590528, 94417997631487, +ERASE, 94417997590528, 94417997631487, +STORE, 94417996333056, 94417997733887, +STORE, 94417996333056, 94417997869055, +STORE, 94417996333056, 94417998004223, +SNULL, 94417998000127, 94417998004223, +STORE, 94417996333056, 94417998000127, +STORE, 94417998000128, 94417998004223, +ERASE, 94417998000128, 94417998004223, +STORE, 94049170993152, 94049171206143, +STORE, 94049173303296, 94049173307391, +STORE, 94049173307392, 94049173315583, +STORE, 94049173315584, 94049173327871, +STORE, 94049176236032, 94049183645695, +STORE, 139807795544064, 139807797202943, +STORE, 139807797202944, 139807799300095, +STORE, 139807799300096, 139807799316479, +STORE, 139807799316480, 139807799324671, +STORE, 139807799324672, 139807799341055, +STORE, 139807799341056, 139807799353343, +STORE, 139807799353344, 139807801446399, +STORE, 139807801446400, 139807801450495, +STORE, 139807801450496, 139807801454591, +STORE, 139807801454592, 139807801597951, +STORE, 139807801966592, 139807803650047, +STORE, 139807803650048, 139807803666431, +STORE, 139807803695104, 139807803699199, +STORE, 139807803699200, 139807803703295, +STORE, 139807803703296, 139807803707391, +STORE, 140727555538944, 140727555678207, +STORE, 140727555940352, 140727555952639, +STORE, 140727555952640, 140727555956735, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722483441664, 140737488351231, +SNULL, 140722483449855, 140737488351231, +STORE, 140722483441664, 140722483449855, +STORE, 140722483310592, 140722483449855, +STORE, 94416704921600, 94416707145727, +SNULL, 94416705032191, 94416707145727, +STORE, 94416704921600, 94416705032191, +STORE, 94416705032192, 94416707145727, +ERASE, 94416705032192, 94416707145727, +STORE, 94416707125248, 94416707137535, +STORE, 94416707137536, 94416707145727, +STORE, 140555439296512, 140555441549311, +SNULL, 140555439439871, 140555441549311, +STORE, 140555439296512, 140555439439871, +STORE, 140555439439872, 140555441549311, +ERASE, 140555439439872, 140555441549311, +STORE, 140555441537024, 140555441545215, +STORE, 140555441545216, 140555441549311, +STORE, 140722484781056, 140722484785151, +STORE, 140722484768768, 140722484781055, +STORE, 140555441508352, 140555441537023, +STORE, 140555441500160, 140555441508351, +STORE, 140555435499520, 140555439296511, +SNULL, 140555435499520, 140555437158399, +STORE, 140555437158400, 140555439296511, +STORE, 140555435499520, 140555437158399, +SNULL, 140555439255551, 140555439296511, +STORE, 140555437158400, 140555439255551, +STORE, 140555439255552, 140555439296511, +SNULL, 140555439255552, 140555439280127, +STORE, 140555439280128, 140555439296511, +STORE, 140555439255552, 140555439280127, +ERASE, 140555439255552, 140555439280127, +STORE, 140555439255552, 140555439280127, +ERASE, 140555439280128, 140555439296511, +STORE, 140555439280128, 140555439296511, +SNULL, 140555439271935, 140555439280127, +STORE, 140555439255552, 140555439271935, +STORE, 140555439271936, 140555439280127, +SNULL, 94416707133439, 94416707137535, +STORE, 94416707125248, 94416707133439, +STORE, 94416707133440, 94416707137535, +SNULL, 140555441541119, 140555441545215, +STORE, 140555441537024, 140555441541119, +STORE, 140555441541120, 140555441545215, +ERASE, 140555441508352, 140555441537023, +STORE, 94416724672512, 94416724807679, +STORE, 94686636953600, 94686637166591, +STORE, 94686639263744, 94686639267839, +STORE, 94686639267840, 94686639276031, +STORE, 94686639276032, 94686639288319, +STORE, 94686662193152, 94686663163903, +STORE, 140312944431104, 140312946089983, +STORE, 140312946089984, 140312948187135, +STORE, 140312948187136, 140312948203519, +STORE, 140312948203520, 140312948211711, +STORE, 140312948211712, 140312948228095, +STORE, 140312948228096, 140312948240383, +STORE, 140312948240384, 140312950333439, +STORE, 140312950333440, 140312950337535, +STORE, 140312950337536, 140312950341631, +STORE, 140312950341632, 140312950484991, +STORE, 140312950853632, 140312952537087, +STORE, 140312952537088, 140312952553471, +STORE, 140312952582144, 140312952586239, +STORE, 140312952586240, 140312952590335, +STORE, 140312952590336, 140312952594431, +STORE, 140730598920192, 140730599059455, +STORE, 140730599108608, 140730599120895, +STORE, 140730599120896, 140730599124991, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140726234079232, 140737488351231, +SNULL, 140726234087423, 140737488351231, +STORE, 140726234079232, 140726234087423, +STORE, 140726233948160, 140726234087423, +STORE, 94589467578368, 94589469802495, +SNULL, 94589467688959, 94589469802495, +STORE, 94589467578368, 94589467688959, +STORE, 94589467688960, 94589469802495, +ERASE, 94589467688960, 94589469802495, +STORE, 94589469782016, 94589469794303, +STORE, 94589469794304, 94589469802495, +STORE, 140587082842112, 140587085094911, +SNULL, 140587082985471, 140587085094911, +STORE, 140587082842112, 140587082985471, +STORE, 140587082985472, 140587085094911, +ERASE, 140587082985472, 140587085094911, +STORE, 140587085082624, 140587085090815, +STORE, 140587085090816, 140587085094911, +STORE, 140726234103808, 140726234107903, +STORE, 140726234091520, 140726234103807, +STORE, 140587085053952, 140587085082623, +STORE, 140587085045760, 140587085053951, +STORE, 140587079045120, 140587082842111, +SNULL, 140587079045120, 140587080703999, +STORE, 140587080704000, 140587082842111, +STORE, 140587079045120, 140587080703999, +SNULL, 140587082801151, 140587082842111, +STORE, 140587080704000, 140587082801151, +STORE, 140587082801152, 140587082842111, +SNULL, 140587082801152, 140587082825727, +STORE, 140587082825728, 140587082842111, +STORE, 140587082801152, 140587082825727, +ERASE, 140587082801152, 140587082825727, +STORE, 140587082801152, 140587082825727, +ERASE, 140587082825728, 140587082842111, +STORE, 140587082825728, 140587082842111, +SNULL, 140587082817535, 140587082825727, +STORE, 140587082801152, 140587082817535, +STORE, 140587082817536, 140587082825727, +SNULL, 94589469790207, 94589469794303, +STORE, 94589469782016, 94589469790207, +STORE, 94589469790208, 94589469794303, +SNULL, 140587085086719, 140587085090815, +STORE, 140587085082624, 140587085086719, +STORE, 140587085086720, 140587085090815, +ERASE, 140587085053952, 140587085082623, +STORE, 94589477507072, 94589477642239, +STORE, 94225448325120, 94225448538111, +STORE, 94225450635264, 94225450639359, +STORE, 94225450639360, 94225450647551, +STORE, 94225450647552, 94225450659839, +STORE, 94225470246912, 94225473548287, +STORE, 140199245496320, 140199247155199, +STORE, 140199247155200, 140199249252351, +STORE, 140199249252352, 140199249268735, +STORE, 140199249268736, 140199249276927, +STORE, 140199249276928, 140199249293311, +STORE, 140199249293312, 140199249305599, +STORE, 140199249305600, 140199251398655, +STORE, 140199251398656, 140199251402751, +STORE, 140199251402752, 140199251406847, +STORE, 140199251406848, 140199251550207, +STORE, 140199251918848, 140199253602303, +STORE, 140199253602304, 140199253618687, +STORE, 140199253647360, 140199253651455, +STORE, 140199253651456, 140199253655551, +STORE, 140199253655552, 140199253659647, +STORE, 140726264414208, 140726264553471, +STORE, 140726265843712, 140726265855999, +STORE, 140726265856000, 140726265860095, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140733508358144, 140737488351231, +SNULL, 140733508366335, 140737488351231, +STORE, 140733508358144, 140733508366335, +STORE, 140733508227072, 140733508366335, +STORE, 94766263947264, 94766266171391, +SNULL, 94766264057855, 94766266171391, +STORE, 94766263947264, 94766264057855, +STORE, 94766264057856, 94766266171391, +ERASE, 94766264057856, 94766266171391, +STORE, 94766266150912, 94766266163199, +STORE, 94766266163200, 94766266171391, +STORE, 140693985132544, 140693987385343, +SNULL, 140693985275903, 140693987385343, +STORE, 140693985132544, 140693985275903, +STORE, 140693985275904, 140693987385343, +ERASE, 140693985275904, 140693987385343, +STORE, 140693987373056, 140693987381247, +STORE, 140693987381248, 140693987385343, +STORE, 140733509939200, 140733509943295, +STORE, 140733509926912, 140733509939199, +STORE, 140693987344384, 140693987373055, +STORE, 140693987336192, 140693987344383, +STORE, 140693981335552, 140693985132543, +SNULL, 140693981335552, 140693982994431, +STORE, 140693982994432, 140693985132543, +STORE, 140693981335552, 140693982994431, +SNULL, 140693985091583, 140693985132543, +STORE, 140693982994432, 140693985091583, +STORE, 140693985091584, 140693985132543, +SNULL, 140693985091584, 140693985116159, +STORE, 140693985116160, 140693985132543, +STORE, 140693985091584, 140693985116159, +ERASE, 140693985091584, 140693985116159, +STORE, 140693985091584, 140693985116159, +ERASE, 140693985116160, 140693985132543, +STORE, 140693985116160, 140693985132543, +SNULL, 140693985107967, 140693985116159, +STORE, 140693985091584, 140693985107967, +STORE, 140693985107968, 140693985116159, +SNULL, 94766266159103, 94766266163199, +STORE, 94766266150912, 94766266159103, +STORE, 94766266159104, 94766266163199, +SNULL, 140693987377151, 140693987381247, +STORE, 140693987373056, 140693987377151, +STORE, 140693987377152, 140693987381247, +ERASE, 140693987344384, 140693987373055, +STORE, 94766282035200, 94766282170367, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140724769353728, 140737488351231, +SNULL, 140724769361919, 140737488351231, +STORE, 140724769353728, 140724769361919, +STORE, 140724769222656, 140724769361919, +STORE, 94710460526592, 94710462750719, +SNULL, 94710460637183, 94710462750719, +STORE, 94710460526592, 94710460637183, +STORE, 94710460637184, 94710462750719, +ERASE, 94710460637184, 94710462750719, +STORE, 94710462730240, 94710462742527, +STORE, 94710462742528, 94710462750719, +STORE, 140469764395008, 140469766647807, +SNULL, 140469764538367, 140469766647807, +STORE, 140469764395008, 140469764538367, +STORE, 140469764538368, 140469766647807, +ERASE, 140469764538368, 140469766647807, +STORE, 140469766635520, 140469766643711, +STORE, 140469766643712, 140469766647807, +STORE, 140724770877440, 140724770881535, +STORE, 140724770865152, 140724770877439, +STORE, 140469766606848, 140469766635519, +STORE, 140469766598656, 140469766606847, +STORE, 140469760598016, 140469764395007, +SNULL, 140469760598016, 140469762256895, +STORE, 140469762256896, 140469764395007, +STORE, 140469760598016, 140469762256895, +SNULL, 140469764354047, 140469764395007, +STORE, 140469762256896, 140469764354047, +STORE, 140469764354048, 140469764395007, +SNULL, 140469764354048, 140469764378623, +STORE, 140469764378624, 140469764395007, +STORE, 140469764354048, 140469764378623, +ERASE, 140469764354048, 140469764378623, +STORE, 140469764354048, 140469764378623, +ERASE, 140469764378624, 140469764395007, +STORE, 140469764378624, 140469764395007, +SNULL, 140469764370431, 140469764378623, +STORE, 140469764354048, 140469764370431, +STORE, 140469764370432, 140469764378623, +SNULL, 94710462738431, 94710462742527, +STORE, 94710462730240, 94710462738431, +STORE, 94710462738432, 94710462742527, +SNULL, 140469766639615, 140469766643711, +STORE, 140469766635520, 140469766639615, +STORE, 140469766639616, 140469766643711, +ERASE, 140469766606848, 140469766635519, +STORE, 94710485581824, 94710485716991, +STORE, 94105755795456, 94105756008447, +STORE, 94105758105600, 94105758109695, +STORE, 94105758109696, 94105758117887, +STORE, 94105758117888, 94105758130175, +STORE, 94105788981248, 94105794871295, +STORE, 140641190031360, 140641191690239, +STORE, 140641191690240, 140641193787391, +STORE, 140641193787392, 140641193803775, +STORE, 140641193803776, 140641193811967, +STORE, 140641193811968, 140641193828351, +STORE, 140641193828352, 140641193840639, +STORE, 140641193840640, 140641195933695, +STORE, 140641195933696, 140641195937791, +STORE, 140641195937792, 140641195941887, +STORE, 140641195941888, 140641196085247, +STORE, 140641196453888, 140641198137343, +STORE, 140641198137344, 140641198153727, +STORE, 140641198182400, 140641198186495, +STORE, 140641198186496, 140641198190591, +STORE, 140641198190592, 140641198194687, +STORE, 140731980034048, 140731980173311, +STORE, 140731981078528, 140731981090815, +STORE, 140731981090816, 140731981094911, +STORE, 93828086431744, 93828086644735, +STORE, 93828088741888, 93828088745983, +STORE, 93828088745984, 93828088754175, +STORE, 93828088754176, 93828088766463, +STORE, 93828094193664, 93828096831487, +STORE, 139844717334528, 139844718993407, +STORE, 139844718993408, 139844721090559, +STORE, 139844721090560, 139844721106943, +STORE, 139844721106944, 139844721115135, +STORE, 139844721115136, 139844721131519, +STORE, 139844721131520, 139844721143807, +STORE, 139844721143808, 139844723236863, +STORE, 139844723236864, 139844723240959, +STORE, 139844723240960, 139844723245055, +STORE, 139844723245056, 139844723388415, +STORE, 139844723757056, 139844725440511, +STORE, 139844725440512, 139844725456895, +STORE, 139844725485568, 139844725489663, +STORE, 139844725489664, 139844725493759, +STORE, 139844725493760, 139844725497855, +STORE, 140729996185600, 140729996324863, +STORE, 140729996828672, 140729996840959, +STORE, 140729996840960, 140729996845055, +STORE, 140737488347136, 140737488351231, +STORE, 140722494771200, 140737488351231, +SNULL, 140722494775295, 140737488351231, +STORE, 140722494771200, 140722494775295, +STORE, 140722494640128, 140722494775295, +STORE, 94324011311104, 94324013535231, +SNULL, 94324011421695, 94324013535231, +STORE, 94324011311104, 94324011421695, +STORE, 94324011421696, 94324013535231, +ERASE, 94324011421696, 94324013535231, +STORE, 94324013514752, 94324013527039, +STORE, 94324013527040, 94324013535231, +STORE, 140151462309888, 140151464562687, +SNULL, 140151462453247, 140151464562687, +STORE, 140151462309888, 140151462453247, +STORE, 140151462453248, 140151464562687, +ERASE, 140151462453248, 140151464562687, +STORE, 140151464550400, 140151464558591, +STORE, 140151464558592, 140151464562687, +STORE, 140722495467520, 140722495471615, +STORE, 140722495455232, 140722495467519, +STORE, 140151464521728, 140151464550399, +STORE, 140151464513536, 140151464521727, +STORE, 140151458512896, 140151462309887, +SNULL, 140151458512896, 140151460171775, +STORE, 140151460171776, 140151462309887, +STORE, 140151458512896, 140151460171775, +SNULL, 140151462268927, 140151462309887, +STORE, 140151460171776, 140151462268927, +STORE, 140151462268928, 140151462309887, +SNULL, 140151462268928, 140151462293503, +STORE, 140151462293504, 140151462309887, +STORE, 140151462268928, 140151462293503, +ERASE, 140151462268928, 140151462293503, +STORE, 140151462268928, 140151462293503, +ERASE, 140151462293504, 140151462309887, +STORE, 140151462293504, 140151462309887, +SNULL, 140151462285311, 140151462293503, +STORE, 140151462268928, 140151462285311, +STORE, 140151462285312, 140151462293503, +SNULL, 94324013522943, 94324013527039, +STORE, 94324013514752, 94324013522943, +STORE, 94324013522944, 94324013527039, +SNULL, 140151464554495, 140151464558591, +STORE, 140151464550400, 140151464554495, +STORE, 140151464554496, 140151464558591, +ERASE, 140151464521728, 140151464550399, +STORE, 94324024778752, 94324024913919, +STORE, 94899262967808, 94899263180799, +STORE, 94899265277952, 94899265282047, +STORE, 94899265282048, 94899265290239, +STORE, 94899265290240, 94899265302527, +STORE, 94899295469568, 94899298689023, +STORE, 140434388418560, 140434390077439, +STORE, 140434390077440, 140434392174591, +STORE, 140434392174592, 140434392190975, +STORE, 140434392190976, 140434392199167, +STORE, 140434392199168, 140434392215551, +STORE, 140434392215552, 140434392227839, +STORE, 140434392227840, 140434394320895, +STORE, 140434394320896, 140434394324991, +STORE, 140434394324992, 140434394329087, +STORE, 140434394329088, 140434394472447, +STORE, 140434394841088, 140434396524543, +STORE, 140434396524544, 140434396540927, +STORE, 140434396569600, 140434396573695, +STORE, 140434396573696, 140434396577791, +STORE, 140434396577792, 140434396581887, +STORE, 140720618135552, 140720618274815, +STORE, 140720618418176, 140720618430463, +STORE, 140720618430464, 140720618434559, +STORE, 94425529798656, 94425530011647, +STORE, 94425532108800, 94425532112895, +STORE, 94425532112896, 94425532121087, +STORE, 94425532121088, 94425532133375, +STORE, 94425557753856, 94425566576639, +STORE, 140600528470016, 140600530128895, +STORE, 140600530128896, 140600532226047, +STORE, 140600532226048, 140600532242431, +STORE, 140600532242432, 140600532250623, +STORE, 140600532250624, 140600532267007, +STORE, 140600532267008, 140600532279295, +STORE, 140600532279296, 140600534372351, +STORE, 140600534372352, 140600534376447, +STORE, 140600534376448, 140600534380543, +STORE, 140600534380544, 140600534523903, +STORE, 140600534892544, 140600536575999, +STORE, 140600536576000, 140600536592383, +STORE, 140600536621056, 140600536625151, +STORE, 140600536625152, 140600536629247, +STORE, 140600536629248, 140600536633343, +STORE, 140721857785856, 140721857925119, +STORE, 140721858068480, 140721858080767, +STORE, 140721858080768, 140721858084863, +STORE, 94425529798656, 94425530011647, +STORE, 94425532108800, 94425532112895, +STORE, 94425532112896, 94425532121087, +STORE, 94425532121088, 94425532133375, +STORE, 94425557753856, 94425568772095, +STORE, 140600528470016, 140600530128895, +STORE, 140600530128896, 140600532226047, +STORE, 140600532226048, 140600532242431, +STORE, 140600532242432, 140600532250623, +STORE, 140600532250624, 140600532267007, +STORE, 140600532267008, 140600532279295, +STORE, 140600532279296, 140600534372351, +STORE, 140600534372352, 140600534376447, +STORE, 140600534376448, 140600534380543, +STORE, 140600534380544, 140600534523903, +STORE, 140600534892544, 140600536575999, +STORE, 140600536576000, 140600536592383, +STORE, 140600536621056, 140600536625151, +STORE, 140600536625152, 140600536629247, +STORE, 140600536629248, 140600536633343, +STORE, 140721857785856, 140721857925119, +STORE, 140721858068480, 140721858080767, +STORE, 140721858080768, 140721858084863, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140735611645952, 140737488351231, +SNULL, 140735611654143, 140737488351231, +STORE, 140735611645952, 140735611654143, +STORE, 140735611514880, 140735611654143, +STORE, 94592137641984, 94592139866111, +SNULL, 94592137752575, 94592139866111, +STORE, 94592137641984, 94592137752575, +STORE, 94592137752576, 94592139866111, +ERASE, 94592137752576, 94592139866111, +STORE, 94592139845632, 94592139857919, +STORE, 94592139857920, 94592139866111, +STORE, 140350425030656, 140350427283455, +SNULL, 140350425174015, 140350427283455, +STORE, 140350425030656, 140350425174015, +STORE, 140350425174016, 140350427283455, +ERASE, 140350425174016, 140350427283455, +STORE, 140350427271168, 140350427279359, +STORE, 140350427279360, 140350427283455, +STORE, 140735612043264, 140735612047359, +STORE, 140735612030976, 140735612043263, +STORE, 140350427242496, 140350427271167, +STORE, 140350427234304, 140350427242495, +STORE, 140350421233664, 140350425030655, +SNULL, 140350421233664, 140350422892543, +STORE, 140350422892544, 140350425030655, +STORE, 140350421233664, 140350422892543, +SNULL, 140350424989695, 140350425030655, +STORE, 140350422892544, 140350424989695, +STORE, 140350424989696, 140350425030655, +SNULL, 140350424989696, 140350425014271, +STORE, 140350425014272, 140350425030655, +STORE, 140350424989696, 140350425014271, +ERASE, 140350424989696, 140350425014271, +STORE, 140350424989696, 140350425014271, +ERASE, 140350425014272, 140350425030655, +STORE, 140350425014272, 140350425030655, +SNULL, 140350425006079, 140350425014271, +STORE, 140350424989696, 140350425006079, +STORE, 140350425006080, 140350425014271, +SNULL, 94592139853823, 94592139857919, +STORE, 94592139845632, 94592139853823, +STORE, 94592139853824, 94592139857919, +SNULL, 140350427275263, 140350427279359, +STORE, 140350427271168, 140350427275263, +STORE, 140350427275264, 140350427279359, +ERASE, 140350427242496, 140350427271167, +STORE, 94592164823040, 94592164958207, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140723500535808, 140737488351231, +SNULL, 140723500543999, 140737488351231, +STORE, 140723500535808, 140723500543999, +STORE, 140723500404736, 140723500543999, +STORE, 94458379010048, 94458381234175, +SNULL, 94458379120639, 94458381234175, +STORE, 94458379010048, 94458379120639, +STORE, 94458379120640, 94458381234175, +ERASE, 94458379120640, 94458381234175, +STORE, 94458381213696, 94458381225983, +STORE, 94458381225984, 94458381234175, +STORE, 139771674230784, 139771676483583, +SNULL, 139771674374143, 139771676483583, +STORE, 139771674230784, 139771674374143, +STORE, 139771674374144, 139771676483583, +ERASE, 139771674374144, 139771676483583, +STORE, 139771676471296, 139771676479487, +STORE, 139771676479488, 139771676483583, +STORE, 140723500769280, 140723500773375, +STORE, 140723500756992, 140723500769279, +STORE, 139771676442624, 139771676471295, +STORE, 139771676434432, 139771676442623, +STORE, 139771670433792, 139771674230783, +SNULL, 139771670433792, 139771672092671, +STORE, 139771672092672, 139771674230783, +STORE, 139771670433792, 139771672092671, +SNULL, 139771674189823, 139771674230783, +STORE, 139771672092672, 139771674189823, +STORE, 139771674189824, 139771674230783, +SNULL, 139771674189824, 139771674214399, +STORE, 139771674214400, 139771674230783, +STORE, 139771674189824, 139771674214399, +ERASE, 139771674189824, 139771674214399, +STORE, 139771674189824, 139771674214399, +ERASE, 139771674214400, 139771674230783, +STORE, 139771674214400, 139771674230783, +SNULL, 139771674206207, 139771674214399, +STORE, 139771674189824, 139771674206207, +STORE, 139771674206208, 139771674214399, +SNULL, 94458381221887, 94458381225983, +STORE, 94458381213696, 94458381221887, +STORE, 94458381221888, 94458381225983, +SNULL, 139771676475391, 139771676479487, +STORE, 139771676471296, 139771676475391, +STORE, 139771676475392, 139771676479487, +ERASE, 139771676442624, 139771676471295, +STORE, 94458401873920, 94458402009087, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140731316264960, 140737488351231, +SNULL, 140731316273151, 140737488351231, +STORE, 140731316264960, 140731316273151, +STORE, 140731316133888, 140731316273151, +STORE, 94437830881280, 94437833215999, +SNULL, 94437831094271, 94437833215999, +STORE, 94437830881280, 94437831094271, +STORE, 94437831094272, 94437833215999, +ERASE, 94437831094272, 94437833215999, +STORE, 94437833191424, 94437833203711, +STORE, 94437833203712, 94437833215999, +STORE, 140265986031616, 140265988284415, +SNULL, 140265986174975, 140265988284415, +STORE, 140265986031616, 140265986174975, +STORE, 140265986174976, 140265988284415, +ERASE, 140265986174976, 140265988284415, +STORE, 140265988272128, 140265988280319, +STORE, 140265988280320, 140265988284415, +STORE, 140731316318208, 140731316322303, +STORE, 140731316305920, 140731316318207, +STORE, 140265988243456, 140265988272127, +STORE, 140265988235264, 140265988243455, +STORE, 140265983918080, 140265986031615, +SNULL, 140265983918080, 140265983930367, +STORE, 140265983930368, 140265986031615, +STORE, 140265983918080, 140265983930367, +SNULL, 140265986023423, 140265986031615, +STORE, 140265983930368, 140265986023423, +STORE, 140265986023424, 140265986031615, +ERASE, 140265986023424, 140265986031615, +STORE, 140265986023424, 140265986031615, +STORE, 140265980121088, 140265983918079, +SNULL, 140265980121088, 140265981779967, +STORE, 140265981779968, 140265983918079, +STORE, 140265980121088, 140265981779967, +SNULL, 140265983877119, 140265983918079, +STORE, 140265981779968, 140265983877119, +STORE, 140265983877120, 140265983918079, +SNULL, 140265983877120, 140265983901695, +STORE, 140265983901696, 140265983918079, +STORE, 140265983877120, 140265983901695, +ERASE, 140265983877120, 140265983901695, +STORE, 140265983877120, 140265983901695, +ERASE, 140265983901696, 140265983918079, +STORE, 140265983901696, 140265983918079, +STORE, 140265988227072, 140265988243455, +SNULL, 140265983893503, 140265983901695, +STORE, 140265983877120, 140265983893503, +STORE, 140265983893504, 140265983901695, +SNULL, 140265986027519, 140265986031615, +STORE, 140265986023424, 140265986027519, +STORE, 140265986027520, 140265986031615, +SNULL, 94437833195519, 94437833203711, +STORE, 94437833191424, 94437833195519, +STORE, 94437833195520, 94437833203711, +SNULL, 140265988276223, 140265988280319, +STORE, 140265988272128, 140265988276223, +STORE, 140265988276224, 140265988280319, +ERASE, 140265988243456, 140265988272127, +STORE, 94437847638016, 94437847773183, +STORE, 140265986543616, 140265988227071, +STORE, 94437847638016, 94437847908351, +STORE, 94437847638016, 94437848043519, +STORE, 94437847638016, 94437848190975, +SNULL, 94437848178687, 94437848190975, +STORE, 94437847638016, 94437848178687, +STORE, 94437848178688, 94437848190975, +ERASE, 94437848178688, 94437848190975, +STORE, 94437847638016, 94437848330239, +STORE, 94437847638016, 94437848465407, +SNULL, 94437848444927, 94437848465407, +STORE, 94437847638016, 94437848444927, +STORE, 94437848444928, 94437848465407, +ERASE, 94437848444928, 94437848465407, +STORE, 94437847638016, 94437848584191, +STORE, 94437847638016, 94437848719359, +SNULL, 94437848678399, 94437848719359, +STORE, 94437847638016, 94437848678399, +STORE, 94437848678400, 94437848719359, +ERASE, 94437848678400, 94437848719359, +STORE, 94437847638016, 94437848842239, +SNULL, 94437848825855, 94437848842239, +STORE, 94437847638016, 94437848825855, +STORE, 94437848825856, 94437848842239, +ERASE, 94437848825856, 94437848842239, +STORE, 94437847638016, 94437848961023, +STORE, 94437847638016, 94437849096191, +STORE, 94661814710272, 94661814923263, +STORE, 94661817020416, 94661817024511, +STORE, 94661817024512, 94661817032703, +STORE, 94661817032704, 94661817044991, +STORE, 94661840424960, 94661841240063, +STORE, 140582259814400, 140582261473279, +STORE, 140582261473280, 140582263570431, +STORE, 140582263570432, 140582263586815, +STORE, 140582263586816, 140582263595007, +STORE, 140582263595008, 140582263611391, +STORE, 140582263611392, 140582263623679, +STORE, 140582263623680, 140582265716735, +STORE, 140582265716736, 140582265720831, +STORE, 140582265720832, 140582265724927, +STORE, 140582265724928, 140582265868287, +STORE, 140582266236928, 140582267920383, +STORE, 140582267920384, 140582267936767, +STORE, 140582267965440, 140582267969535, +STORE, 140582267969536, 140582267973631, +STORE, 140582267973632, 140582267977727, +STORE, 140735472508928, 140735472648191, +STORE, 140735472672768, 140735472685055, +STORE, 140735472685056, 140735472689151, +STORE, 94440069140480, 94440069353471, +STORE, 94440071450624, 94440071454719, +STORE, 94440071454720, 94440071462911, +STORE, 94440071462912, 94440071475199, +STORE, 94440072122368, 94440079048703, +STORE, 140112218095616, 140112219754495, +STORE, 140112219754496, 140112221851647, +STORE, 140112221851648, 140112221868031, +STORE, 140112221868032, 140112221876223, +STORE, 140112221876224, 140112221892607, +STORE, 140112221892608, 140112221904895, +STORE, 140112221904896, 140112223997951, +STORE, 140112223997952, 140112224002047, +STORE, 140112224002048, 140112224006143, +STORE, 140112224006144, 140112224149503, +STORE, 140112224518144, 140112226201599, +STORE, 140112226201600, 140112226217983, +STORE, 140112226246656, 140112226250751, +STORE, 140112226250752, 140112226254847, +STORE, 140112226254848, 140112226258943, +STORE, 140737460969472, 140737461108735, +STORE, 140737462083584, 140737462095871, +STORE, 140737462095872, 140737462099967, +STORE, 94257654345728, 94257654390783, +STORE, 94257656483840, 94257656487935, +STORE, 94257656487936, 94257656492031, +STORE, 94257656492032, 94257656496127, +STORE, 94257665859584, 94257665994751, +STORE, 140507070345216, 140507070386175, +STORE, 140507070386176, 140507072483327, +STORE, 140507072483328, 140507072487423, +STORE, 140507072487424, 140507072491519, +STORE, 140507072491520, 140507072516095, +STORE, 140507072516096, 140507072561151, +STORE, 140507072561152, 140507074654207, +STORE, 140507074654208, 140507074658303, +STORE, 140507074658304, 140507074662399, +STORE, 140507074662400, 140507074744319, +STORE, 140507074744320, 140507076841471, +STORE, 140507076841472, 140507076845567, +STORE, 140507076845568, 140507076849663, +STORE, 140507076849664, 140507076857855, +STORE, 140507076857856, 140507076886527, +STORE, 140507076886528, 140507078979583, +STORE, 140507078979584, 140507078983679, +STORE, 140507078983680, 140507078987775, +STORE, 140507078987776, 140507079086079, +STORE, 140507079086080, 140507081179135, +STORE, 140507081179136, 140507081183231, +STORE, 140507081183232, 140507081187327, +STORE, 140507081187328, 140507081203711, +STORE, 140507081203712, 140507081220095, +STORE, 140507081220096, 140507083317247, +STORE, 140507083317248, 140507083321343, +STORE, 140507083321344, 140507083325439, +STORE, 140507083325440, 140507083792383, +STORE, 140507083792384, 140507085885439, +STORE, 140507085885440, 140507085889535, +STORE, 140507085889536, 140507085893631, +STORE, 140507085893632, 140507085905919, +STORE, 140507085905920, 140507087998975, +STORE, 140507087998976, 140507088003071, +STORE, 140507088003072, 140507088007167, +STORE, 140507088007168, 140507088125951, +STORE, 140507088125952, 140507090219007, +STORE, 140507090219008, 140507090223103, +STORE, 140507090223104, 140507090227199, +STORE, 140507090227200, 140507090268159, +STORE, 140507090268160, 140507091927039, +STORE, 140507091927040, 140507094024191, +STORE, 140507094024192, 140507094040575, +STORE, 140507094040576, 140507094048767, +STORE, 140507094048768, 140507094065151, +STORE, 140507094065152, 140507094216703, +STORE, 140507094216704, 140507096309759, +STORE, 140507096309760, 140507096313855, +STORE, 140507096313856, 140507096317951, +STORE, 140507096317952, 140507096326143, +STORE, 140507096326144, 140507096379391, +STORE, 140507096379392, 140507098472447, +STORE, 140507098472448, 140507098476543, +STORE, 140507098476544, 140507098480639, +STORE, 140507098480640, 140507098623999, +STORE, 140507098980352, 140507100663807, +STORE, 140507100663808, 140507100692479, +STORE, 140507100721152, 140507100725247, +STORE, 140507100725248, 140507100729343, +STORE, 140507100729344, 140507100733439, +STORE, 140728152780800, 140728152915967, +STORE, 140728153698304, 140728153710591, +STORE, 140728153710592, 140728153714687, +STORE, 140507068137472, 140507070345215, +SNULL, 140507068137472, 140507068190719, +STORE, 140507068190720, 140507070345215, +STORE, 140507068137472, 140507068190719, +SNULL, 140507070287871, 140507070345215, +STORE, 140507068190720, 140507070287871, +STORE, 140507070287872, 140507070345215, +SNULL, 140507070287872, 140507070296063, +STORE, 140507070296064, 140507070345215, +STORE, 140507070287872, 140507070296063, +ERASE, 140507070287872, 140507070296063, +STORE, 140507070287872, 140507070296063, +ERASE, 140507070296064, 140507070345215, +STORE, 140507070296064, 140507070345215, +STORE, 140507100692480, 140507100721151, +STORE, 140507065810944, 140507068137471, +SNULL, 140507065810944, 140507065843711, +STORE, 140507065843712, 140507068137471, +STORE, 140507065810944, 140507065843711, +SNULL, 140507067940863, 140507068137471, +STORE, 140507065843712, 140507067940863, +STORE, 140507067940864, 140507068137471, +SNULL, 140507067940864, 140507067949055, +STORE, 140507067949056, 140507068137471, +STORE, 140507067940864, 140507067949055, +ERASE, 140507067940864, 140507067949055, +STORE, 140507067940864, 140507067949055, +ERASE, 140507067949056, 140507068137471, +STORE, 140507067949056, 140507068137471, +SNULL, 140507067944959, 140507067949055, +STORE, 140507067940864, 140507067944959, +STORE, 140507067944960, 140507067949055, +SNULL, 140507070291967, 140507070296063, +STORE, 140507070287872, 140507070291967, +STORE, 140507070291968, 140507070296063, +ERASE, 140507100692480, 140507100721151, +STORE, 140507063705600, 140507065810943, +SNULL, 140507063705600, 140507063709695, +STORE, 140507063709696, 140507065810943, +STORE, 140507063705600, 140507063709695, +SNULL, 140507065802751, 140507065810943, +STORE, 140507063709696, 140507065802751, +STORE, 140507065802752, 140507065810943, +ERASE, 140507065802752, 140507065810943, +STORE, 140507065802752, 140507065810943, +SNULL, 140507065806847, 140507065810943, +STORE, 140507065802752, 140507065806847, +STORE, 140507065806848, 140507065810943, +STORE, 140507061600256, 140507063705599, +SNULL, 140507061600256, 140507061604351, +STORE, 140507061604352, 140507063705599, +STORE, 140507061600256, 140507061604351, +SNULL, 140507063697407, 140507063705599, +STORE, 140507061604352, 140507063697407, +STORE, 140507063697408, 140507063705599, +ERASE, 140507063697408, 140507063705599, +STORE, 140507063697408, 140507063705599, +SNULL, 140507063701503, 140507063705599, +STORE, 140507063697408, 140507063701503, +STORE, 140507063701504, 140507063705599, +STORE, 140507059490816, 140507061600255, +SNULL, 140507059490816, 140507059499007, +STORE, 140507059499008, 140507061600255, +STORE, 140507059490816, 140507059499007, +SNULL, 140507061592063, 140507061600255, +STORE, 140507059499008, 140507061592063, +STORE, 140507061592064, 140507061600255, +ERASE, 140507061592064, 140507061600255, +STORE, 140507061592064, 140507061600255, +SNULL, 140507061596159, 140507061600255, +STORE, 140507061592064, 140507061596159, +STORE, 140507061596160, 140507061600255, +STORE, 140507057377280, 140507059490815, +SNULL, 140507057377280, 140507057389567, +STORE, 140507057389568, 140507059490815, +STORE, 140507057377280, 140507057389567, +SNULL, 140507059482623, 140507059490815, +STORE, 140507057389568, 140507059482623, +STORE, 140507059482624, 140507059490815, +ERASE, 140507059482624, 140507059490815, +STORE, 140507059482624, 140507059490815, +SNULL, 140507059486719, 140507059490815, +STORE, 140507059482624, 140507059486719, +STORE, 140507059486720, 140507059490815, +STORE, 140507055255552, 140507057377279, +SNULL, 140507055255552, 140507055276031, +STORE, 140507055276032, 140507057377279, +STORE, 140507055255552, 140507055276031, +SNULL, 140507057369087, 140507057377279, +STORE, 140507055276032, 140507057369087, +STORE, 140507057369088, 140507057377279, +ERASE, 140507057369088, 140507057377279, +STORE, 140507057369088, 140507057377279, +SNULL, 140507057373183, 140507057377279, +STORE, 140507057369088, 140507057373183, +STORE, 140507057373184, 140507057377279, +STORE, 140507098693632, 140507098980351, +SNULL, 140507098959871, 140507098980351, +STORE, 140507098693632, 140507098959871, +STORE, 140507098959872, 140507098980351, +SNULL, 140507098959872, 140507098976255, +STORE, 140507098976256, 140507098980351, +STORE, 140507098959872, 140507098976255, +ERASE, 140507098959872, 140507098976255, +STORE, 140507098959872, 140507098976255, +ERASE, 140507098976256, 140507098980351, +STORE, 140507098976256, 140507098980351, +STORE, 140507100692480, 140507100721151, +STORE, 140507053125632, 140507055255551, +SNULL, 140507053125632, 140507053154303, +STORE, 140507053154304, 140507055255551, +STORE, 140507053125632, 140507053154303, +SNULL, 140507055247359, 140507055255551, +STORE, 140507053154304, 140507055247359, +STORE, 140507055247360, 140507055255551, +ERASE, 140507055247360, 140507055255551, +STORE, 140507055247360, 140507055255551, +STORE, 140507051012096, 140507053125631, +SNULL, 140507051012096, 140507051024383, +STORE, 140507051024384, 140507053125631, +STORE, 140507051012096, 140507051024383, +SNULL, 140507053117439, 140507053125631, +STORE, 140507051024384, 140507053117439, +STORE, 140507053117440, 140507053125631, +ERASE, 140507053117440, 140507053125631, +STORE, 140507053117440, 140507053125631, +SNULL, 140507053121535, 140507053125631, +STORE, 140507053117440, 140507053121535, +STORE, 140507053121536, 140507053125631, +SNULL, 140507055251455, 140507055255551, +STORE, 140507055247360, 140507055251455, +STORE, 140507055251456, 140507055255551, +SNULL, 140507098972159, 140507098976255, +STORE, 140507098959872, 140507098972159, +STORE, 140507098972160, 140507098976255, +ERASE, 140507100692480, 140507100721151, +STORE, 140507100717056, 140507100721151, +ERASE, 140507100717056, 140507100721151, +STORE, 140507100717056, 140507100721151, +ERASE, 140507100717056, 140507100721151, +STORE, 140507100717056, 140507100721151, +ERASE, 140507100717056, 140507100721151, +STORE, 140507100717056, 140507100721151, +ERASE, 140507100717056, 140507100721151, +STORE, 140507100692480, 140507100721151, +ERASE, 140507068137472, 140507068190719, +ERASE, 140507068190720, 140507070287871, +ERASE, 140507070287872, 140507070291967, +ERASE, 140507070291968, 140507070296063, +ERASE, 140507070296064, 140507070345215, +ERASE, 140507065810944, 140507065843711, +ERASE, 140507065843712, 140507067940863, +ERASE, 140507067940864, 140507067944959, +ERASE, 140507067944960, 140507067949055, +ERASE, 140507067949056, 140507068137471, +ERASE, 140507063705600, 140507063709695, +ERASE, 140507063709696, 140507065802751, +ERASE, 140507065802752, 140507065806847, +ERASE, 140507065806848, 140507065810943, +ERASE, 140507061600256, 140507061604351, +ERASE, 140507061604352, 140507063697407, +ERASE, 140507063697408, 140507063701503, +ERASE, 140507063701504, 140507063705599, +ERASE, 140507059490816, 140507059499007, +ERASE, 140507059499008, 140507061592063, +ERASE, 140507061592064, 140507061596159, +ERASE, 140507061596160, 140507061600255, +ERASE, 140507057377280, 140507057389567, +ERASE, 140507057389568, 140507059482623, +ERASE, 140507059482624, 140507059486719, +ERASE, 140507059486720, 140507059490815, +ERASE, 140507055255552, 140507055276031, +ERASE, 140507055276032, 140507057369087, +ERASE, 140507057369088, 140507057373183, +ERASE, 140507057373184, 140507057377279, +ERASE, 140507098693632, 140507098959871, +ERASE, 140507098959872, 140507098972159, +ERASE, 140507098972160, 140507098976255, +ERASE, 140507098976256, 140507098980351, +ERASE, 140507051012096, 140507051024383, +ERASE, 140507051024384, 140507053117439, +ERASE, 140507053117440, 140507053121535, +ERASE, 140507053121536, 140507053125631, +STORE, 94036448296960, 94036448509951, +STORE, 94036450607104, 94036450611199, +STORE, 94036450611200, 94036450619391, +STORE, 94036450619392, 94036450631679, +STORE, 94036482445312, 94036502376447, +STORE, 140469487013888, 140469488672767, +STORE, 140469488672768, 140469490769919, +STORE, 140469490769920, 140469490786303, +STORE, 140469490786304, 140469490794495, +STORE, 140469490794496, 140469490810879, +STORE, 140469490810880, 140469490823167, +STORE, 140469490823168, 140469492916223, +STORE, 140469492916224, 140469492920319, +STORE, 140469492920320, 140469492924415, +STORE, 140469492924416, 140469493067775, +STORE, 140469493436416, 140469495119871, +STORE, 140469495119872, 140469495136255, +STORE, 140469495164928, 140469495169023, +STORE, 140469495169024, 140469495173119, +STORE, 140469495173120, 140469495177215, +STORE, 140732281446400, 140732281585663, +STORE, 140732282736640, 140732282748927, +STORE, 140732282748928, 140732282753023, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140723411931136, 140737488351231, +SNULL, 140723411939327, 140737488351231, +STORE, 140723411931136, 140723411939327, +STORE, 140723411800064, 140723411939327, +STORE, 93993768685568, 93993770909695, +SNULL, 93993768796159, 93993770909695, +STORE, 93993768685568, 93993768796159, +STORE, 93993768796160, 93993770909695, +ERASE, 93993768796160, 93993770909695, +STORE, 93993770889216, 93993770901503, +STORE, 93993770901504, 93993770909695, +STORE, 140508681740288, 140508683993087, +SNULL, 140508681883647, 140508683993087, +STORE, 140508681740288, 140508681883647, +STORE, 140508681883648, 140508683993087, +ERASE, 140508681883648, 140508683993087, +STORE, 140508683980800, 140508683988991, +STORE, 140508683988992, 140508683993087, +STORE, 140723412070400, 140723412074495, +STORE, 140723412058112, 140723412070399, +STORE, 140508683952128, 140508683980799, +STORE, 140508683943936, 140508683952127, +STORE, 140508677943296, 140508681740287, +SNULL, 140508677943296, 140508679602175, +STORE, 140508679602176, 140508681740287, +STORE, 140508677943296, 140508679602175, +SNULL, 140508681699327, 140508681740287, +STORE, 140508679602176, 140508681699327, +STORE, 140508681699328, 140508681740287, +SNULL, 140508681699328, 140508681723903, +STORE, 140508681723904, 140508681740287, +STORE, 140508681699328, 140508681723903, +ERASE, 140508681699328, 140508681723903, +STORE, 140508681699328, 140508681723903, +ERASE, 140508681723904, 140508681740287, +STORE, 140508681723904, 140508681740287, +SNULL, 140508681715711, 140508681723903, +STORE, 140508681699328, 140508681715711, +STORE, 140508681715712, 140508681723903, +SNULL, 93993770897407, 93993770901503, +STORE, 93993770889216, 93993770897407, +STORE, 93993770897408, 93993770901503, +SNULL, 140508683984895, 140508683988991, +STORE, 140508683980800, 140508683984895, +STORE, 140508683984896, 140508683988991, +ERASE, 140508683952128, 140508683980799, +STORE, 93993791582208, 93993791717375, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140734685458432, 140737488351231, +SNULL, 140734685466623, 140737488351231, +STORE, 140734685458432, 140734685466623, +STORE, 140734685327360, 140734685466623, +STORE, 93832321548288, 93832323772415, +SNULL, 93832321658879, 93832323772415, +STORE, 93832321548288, 93832321658879, +STORE, 93832321658880, 93832323772415, +ERASE, 93832321658880, 93832323772415, +STORE, 93832323751936, 93832323764223, +STORE, 93832323764224, 93832323772415, +STORE, 140650945118208, 140650947371007, +SNULL, 140650945261567, 140650947371007, +STORE, 140650945118208, 140650945261567, +STORE, 140650945261568, 140650947371007, +ERASE, 140650945261568, 140650947371007, +STORE, 140650947358720, 140650947366911, +STORE, 140650947366912, 140650947371007, +STORE, 140734686081024, 140734686085119, +STORE, 140734686068736, 140734686081023, +STORE, 140650947330048, 140650947358719, +STORE, 140650947321856, 140650947330047, +STORE, 140650941321216, 140650945118207, +SNULL, 140650941321216, 140650942980095, +STORE, 140650942980096, 140650945118207, +STORE, 140650941321216, 140650942980095, +SNULL, 140650945077247, 140650945118207, +STORE, 140650942980096, 140650945077247, +STORE, 140650945077248, 140650945118207, +SNULL, 140650945077248, 140650945101823, +STORE, 140650945101824, 140650945118207, +STORE, 140650945077248, 140650945101823, +ERASE, 140650945077248, 140650945101823, +STORE, 140650945077248, 140650945101823, +ERASE, 140650945101824, 140650945118207, +STORE, 140650945101824, 140650945118207, +SNULL, 140650945093631, 140650945101823, +STORE, 140650945077248, 140650945093631, +STORE, 140650945093632, 140650945101823, +SNULL, 93832323760127, 93832323764223, +STORE, 93832323751936, 93832323760127, +STORE, 93832323760128, 93832323764223, +SNULL, 140650947362815, 140650947366911, +STORE, 140650947358720, 140650947362815, +STORE, 140650947362816, 140650947366911, +ERASE, 140650947330048, 140650947358719, +STORE, 93832331890688, 93832332025855, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140728333520896, 140737488351231, +SNULL, 140728333529087, 140737488351231, +STORE, 140728333520896, 140728333529087, +STORE, 140728333389824, 140728333529087, +STORE, 94872734732288, 94872736956415, +SNULL, 94872734842879, 94872736956415, +STORE, 94872734732288, 94872734842879, +STORE, 94872734842880, 94872736956415, +ERASE, 94872734842880, 94872736956415, +STORE, 94872736935936, 94872736948223, +STORE, 94872736948224, 94872736956415, +STORE, 139755193257984, 139755195510783, +SNULL, 139755193401343, 139755195510783, +STORE, 139755193257984, 139755193401343, +STORE, 139755193401344, 139755195510783, +ERASE, 139755193401344, 139755195510783, +STORE, 139755195498496, 139755195506687, +STORE, 139755195506688, 139755195510783, +STORE, 140728333926400, 140728333930495, +STORE, 140728333914112, 140728333926399, +STORE, 139755195469824, 139755195498495, +STORE, 139755195461632, 139755195469823, +STORE, 139755189460992, 139755193257983, +SNULL, 139755189460992, 139755191119871, +STORE, 139755191119872, 139755193257983, +STORE, 139755189460992, 139755191119871, +SNULL, 139755193217023, 139755193257983, +STORE, 139755191119872, 139755193217023, +STORE, 139755193217024, 139755193257983, +SNULL, 139755193217024, 139755193241599, +STORE, 139755193241600, 139755193257983, +STORE, 139755193217024, 139755193241599, +ERASE, 139755193217024, 139755193241599, +STORE, 139755193217024, 139755193241599, +ERASE, 139755193241600, 139755193257983, +STORE, 139755193241600, 139755193257983, +SNULL, 139755193233407, 139755193241599, +STORE, 139755193217024, 139755193233407, +STORE, 139755193233408, 139755193241599, +SNULL, 94872736944127, 94872736948223, +STORE, 94872736935936, 94872736944127, +STORE, 94872736944128, 94872736948223, +SNULL, 139755195502591, 139755195506687, +STORE, 139755195498496, 139755195502591, +STORE, 139755195502592, 139755195506687, +ERASE, 139755195469824, 139755195498495, +STORE, 94872749744128, 94872749879295, +STORE, 94720243642368, 94720243855359, +STORE, 94720245952512, 94720245956607, +STORE, 94720245956608, 94720245964799, +STORE, 94720245964800, 94720245977087, +STORE, 94720277745664, 94720278151167, +STORE, 140453174497280, 140453176156159, +STORE, 140453176156160, 140453178253311, +STORE, 140453178253312, 140453178269695, +STORE, 140453178269696, 140453178277887, +STORE, 140453178277888, 140453178294271, +STORE, 140453178294272, 140453178306559, +STORE, 140453178306560, 140453180399615, +STORE, 140453180399616, 140453180403711, +STORE, 140453180403712, 140453180407807, +STORE, 140453180407808, 140453180551167, +STORE, 140453180919808, 140453182603263, +STORE, 140453182603264, 140453182619647, +STORE, 140453182648320, 140453182652415, +STORE, 140453182652416, 140453182656511, +STORE, 140453182656512, 140453182660607, +STORE, 140733223923712, 140733224062975, +STORE, 140733224808448, 140733224820735, +STORE, 140733224820736, 140733224824831, +STORE, 94321091141632, 94321091354623, +STORE, 94321093451776, 94321093455871, +STORE, 94321093455872, 94321093464063, +STORE, 94321093464064, 94321093476351, +STORE, 94321115873280, 94321117229055, +STORE, 139695978840064, 139695980498943, +STORE, 139695980498944, 139695982596095, +STORE, 139695982596096, 139695982612479, +STORE, 139695982612480, 139695982620671, +STORE, 139695982620672, 139695982637055, +STORE, 139695982637056, 139695982649343, +STORE, 139695982649344, 139695984742399, +STORE, 139695984742400, 139695984746495, +STORE, 139695984746496, 139695984750591, +STORE, 139695984750592, 139695984893951, +STORE, 139695985262592, 139695986946047, +STORE, 139695986946048, 139695986962431, +STORE, 139695986991104, 139695986995199, +STORE, 139695986995200, 139695986999295, +STORE, 139695986999296, 139695987003391, +STORE, 140734650564608, 140734650703871, +STORE, 140734650785792, 140734650798079, +STORE, 140734650798080, 140734650802175, +STORE, 94523438456832, 94523438669823, +STORE, 94523440766976, 94523440771071, +STORE, 94523440771072, 94523440779263, +STORE, 94523440779264, 94523440791551, +STORE, 94523464544256, 94523465842687, +STORE, 140453231493120, 140453233151999, +STORE, 140453233152000, 140453235249151, +STORE, 140453235249152, 140453235265535, +STORE, 140453235265536, 140453235273727, +STORE, 140453235273728, 140453235290111, +STORE, 140453235290112, 140453235302399, +STORE, 140453235302400, 140453237395455, +STORE, 140453237395456, 140453237399551, +STORE, 140453237399552, 140453237403647, +STORE, 140453237403648, 140453237547007, +STORE, 140453237915648, 140453239599103, +STORE, 140453239599104, 140453239615487, +STORE, 140453239644160, 140453239648255, +STORE, 140453239648256, 140453239652351, +STORE, 140453239652352, 140453239656447, +STORE, 140734679445504, 140734679584767, +STORE, 140734680018944, 140734680031231, +STORE, 140734680031232, 140734680035327, +STORE, 94614776987648, 94614777200639, +STORE, 94614779297792, 94614779301887, +STORE, 94614779301888, 94614779310079, +STORE, 94614779310080, 94614779322367, +STORE, 94614798467072, 94614800699391, +STORE, 139677037182976, 139677038841855, +STORE, 139677038841856, 139677040939007, +STORE, 139677040939008, 139677040955391, +STORE, 139677040955392, 139677040963583, +STORE, 139677040963584, 139677040979967, +STORE, 139677040979968, 139677040992255, +STORE, 139677040992256, 139677043085311, +STORE, 139677043085312, 139677043089407, +STORE, 139677043089408, 139677043093503, +STORE, 139677043093504, 139677043236863, +STORE, 139677043605504, 139677045288959, +STORE, 139677045288960, 139677045305343, +STORE, 139677045334016, 139677045338111, +STORE, 139677045338112, 139677045342207, +STORE, 139677045342208, 139677045346303, +STORE, 140721604411392, 140721604550655, +STORE, 140721606135808, 140721606148095, +STORE, 140721606148096, 140721606152191, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140729280544768, 140737488351231, +SNULL, 140729280552959, 140737488351231, +STORE, 140729280544768, 140729280552959, +STORE, 140729280413696, 140729280552959, +STORE, 94863939334144, 94863941558271, +SNULL, 94863939444735, 94863941558271, +STORE, 94863939334144, 94863939444735, +STORE, 94863939444736, 94863941558271, +ERASE, 94863939444736, 94863941558271, +STORE, 94863941537792, 94863941550079, +STORE, 94863941550080, 94863941558271, +STORE, 139691047276544, 139691049529343, +SNULL, 139691047419903, 139691049529343, +STORE, 139691047276544, 139691047419903, +STORE, 139691047419904, 139691049529343, +ERASE, 139691047419904, 139691049529343, +STORE, 139691049517056, 139691049525247, +STORE, 139691049525248, 139691049529343, +STORE, 140729281679360, 140729281683455, +STORE, 140729281667072, 140729281679359, +STORE, 139691049488384, 139691049517055, +STORE, 139691049480192, 139691049488383, +STORE, 139691043479552, 139691047276543, +SNULL, 139691043479552, 139691045138431, +STORE, 139691045138432, 139691047276543, +STORE, 139691043479552, 139691045138431, +SNULL, 139691047235583, 139691047276543, +STORE, 139691045138432, 139691047235583, +STORE, 139691047235584, 139691047276543, +SNULL, 139691047235584, 139691047260159, +STORE, 139691047260160, 139691047276543, +STORE, 139691047235584, 139691047260159, +ERASE, 139691047235584, 139691047260159, +STORE, 139691047235584, 139691047260159, +ERASE, 139691047260160, 139691047276543, +STORE, 139691047260160, 139691047276543, +SNULL, 139691047251967, 139691047260159, +STORE, 139691047235584, 139691047251967, +STORE, 139691047251968, 139691047260159, +SNULL, 94863941545983, 94863941550079, +STORE, 94863941537792, 94863941545983, +STORE, 94863941545984, 94863941550079, +SNULL, 139691049521151, 139691049525247, +STORE, 139691049517056, 139691049521151, +STORE, 139691049521152, 139691049525247, +ERASE, 139691049488384, 139691049517055, +STORE, 94863951294464, 94863951429631, +STORE, 93998209294336, 93998209507327, +STORE, 93998211604480, 93998211608575, +STORE, 93998211608576, 93998211616767, +STORE, 93998211616768, 93998211629055, +STORE, 93998227210240, 93998227615743, +STORE, 140243029913600, 140243031572479, +STORE, 140243031572480, 140243033669631, +STORE, 140243033669632, 140243033686015, +STORE, 140243033686016, 140243033694207, +STORE, 140243033694208, 140243033710591, +STORE, 140243033710592, 140243033722879, +STORE, 140243033722880, 140243035815935, +STORE, 140243035815936, 140243035820031, +STORE, 140243035820032, 140243035824127, +STORE, 140243035824128, 140243035967487, +STORE, 140243036336128, 140243038019583, +STORE, 140243038019584, 140243038035967, +STORE, 140243038064640, 140243038068735, +STORE, 140243038068736, 140243038072831, +STORE, 140243038072832, 140243038076927, +STORE, 140734976479232, 140734976618495, +STORE, 140734977978368, 140734977990655, +STORE, 140734977990656, 140734977994751, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722742775808, 140737488351231, +SNULL, 140722742783999, 140737488351231, +STORE, 140722742775808, 140722742783999, +STORE, 140722742644736, 140722742783999, +STORE, 93857673662464, 93857675997183, +SNULL, 93857673875455, 93857675997183, +STORE, 93857673662464, 93857673875455, +STORE, 93857673875456, 93857675997183, +ERASE, 93857673875456, 93857675997183, +STORE, 93857675972608, 93857675984895, +STORE, 93857675984896, 93857675997183, +STORE, 140629677498368, 140629679751167, +SNULL, 140629677641727, 140629679751167, +STORE, 140629677498368, 140629677641727, +STORE, 140629677641728, 140629679751167, +ERASE, 140629677641728, 140629679751167, +STORE, 140629679738880, 140629679747071, +STORE, 140629679747072, 140629679751167, +STORE, 140722743222272, 140722743226367, +STORE, 140722743209984, 140722743222271, +STORE, 140629679710208, 140629679738879, +STORE, 140629679702016, 140629679710207, +STORE, 140629675384832, 140629677498367, +SNULL, 140629675384832, 140629675397119, +STORE, 140629675397120, 140629677498367, +STORE, 140629675384832, 140629675397119, +SNULL, 140629677490175, 140629677498367, +STORE, 140629675397120, 140629677490175, +STORE, 140629677490176, 140629677498367, +ERASE, 140629677490176, 140629677498367, +STORE, 140629677490176, 140629677498367, +STORE, 140629671587840, 140629675384831, +SNULL, 140629671587840, 140629673246719, +STORE, 140629673246720, 140629675384831, +STORE, 140629671587840, 140629673246719, +SNULL, 140629675343871, 140629675384831, +STORE, 140629673246720, 140629675343871, +STORE, 140629675343872, 140629675384831, +SNULL, 140629675343872, 140629675368447, +STORE, 140629675368448, 140629675384831, +STORE, 140629675343872, 140629675368447, +ERASE, 140629675343872, 140629675368447, +STORE, 140629675343872, 140629675368447, +ERASE, 140629675368448, 140629675384831, +STORE, 140629675368448, 140629675384831, +STORE, 140629679693824, 140629679710207, +SNULL, 140629675360255, 140629675368447, +STORE, 140629675343872, 140629675360255, +STORE, 140629675360256, 140629675368447, +SNULL, 140629677494271, 140629677498367, +STORE, 140629677490176, 140629677494271, +STORE, 140629677494272, 140629677498367, +SNULL, 93857675976703, 93857675984895, +STORE, 93857675972608, 93857675976703, +STORE, 93857675976704, 93857675984895, +SNULL, 140629679742975, 140629679747071, +STORE, 140629679738880, 140629679742975, +STORE, 140629679742976, 140629679747071, +ERASE, 140629679710208, 140629679738879, +STORE, 93857705832448, 93857705967615, +STORE, 140629678010368, 140629679693823, +STORE, 93857705832448, 93857706102783, +STORE, 93857705832448, 93857706237951, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140735922421760, 140737488351231, +SNULL, 140735922429951, 140737488351231, +STORE, 140735922421760, 140735922429951, +STORE, 140735922290688, 140735922429951, +STORE, 94651136139264, 94651138363391, +SNULL, 94651136249855, 94651138363391, +STORE, 94651136139264, 94651136249855, +STORE, 94651136249856, 94651138363391, +ERASE, 94651136249856, 94651138363391, +STORE, 94651138342912, 94651138355199, +STORE, 94651138355200, 94651138363391, +STORE, 140325788266496, 140325790519295, +SNULL, 140325788409855, 140325790519295, +STORE, 140325788266496, 140325788409855, +STORE, 140325788409856, 140325790519295, +ERASE, 140325788409856, 140325790519295, +STORE, 140325790507008, 140325790515199, +STORE, 140325790515200, 140325790519295, +STORE, 140735923572736, 140735923576831, +STORE, 140735923560448, 140735923572735, +STORE, 140325790478336, 140325790507007, +STORE, 140325790470144, 140325790478335, +STORE, 140325784469504, 140325788266495, +SNULL, 140325784469504, 140325786128383, +STORE, 140325786128384, 140325788266495, +STORE, 140325784469504, 140325786128383, +SNULL, 140325788225535, 140325788266495, +STORE, 140325786128384, 140325788225535, +STORE, 140325788225536, 140325788266495, +SNULL, 140325788225536, 140325788250111, +STORE, 140325788250112, 140325788266495, +STORE, 140325788225536, 140325788250111, +ERASE, 140325788225536, 140325788250111, +STORE, 140325788225536, 140325788250111, +ERASE, 140325788250112, 140325788266495, +STORE, 140325788250112, 140325788266495, +SNULL, 140325788241919, 140325788250111, +STORE, 140325788225536, 140325788241919, +STORE, 140325788241920, 140325788250111, +SNULL, 94651138351103, 94651138355199, +STORE, 94651138342912, 94651138351103, +STORE, 94651138351104, 94651138355199, +SNULL, 140325790511103, 140325790515199, +STORE, 140325790507008, 140325790511103, +STORE, 140325790511104, 140325790515199, +ERASE, 140325790478336, 140325790507007, +STORE, 94651146297344, 94651146432511, +STORE, 94212330168320, 94212330381311, +STORE, 94212332478464, 94212332482559, +STORE, 94212332482560, 94212332490751, +STORE, 94212332490752, 94212332503039, +STORE, 94212348891136, 94212349825023, +STORE, 140611630604288, 140611632263167, +STORE, 140611632263168, 140611634360319, +STORE, 140611634360320, 140611634376703, +STORE, 140611634376704, 140611634384895, +STORE, 140611634384896, 140611634401279, +STORE, 140611634401280, 140611634413567, +STORE, 140611634413568, 140611636506623, +STORE, 140611636506624, 140611636510719, +STORE, 140611636510720, 140611636514815, +STORE, 140611636514816, 140611636658175, +STORE, 140611637026816, 140611638710271, +STORE, 140611638710272, 140611638726655, +STORE, 140611638755328, 140611638759423, +STORE, 140611638759424, 140611638763519, +STORE, 140611638763520, 140611638767615, +STORE, 140726974533632, 140726974672895, +STORE, 140726974943232, 140726974955519, +STORE, 140726974955520, 140726974959615, +STORE, 94572463521792, 94572463734783, +STORE, 94572465831936, 94572465836031, +STORE, 94572465836032, 94572465844223, +STORE, 94572465844224, 94572465856511, +STORE, 94572491534336, 94572492865535, +STORE, 140644351492096, 140644353150975, +STORE, 140644353150976, 140644355248127, +STORE, 140644355248128, 140644355264511, +STORE, 140644355264512, 140644355272703, +STORE, 140644355272704, 140644355289087, +STORE, 140644355289088, 140644355301375, +STORE, 140644355301376, 140644357394431, +STORE, 140644357394432, 140644357398527, +STORE, 140644357398528, 140644357402623, +STORE, 140644357402624, 140644357545983, +STORE, 140644357914624, 140644359598079, +STORE, 140644359598080, 140644359614463, +STORE, 140644359643136, 140644359647231, +STORE, 140644359647232, 140644359651327, +STORE, 140644359651328, 140644359655423, +STORE, 140727841824768, 140727841964031, +STORE, 140727843188736, 140727843201023, +STORE, 140727843201024, 140727843205119, +STORE, 94144315457536, 94144315670527, +STORE, 94144317767680, 94144317771775, +STORE, 94144317771776, 94144317779967, +STORE, 94144317779968, 94144317792255, +STORE, 94144318369792, 94144320815103, +STORE, 140316717645824, 140316719304703, +STORE, 140316719304704, 140316721401855, +STORE, 140316721401856, 140316721418239, +STORE, 140316721418240, 140316721426431, +STORE, 140316721426432, 140316721442815, +STORE, 140316721442816, 140316721455103, +STORE, 140316721455104, 140316723548159, +STORE, 140316723548160, 140316723552255, +STORE, 140316723552256, 140316723556351, +STORE, 140316723556352, 140316723699711, +STORE, 140316724068352, 140316725751807, +STORE, 140316725751808, 140316725768191, +STORE, 140316725796864, 140316725800959, +STORE, 140316725800960, 140316725805055, +STORE, 140316725805056, 140316725809151, +STORE, 140725744283648, 140725744422911, +STORE, 140725745852416, 140725745864703, +STORE, 140725745864704, 140725745868799, +STORE, 94646858846208, 94646859059199, +STORE, 94646861156352, 94646861160447, +STORE, 94646861160448, 94646861168639, +STORE, 94646861168640, 94646861180927, +STORE, 94646879805440, 94646881894399, +STORE, 140435449745408, 140435451404287, +STORE, 140435451404288, 140435453501439, +STORE, 140435453501440, 140435453517823, +STORE, 140435453517824, 140435453526015, +STORE, 140435453526016, 140435453542399, +STORE, 140435453542400, 140435453554687, +STORE, 140435453554688, 140435455647743, +STORE, 140435455647744, 140435455651839, +STORE, 140435455651840, 140435455655935, +STORE, 140435455655936, 140435455799295, +STORE, 140435456167936, 140435457851391, +STORE, 140435457851392, 140435457867775, +STORE, 140435457896448, 140435457900543, +STORE, 140435457900544, 140435457904639, +STORE, 140435457904640, 140435457908735, +STORE, 140721033818112, 140721033957375, +STORE, 140721034018816, 140721034031103, +STORE, 140721034031104, 140721034035199, +STORE, 94872903438336, 94872903651327, +STORE, 94872905748480, 94872905752575, +STORE, 94872905752576, 94872905760767, +STORE, 94872905760768, 94872905773055, +STORE, 94872931246080, 94872931651583, +STORE, 139771607810048, 139771609468927, +STORE, 139771609468928, 139771611566079, +STORE, 139771611566080, 139771611582463, +STORE, 139771611582464, 139771611590655, +STORE, 139771611590656, 139771611607039, +STORE, 139771611607040, 139771611619327, +STORE, 139771611619328, 139771613712383, +STORE, 139771613712384, 139771613716479, +STORE, 139771613716480, 139771613720575, +STORE, 139771613720576, 139771613863935, +STORE, 139771614232576, 139771615916031, +STORE, 139771615916032, 139771615932415, +STORE, 139771615961088, 139771615965183, +STORE, 139771615965184, 139771615969279, +STORE, 139771615969280, 139771615973375, +STORE, 140725402931200, 140725403070463, +STORE, 140725403852800, 140725403865087, +STORE, 140725403865088, 140725403869183, +STORE, 94740737736704, 94740737949695, +STORE, 94740740046848, 94740740050943, +STORE, 94740740050944, 94740740059135, +STORE, 94740740059136, 94740740071423, +STORE, 94740743249920, 94740744724479, +STORE, 140640287010816, 140640288669695, +STORE, 140640288669696, 140640290766847, +STORE, 140640290766848, 140640290783231, +STORE, 140640290783232, 140640290791423, +STORE, 140640290791424, 140640290807807, +STORE, 140640290807808, 140640290820095, +STORE, 140640290820096, 140640292913151, +STORE, 140640292913152, 140640292917247, +STORE, 140640292917248, 140640292921343, +STORE, 140640292921344, 140640293064703, +STORE, 140640293433344, 140640295116799, +STORE, 140640295116800, 140640295133183, +STORE, 140640295161856, 140640295165951, +STORE, 140640295165952, 140640295170047, +STORE, 140640295170048, 140640295174143, +STORE, 140725133303808, 140725133443071, +STORE, 140725133684736, 140725133697023, +STORE, 140725133697024, 140725133701119, +STORE, 140737488347136, 140737488351231, +STORE, 140722826371072, 140737488351231, +SNULL, 140722826375167, 140737488351231, +STORE, 140722826371072, 140722826375167, +STORE, 140722826240000, 140722826375167, +STORE, 94113818611712, 94113820835839, +SNULL, 94113818722303, 94113820835839, +STORE, 94113818611712, 94113818722303, +STORE, 94113818722304, 94113820835839, +ERASE, 94113818722304, 94113820835839, +STORE, 94113820815360, 94113820827647, +STORE, 94113820827648, 94113820835839, +STORE, 139628194508800, 139628196761599, +SNULL, 139628194652159, 139628196761599, +STORE, 139628194508800, 139628194652159, +STORE, 139628194652160, 139628196761599, +ERASE, 139628194652160, 139628196761599, +STORE, 139628196749312, 139628196757503, +STORE, 139628196757504, 139628196761599, +STORE, 140722826727424, 140722826731519, +STORE, 140722826715136, 140722826727423, +STORE, 139628196720640, 139628196749311, +STORE, 139628196712448, 139628196720639, +STORE, 139628190711808, 139628194508799, +SNULL, 139628190711808, 139628192370687, +STORE, 139628192370688, 139628194508799, +STORE, 139628190711808, 139628192370687, +SNULL, 139628194467839, 139628194508799, +STORE, 139628192370688, 139628194467839, +STORE, 139628194467840, 139628194508799, +SNULL, 139628194467840, 139628194492415, +STORE, 139628194492416, 139628194508799, +STORE, 139628194467840, 139628194492415, +ERASE, 139628194467840, 139628194492415, +STORE, 139628194467840, 139628194492415, +ERASE, 139628194492416, 139628194508799, +STORE, 139628194492416, 139628194508799, +SNULL, 139628194484223, 139628194492415, +STORE, 139628194467840, 139628194484223, +STORE, 139628194484224, 139628194492415, +SNULL, 94113820823551, 94113820827647, +STORE, 94113820815360, 94113820823551, +STORE, 94113820823552, 94113820827647, +SNULL, 139628196753407, 139628196757503, +STORE, 139628196749312, 139628196753407, +STORE, 139628196753408, 139628196757503, +ERASE, 139628196720640, 139628196749311, +STORE, 94113830850560, 94113830985727, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140731865833472, 140737488351231, +SNULL, 140731865841663, 140737488351231, +STORE, 140731865833472, 140731865841663, +STORE, 140731865702400, 140731865841663, +STORE, 94763339386880, 94763341611007, +SNULL, 94763339497471, 94763341611007, +STORE, 94763339386880, 94763339497471, +STORE, 94763339497472, 94763341611007, +ERASE, 94763339497472, 94763341611007, +STORE, 94763341590528, 94763341602815, +STORE, 94763341602816, 94763341611007, +STORE, 139778398486528, 139778400739327, +SNULL, 139778398629887, 139778400739327, +STORE, 139778398486528, 139778398629887, +STORE, 139778398629888, 139778400739327, +ERASE, 139778398629888, 139778400739327, +STORE, 139778400727040, 139778400735231, +STORE, 139778400735232, 139778400739327, +STORE, 140731865858048, 140731865862143, +STORE, 140731865845760, 140731865858047, +STORE, 139778400698368, 139778400727039, +STORE, 139778400690176, 139778400698367, +STORE, 139778394689536, 139778398486527, +SNULL, 139778394689536, 139778396348415, +STORE, 139778396348416, 139778398486527, +STORE, 139778394689536, 139778396348415, +SNULL, 139778398445567, 139778398486527, +STORE, 139778396348416, 139778398445567, +STORE, 139778398445568, 139778398486527, +SNULL, 139778398445568, 139778398470143, +STORE, 139778398470144, 139778398486527, +STORE, 139778398445568, 139778398470143, +ERASE, 139778398445568, 139778398470143, +STORE, 139778398445568, 139778398470143, +ERASE, 139778398470144, 139778398486527, +STORE, 139778398470144, 139778398486527, +SNULL, 139778398461951, 139778398470143, +STORE, 139778398445568, 139778398461951, +STORE, 139778398461952, 139778398470143, +SNULL, 94763341598719, 94763341602815, +STORE, 94763341590528, 94763341598719, +STORE, 94763341598720, 94763341602815, +SNULL, 139778400731135, 139778400735231, +STORE, 139778400727040, 139778400731135, +STORE, 139778400731136, 139778400735231, +ERASE, 139778400698368, 139778400727039, +STORE, 94763362197504, 94763362332671, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140737488338944, 140737488351231, +STORE, 140732053192704, 140737488351231, +SNULL, 140732053204991, 140737488351231, +STORE, 140732053192704, 140732053204991, +STORE, 140732053061632, 140732053204991, +STORE, 4194304, 26279935, +STORE, 28372992, 28454911, +STORE, 28454912, 29806591, +STORE, 140176018599936, 140176020852735, +SNULL, 140176018743295, 140176020852735, +STORE, 140176018599936, 140176018743295, +STORE, 140176018743296, 140176020852735, +ERASE, 140176018743296, 140176020852735, +STORE, 140176020840448, 140176020848639, +STORE, 140176020848640, 140176020852735, +STORE, 140732053381120, 140732053385215, +STORE, 140732053368832, 140732053381119, +STORE, 140176020811776, 140176020840447, +STORE, 140176020803584, 140176020811775, +STORE, 140176014766080, 140176018599935, +SNULL, 140176014766080, 140176016474111, +STORE, 140176016474112, 140176018599935, +STORE, 140176014766080, 140176016474111, +SNULL, 140176018567167, 140176018599935, +STORE, 140176016474112, 140176018567167, +STORE, 140176018567168, 140176018599935, +ERASE, 140176018567168, 140176018599935, +STORE, 140176018567168, 140176018599935, +STORE, 140176012570624, 140176014766079, +SNULL, 140176012570624, 140176012664831, +STORE, 140176012664832, 140176014766079, +STORE, 140176012570624, 140176012664831, +SNULL, 140176014757887, 140176014766079, +STORE, 140176012664832, 140176014757887, +STORE, 140176014757888, 140176014766079, +ERASE, 140176014757888, 140176014766079, +STORE, 140176014757888, 140176014766079, +STORE, 140176010051584, 140176012570623, +SNULL, 140176010051584, 140176010465279, +STORE, 140176010465280, 140176012570623, +STORE, 140176010051584, 140176010465279, +SNULL, 140176012558335, 140176012570623, +STORE, 140176010465280, 140176012558335, +STORE, 140176012558336, 140176012570623, +ERASE, 140176012558336, 140176012570623, +STORE, 140176012558336, 140176012570623, +STORE, 140176007417856, 140176010051583, +SNULL, 140176007417856, 140176007946239, +STORE, 140176007946240, 140176010051583, +STORE, 140176007417856, 140176007946239, +SNULL, 140176010043391, 140176010051583, +STORE, 140176007946240, 140176010043391, +STORE, 140176010043392, 140176010051583, +ERASE, 140176010043392, 140176010051583, +STORE, 140176010043392, 140176010051583, +STORE, 140176005304320, 140176007417855, +SNULL, 140176005304320, 140176005316607, +STORE, 140176005316608, 140176007417855, +STORE, 140176005304320, 140176005316607, +SNULL, 140176007409663, 140176007417855, +STORE, 140176005316608, 140176007409663, +STORE, 140176007409664, 140176007417855, +ERASE, 140176007409664, 140176007417855, +STORE, 140176007409664, 140176007417855, +STORE, 140176003100672, 140176005304319, +SNULL, 140176003100672, 140176003203071, +STORE, 140176003203072, 140176005304319, +STORE, 140176003100672, 140176003203071, +SNULL, 140176005296127, 140176005304319, +STORE, 140176003203072, 140176005296127, +STORE, 140176005296128, 140176005304319, +ERASE, 140176005296128, 140176005304319, +STORE, 140176005296128, 140176005304319, +STORE, 140176020795392, 140176020811775, +STORE, 140175999938560, 140176003100671, +SNULL, 140175999938560, 140176000999423, +STORE, 140176000999424, 140176003100671, +STORE, 140175999938560, 140176000999423, +SNULL, 140176003092479, 140176003100671, +STORE, 140176000999424, 140176003092479, +STORE, 140176003092480, 140176003100671, +ERASE, 140176003092480, 140176003100671, +STORE, 140176003092480, 140176003100671, +STORE, 140175996141568, 140175999938559, +SNULL, 140175996141568, 140175997800447, +STORE, 140175997800448, 140175999938559, +STORE, 140175996141568, 140175997800447, +SNULL, 140175999897599, 140175999938559, +STORE, 140175997800448, 140175999897599, +STORE, 140175999897600, 140175999938559, +SNULL, 140175999897600, 140175999922175, +STORE, 140175999922176, 140175999938559, +STORE, 140175999897600, 140175999922175, +ERASE, 140175999897600, 140175999922175, +STORE, 140175999897600, 140175999922175, +ERASE, 140175999922176, 140175999938559, +STORE, 140175999922176, 140175999938559, +STORE, 140176020783104, 140176020811775, +SNULL, 140175999913983, 140175999922175, +STORE, 140175999897600, 140175999913983, +STORE, 140175999913984, 140175999922175, +SNULL, 140176003096575, 140176003100671, +STORE, 140176003092480, 140176003096575, +STORE, 140176003096576, 140176003100671, +SNULL, 140176005300223, 140176005304319, +STORE, 140176005296128, 140176005300223, +STORE, 140176005300224, 140176005304319, +SNULL, 140176007413759, 140176007417855, +STORE, 140176007409664, 140176007413759, +STORE, 140176007413760, 140176007417855, +SNULL, 140176010047487, 140176010051583, +STORE, 140176010043392, 140176010047487, +STORE, 140176010047488, 140176010051583, +SNULL, 140176012566527, 140176012570623, +STORE, 140176012558336, 140176012566527, +STORE, 140176012566528, 140176012570623, +SNULL, 140176014761983, 140176014766079, +STORE, 140176014757888, 140176014761983, +STORE, 140176014761984, 140176014766079, +SNULL, 140176018571263, 140176018599935, +STORE, 140176018567168, 140176018571263, +STORE, 140176018571264, 140176018599935, +SNULL, 28405759, 28454911, +STORE, 28372992, 28405759, +STORE, 28405760, 28454911, +SNULL, 140176020844543, 140176020848639, +STORE, 140176020840448, 140176020844543, +STORE, 140176020844544, 140176020848639, +ERASE, 140176020811776, 140176020840447, +STORE, 53080064, 53215231, +STORE, 140176019099648, 140176020783103, +STORE, 140176020836352, 140176020840447, +STORE, 140176018964480, 140176019099647, +STORE, 53080064, 53358591, +STORE, 140175994044416, 140175996141567, +STORE, 140176020828160, 140176020840447, +STORE, 140176020819968, 140176020840447, +STORE, 140176020783104, 140176020819967, +STORE, 140176018948096, 140176019099647, +STORE, 53080064, 53493759, +STORE, 53080064, 53649407, +STORE, 140176018939904, 140176019099647, +STORE, 140176018931712, 140176019099647, +STORE, 53080064, 53784575, +STORE, 53080064, 53919743, +STORE, 140176018915328, 140176019099647, +STORE, 140176018907136, 140176019099647, +STORE, 53080064, 54059007, +STORE, 140175993769984, 140175996141567, +STORE, 140176018747392, 140176019099647, +STORE, 53080064, 54198271, +SNULL, 54190079, 54198271, +STORE, 53080064, 54190079, +STORE, 54190080, 54198271, +ERASE, 54190080, 54198271, +SNULL, 54181887, 54190079, +STORE, 53080064, 54181887, +STORE, 54181888, 54190079, +ERASE, 54181888, 54190079, +SNULL, 54173695, 54181887, +STORE, 53080064, 54173695, +STORE, 54173696, 54181887, +ERASE, 54173696, 54181887, +SNULL, 54165503, 54173695, +STORE, 53080064, 54165503, +STORE, 54165504, 54173695, +ERASE, 54165504, 54173695, +STORE, 140175993753600, 140175996141567, +STORE, 140175993688064, 140175996141567, +STORE, 140175993655296, 140175996141567, +STORE, 140175991558144, 140175996141567, +STORE, 140175991492608, 140175996141567, +STORE, 53080064, 54312959, +STORE, 140175991361536, 140175996141567, +STORE, 140175991099392, 140175996141567, +STORE, 140175991091200, 140175996141567, +STORE, 140175991074816, 140175996141567, +STORE, 140175991066624, 140175996141567, +STORE, 140175991058432, 140175996141567, +STORE, 53080064, 54448127, +SNULL, 54439935, 54448127, +STORE, 53080064, 54439935, +STORE, 54439936, 54448127, +ERASE, 54439936, 54448127, +SNULL, 54431743, 54439935, +STORE, 53080064, 54431743, +STORE, 54431744, 54439935, +ERASE, 54431744, 54439935, +SNULL, 54419455, 54431743, +STORE, 53080064, 54419455, +STORE, 54419456, 54431743, +ERASE, 54419456, 54431743, +SNULL, 54403071, 54419455, +STORE, 53080064, 54403071, +STORE, 54403072, 54419455, +ERASE, 54403072, 54419455, +STORE, 140175991042048, 140175996141567, +STORE, 53080064, 54538239, +SNULL, 54534143, 54538239, +STORE, 53080064, 54534143, +STORE, 54534144, 54538239, +ERASE, 54534144, 54538239, +SNULL, 54530047, 54534143, +STORE, 53080064, 54530047, +STORE, 54530048, 54534143, +ERASE, 54530048, 54534143, +SNULL, 54525951, 54530047, +STORE, 53080064, 54525951, +STORE, 54525952, 54530047, +ERASE, 54525952, 54530047, +SNULL, 54521855, 54525951, +STORE, 53080064, 54521855, +STORE, 54521856, 54525951, +ERASE, 54521856, 54525951, +SNULL, 54517759, 54521855, +STORE, 53080064, 54517759, +STORE, 54517760, 54521855, +ERASE, 54517760, 54521855, +SNULL, 54513663, 54517759, +STORE, 53080064, 54513663, +STORE, 54513664, 54517759, +ERASE, 54513664, 54517759, +SNULL, 54509567, 54513663, +STORE, 53080064, 54509567, +STORE, 54509568, 54513663, +ERASE, 54509568, 54513663, +STORE, 140175991025664, 140175996141567, +STORE, 140175990992896, 140175996141567, +STORE, 53080064, 54644735, +SNULL, 54628351, 54644735, +STORE, 53080064, 54628351, +STORE, 54628352, 54644735, +ERASE, 54628352, 54644735, +SNULL, 54616063, 54628351, +STORE, 53080064, 54616063, +STORE, 54616064, 54628351, +ERASE, 54616064, 54628351, +STORE, 140175988895744, 140175996141567, +STORE, 53080064, 54767615, +STORE, 140175988879360, 140175996141567, +STORE, 140175988617216, 140175996141567, +STORE, 140175988609024, 140175996141567, +STORE, 140175988600832, 140175996141567, +STORE, 53080064, 54906879, +SNULL, 54898687, 54906879, +STORE, 53080064, 54898687, +STORE, 54898688, 54906879, +ERASE, 54898688, 54906879, +SNULL, 54853631, 54898687, +STORE, 53080064, 54853631, +STORE, 54853632, 54898687, +ERASE, 54853632, 54898687, +STORE, 140175986503680, 140175996141567, +STORE, 53080064, 54996991, +STORE, 140175986495488, 140175996141567, +STORE, 140175986487296, 140175996141567, +STORE, 140175985438720, 140175996141567, +STORE, 53080064, 55136255, +STORE, 140175985405952, 140175996141567, +STORE, 140175985139712, 140175996141567, +SNULL, 140176018964479, 140176019099647, +STORE, 140176018747392, 140176018964479, +STORE, 140176018964480, 140176019099647, +ERASE, 140176018964480, 140176019099647, +STORE, 140175983042560, 140175996141567, +STORE, 140175982518272, 140175996141567, +STORE, 140175980421120, 140175996141567, +STORE, 53080064, 55287807, +STORE, 53080064, 55427071, +STORE, 140176019091456, 140176019099647, +STORE, 140176019083264, 140176019099647, +STORE, 140176019075072, 140176019099647, +STORE, 140176019066880, 140176019099647, +STORE, 140176019058688, 140176019099647, +STORE, 140175980158976, 140175996141567, +STORE, 140176019050496, 140176019099647, +STORE, 140176019042304, 140176019099647, +STORE, 140176019034112, 140176019099647, +STORE, 140176019025920, 140176019099647, +STORE, 140176019017728, 140176019099647, +STORE, 140176019009536, 140176019099647, +STORE, 140176019001344, 140176019099647, +STORE, 140176018993152, 140176019099647, +STORE, 140176018984960, 140176019099647, +STORE, 140176018976768, 140176019099647, +STORE, 140176018968576, 140176019099647, +STORE, 140175978061824, 140175996141567, +STORE, 53080064, 55603199, +STORE, 140175978029056, 140175996141567, +STORE, 140175977996288, 140175996141567, +STORE, 53080064, 55738367, +STORE, 53080064, 55881727, +STORE, 140175977963520, 140175996141567, +STORE, 140175977930752, 140175996141567, +STORE, 53080064, 56041471, +STORE, 140175977897984, 140175996141567, +STORE, 140175977865216, 140175996141567, +SNULL, 55881727, 56041471, +STORE, 53080064, 55881727, +STORE, 55881728, 56041471, +ERASE, 55881728, 56041471, +SNULL, 55721983, 55881727, +STORE, 53080064, 55721983, +STORE, 55721984, 55881727, +ERASE, 55721984, 55881727, +SNULL, 55570431, 55721983, +STORE, 53080064, 55570431, +STORE, 55570432, 55721983, +ERASE, 55570432, 55721983, +STORE, 140175977857024, 140175996141567, +STORE, 140175975759872, 140175996141567, +STORE, 53080064, 55754751, +STORE, 53080064, 55943167, +STORE, 140175975751680, 140175996141567, +STORE, 140175975743488, 140175996141567, +STORE, 140175975735296, 140175996141567, +STORE, 140175975727104, 140175996141567, +STORE, 140175975718912, 140175996141567, +STORE, 140175975710720, 140175996141567, +STORE, 140175975702528, 140175996141567, +STORE, 140175975694336, 140175996141567, +STORE, 140175975686144, 140175996141567, +STORE, 140175975677952, 140175996141567, +STORE, 140175975669760, 140175996141567, +STORE, 140175974621184, 140175996141567, +STORE, 140175974612992, 140175996141567, +STORE, 53080064, 56139775, +STORE, 140175972515840, 140175996141567, +STORE, 53080064, 56401919, +STORE, 140175970418688, 140175996141567, +STORE, 140175970410496, 140175996141567, +STORE, 140175970402304, 140175996141567, +STORE, 140175970394112, 140175996141567, +STORE, 53080064, 56569855, +STORE, 140175969865728, 140175996141567, +SNULL, 140175985139711, 140175996141567, +STORE, 140175969865728, 140175985139711, +STORE, 140175985139712, 140175996141567, +SNULL, 140175985139712, 140175985405951, +STORE, 140175985405952, 140175996141567, +STORE, 140175985139712, 140175985405951, +ERASE, 140175985139712, 140175985405951, +STORE, 140175965671424, 140175985139711, +STORE, 140175985397760, 140175996141567, +STORE, 140175985389568, 140175996141567, +STORE, 140175985381376, 140175996141567, +STORE, 140175985373184, 140175996141567, +STORE, 140175985364992, 140175996141567, +STORE, 140175985356800, 140175996141567, +STORE, 140175985348608, 140175996141567, +STORE, 140175985340416, 140175996141567, +STORE, 140175985332224, 140175996141567, +STORE, 140175985324032, 140175996141567, +STORE, 140175985315840, 140175996141567, +STORE, 140175985307648, 140175996141567, +STORE, 140175985299456, 140175996141567, +STORE, 140175985291264, 140175996141567, +STORE, 140175985283072, 140175996141567, +STORE, 140175985274880, 140175996141567, +STORE, 140175963574272, 140175985139711, +STORE, 140175985266688, 140175996141567, +STORE, 140175961477120, 140175985139711, +STORE, 53080064, 56831999, +STORE, 140175959379968, 140175985139711, +STORE, 140175985258496, 140175996141567, +STORE, 140175957282816, 140175985139711, +STORE, 140175985250304, 140175996141567, +STORE, 140175985242112, 140175996141567, +STORE, 140175985233920, 140175996141567, +STORE, 140175985225728, 140175996141567, +STORE, 140175985217536, 140175996141567, +STORE, 140175957151744, 140175985139711, +STORE, 140175956627456, 140175985139711, +SNULL, 140175980158975, 140175985139711, +STORE, 140175956627456, 140175980158975, +STORE, 140175980158976, 140175985139711, +SNULL, 140175980158976, 140175980421119, +STORE, 140175980421120, 140175985139711, +STORE, 140175980158976, 140175980421119, +ERASE, 140175980158976, 140175980421119, +STORE, 140175954530304, 140175980158975, +STORE, 140175985209344, 140175996141567, +STORE, 53080064, 57094143, +STORE, 140175952433152, 140175980158975, +STORE, 140175985192960, 140175996141567, +STORE, 140175985184768, 140175996141567, +STORE, 140175985176576, 140175996141567, +STORE, 140175985168384, 140175996141567, +STORE, 140175985160192, 140175996141567, +STORE, 140175985152000, 140175996141567, +STORE, 140175985143808, 140175996141567, +STORE, 140175980412928, 140175985139711, +STORE, 140175980404736, 140175985139711, +STORE, 140175980396544, 140175985139711, +STORE, 140175980388352, 140175985139711, +STORE, 140175980380160, 140175985139711, +STORE, 140175980371968, 140175985139711, +STORE, 140175980363776, 140175985139711, +STORE, 140175980355584, 140175985139711, +STORE, 140175980347392, 140175985139711, +STORE, 140175980339200, 140175985139711, +STORE, 53080064, 57356287, +SNULL, 140176018747392, 140176018907135, +STORE, 140176018907136, 140176018964479, +STORE, 140176018747392, 140176018907135, +ERASE, 140176018747392, 140176018907135, +STORE, 140175952146432, 140175980158975, +STORE, 140175950049280, 140175980158975, +SNULL, 140175952146431, 140175980158975, +STORE, 140175950049280, 140175952146431, +STORE, 140175952146432, 140175980158975, +SNULL, 140175952146432, 140175952433151, +STORE, 140175952433152, 140175980158975, +STORE, 140175952146432, 140175952433151, +ERASE, 140175952146432, 140175952433151, +STORE, 140176018898944, 140176018964479, +STORE, 53080064, 57749503, +STORE, 140175949520896, 140175952146431, +STORE, 140175947423744, 140175952146431, +SNULL, 140175993769983, 140175996141567, +STORE, 140175985143808, 140175993769983, +STORE, 140175993769984, 140175996141567, +SNULL, 140175993769984, 140175994044415, +STORE, 140175994044416, 140175996141567, +STORE, 140175993769984, 140175994044415, +ERASE, 140175993769984, 140175994044415, +STORE, 140176018890752, 140176018964479, +STORE, 140176018882560, 140176018964479, +STORE, 140176018874368, 140176018964479, +STORE, 140176018866176, 140176018964479, +STORE, 140176018849792, 140176018964479, +STORE, 140176018841600, 140176018964479, +STORE, 140176018825216, 140176018964479, +STORE, 140176018817024, 140176018964479, +STORE, 140176018800640, 140176018964479, +STORE, 140176018792448, 140176018964479, +STORE, 140176018759680, 140176018964479, +STORE, 140176018751488, 140176018964479, +STORE, 140175994028032, 140175996141567, +STORE, 140176018743296, 140176018964479, +STORE, 140175994011648, 140175996141567, +STORE, 140175994003456, 140175996141567, +STORE, 140175993987072, 140175996141567, +STORE, 140175993978880, 140175996141567, +STORE, 140175993946112, 140175996141567, +STORE, 140175993937920, 140175996141567, +STORE, 140175993921536, 140175996141567, +STORE, 140175993913344, 140175996141567, +STORE, 140175993896960, 140175996141567, +STORE, 140175993888768, 140175996141567, +STORE, 140175993872384, 140175996141567, +STORE, 140175993864192, 140175996141567, +STORE, 140175993831424, 140175996141567, +STORE, 140175993823232, 140175996141567, +STORE, 140175993806848, 140175996141567, +STORE, 140175993798656, 140175996141567, +STORE, 140175993782272, 140175996141567, +STORE, 140175993774080, 140175996141567, +STORE, 140175980322816, 140175985139711, +STORE, 140175980314624, 140175985139711, +STORE, 140175980281856, 140175985139711, +STORE, 140175980273664, 140175985139711, +STORE, 140175980257280, 140175985139711, +STORE, 140175945326592, 140175952146431, +STORE, 140175980249088, 140175985139711, +STORE, 140175980232704, 140175985139711, +STORE, 140175980224512, 140175985139711, +STORE, 140175980208128, 140175985139711, +STORE, 140175980199936, 140175985139711, +STORE, 140175980167168, 140175985139711, +STORE, 140175952433152, 140175985139711, +STORE, 140175952416768, 140175985139711, +STORE, 140175952408576, 140175985139711, +STORE, 140175952392192, 140175985139711, +STORE, 140175952384000, 140175985139711, +STORE, 140175952367616, 140175985139711, +STORE, 140175943229440, 140175952146431, +STORE, 140175952359424, 140175985139711, +STORE, 140175952326656, 140175985139711, +STORE, 140175952318464, 140175985139711, +STORE, 140175952302080, 140175985139711, +STORE, 140175952293888, 140175985139711, +STORE, 140175952277504, 140175985139711, +STORE, 140175952269312, 140175985139711, +STORE, 140175952252928, 140175985139711, +STORE, 140175952244736, 140175985139711, +STORE, 140175952211968, 140175985139711, +STORE, 140175952203776, 140175985139711, +STORE, 140175952187392, 140175985139711, +STORE, 140175952179200, 140175985139711, +STORE, 140175952162816, 140175985139711, +STORE, 140175952154624, 140175985139711, +STORE, 140175943213056, 140175952146431, +STORE, 140175943213056, 140175985139711, +STORE, 140175943180288, 140175985139711, +STORE, 140175943172096, 140175985139711, +STORE, 140175943155712, 140175985139711, +STORE, 140175943147520, 140175985139711, +STORE, 140175943131136, 140175985139711, +STORE, 140175943122944, 140175985139711, +STORE, 140175943106560, 140175985139711, +STORE, 140175943098368, 140175985139711, +STORE, 140175943065600, 140175985139711, +STORE, 140175943057408, 140175985139711, +STORE, 140175943041024, 140175985139711, +STORE, 140175943032832, 140175985139711, +STORE, 140175943016448, 140175985139711, +STORE, 140175943008256, 140175985139711, +STORE, 140175942991872, 140175985139711, +STORE, 140175942983680, 140175985139711, +STORE, 140175942950912, 140175985139711, +STORE, 140175942942720, 140175985139711, +STORE, 140175942926336, 140175985139711, +STORE, 140175942918144, 140175985139711, +STORE, 140175942901760, 140175985139711, +STORE, 140175942893568, 140175985139711, +STORE, 140175942877184, 140175985139711, +STORE, 140175942868992, 140175985139711, +STORE, 140175942836224, 140175985139711, +STORE, 140175942828032, 140175985139711, +STORE, 140175942811648, 140175985139711, +STORE, 140175942803456, 140175985139711, +STORE, 140175942787072, 140175985139711, +STORE, 140175942778880, 140175985139711, +STORE, 140175942762496, 140175985139711, +STORE, 140175942754304, 140175985139711, +STORE, 140175942721536, 140175985139711, +STORE, 140175942713344, 140175985139711, +STORE, 140175942696960, 140175985139711, +STORE, 140175942688768, 140175985139711, +STORE, 140175942672384, 140175985139711, +STORE, 140175942664192, 140175985139711, +STORE, 140175942647808, 140175985139711, +STORE, 140175942639616, 140175985139711, +STORE, 140175942606848, 140175985139711, +STORE, 140175942598656, 140175985139711, +STORE, 140175942582272, 140175985139711, +STORE, 140175942574080, 140175985139711, +STORE, 140175942557696, 140175985139711, +STORE, 140175942549504, 140175985139711, +STORE, 140175942533120, 140175985139711, +STORE, 140175942524928, 140175985139711, +STORE, 140175942492160, 140175985139711, +STORE, 140175942483968, 140175985139711, +STORE, 140175942467584, 140175985139711, +STORE, 140175942459392, 140175985139711, +STORE, 140175942443008, 140175985139711, +STORE, 140175942434816, 140175985139711, +STORE, 140175942418432, 140175985139711, +STORE, 140175942410240, 140175985139711, +STORE, 140175942377472, 140175985139711, +STORE, 140175942369280, 140175985139711, +STORE, 140175942352896, 140175985139711, +STORE, 140175942344704, 140175985139711, +STORE, 140175942328320, 140175985139711, +STORE, 140175942320128, 140175985139711, +STORE, 140175942303744, 140175985139711, +STORE, 140175942295552, 140175985139711, +STORE, 140175942262784, 140175985139711, +STORE, 140175942254592, 140175985139711, +STORE, 140175942238208, 140175985139711, +STORE, 140175942230016, 140175985139711, +STORE, 140175942213632, 140175985139711, +STORE, 140175942205440, 140175985139711, +STORE, 140175942189056, 140175985139711, +STORE, 140175942180864, 140175985139711, +STORE, 140175942148096, 140175985139711, +STORE, 140175942139904, 140175985139711, +STORE, 140175942123520, 140175985139711, +STORE, 140175942115328, 140175985139711, +STORE, 140175942098944, 140175985139711, +STORE, 140175942090752, 140175985139711, +STORE, 140175942074368, 140175985139711, +STORE, 140175942066176, 140175985139711, +STORE, 140175942033408, 140175985139711, +STORE, 140175942025216, 140175985139711, +STORE, 140175942008832, 140175985139711, +STORE, 140175942000640, 140175985139711, +STORE, 140175941984256, 140175985139711, +STORE, 140175941976064, 140175985139711, +STORE, 140175941959680, 140175985139711, +STORE, 140175939862528, 140175985139711, +STORE, 140175939854336, 140175985139711, +STORE, 140175939821568, 140175985139711, +STORE, 140175939813376, 140175985139711, +STORE, 140175939796992, 140175985139711, +STORE, 140175939788800, 140175985139711, +STORE, 140175939772416, 140175985139711, +STORE, 140175939764224, 140175985139711, +STORE, 140175939747840, 140175985139711, +STORE, 140175939739648, 140175985139711, +STORE, 140175939706880, 140175985139711, +STORE, 140175939698688, 140175985139711, +STORE, 140175939682304, 140175985139711, +STORE, 140175939674112, 140175985139711, +STORE, 140175939657728, 140175985139711, +STORE, 140175939649536, 140175985139711, +STORE, 140175939633152, 140175985139711, +STORE, 140175939624960, 140175985139711, +STORE, 140175939592192, 140175985139711, +STORE, 140175939584000, 140175985139711, +STORE, 140175939567616, 140175985139711, +STORE, 140175939559424, 140175985139711, +STORE, 140175939543040, 140175985139711, +STORE, 140175939534848, 140175985139711, +STORE, 140175939518464, 140175985139711, +STORE, 140175939510272, 140175985139711, +STORE, 140175939477504, 140175985139711, +STORE, 140175939469312, 140175985139711, +STORE, 140175939452928, 140175985139711, +STORE, 140175939444736, 140175985139711, +STORE, 140175939428352, 140175985139711, +STORE, 140175939420160, 140175985139711, +STORE, 140175939403776, 140175985139711, +STORE, 140175939395584, 140175985139711, +STORE, 140175939362816, 140175985139711, +STORE, 140175939354624, 140175985139711, +STORE, 140175939338240, 140175985139711, +STORE, 140175939330048, 140175985139711, +STORE, 140175939313664, 140175985139711, +STORE, 140175939305472, 140175985139711, +STORE, 140175939289088, 140175985139711, +STORE, 140175939280896, 140175985139711, +STORE, 140175939248128, 140175985139711, +STORE, 140175939239936, 140175985139711, +STORE, 140175939223552, 140175985139711, +STORE, 140175939215360, 140175985139711, +STORE, 140175939198976, 140175985139711, +STORE, 140175939190784, 140175985139711, +STORE, 140175939174400, 140175985139711, +STORE, 140175939166208, 140175985139711, +STORE, 140175939133440, 140175985139711, +STORE, 140175939125248, 140175985139711, +STORE, 140175939108864, 140175985139711, +STORE, 140175939100672, 140175985139711, +STORE, 140175939084288, 140175985139711, +STORE, 140175939076096, 140175985139711, +STORE, 140175939059712, 140175985139711, +STORE, 140175939051520, 140175985139711, +STORE, 140175939018752, 140175985139711, +STORE, 140175939010560, 140175985139711, +STORE, 140175938994176, 140175985139711, +STORE, 140175938985984, 140175985139711, +STORE, 140175938969600, 140175985139711, +STORE, 140175938961408, 140175985139711, +STORE, 140175938945024, 140175985139711, +STORE, 140175938936832, 140175985139711, +STORE, 140175938904064, 140175985139711, +STORE, 140175938895872, 140175985139711, +STORE, 140175938879488, 140175985139711, +STORE, 140175938871296, 140175985139711, +STORE, 140175938854912, 140175985139711, +STORE, 140175938846720, 140175985139711, +STORE, 140175938830336, 140175985139711, +STORE, 140175938822144, 140175985139711, +STORE, 140175938789376, 140175985139711, +STORE, 140175938781184, 140175985139711, +STORE, 140175938764800, 140175985139711, +STORE, 140175938756608, 140175985139711, +STORE, 140175938740224, 140175985139711, +STORE, 140175938732032, 140175985139711, +STORE, 140175938715648, 140175985139711, +STORE, 140175938707456, 140175985139711, +STORE, 140175938674688, 140175985139711, +STORE, 140175938666496, 140175985139711, +STORE, 140175938650112, 140175985139711, +STORE, 140175938641920, 140175985139711, +STORE, 140175938625536, 140175985139711, +STORE, 140175938617344, 140175985139711, +STORE, 140175938600960, 140175985139711, +STORE, 140175938592768, 140175985139711, +STORE, 140175938560000, 140175985139711, +STORE, 140175938551808, 140175985139711, +STORE, 140175938535424, 140175985139711, +STORE, 140175938527232, 140175985139711, +STORE, 140175938510848, 140175985139711, +STORE, 140175938502656, 140175985139711, +STORE, 140175938486272, 140175985139711, +STORE, 140175938478080, 140175985139711, +STORE, 140175938445312, 140175985139711, +STORE, 140175938437120, 140175985139711, +STORE, 140175938420736, 140175985139711, +STORE, 140175938412544, 140175985139711, +STORE, 140175938396160, 140175985139711, +STORE, 140175938387968, 140175985139711, +STORE, 140175938371584, 140175985139711, +STORE, 140175938363392, 140175985139711, +STORE, 140175938330624, 140175985139711, +STORE, 140175938322432, 140175985139711, +STORE, 140175938306048, 140175985139711, +STORE, 140175938297856, 140175985139711, +STORE, 140175938281472, 140175985139711, +STORE, 140175938273280, 140175985139711, +STORE, 140175938256896, 140175985139711, +STORE, 140175938248704, 140175985139711, +STORE, 140175938215936, 140175985139711, +STORE, 140175938207744, 140175985139711, +STORE, 140175938191360, 140175985139711, +STORE, 140175938183168, 140175985139711, +STORE, 140175938166784, 140175985139711, +STORE, 140175938158592, 140175985139711, +STORE, 140175938142208, 140175985139711, +STORE, 140175936045056, 140175985139711, +STORE, 140175936036864, 140175985139711, +STORE, 140175936004096, 140175985139711, +STORE, 140175935995904, 140175985139711, +STORE, 140175935979520, 140175985139711, +STORE, 140175935971328, 140175985139711, +STORE, 140175935954944, 140175985139711, +STORE, 140175935946752, 140175985139711, +STORE, 140175935930368, 140175985139711, +STORE, 140175935922176, 140175985139711, +STORE, 140175935889408, 140175985139711, +STORE, 140175935881216, 140175985139711, +STORE, 140175935864832, 140175985139711, +STORE, 140175935856640, 140175985139711, +STORE, 140175935840256, 140175985139711, +STORE, 140175935832064, 140175985139711, +STORE, 140175935815680, 140175985139711, +STORE, 140175935807488, 140175985139711, +STORE, 140175935774720, 140175985139711, +STORE, 140175935766528, 140175985139711, +STORE, 140175935750144, 140175985139711, +STORE, 140175935741952, 140175985139711, +STORE, 140175935725568, 140175985139711, +STORE, 140175935717376, 140175985139711, +STORE, 140175935700992, 140175985139711, +STORE, 140175935692800, 140175985139711, +STORE, 140175935660032, 140175985139711, +STORE, 140175935651840, 140175985139711, +STORE, 140175935635456, 140175985139711, +STORE, 140175935627264, 140175985139711, +STORE, 140175935610880, 140175985139711, +STORE, 140175935602688, 140175985139711, +STORE, 140175935586304, 140175985139711, +STORE, 140175935578112, 140175985139711, +STORE, 140175935545344, 140175985139711, +STORE, 140175935537152, 140175985139711, +STORE, 140175935520768, 140175985139711, +STORE, 140175935512576, 140175985139711, +STORE, 140175935496192, 140175985139711, +STORE, 140175935488000, 140175985139711, +STORE, 140175935471616, 140175985139711, +STORE, 140175935463424, 140175985139711, +STORE, 140175935430656, 140175985139711, +STORE, 140175935422464, 140175985139711, +STORE, 140175935406080, 140175985139711, +STORE, 140175935397888, 140175985139711, +STORE, 140175935381504, 140175985139711, +STORE, 140175935373312, 140175985139711, +STORE, 140175935356928, 140175985139711, +STORE, 140175935348736, 140175985139711, +STORE, 140175935315968, 140175985139711, +STORE, 140175935307776, 140175985139711, +STORE, 140175935291392, 140175985139711, +STORE, 140175935283200, 140175985139711, +STORE, 140175935266816, 140175985139711, +STORE, 140175935258624, 140175985139711, +STORE, 140175935242240, 140175985139711, +STORE, 140175935234048, 140175985139711, +STORE, 140175935201280, 140175985139711, +STORE, 140175935193088, 140175985139711, +STORE, 140175935176704, 140175985139711, +STORE, 140175935168512, 140175985139711, +STORE, 140175935152128, 140175985139711, +STORE, 140175935143936, 140175985139711, +STORE, 140175935127552, 140175985139711, +STORE, 140175935119360, 140175985139711, +STORE, 140175935086592, 140175985139711, +STORE, 140175935078400, 140175985139711, +STORE, 140175935062016, 140175985139711, +STORE, 140175935053824, 140175985139711, +STORE, 140175935037440, 140175985139711, +STORE, 140175935029248, 140175985139711, +STORE, 140175935012864, 140175985139711, +STORE, 140175935004672, 140175985139711, +STORE, 140175934971904, 140175985139711, +STORE, 140175934963712, 140175985139711, +STORE, 140175934947328, 140175985139711, +STORE, 140175934939136, 140175985139711, +STORE, 140175934922752, 140175985139711, +STORE, 140175934914560, 140175985139711, +STORE, 140175934898176, 140175985139711, +STORE, 140175934889984, 140175985139711, +STORE, 140175934857216, 140175985139711, +STORE, 140175934849024, 140175985139711, +STORE, 140175934832640, 140175985139711, +STORE, 140175934824448, 140175985139711, +STORE, 140175934808064, 140175985139711, +STORE, 140175934799872, 140175985139711, +STORE, 140175934783488, 140175985139711, +STORE, 140175934775296, 140175985139711, +STORE, 140175934742528, 140175985139711, +STORE, 140175934734336, 140175985139711, +STORE, 140175934717952, 140175985139711, +STORE, 140175934709760, 140175985139711, +STORE, 140175934693376, 140175985139711, +STORE, 140175934685184, 140175985139711, +STORE, 140175934668800, 140175985139711, +STORE, 140175934660608, 140175985139711, +STORE, 140175934627840, 140175985139711, +STORE, 140175934619648, 140175985139711, +STORE, 140175934603264, 140175985139711, +STORE, 140175934595072, 140175985139711, +STORE, 140175934578688, 140175985139711, +STORE, 140175934570496, 140175985139711, +STORE, 140175934554112, 140175985139711, +STORE, 140175934545920, 140175985139711, +STORE, 140175934513152, 140175985139711, +STORE, 140175934504960, 140175985139711, +STORE, 140175934488576, 140175985139711, +STORE, 140175934480384, 140175985139711, +STORE, 140175934464000, 140175985139711, +STORE, 140175934455808, 140175985139711, +STORE, 140175934439424, 140175985139711, +STORE, 140175934431232, 140175985139711, +STORE, 140175934398464, 140175985139711, +STORE, 140175934390272, 140175985139711, +STORE, 140175934373888, 140175985139711, +STORE, 140175934365696, 140175985139711, +STORE, 140175934349312, 140175985139711, +STORE, 140175934341120, 140175985139711, +STORE, 140175934324736, 140175985139711, +STORE, 140175932227584, 140175985139711, +STORE, 140175932219392, 140175985139711, +STORE, 140175932186624, 140175985139711, +STORE, 140175932178432, 140175985139711, +STORE, 140175932162048, 140175985139711, +STORE, 140175932153856, 140175985139711, +STORE, 140175932137472, 140175985139711, +STORE, 53080064, 57884671, +STORE, 140175932129280, 140175985139711, +STORE, 140175932112896, 140175985139711, +STORE, 140175932104704, 140175985139711, +STORE, 140175932071936, 140175985139711, +STORE, 140175932063744, 140175985139711, +STORE, 140175932047360, 140175985139711, +STORE, 140175932039168, 140175985139711, +STORE, 140175932022784, 140175985139711, +STORE, 140175932014592, 140175985139711, +STORE, 140175931998208, 140175985139711, +STORE, 140175931990016, 140175985139711, +STORE, 140175931957248, 140175985139711, +STORE, 140175931949056, 140175985139711, +STORE, 140175931932672, 140175985139711, +STORE, 140175931924480, 140175985139711, +STORE, 140175931908096, 140175985139711, +STORE, 140175931899904, 140175985139711, +STORE, 140175931883520, 140175985139711, +STORE, 140175931875328, 140175985139711, +STORE, 140175931842560, 140175985139711, +STORE, 140175931834368, 140175985139711, +STORE, 140175931817984, 140175985139711, +STORE, 140175931809792, 140175985139711, +STORE, 140175931793408, 140175985139711, +STORE, 140175931785216, 140175985139711, +STORE, 140175931768832, 140175985139711, +STORE, 140175931760640, 140175985139711, +STORE, 140175931727872, 140175985139711, +STORE, 140175931719680, 140175985139711, +STORE, 140175931703296, 140175985139711, +STORE, 140175931695104, 140175985139711, +STORE, 140175931678720, 140175985139711, +STORE, 140175931670528, 140175985139711, +STORE, 140175931654144, 140175985139711, +STORE, 140175931645952, 140175985139711, +STORE, 140175931613184, 140175985139711, +STORE, 140175931604992, 140175985139711, +STORE, 140175931588608, 140175985139711, +STORE, 140175931580416, 140175985139711, +STORE, 140175931564032, 140175985139711, +STORE, 140175931555840, 140175985139711, +STORE, 140175931539456, 140175985139711, +STORE, 140175931531264, 140175985139711, +STORE, 140175931498496, 140175985139711, +STORE, 140175931490304, 140175985139711, +STORE, 140175931473920, 140175985139711, +STORE, 140175931465728, 140175985139711, +STORE, 140175931449344, 140175985139711, +STORE, 140175931441152, 140175985139711, +STORE, 140175931424768, 140175985139711, +STORE, 140175931416576, 140175985139711, +STORE, 140175931383808, 140175985139711, +STORE, 140175931375616, 140175985139711, +STORE, 140175931359232, 140175985139711, +STORE, 140175931351040, 140175985139711, +STORE, 140175931334656, 140175985139711, +STORE, 140175931326464, 140175985139711, +STORE, 140175931310080, 140175985139711, +STORE, 140175931301888, 140175985139711, +STORE, 140175931269120, 140175985139711, +STORE, 140175931260928, 140175985139711, +STORE, 140175931244544, 140175985139711, +STORE, 140175931236352, 140175985139711, +STORE, 140175931219968, 140175985139711, +STORE, 140175931211776, 140175985139711, +STORE, 140175931195392, 140175985139711, +STORE, 140175931187200, 140175985139711, +STORE, 140175931154432, 140175985139711, +STORE, 140175931146240, 140175985139711, +STORE, 140175931129856, 140175985139711, +STORE, 140175931121664, 140175985139711, +STORE, 140175931105280, 140175985139711, +STORE, 140175931097088, 140175985139711, +STORE, 140175931080704, 140175985139711, +STORE, 140175931072512, 140175985139711, +STORE, 140175931039744, 140175985139711, +STORE, 140175931031552, 140175985139711, +STORE, 140175931015168, 140175985139711, +STORE, 140175931006976, 140175985139711, +STORE, 140175930990592, 140175985139711, +STORE, 140175930982400, 140175985139711, +STORE, 140175930966016, 140175985139711, +STORE, 140175930957824, 140175985139711, +STORE, 140175930925056, 140175985139711, +STORE, 140175930916864, 140175985139711, +STORE, 140175930900480, 140175985139711, +STORE, 140175930892288, 140175985139711, +STORE, 140175930875904, 140175985139711, +STORE, 140175930867712, 140175985139711, +STORE, 140175930851328, 140175985139711, +STORE, 140175930843136, 140175985139711, +STORE, 140175930810368, 140175985139711, +STORE, 140175930802176, 140175985139711, +STORE, 140175930785792, 140175985139711, +STORE, 140175930777600, 140175985139711, +STORE, 140175930761216, 140175985139711, +STORE, 140175930753024, 140175985139711, +STORE, 140175930736640, 140175985139711, +STORE, 140175930728448, 140175985139711, +STORE, 140175930695680, 140175985139711, +STORE, 140175930687488, 140175985139711, +STORE, 140175930671104, 140175985139711, +STORE, 140175930662912, 140175985139711, +STORE, 140175930646528, 140175985139711, +STORE, 140175930638336, 140175985139711, +STORE, 140175930621952, 140175985139711, +STORE, 140175930613760, 140175985139711, +STORE, 140175930580992, 140175985139711, +STORE, 140175930572800, 140175985139711, +STORE, 140175930556416, 140175985139711, +STORE, 140175930548224, 140175985139711, +STORE, 140175930531840, 140175985139711, +STORE, 140175930523648, 140175985139711, +STORE, 140175930507264, 140175985139711, +STORE, 140175928410112, 140175985139711, +STORE, 140175928401920, 140175985139711, +STORE, 140175928369152, 140175985139711, +STORE, 140175928360960, 140175985139711, +STORE, 140175928344576, 140175985139711, +STORE, 140175928336384, 140175985139711, +STORE, 140175928320000, 140175985139711, +STORE, 140175928311808, 140175985139711, +STORE, 140175928295424, 140175985139711, +STORE, 140175927242752, 140175985139711, +SNULL, 140175956627455, 140175985139711, +STORE, 140175927242752, 140175956627455, +STORE, 140175956627456, 140175985139711, + }; + unsigned long set24[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140735281639424, 140737488351231, +SNULL, 140735281643519, 140737488351231, +STORE, 140735281639424, 140735281643519, +STORE, 140735281508352, 140735281643519, +STORE, 94717834911744, 94717834928127, +SNULL, 94717834915839, 94717834928127, +STORE, 94717834911744, 94717834915839, +STORE, 94717834915840, 94717834928127, +ERASE, 94717834915840, 94717834928127, +STORE, 94717834919936, 94717834928127, +STORE, 140428246065152, 140428248317951, +SNULL, 140428246208511, 140428248317951, +STORE, 140428246065152, 140428246208511, +STORE, 140428246208512, 140428248317951, +ERASE, 140428246208512, 140428248317951, +STORE, 140428248305664, 140428248313855, +STORE, 140428248313856, 140428248317951, +STORE, 140735281811456, 140735281815551, +STORE, 140735281799168, 140735281811455, +STORE, 140428248297472, 140428248305663, +STORE, 140428243841024, 140428246065151, +SNULL, 140428245491711, 140428246065151, +STORE, 140428243841024, 140428245491711, +STORE, 140428245491712, 140428246065151, +SNULL, 140428245491712, 140428246061055, +STORE, 140428246061056, 140428246065151, +STORE, 140428245491712, 140428246061055, +ERASE, 140428245491712, 140428246061055, +STORE, 140428245491712, 140428246061055, +ERASE, 140428246061056, 140428246065151, +STORE, 140428246061056, 140428246065151, +STORE, 140428248268800, 140428248297471, +STORE, 140428241625088, 140428243841023, +SNULL, 140428241625088, 140428241723391, +STORE, 140428241723392, 140428243841023, +STORE, 140428241625088, 140428241723391, +SNULL, 140428243816447, 140428243841023, +STORE, 140428241723392, 140428243816447, +STORE, 140428243816448, 140428243841023, +SNULL, 140428243816448, 140428243824639, +STORE, 140428243824640, 140428243841023, +STORE, 140428243816448, 140428243824639, +ERASE, 140428243816448, 140428243824639, +STORE, 140428243816448, 140428243824639, +ERASE, 140428243824640, 140428243841023, +STORE, 140428243824640, 140428243841023, +STORE, 140428237828096, 140428241625087, +SNULL, 140428237828096, 140428239486975, +STORE, 140428239486976, 140428241625087, +STORE, 140428237828096, 140428239486975, +SNULL, 140428241584127, 140428241625087, +STORE, 140428239486976, 140428241584127, +STORE, 140428241584128, 140428241625087, +SNULL, 140428241584128, 140428241608703, +STORE, 140428241608704, 140428241625087, +STORE, 140428241584128, 140428241608703, +ERASE, 140428241584128, 140428241608703, +STORE, 140428241584128, 140428241608703, +ERASE, 140428241608704, 140428241625087, +STORE, 140428241608704, 140428241625087, +STORE, 140428235567104, 140428237828095, +SNULL, 140428235567104, 140428235718655, +STORE, 140428235718656, 140428237828095, +STORE, 140428235567104, 140428235718655, +SNULL, 140428237811711, 140428237828095, +STORE, 140428235718656, 140428237811711, +STORE, 140428237811712, 140428237828095, +SNULL, 140428237811712, 140428237819903, +STORE, 140428237819904, 140428237828095, +STORE, 140428237811712, 140428237819903, +ERASE, 140428237811712, 140428237819903, +STORE, 140428237811712, 140428237819903, +ERASE, 140428237819904, 140428237828095, +STORE, 140428237819904, 140428237828095, +STORE, 140428233445376, 140428235567103, +SNULL, 140428233445376, 140428233461759, +STORE, 140428233461760, 140428235567103, +STORE, 140428233445376, 140428233461759, +SNULL, 140428235558911, 140428235567103, +STORE, 140428233461760, 140428235558911, +STORE, 140428235558912, 140428235567103, +ERASE, 140428235558912, 140428235567103, +STORE, 140428235558912, 140428235567103, +STORE, 140428231315456, 140428233445375, +SNULL, 140428231315456, 140428231344127, +STORE, 140428231344128, 140428233445375, +STORE, 140428231315456, 140428231344127, +SNULL, 140428233437183, 140428233445375, +STORE, 140428231344128, 140428233437183, +STORE, 140428233437184, 140428233445375, +ERASE, 140428233437184, 140428233445375, +STORE, 140428233437184, 140428233445375, +STORE, 140428248260608, 140428248268799, +STORE, 140428229062656, 140428231315455, +SNULL, 140428229062656, 140428229214207, +STORE, 140428229214208, 140428231315455, +STORE, 140428229062656, 140428229214207, +SNULL, 140428231307263, 140428231315455, +STORE, 140428229214208, 140428231307263, +STORE, 140428231307264, 140428231315455, +ERASE, 140428231307264, 140428231315455, +STORE, 140428231307264, 140428231315455, +STORE, 140428226891776, 140428229062655, +SNULL, 140428226891776, 140428226961407, +STORE, 140428226961408, 140428229062655, +STORE, 140428226891776, 140428226961407, +SNULL, 140428229054463, 140428229062655, +STORE, 140428226961408, 140428229054463, +STORE, 140428229054464, 140428229062655, +ERASE, 140428229054464, 140428229062655, +STORE, 140428229054464, 140428229062655, +STORE, 140428223680512, 140428226891775, +SNULL, 140428223680512, 140428224757759, +STORE, 140428224757760, 140428226891775, +STORE, 140428223680512, 140428224757759, +SNULL, 140428226854911, 140428226891775, +STORE, 140428224757760, 140428226854911, +STORE, 140428226854912, 140428226891775, +ERASE, 140428226854912, 140428226891775, +STORE, 140428226854912, 140428226891775, +STORE, 140428221546496, 140428223680511, +SNULL, 140428221546496, 140428221575167, +STORE, 140428221575168, 140428223680511, +STORE, 140428221546496, 140428221575167, +SNULL, 140428223672319, 140428223680511, +STORE, 140428221575168, 140428223672319, +STORE, 140428223672320, 140428223680511, +ERASE, 140428223672320, 140428223680511, +STORE, 140428223672320, 140428223680511, +STORE, 140428219236352, 140428221546495, +SNULL, 140428219236352, 140428219441151, +STORE, 140428219441152, 140428221546495, +STORE, 140428219236352, 140428219441151, +SNULL, 140428221538303, 140428221546495, +STORE, 140428219441152, 140428221538303, +STORE, 140428221538304, 140428221546495, +ERASE, 140428221538304, 140428221546495, +STORE, 140428221538304, 140428221546495, +STORE, 140428216852480, 140428219236351, +SNULL, 140428216852480, 140428217044991, +STORE, 140428217044992, 140428219236351, +STORE, 140428216852480, 140428217044991, +SNULL, 140428219138047, 140428219236351, +STORE, 140428217044992, 140428219138047, +STORE, 140428219138048, 140428219236351, +ERASE, 140428219138048, 140428219236351, +STORE, 140428219138048, 140428219236351, +STORE, 140428248252416, 140428248268799, +STORE, 140428214284288, 140428216852479, +SNULL, 140428214284288, 140428214751231, +STORE, 140428214751232, 140428216852479, +STORE, 140428214284288, 140428214751231, +SNULL, 140428216844287, 140428216852479, +STORE, 140428214751232, 140428216844287, +STORE, 140428216844288, 140428216852479, +ERASE, 140428216844288, 140428216852479, +STORE, 140428216844288, 140428216852479, +STORE, 140428212170752, 140428214284287, +SNULL, 140428212170752, 140428212183039, +STORE, 140428212183040, 140428214284287, +STORE, 140428212170752, 140428212183039, +SNULL, 140428214276095, 140428214284287, +STORE, 140428212183040, 140428214276095, +STORE, 140428214276096, 140428214284287, +ERASE, 140428214276096, 140428214284287, +STORE, 140428214276096, 140428214284287, +STORE, 140428209991680, 140428212170751, +SNULL, 140428209991680, 140428210069503, +STORE, 140428210069504, 140428212170751, +STORE, 140428209991680, 140428210069503, +SNULL, 140428212162559, 140428212170751, +STORE, 140428210069504, 140428212162559, +STORE, 140428212162560, 140428212170751, +ERASE, 140428212162560, 140428212170751, +STORE, 140428212162560, 140428212170751, +STORE, 140428207874048, 140428209991679, +SNULL, 140428207874048, 140428207890431, +STORE, 140428207890432, 140428209991679, +STORE, 140428207874048, 140428207890431, +SNULL, 140428209983487, 140428209991679, +STORE, 140428207890432, 140428209983487, +STORE, 140428209983488, 140428209991679, +ERASE, 140428209983488, 140428209991679, +STORE, 140428209983488, 140428209991679, +STORE, 140428248244224, 140428248268799, +STORE, 140428248231936, 140428248268799, +SNULL, 140428241600511, 140428241608703, +STORE, 140428241584128, 140428241600511, +STORE, 140428241600512, 140428241608703, +SNULL, 140428209987583, 140428209991679, +STORE, 140428209983488, 140428209987583, +STORE, 140428209987584, 140428209991679, +SNULL, 140428212166655, 140428212170751, +STORE, 140428212162560, 140428212166655, +STORE, 140428212166656, 140428212170751, +SNULL, 140428214280191, 140428214284287, +STORE, 140428214276096, 140428214280191, +STORE, 140428214280192, 140428214284287, +SNULL, 140428243820543, 140428243824639, +STORE, 140428243816448, 140428243820543, +STORE, 140428243820544, 140428243824639, +SNULL, 140428216848383, 140428216852479, +STORE, 140428216844288, 140428216848383, +STORE, 140428216848384, 140428216852479, +SNULL, 140428219232255, 140428219236351, +STORE, 140428219138048, 140428219232255, +STORE, 140428219232256, 140428219236351, +SNULL, 140428221542399, 140428221546495, +STORE, 140428221538304, 140428221542399, +STORE, 140428221542400, 140428221546495, +SNULL, 140428223676415, 140428223680511, +STORE, 140428223672320, 140428223676415, +STORE, 140428223676416, 140428223680511, +SNULL, 140428226863103, 140428226891775, +STORE, 140428226854912, 140428226863103, +STORE, 140428226863104, 140428226891775, +SNULL, 140428229058559, 140428229062655, +STORE, 140428229054464, 140428229058559, +STORE, 140428229058560, 140428229062655, +SNULL, 140428231311359, 140428231315455, +STORE, 140428231307264, 140428231311359, +STORE, 140428231311360, 140428231315455, +SNULL, 140428233441279, 140428233445375, +STORE, 140428233437184, 140428233441279, +STORE, 140428233441280, 140428233445375, +SNULL, 140428235563007, 140428235567103, +STORE, 140428235558912, 140428235563007, +STORE, 140428235563008, 140428235567103, +SNULL, 140428237815807, 140428237819903, +STORE, 140428237811712, 140428237815807, +STORE, 140428237815808, 140428237819903, +SNULL, 140428246056959, 140428246061055, +STORE, 140428245491712, 140428246056959, +STORE, 140428246056960, 140428246061055, +SNULL, 94717834924031, 94717834928127, +STORE, 94717834919936, 94717834924031, +STORE, 94717834924032, 94717834928127, +SNULL, 140428248309759, 140428248313855, +STORE, 140428248305664, 140428248309759, +STORE, 140428248309760, 140428248313855, +ERASE, 140428248268800, 140428248297471, +STORE, 94717843058688, 94717843193855, +STORE, 94749677137920, 94749677559807, +STORE, 94749677563904, 94749677604863, +STORE, 94749677604864, 94749677608959, +STORE, 94749710970880, 94749711241215, +STORE, 140490884894720, 140490884935679, +STORE, 140490884935680, 140490887032831, +STORE, 140490887032832, 140490887036927, +STORE, 140490887036928, 140490887041023, +STORE, 140490887041024, 140490887065599, +STORE, 140490887065600, 140490887110655, +STORE, 140490887110656, 140490889203711, +STORE, 140490889203712, 140490889207807, +STORE, 140490889207808, 140490889211903, +STORE, 140490889211904, 140490889293823, +STORE, 140490889293824, 140490891390975, +STORE, 140490891390976, 140490891395071, +STORE, 140490891395072, 140490891399167, +STORE, 140490891399168, 140490891407359, +STORE, 140490891407360, 140490891436031, +STORE, 140490891436032, 140490893529087, +STORE, 140490893529088, 140490893533183, +STORE, 140490893533184, 140490893537279, +STORE, 140490893537280, 140490901979135, +STORE, 140490901979136, 140490901991423, +STORE, 140490901991424, 140490904084479, +STORE, 140490904084480, 140490904088575, +STORE, 140490904088576, 140490904092671, +STORE, 140490904092672, 140490904559615, +STORE, 140490904559616, 140490906652671, +STORE, 140490906652672, 140490906656767, +STORE, 140490906656768, 140490906660863, +STORE, 140490906660864, 140490906677247, +STORE, 140490906677248, 140490908770303, +STORE, 140490908770304, 140490908774399, +STORE, 140490908774400, 140490908778495, +STORE, 140490908778496, 140490908794879, +STORE, 140490908794880, 140490910887935, +STORE, 140490910887936, 140490910892031, +STORE, 140490910892032, 140490910896127, +STORE, 140490910896128, 140490912555007, +STORE, 140490912555008, 140490914652159, +STORE, 140490914652160, 140490914668543, +STORE, 140490914668544, 140490914676735, +STORE, 140490914676736, 140490914693119, +STORE, 140490914693120, 140490914791423, +STORE, 140490914791424, 140490916884479, +STORE, 140490916884480, 140490916888575, +STORE, 140490916888576, 140490916892671, +STORE, 140490916892672, 140490916909055, +STORE, 140490916909056, 140490916937727, +STORE, 140490916937728, 140490919030783, +STORE, 140490919030784, 140490919034879, +STORE, 140490919034880, 140490919038975, +STORE, 140490919038976, 140490919190527, +STORE, 140490919190528, 140490921283583, +STORE, 140490921283584, 140490921287679, +STORE, 140490921287680, 140490921291775, +STORE, 140490921291776, 140490921299967, +STORE, 140490921299968, 140490921390079, +STORE, 140490921390080, 140490923483135, +STORE, 140490923483136, 140490923487231, +STORE, 140490923487232, 140490923491327, +STORE, 140490923491328, 140490923757567, +STORE, 140490923757568, 140490925850623, +STORE, 140490925850624, 140490925867007, +STORE, 140490925867008, 140490925871103, +STORE, 140490925871104, 140490925875199, +STORE, 140490925875200, 140490925903871, +STORE, 140490925903872, 140490928001023, +STORE, 140490928001024, 140490928005119, +STORE, 140490928005120, 140490928009215, +STORE, 140490928009216, 140490928152575, +STORE, 140490930184192, 140490930221055, +STORE, 140490930221056, 140490930237439, +STORE, 140490930237440, 140490930241535, +STORE, 140490930241536, 140490930245631, +STORE, 140490930245632, 140490930249727, +STORE, 140490930249728, 140490930253823, +STORE, 140490930253824, 140490930257919, +STORE, 140490930257920, 140490930262015, +STORE, 140724611694592, 140724611829759, +STORE, 140724612427776, 140724612440063, +STORE, 140724612440064, 140724612444159, +STORE, 94103163662336, 94103163772927, +STORE, 94103165865984, 94103165874175, +STORE, 94103165874176, 94103165878271, +STORE, 94103165878272, 94103165886463, +STORE, 94103182548992, 94103182684159, +STORE, 140092694708224, 140092696367103, +STORE, 140092696367104, 140092698464255, +STORE, 140092698464256, 140092698480639, +STORE, 140092698480640, 140092698488831, +STORE, 140092698488832, 140092698505215, +STORE, 140092698505216, 140092698648575, +STORE, 140092700708864, 140092700717055, +STORE, 140092700745728, 140092700749823, +STORE, 140092700749824, 140092700753919, +STORE, 140092700753920, 140092700758015, +STORE, 140736800911360, 140736801046527, +STORE, 140736802308096, 140736802320383, +STORE, 140736802320384, 140736802324479, +STORE, 93948802064384, 93948802174975, +STORE, 93948804268032, 93948804276223, +STORE, 93948804276224, 93948804280319, +STORE, 93948804280320, 93948804288511, +STORE, 93948806266880, 93948806402047, +STORE, 140222999113728, 140223000772607, +STORE, 140223000772608, 140223002869759, +STORE, 140223002869760, 140223002886143, +STORE, 140223002886144, 140223002894335, +STORE, 140223002894336, 140223002910719, +STORE, 140223002910720, 140223003054079, +STORE, 140223005114368, 140223005122559, +STORE, 140223005151232, 140223005155327, +STORE, 140223005155328, 140223005159423, +STORE, 140223005159424, 140223005163519, +STORE, 140720877506560, 140720877641727, +STORE, 140720878231552, 140720878243839, +STORE, 140720878243840, 140720878247935, +STORE, 140737488347136, 140737488351231, +STORE, 140733232087040, 140737488351231, +SNULL, 140733232091135, 140737488351231, +STORE, 140733232087040, 140733232091135, +STORE, 140733231955968, 140733232091135, +STORE, 4194304, 5128191, +STORE, 7221248, 7241727, +STORE, 7241728, 7249919, +STORE, 140161681321984, 140161683574783, +SNULL, 140161681465343, 140161683574783, +STORE, 140161681321984, 140161681465343, +STORE, 140161681465344, 140161683574783, +ERASE, 140161681465344, 140161683574783, +STORE, 140161683562496, 140161683570687, +STORE, 140161683570688, 140161683574783, +STORE, 140733232214016, 140733232218111, +STORE, 140733232201728, 140733232214015, +STORE, 140161683533824, 140161683562495, +STORE, 140161683525632, 140161683533823, +STORE, 140161678159872, 140161681321983, +SNULL, 140161678159872, 140161679220735, +STORE, 140161679220736, 140161681321983, +STORE, 140161678159872, 140161679220735, +SNULL, 140161681313791, 140161681321983, +STORE, 140161679220736, 140161681313791, +STORE, 140161681313792, 140161681321983, +ERASE, 140161681313792, 140161681321983, +STORE, 140161681313792, 140161681321983, +STORE, 140161674362880, 140161678159871, +SNULL, 140161674362880, 140161676021759, +STORE, 140161676021760, 140161678159871, +STORE, 140161674362880, 140161676021759, +SNULL, 140161678118911, 140161678159871, +STORE, 140161676021760, 140161678118911, +STORE, 140161678118912, 140161678159871, +SNULL, 140161678118912, 140161678143487, +STORE, 140161678143488, 140161678159871, +STORE, 140161678118912, 140161678143487, +ERASE, 140161678118912, 140161678143487, +STORE, 140161678118912, 140161678143487, +ERASE, 140161678143488, 140161678159871, +STORE, 140161678143488, 140161678159871, +STORE, 140161683513344, 140161683533823, +SNULL, 140161678135295, 140161678143487, +STORE, 140161678118912, 140161678135295, +STORE, 140161678135296, 140161678143487, +SNULL, 140161681317887, 140161681321983, +STORE, 140161681313792, 140161681317887, +STORE, 140161681317888, 140161681321983, +SNULL, 7233535, 7241727, +STORE, 7221248, 7233535, +STORE, 7233536, 7241727, +SNULL, 140161683566591, 140161683570687, +STORE, 140161683562496, 140161683566591, +STORE, 140161683566592, 140161683570687, +ERASE, 140161683533824, 140161683562495, +STORE, 25477120, 25612287, +STORE, 25477120, 25759743, +STORE, 140161681829888, 140161683513343, +STORE, 25477120, 25915391, +STORE, 25477120, 26054655, +SNULL, 25800703, 26054655, +STORE, 25477120, 25800703, +STORE, 25800704, 26054655, +ERASE, 25800704, 26054655, +STORE, 140737488347136, 140737488351231, +STORE, 140723218452480, 140737488351231, +SNULL, 140723218456575, 140737488351231, +STORE, 140723218452480, 140723218456575, +STORE, 140723218321408, 140723218456575, +STORE, 4194304, 26279935, +STORE, 28372992, 28454911, +STORE, 28454912, 29806591, +STORE, 140398872264704, 140398874517503, +SNULL, 140398872408063, 140398874517503, +STORE, 140398872264704, 140398872408063, +STORE, 140398872408064, 140398874517503, +ERASE, 140398872408064, 140398874517503, +STORE, 140398874505216, 140398874513407, +STORE, 140398874513408, 140398874517503, +STORE, 140723219247104, 140723219251199, +STORE, 140723219234816, 140723219247103, +STORE, 140398874476544, 140398874505215, +STORE, 140398874468352, 140398874476543, +STORE, 140398868430848, 140398872264703, +SNULL, 140398868430848, 140398870138879, +STORE, 140398870138880, 140398872264703, +STORE, 140398868430848, 140398870138879, +SNULL, 140398872231935, 140398872264703, +STORE, 140398870138880, 140398872231935, +STORE, 140398872231936, 140398872264703, +ERASE, 140398872231936, 140398872264703, +STORE, 140398872231936, 140398872264703, +STORE, 140398866235392, 140398868430847, +SNULL, 140398866235392, 140398866329599, +STORE, 140398866329600, 140398868430847, +STORE, 140398866235392, 140398866329599, +SNULL, 140398868422655, 140398868430847, +STORE, 140398866329600, 140398868422655, +STORE, 140398868422656, 140398868430847, +ERASE, 140398868422656, 140398868430847, +STORE, 140398868422656, 140398868430847, +STORE, 140398863716352, 140398866235391, +SNULL, 140398863716352, 140398864130047, +STORE, 140398864130048, 140398866235391, +STORE, 140398863716352, 140398864130047, +SNULL, 140398866223103, 140398866235391, +STORE, 140398864130048, 140398866223103, +STORE, 140398866223104, 140398866235391, +ERASE, 140398866223104, 140398866235391, +STORE, 140398866223104, 140398866235391, +STORE, 140398861082624, 140398863716351, +SNULL, 140398861082624, 140398861611007, +STORE, 140398861611008, 140398863716351, +STORE, 140398861082624, 140398861611007, +SNULL, 140398863708159, 140398863716351, +STORE, 140398861611008, 140398863708159, +STORE, 140398863708160, 140398863716351, +ERASE, 140398863708160, 140398863716351, +STORE, 140398863708160, 140398863716351, +STORE, 140398858969088, 140398861082623, +SNULL, 140398858969088, 140398858981375, +STORE, 140398858981376, 140398861082623, +STORE, 140398858969088, 140398858981375, +SNULL, 140398861074431, 140398861082623, +STORE, 140398858981376, 140398861074431, +STORE, 140398861074432, 140398861082623, +ERASE, 140398861074432, 140398861082623, +STORE, 140398861074432, 140398861082623, +STORE, 140398856765440, 140398858969087, +SNULL, 140398856765440, 140398856867839, +STORE, 140398856867840, 140398858969087, +STORE, 140398856765440, 140398856867839, +SNULL, 140398858960895, 140398858969087, +STORE, 140398856867840, 140398858960895, +STORE, 140398858960896, 140398858969087, +ERASE, 140398858960896, 140398858969087, +STORE, 140398858960896, 140398858969087, +STORE, 140398874460160, 140398874476543, +STORE, 140398853603328, 140398856765439, +SNULL, 140398853603328, 140398854664191, +STORE, 140398854664192, 140398856765439, +STORE, 140398853603328, 140398854664191, +SNULL, 140398856757247, 140398856765439, +STORE, 140398854664192, 140398856757247, +STORE, 140398856757248, 140398856765439, +ERASE, 140398856757248, 140398856765439, +STORE, 140398856757248, 140398856765439, +STORE, 140398849806336, 140398853603327, +SNULL, 140398849806336, 140398851465215, +STORE, 140398851465216, 140398853603327, +STORE, 140398849806336, 140398851465215, +SNULL, 140398853562367, 140398853603327, +STORE, 140398851465216, 140398853562367, +STORE, 140398853562368, 140398853603327, +SNULL, 140398853562368, 140398853586943, +STORE, 140398853586944, 140398853603327, +STORE, 140398853562368, 140398853586943, +ERASE, 140398853562368, 140398853586943, +STORE, 140398853562368, 140398853586943, +ERASE, 140398853586944, 140398853603327, +STORE, 140398853586944, 140398853603327, +STORE, 140398874447872, 140398874476543, +SNULL, 140398853578751, 140398853586943, +STORE, 140398853562368, 140398853578751, +STORE, 140398853578752, 140398853586943, +SNULL, 140398856761343, 140398856765439, +STORE, 140398856757248, 140398856761343, +STORE, 140398856761344, 140398856765439, +SNULL, 140398858964991, 140398858969087, +STORE, 140398858960896, 140398858964991, +STORE, 140398858964992, 140398858969087, +SNULL, 140398861078527, 140398861082623, +STORE, 140398861074432, 140398861078527, +STORE, 140398861078528, 140398861082623, +SNULL, 140398863712255, 140398863716351, +STORE, 140398863708160, 140398863712255, +STORE, 140398863712256, 140398863716351, +SNULL, 140398866231295, 140398866235391, +STORE, 140398866223104, 140398866231295, +STORE, 140398866231296, 140398866235391, +SNULL, 140398868426751, 140398868430847, +STORE, 140398868422656, 140398868426751, +STORE, 140398868426752, 140398868430847, +SNULL, 140398872236031, 140398872264703, +STORE, 140398872231936, 140398872236031, +STORE, 140398872236032, 140398872264703, +SNULL, 28405759, 28454911, +STORE, 28372992, 28405759, +STORE, 28405760, 28454911, +SNULL, 140398874509311, 140398874513407, +STORE, 140398874505216, 140398874509311, +STORE, 140398874509312, 140398874513407, +ERASE, 140398874476544, 140398874505215, +STORE, 43278336, 43413503, +STORE, 140398872764416, 140398874447871, +STORE, 140398874501120, 140398874505215, +STORE, 140398872629248, 140398872764415, +STORE, 43278336, 43556863, +STORE, 140398847709184, 140398849806335, +STORE, 140398874492928, 140398874505215, +STORE, 140398874484736, 140398874505215, +STORE, 140398874447872, 140398874484735, +STORE, 140398872612864, 140398872764415, +STORE, 43278336, 43692031, +STORE, 43278336, 43880447, +STORE, 140398872604672, 140398872764415, +STORE, 140398872596480, 140398872764415, +STORE, 43278336, 44044287, +STORE, 140398872580096, 140398872764415, +STORE, 140737488347136, 140737488351231, +STORE, 140734403092480, 140737488351231, +SNULL, 140734403096575, 140737488351231, +STORE, 140734403092480, 140734403096575, +STORE, 140734402961408, 140734403096575, +STORE, 4194304, 5128191, +STORE, 7221248, 7241727, +STORE, 7241728, 7249919, +STORE, 140240662380544, 140240664633343, +SNULL, 140240662523903, 140240664633343, +STORE, 140240662380544, 140240662523903, +STORE, 140240662523904, 140240664633343, +ERASE, 140240662523904, 140240664633343, +STORE, 140240664621056, 140240664629247, +STORE, 140240664629248, 140240664633343, +STORE, 140734403145728, 140734403149823, +STORE, 140734403133440, 140734403145727, +STORE, 140240664592384, 140240664621055, +STORE, 140240664584192, 140240664592383, +STORE, 140240659218432, 140240662380543, +SNULL, 140240659218432, 140240660279295, +STORE, 140240660279296, 140240662380543, +STORE, 140240659218432, 140240660279295, +SNULL, 140240662372351, 140240662380543, +STORE, 140240660279296, 140240662372351, +STORE, 140240662372352, 140240662380543, +ERASE, 140240662372352, 140240662380543, +STORE, 140240662372352, 140240662380543, +STORE, 140240655421440, 140240659218431, +SNULL, 140240655421440, 140240657080319, +STORE, 140240657080320, 140240659218431, +STORE, 140240655421440, 140240657080319, +SNULL, 140240659177471, 140240659218431, +STORE, 140240657080320, 140240659177471, +STORE, 140240659177472, 140240659218431, +SNULL, 140240659177472, 140240659202047, +STORE, 140240659202048, 140240659218431, +STORE, 140240659177472, 140240659202047, +ERASE, 140240659177472, 140240659202047, +STORE, 140240659177472, 140240659202047, +ERASE, 140240659202048, 140240659218431, +STORE, 140240659202048, 140240659218431, +STORE, 140240664571904, 140240664592383, +SNULL, 140240659193855, 140240659202047, +STORE, 140240659177472, 140240659193855, +STORE, 140240659193856, 140240659202047, +SNULL, 140240662376447, 140240662380543, +STORE, 140240662372352, 140240662376447, +STORE, 140240662376448, 140240662380543, +SNULL, 7233535, 7241727, +STORE, 7221248, 7233535, +STORE, 7233536, 7241727, +SNULL, 140240664625151, 140240664629247, +STORE, 140240664621056, 140240664625151, +STORE, 140240664625152, 140240664629247, +ERASE, 140240664592384, 140240664621055, +STORE, 30646272, 30781439, +STORE, 30646272, 30928895, +STORE, 140240662888448, 140240664571903, +STORE, 94256659468288, 94256659578879, +STORE, 94256661671936, 94256661680127, +STORE, 94256661680128, 94256661684223, +STORE, 94256661684224, 94256661692415, +STORE, 94256687980544, 94256688115711, +STORE, 139801712504832, 139801714163711, +STORE, 139801714163712, 139801716260863, +STORE, 139801716260864, 139801716277247, +STORE, 139801716277248, 139801716285439, +STORE, 139801716285440, 139801716301823, +STORE, 139801716301824, 139801716445183, +STORE, 139801718505472, 139801718513663, +STORE, 139801718542336, 139801718546431, +STORE, 139801718546432, 139801718550527, +STORE, 139801718550528, 139801718554623, +STORE, 140721575538688, 140721575673855, +STORE, 140721577013248, 140721577025535, +STORE, 140721577025536, 140721577029631, +STORE, 140737488347136, 140737488351231, +STORE, 140729259393024, 140737488351231, +SNULL, 140729259397119, 140737488351231, +STORE, 140729259393024, 140729259397119, +STORE, 140729259261952, 140729259397119, +STORE, 4194304, 5128191, +STORE, 7221248, 7241727, +STORE, 7241728, 7249919, +STORE, 139682376638464, 139682378891263, +SNULL, 139682376781823, 139682378891263, +STORE, 139682376638464, 139682376781823, +STORE, 139682376781824, 139682378891263, +ERASE, 139682376781824, 139682378891263, +STORE, 139682378878976, 139682378887167, +STORE, 139682378887168, 139682378891263, +STORE, 140729260462080, 140729260466175, +STORE, 140729260449792, 140729260462079, +STORE, 139682378850304, 139682378878975, +STORE, 139682378842112, 139682378850303, +STORE, 139682373476352, 139682376638463, +SNULL, 139682373476352, 139682374537215, +STORE, 139682374537216, 139682376638463, +STORE, 139682373476352, 139682374537215, +SNULL, 139682376630271, 139682376638463, +STORE, 139682374537216, 139682376630271, +STORE, 139682376630272, 139682376638463, +ERASE, 139682376630272, 139682376638463, +STORE, 139682376630272, 139682376638463, +STORE, 139682369679360, 139682373476351, +SNULL, 139682369679360, 139682371338239, +STORE, 139682371338240, 139682373476351, +STORE, 139682369679360, 139682371338239, +SNULL, 139682373435391, 139682373476351, +STORE, 139682371338240, 139682373435391, +STORE, 139682373435392, 139682373476351, +SNULL, 139682373435392, 139682373459967, +STORE, 139682373459968, 139682373476351, +STORE, 139682373435392, 139682373459967, +ERASE, 139682373435392, 139682373459967, +STORE, 139682373435392, 139682373459967, +ERASE, 139682373459968, 139682373476351, +STORE, 139682373459968, 139682373476351, +STORE, 139682378829824, 139682378850303, +SNULL, 139682373451775, 139682373459967, +STORE, 139682373435392, 139682373451775, +STORE, 139682373451776, 139682373459967, +SNULL, 139682376634367, 139682376638463, +STORE, 139682376630272, 139682376634367, +STORE, 139682376634368, 139682376638463, +SNULL, 7233535, 7241727, +STORE, 7221248, 7233535, +STORE, 7233536, 7241727, +SNULL, 139682378883071, 139682378887167, +STORE, 139682378878976, 139682378883071, +STORE, 139682378883072, 139682378887167, +ERASE, 139682378850304, 139682378878975, +STORE, 10022912, 10158079, +STORE, 10022912, 10305535, +STORE, 139682377146368, 139682378829823, +STORE, 140737488347136, 140737488351231, +STORE, 140731831926784, 140737488351231, +SNULL, 140731831930879, 140737488351231, +STORE, 140731831926784, 140731831930879, +STORE, 140731831795712, 140731831930879, +STORE, 94615305261056, 94615307485183, +SNULL, 94615305371647, 94615307485183, +STORE, 94615305261056, 94615305371647, +STORE, 94615305371648, 94615307485183, +ERASE, 94615305371648, 94615307485183, +STORE, 94615307464704, 94615307476991, +STORE, 94615307476992, 94615307485183, +STORE, 140163912994816, 140163915247615, +SNULL, 140163913138175, 140163915247615, +STORE, 140163912994816, 140163913138175, +STORE, 140163913138176, 140163915247615, +ERASE, 140163913138176, 140163915247615, +STORE, 140163915235328, 140163915243519, +STORE, 140163915243520, 140163915247615, +STORE, 140731832217600, 140731832221695, +STORE, 140731832205312, 140731832217599, +STORE, 140163915206656, 140163915235327, +STORE, 140163915198464, 140163915206655, +STORE, 140163909197824, 140163912994815, +SNULL, 140163909197824, 140163910856703, +STORE, 140163910856704, 140163912994815, +STORE, 140163909197824, 140163910856703, +SNULL, 140163912953855, 140163912994815, +STORE, 140163910856704, 140163912953855, +STORE, 140163912953856, 140163912994815, +SNULL, 140163912953856, 140163912978431, +STORE, 140163912978432, 140163912994815, +STORE, 140163912953856, 140163912978431, +ERASE, 140163912953856, 140163912978431, +STORE, 140163912953856, 140163912978431, +ERASE, 140163912978432, 140163912994815, +STORE, 140163912978432, 140163912994815, +SNULL, 140163912970239, 140163912978431, +STORE, 140163912953856, 140163912970239, +STORE, 140163912970240, 140163912978431, +SNULL, 94615307472895, 94615307476991, +STORE, 94615307464704, 94615307472895, +STORE, 94615307472896, 94615307476991, +SNULL, 140163915239423, 140163915243519, +STORE, 140163915235328, 140163915239423, +STORE, 140163915239424, 140163915243519, +ERASE, 140163915206656, 140163915235327, +STORE, 94615330672640, 94615330807807, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140725254479872, 140737488351231, +SNULL, 140725254488063, 140737488351231, +STORE, 140725254479872, 140725254488063, +STORE, 140725254348800, 140725254488063, +STORE, 94572781277184, 94572785741823, +SNULL, 94572783312895, 94572785741823, +STORE, 94572781277184, 94572783312895, +STORE, 94572783312896, 94572785741823, +ERASE, 94572783312896, 94572785741823, +STORE, 94572785405952, 94572785455103, +STORE, 94572785455104, 94572785741823, +STORE, 139636001341440, 139636003594239, +SNULL, 139636001484799, 139636003594239, +STORE, 139636001341440, 139636001484799, +STORE, 139636001484800, 139636003594239, +ERASE, 139636001484800, 139636003594239, +STORE, 139636003581952, 139636003590143, +STORE, 139636003590144, 139636003594239, +STORE, 140725255557120, 140725255561215, +STORE, 140725255544832, 140725255557119, +STORE, 139636003553280, 139636003581951, +STORE, 139636003545088, 139636003553279, +STORE, 139635998773248, 139636001341439, +SNULL, 139635998773248, 139635999240191, +STORE, 139635999240192, 139636001341439, +STORE, 139635998773248, 139635999240191, +SNULL, 139636001333247, 139636001341439, +STORE, 139635999240192, 139636001333247, +STORE, 139636001333248, 139636001341439, +ERASE, 139636001333248, 139636001341439, +STORE, 139636001333248, 139636001341439, +STORE, 139635996569600, 139635998773247, +SNULL, 139635996569600, 139635996671999, +STORE, 139635996672000, 139635998773247, +STORE, 139635996569600, 139635996671999, +SNULL, 139635998765055, 139635998773247, +STORE, 139635996672000, 139635998765055, +STORE, 139635998765056, 139635998773247, +ERASE, 139635998765056, 139635998773247, +STORE, 139635998765056, 139635998773247, +STORE, 139635994353664, 139635996569599, +SNULL, 139635994353664, 139635994451967, +STORE, 139635994451968, 139635996569599, +STORE, 139635994353664, 139635994451967, +SNULL, 139635996545023, 139635996569599, +STORE, 139635994451968, 139635996545023, +STORE, 139635996545024, 139635996569599, +SNULL, 139635996545024, 139635996553215, +STORE, 139635996553216, 139635996569599, +STORE, 139635996545024, 139635996553215, +ERASE, 139635996545024, 139635996553215, +STORE, 139635996545024, 139635996553215, +ERASE, 139635996553216, 139635996569599, +STORE, 139635996553216, 139635996569599, +STORE, 139635992223744, 139635994353663, +SNULL, 139635992223744, 139635992252415, +STORE, 139635992252416, 139635994353663, +STORE, 139635992223744, 139635992252415, +SNULL, 139635994345471, 139635994353663, +STORE, 139635992252416, 139635994345471, +STORE, 139635994345472, 139635994353663, +ERASE, 139635994345472, 139635994353663, +STORE, 139635994345472, 139635994353663, +STORE, 139635988426752, 139635992223743, +SNULL, 139635988426752, 139635990085631, +STORE, 139635990085632, 139635992223743, +STORE, 139635988426752, 139635990085631, +SNULL, 139635992182783, 139635992223743, +STORE, 139635990085632, 139635992182783, +STORE, 139635992182784, 139635992223743, +SNULL, 139635992182784, 139635992207359, +STORE, 139635992207360, 139635992223743, +STORE, 139635992182784, 139635992207359, +ERASE, 139635992182784, 139635992207359, +STORE, 139635992182784, 139635992207359, +ERASE, 139635992207360, 139635992223743, +STORE, 139635992207360, 139635992223743, +STORE, 139636003536896, 139636003553279, +SNULL, 139635992199167, 139635992207359, +STORE, 139635992182784, 139635992199167, +STORE, 139635992199168, 139635992207359, +SNULL, 139635996549119, 139635996553215, +STORE, 139635996545024, 139635996549119, +STORE, 139635996549120, 139635996553215, +SNULL, 139635994349567, 139635994353663, +STORE, 139635994345472, 139635994349567, +STORE, 139635994349568, 139635994353663, +SNULL, 139635998769151, 139635998773247, +STORE, 139635998765056, 139635998769151, +STORE, 139635998769152, 139635998773247, +SNULL, 139636001337343, 139636001341439, +STORE, 139636001333248, 139636001337343, +STORE, 139636001337344, 139636001341439, +SNULL, 94572785418239, 94572785455103, +STORE, 94572785405952, 94572785418239, +STORE, 94572785418240, 94572785455103, +SNULL, 139636003586047, 139636003590143, +STORE, 139636003581952, 139636003586047, +STORE, 139636003586048, 139636003590143, +ERASE, 139636003553280, 139636003581951, +STORE, 94572798435328, 94572798570495, +STORE, 139636001853440, 139636003536895, +STORE, 139635981426688, 139635988426751, +STORE, 139635980615680, 139635981426687, +STORE, 94572798435328, 94572798705663, +STORE, 94572798435328, 94572798840831, +STORE, 94572798435328, 94572798975999, +STORE, 94572798435328, 94572799111167, +STORE, 94572798435328, 94572799246335, +STORE, 94572798435328, 94572799381503, +STORE, 94572798435328, 94572799516671, +STORE, 94572798435328, 94572799651839, +STORE, 94572798435328, 94572799787007, +STORE, 94572798435328, 94572799922175, +STORE, 94572798435328, 94572800057343, +STORE, 94572798435328, 94572800192511, +STORE, 94572798435328, 94572800327679, +STORE, 94572798435328, 94572800462847, +STORE, 94572798435328, 94572800598015, +STORE, 94572798435328, 94572800733183, +STORE, 94572798435328, 94572800868351, +STORE, 94572798435328, 94572801003519, +STORE, 94572798435328, 94572801138687, +STORE, 94572798435328, 94572801273855, +STORE, 94572798435328, 94572801409023, +STORE, 94572798435328, 94572801544191, +STORE, 94572798435328, 94572801679359, +STORE, 94572798435328, 94572801814527, +STORE, 94572798435328, 94572801949695, +STORE, 94572798435328, 94572802084863, +STORE, 94572798435328, 94572802220031, +STORE, 94572798435328, 94572802355199, +STORE, 94572798435328, 94572802490367, +STORE, 94572798435328, 94572802625535, +STORE, 94572798435328, 94572802760703, +STORE, 94572798435328, 94572802895871, +STORE, 94572798435328, 94572803031039, +STORE, 94572798435328, 94572803166207, +STORE, 94572798435328, 94572803301375, +STORE, 94572798435328, 94572803436543, +STORE, 94572798435328, 94572803571711, +STORE, 94572798435328, 94572803706879, +STORE, 94572798435328, 94572803842047, +STORE, 94572798435328, 94572803977215, +STORE, 94572798435328, 94572804112383, +STORE, 94572798435328, 94572804247551, +STORE, 94572798435328, 94572804382719, +STORE, 94572798435328, 94572804517887, +STORE, 94572798435328, 94572804653055, +STORE, 94572798435328, 94572804788223, +STORE, 94572798435328, 94572804923391, +STORE, 94572798435328, 94572805058559, +STORE, 94572798435328, 94572805193727, +STORE, 94572798435328, 94572805328895, +STORE, 94572798435328, 94572805464063, +STORE, 94572798435328, 94572805599231, +STORE, 94572798435328, 94572805734399, +STORE, 94572798435328, 94572805869567, +STORE, 94572798435328, 94572806004735, +STORE, 94572798435328, 94572806139903, +STORE, 94572798435328, 94572806275071, +STORE, 94572798435328, 94572806410239, +STORE, 94572798435328, 94572806545407, +STORE, 94572798435328, 94572806680575, +STORE, 94572798435328, 94572806815743, +STORE, 94572798435328, 94572806950911, +STORE, 94572798435328, 94572807086079, +STORE, 94572798435328, 94572807221247, +STORE, 94572798435328, 94572807356415, +STORE, 94572798435328, 94572807491583, +STORE, 94572798435328, 94572807626751, +STORE, 94572798435328, 94572807761919, +STORE, 94572798435328, 94572807897087, +STORE, 94572798435328, 94572808032255, +STORE, 94572798435328, 94572808167423, +STORE, 94572798435328, 94572808302591, +STORE, 94572798435328, 94572808437759, +STORE, 94572798435328, 94572808572927, +ERASE, 139635981426688, 139635988426751, +STORE, 139635985088512, 139635988426751, +STORE, 139635778273280, 139635980615679, +STORE, 139635567632384, 139635778273279, +STORE, 94572798435328, 94572808716287, +STORE, 139635984564224, 139635985088511, +STORE, 139635559239680, 139635567632383, +SNULL, 139635559243775, 139635567632383, +STORE, 139635559239680, 139635559243775, +STORE, 139635559243776, 139635567632383, +STORE, 139635550846976, 139635559239679, +SNULL, 139635550851071, 139635559239679, +STORE, 139635550846976, 139635550851071, +STORE, 139635550851072, 139635559239679, +STORE, 139635542454272, 139635550846975, +STORE, 139635408236544, 139635542454271, +SNULL, 139635408236544, 139635426590719, +STORE, 139635426590720, 139635542454271, +STORE, 139635408236544, 139635426590719, +ERASE, 139635408236544, 139635426590719, +STORE, 139635292372992, 139635542454271, +SNULL, 139635359481855, 139635542454271, +STORE, 139635292372992, 139635359481855, +STORE, 139635359481856, 139635542454271, +SNULL, 139635359481856, 139635426590719, +STORE, 139635426590720, 139635542454271, +STORE, 139635359481856, 139635426590719, +ERASE, 139635359481856, 139635426590719, +SNULL, 139635542458367, 139635550846975, +STORE, 139635542454272, 139635542458367, +STORE, 139635542458368, 139635550846975, +STORE, 139635418198016, 139635426590719, +SNULL, 139635493699583, 139635542454271, +STORE, 139635426590720, 139635493699583, +STORE, 139635493699584, 139635542454271, +ERASE, 139635493699584, 139635542454271, +SNULL, 139635426725887, 139635493699583, +STORE, 139635426590720, 139635426725887, +STORE, 139635426725888, 139635493699583, +SNULL, 139635292508159, 139635359481855, +STORE, 139635292372992, 139635292508159, +STORE, 139635292508160, 139635359481855, +SNULL, 139635418202111, 139635426590719, +STORE, 139635418198016, 139635418202111, +STORE, 139635418202112, 139635426590719, +STORE, 139635225264128, 139635292372991, +STORE, 139635534061568, 139635542454271, +SNULL, 139635534065663, 139635542454271, +STORE, 139635534061568, 139635534065663, +STORE, 139635534065664, 139635542454271, +STORE, 139635525668864, 139635534061567, +SNULL, 139635525672959, 139635534061567, +STORE, 139635525668864, 139635525672959, +STORE, 139635525672960, 139635534061567, +SNULL, 139635225399295, 139635292372991, +STORE, 139635225264128, 139635225399295, +STORE, 139635225399296, 139635292372991, +STORE, 139635091046400, 139635225264127, +SNULL, 139635158155263, 139635225264127, +STORE, 139635091046400, 139635158155263, +STORE, 139635158155264, 139635225264127, +ERASE, 139635158155264, 139635225264127, +STORE, 139634956828672, 139635158155263, +STORE, 139635517276160, 139635525668863, +SNULL, 139635517280255, 139635525668863, +STORE, 139635517276160, 139635517280255, +STORE, 139635517280256, 139635525668863, +SNULL, 139634956828672, 139635091046399, +STORE, 139635091046400, 139635158155263, +STORE, 139634956828672, 139635091046399, +SNULL, 139635091181567, 139635158155263, +STORE, 139635091046400, 139635091181567, +STORE, 139635091181568, 139635158155263, +SNULL, 139635023937535, 139635091046399, +STORE, 139634956828672, 139635023937535, +STORE, 139635023937536, 139635091046399, +ERASE, 139635023937536, 139635091046399, +STORE, 139634956828672, 139635091046399, +SNULL, 139634956828672, 139635023937535, +STORE, 139635023937536, 139635091046399, +STORE, 139634956828672, 139635023937535, +SNULL, 139635024072703, 139635091046399, +STORE, 139635023937536, 139635024072703, +STORE, 139635024072704, 139635091046399, +STORE, 139635508883456, 139635517276159, +SNULL, 139635508887551, 139635517276159, +STORE, 139635508883456, 139635508887551, +STORE, 139635508887552, 139635517276159, +STORE, 139634822610944, 139635023937535, +SNULL, 139634822610944, 139634956828671, +STORE, 139634956828672, 139635023937535, +STORE, 139634822610944, 139634956828671, +SNULL, 139634956963839, 139635023937535, +STORE, 139634956828672, 139634956963839, +STORE, 139634956963840, 139635023937535, +STORE, 139635500490752, 139635508883455, +SNULL, 139634889719807, 139634956828671, +STORE, 139634822610944, 139634889719807, +STORE, 139634889719808, 139634956828671, +ERASE, 139634889719808, 139634956828671, +SNULL, 139635500494847, 139635508883455, +STORE, 139635500490752, 139635500494847, +STORE, 139635500494848, 139635508883455, +SNULL, 139634822746111, 139634889719807, +STORE, 139634822610944, 139634822746111, +STORE, 139634822746112, 139634889719807, +STORE, 139635409805312, 139635418198015, +STORE, 139634822746112, 139634956828671, +SNULL, 139634822746112, 139634889719807, +STORE, 139634889719808, 139634956828671, +STORE, 139634822746112, 139634889719807, +SNULL, 139634889854975, 139634956828671, +STORE, 139634889719808, 139634889854975, +STORE, 139634889854976, 139634956828671, +SNULL, 139635409809407, 139635418198015, +STORE, 139635409805312, 139635409809407, +STORE, 139635409809408, 139635418198015, +STORE, 139635401412608, 139635409805311, +STORE, 139634688393216, 139634822610943, +SNULL, 139634755502079, 139634822610943, +STORE, 139634688393216, 139634755502079, +STORE, 139634755502080, 139634822610943, +ERASE, 139634755502080, 139634822610943, +SNULL, 139635401416703, 139635409805311, +STORE, 139635401412608, 139635401416703, +STORE, 139635401416704, 139635409805311, +STORE, 139634554175488, 139634755502079, +SNULL, 139634554175488, 139634688393215, +STORE, 139634688393216, 139634755502079, +STORE, 139634554175488, 139634688393215, +SNULL, 139634688528383, 139634755502079, +STORE, 139634688393216, 139634688528383, +STORE, 139634688528384, 139634755502079, +STORE, 139635393019904, 139635401412607, +SNULL, 139634621284351, 139634688393215, +STORE, 139634554175488, 139634621284351, +STORE, 139634621284352, 139634688393215, +ERASE, 139634621284352, 139634688393215, +SNULL, 139634554310655, 139634621284351, +STORE, 139634554175488, 139634554310655, +STORE, 139634554310656, 139634621284351, +STORE, 139634554310656, 139634688393215, +SNULL, 139635393023999, 139635401412607, +STORE, 139635393019904, 139635393023999, +STORE, 139635393024000, 139635401412607, +SNULL, 139634554310656, 139634621284351, +STORE, 139634621284352, 139634688393215, +STORE, 139634554310656, 139634621284351, +SNULL, 139634621419519, 139634688393215, +STORE, 139634621284352, 139634621419519, +STORE, 139634621419520, 139634688393215, +STORE, 139635384627200, 139635393019903, +SNULL, 139635384631295, 139635393019903, +STORE, 139635384627200, 139635384631295, +STORE, 139635384631296, 139635393019903, +STORE, 139635376234496, 139635384627199, +SNULL, 139635376238591, 139635384627199, +STORE, 139635376234496, 139635376238591, +STORE, 139635376238592, 139635384627199, +STORE, 139635367841792, 139635376234495, +SNULL, 139635367845887, 139635376234495, +STORE, 139635367841792, 139635367845887, +STORE, 139635367845888, 139635376234495, +STORE, 139634419957760, 139634554175487, +SNULL, 139634487066623, 139634554175487, +STORE, 139634419957760, 139634487066623, +STORE, 139634487066624, 139634554175487, +ERASE, 139634487066624, 139634554175487, +STORE, 139635216871424, 139635225264127, +SNULL, 139635216875519, 139635225264127, +STORE, 139635216871424, 139635216875519, +STORE, 139635216875520, 139635225264127, +SNULL, 139634420092927, 139634487066623, +STORE, 139634419957760, 139634420092927, +STORE, 139634420092928, 139634487066623, +STORE, 139635208478720, 139635216871423, +SNULL, 139635208482815, 139635216871423, +STORE, 139635208478720, 139635208482815, +STORE, 139635208482816, 139635216871423, +STORE, 139635200086016, 139635208478719, +SNULL, 139635200090111, 139635208478719, +STORE, 139635200086016, 139635200090111, +STORE, 139635200090112, 139635208478719, +STORE, 139635191693312, 139635200086015, +SNULL, 139635191697407, 139635200086015, +STORE, 139635191693312, 139635191697407, +STORE, 139635191697408, 139635200086015, +STORE, 139635183300608, 139635191693311, +SNULL, 139635183304703, 139635191693311, +STORE, 139635183300608, 139635183304703, +STORE, 139635183304704, 139635191693311, +STORE, 139634420092928, 139634554175487, +SNULL, 139634420092928, 139634487066623, +STORE, 139634487066624, 139634554175487, +STORE, 139634420092928, 139634487066623, +SNULL, 139634487201791, 139634554175487, +STORE, 139634487066624, 139634487201791, +STORE, 139634487201792, 139634554175487, +ERASE, 139635559239680, 139635559243775, +ERASE, 139635559243776, 139635567632383, +ERASE, 139635550846976, 139635550851071, +ERASE, 139635550851072, 139635559239679, +ERASE, 139635542454272, 139635542458367, +ERASE, 139635542458368, 139635550846975, +ERASE, 139635418198016, 139635418202111, +ERASE, 139635418202112, 139635426590719, +ERASE, 139635534061568, 139635534065663, +ERASE, 139635534065664, 139635542454271, +ERASE, 139635525668864, 139635525672959, +ERASE, 139635525672960, 139635534061567, +ERASE, 139635517276160, 139635517280255, +ERASE, 139635517280256, 139635525668863, +ERASE, 139635508883456, 139635508887551, +ERASE, 139635508887552, 139635517276159, +ERASE, 139635500490752, 139635500494847, +ERASE, 139635500494848, 139635508883455, +ERASE, 139635409805312, 139635409809407, +ERASE, 139635409809408, 139635418198015, +ERASE, 139635401412608, 139635401416703, +ERASE, 139635401416704, 139635409805311, +ERASE, 139635393019904, 139635393023999, +ERASE, 139635393024000, 139635401412607, +ERASE, 139635384627200, 139635384631295, +ERASE, 139635384631296, 139635393019903, + }; + unsigned long set25[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722547441664, 140737488351231, +SNULL, 140722547449855, 140737488351231, +STORE, 140722547441664, 140722547449855, +STORE, 140722547310592, 140722547449855, +STORE, 94827521732608, 94827523956735, +SNULL, 94827521843199, 94827523956735, +STORE, 94827521732608, 94827521843199, +STORE, 94827521843200, 94827523956735, +ERASE, 94827521843200, 94827523956735, +STORE, 94827523936256, 94827523948543, +STORE, 94827523948544, 94827523956735, +STORE, 139816136847360, 139816139100159, +SNULL, 139816136990719, 139816139100159, +STORE, 139816136847360, 139816136990719, +STORE, 139816136990720, 139816139100159, +ERASE, 139816136990720, 139816139100159, +STORE, 139816139087872, 139816139096063, +STORE, 139816139096064, 139816139100159, +STORE, 140722548142080, 140722548146175, +STORE, 140722548129792, 140722548142079, +STORE, 139816139059200, 139816139087871, +STORE, 139816139051008, 139816139059199, +STORE, 139816133050368, 139816136847359, +SNULL, 139816133050368, 139816134709247, +STORE, 139816134709248, 139816136847359, +STORE, 139816133050368, 139816134709247, +SNULL, 139816136806399, 139816136847359, +STORE, 139816134709248, 139816136806399, +STORE, 139816136806400, 139816136847359, +SNULL, 139816136806400, 139816136830975, +STORE, 139816136830976, 139816136847359, +STORE, 139816136806400, 139816136830975, +ERASE, 139816136806400, 139816136830975, +STORE, 139816136806400, 139816136830975, +ERASE, 139816136830976, 139816136847359, +STORE, 139816136830976, 139816136847359, +SNULL, 139816136822783, 139816136830975, +STORE, 139816136806400, 139816136822783, +STORE, 139816136822784, 139816136830975, +SNULL, 94827523944447, 94827523948543, +STORE, 94827523936256, 94827523944447, +STORE, 94827523944448, 94827523948543, +SNULL, 139816139091967, 139816139096063, +STORE, 139816139087872, 139816139091967, +STORE, 139816139091968, 139816139096063, +ERASE, 139816139059200, 139816139087871, +STORE, 94827534970880, 94827535106047, +STORE, 94114394132480, 94114394345471, +STORE, 94114396442624, 94114396446719, +STORE, 94114396446720, 94114396454911, +STORE, 94114396454912, 94114396467199, +STORE, 94114421575680, 94114427715583, +STORE, 139934313955328, 139934315614207, +STORE, 139934315614208, 139934317711359, +STORE, 139934317711360, 139934317727743, +STORE, 139934317727744, 139934317735935, +STORE, 139934317735936, 139934317752319, +STORE, 139934317752320, 139934317764607, +STORE, 139934317764608, 139934319857663, +STORE, 139934319857664, 139934319861759, +STORE, 139934319861760, 139934319865855, +STORE, 139934319865856, 139934320009215, +STORE, 139934320377856, 139934322061311, +STORE, 139934322061312, 139934322077695, +STORE, 139934322106368, 139934322110463, +STORE, 139934322110464, 139934322114559, +STORE, 139934322114560, 139934322118655, +STORE, 140731200376832, 140731200516095, +STORE, 140731200929792, 140731200942079, +STORE, 140731200942080, 140731200946175, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140734133174272, 140737488351231, +SNULL, 140734133182463, 140737488351231, +STORE, 140734133174272, 140734133182463, +STORE, 140734133043200, 140734133182463, +STORE, 94412675600384, 94412677824511, +SNULL, 94412675710975, 94412677824511, +STORE, 94412675600384, 94412675710975, +STORE, 94412675710976, 94412677824511, +ERASE, 94412675710976, 94412677824511, +STORE, 94412677804032, 94412677816319, +STORE, 94412677816320, 94412677824511, +STORE, 140320087945216, 140320090198015, +SNULL, 140320088088575, 140320090198015, +STORE, 140320087945216, 140320088088575, +STORE, 140320088088576, 140320090198015, +ERASE, 140320088088576, 140320090198015, +STORE, 140320090185728, 140320090193919, +STORE, 140320090193920, 140320090198015, +STORE, 140734134591488, 140734134595583, +STORE, 140734134579200, 140734134591487, +STORE, 140320090157056, 140320090185727, +STORE, 140320090148864, 140320090157055, +STORE, 140320084148224, 140320087945215, +SNULL, 140320084148224, 140320085807103, +STORE, 140320085807104, 140320087945215, +STORE, 140320084148224, 140320085807103, +SNULL, 140320087904255, 140320087945215, +STORE, 140320085807104, 140320087904255, +STORE, 140320087904256, 140320087945215, +SNULL, 140320087904256, 140320087928831, +STORE, 140320087928832, 140320087945215, +STORE, 140320087904256, 140320087928831, +ERASE, 140320087904256, 140320087928831, +STORE, 140320087904256, 140320087928831, +ERASE, 140320087928832, 140320087945215, +STORE, 140320087928832, 140320087945215, +SNULL, 140320087920639, 140320087928831, +STORE, 140320087904256, 140320087920639, +STORE, 140320087920640, 140320087928831, +SNULL, 94412677812223, 94412677816319, +STORE, 94412677804032, 94412677812223, +STORE, 94412677812224, 94412677816319, +SNULL, 140320090189823, 140320090193919, +STORE, 140320090185728, 140320090189823, +STORE, 140320090189824, 140320090193919, +ERASE, 140320090157056, 140320090185727, +STORE, 94412684546048, 94412684681215, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140723005485056, 140737488351231, +SNULL, 140723005493247, 140737488351231, +STORE, 140723005485056, 140723005493247, +STORE, 140723005353984, 140723005493247, +STORE, 94387431936000, 94387434160127, +SNULL, 94387432046591, 94387434160127, +STORE, 94387431936000, 94387432046591, +STORE, 94387432046592, 94387434160127, +ERASE, 94387432046592, 94387434160127, +STORE, 94387434139648, 94387434151935, +STORE, 94387434151936, 94387434160127, +STORE, 140151675392000, 140151677644799, +SNULL, 140151675535359, 140151677644799, +STORE, 140151675392000, 140151675535359, +STORE, 140151675535360, 140151677644799, +ERASE, 140151675535360, 140151677644799, +STORE, 140151677632512, 140151677640703, +STORE, 140151677640704, 140151677644799, +STORE, 140723005784064, 140723005788159, +STORE, 140723005771776, 140723005784063, +STORE, 140151677603840, 140151677632511, +STORE, 140151677595648, 140151677603839, +STORE, 140151671595008, 140151675391999, +SNULL, 140151671595008, 140151673253887, +STORE, 140151673253888, 140151675391999, +STORE, 140151671595008, 140151673253887, +SNULL, 140151675351039, 140151675391999, +STORE, 140151673253888, 140151675351039, +STORE, 140151675351040, 140151675391999, +SNULL, 140151675351040, 140151675375615, +STORE, 140151675375616, 140151675391999, +STORE, 140151675351040, 140151675375615, +ERASE, 140151675351040, 140151675375615, +STORE, 140151675351040, 140151675375615, +ERASE, 140151675375616, 140151675391999, +STORE, 140151675375616, 140151675391999, +SNULL, 140151675367423, 140151675375615, +STORE, 140151675351040, 140151675367423, +STORE, 140151675367424, 140151675375615, +SNULL, 94387434147839, 94387434151935, +STORE, 94387434139648, 94387434147839, +STORE, 94387434147840, 94387434151935, +SNULL, 140151677636607, 140151677640703, +STORE, 140151677632512, 140151677636607, +STORE, 140151677636608, 140151677640703, +ERASE, 140151677603840, 140151677632511, +STORE, 94387458818048, 94387458953215, +STORE, 94909010997248, 94909011210239, +STORE, 94909013307392, 94909013311487, +STORE, 94909013311488, 94909013319679, +STORE, 94909013319680, 94909013331967, +STORE, 94909014827008, 94909023371263, +STORE, 140712411975680, 140712413634559, +STORE, 140712413634560, 140712415731711, +STORE, 140712415731712, 140712415748095, +STORE, 140712415748096, 140712415756287, +STORE, 140712415756288, 140712415772671, +STORE, 140712415772672, 140712415784959, +STORE, 140712415784960, 140712417878015, +STORE, 140712417878016, 140712417882111, +STORE, 140712417882112, 140712417886207, +STORE, 140712417886208, 140712418029567, +STORE, 140712418398208, 140712420081663, +STORE, 140712420081664, 140712420098047, +STORE, 140712420126720, 140712420130815, +STORE, 140712420130816, 140712420134911, +STORE, 140712420134912, 140712420139007, +STORE, 140729293111296, 140729293250559, +STORE, 140729293307904, 140729293320191, +STORE, 140729293320192, 140729293324287, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140720541691904, 140737488351231, +SNULL, 140720541700095, 140737488351231, +STORE, 140720541691904, 140720541700095, +STORE, 140720541560832, 140720541700095, +STORE, 94203603419136, 94203605643263, +SNULL, 94203603529727, 94203605643263, +STORE, 94203603419136, 94203603529727, +STORE, 94203603529728, 94203605643263, +ERASE, 94203603529728, 94203605643263, +STORE, 94203605622784, 94203605635071, +STORE, 94203605635072, 94203605643263, +STORE, 139847623081984, 139847625334783, +SNULL, 139847623225343, 139847625334783, +STORE, 139847623081984, 139847623225343, +STORE, 139847623225344, 139847625334783, +ERASE, 139847623225344, 139847625334783, +STORE, 139847625322496, 139847625330687, +STORE, 139847625330688, 139847625334783, +STORE, 140720542547968, 140720542552063, +STORE, 140720542535680, 140720542547967, +STORE, 139847625293824, 139847625322495, +STORE, 139847625285632, 139847625293823, +STORE, 139847619284992, 139847623081983, +SNULL, 139847619284992, 139847620943871, +STORE, 139847620943872, 139847623081983, +STORE, 139847619284992, 139847620943871, +SNULL, 139847623041023, 139847623081983, +STORE, 139847620943872, 139847623041023, +STORE, 139847623041024, 139847623081983, +SNULL, 139847623041024, 139847623065599, +STORE, 139847623065600, 139847623081983, +STORE, 139847623041024, 139847623065599, +ERASE, 139847623041024, 139847623065599, +STORE, 139847623041024, 139847623065599, +ERASE, 139847623065600, 139847623081983, +STORE, 139847623065600, 139847623081983, +SNULL, 139847623057407, 139847623065599, +STORE, 139847623041024, 139847623057407, +STORE, 139847623057408, 139847623065599, +SNULL, 94203605630975, 94203605635071, +STORE, 94203605622784, 94203605630975, +STORE, 94203605630976, 94203605635071, +SNULL, 139847625326591, 139847625330687, +STORE, 139847625322496, 139847625326591, +STORE, 139847625326592, 139847625330687, +ERASE, 139847625293824, 139847625322495, +STORE, 94203634880512, 94203635015679, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140721428738048, 140737488351231, +SNULL, 140721428746239, 140737488351231, +STORE, 140721428738048, 140721428746239, +STORE, 140721428606976, 140721428746239, +STORE, 93968808378368, 93968810602495, +SNULL, 93968808488959, 93968810602495, +STORE, 93968808378368, 93968808488959, +STORE, 93968808488960, 93968810602495, +ERASE, 93968808488960, 93968810602495, +STORE, 93968810582016, 93968810594303, +STORE, 93968810594304, 93968810602495, +STORE, 140397757026304, 140397759279103, +SNULL, 140397757169663, 140397759279103, +STORE, 140397757026304, 140397757169663, +STORE, 140397757169664, 140397759279103, +ERASE, 140397757169664, 140397759279103, +STORE, 140397759266816, 140397759275007, +STORE, 140397759275008, 140397759279103, +STORE, 140721430368256, 140721430372351, +STORE, 140721430355968, 140721430368255, +STORE, 140397759238144, 140397759266815, +STORE, 140397759229952, 140397759238143, +STORE, 140397753229312, 140397757026303, +SNULL, 140397753229312, 140397754888191, +STORE, 140397754888192, 140397757026303, +STORE, 140397753229312, 140397754888191, +SNULL, 140397756985343, 140397757026303, +STORE, 140397754888192, 140397756985343, +STORE, 140397756985344, 140397757026303, +SNULL, 140397756985344, 140397757009919, +STORE, 140397757009920, 140397757026303, +STORE, 140397756985344, 140397757009919, +ERASE, 140397756985344, 140397757009919, +STORE, 140397756985344, 140397757009919, +ERASE, 140397757009920, 140397757026303, +STORE, 140397757009920, 140397757026303, +SNULL, 140397757001727, 140397757009919, +STORE, 140397756985344, 140397757001727, +STORE, 140397757001728, 140397757009919, +SNULL, 93968810590207, 93968810594303, +STORE, 93968810582016, 93968810590207, +STORE, 93968810590208, 93968810594303, +SNULL, 140397759270911, 140397759275007, +STORE, 140397759266816, 140397759270911, +STORE, 140397759270912, 140397759275007, +ERASE, 140397759238144, 140397759266815, +STORE, 93968837025792, 93968837160959, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140721751044096, 140737488351231, +SNULL, 140721751052287, 140737488351231, +STORE, 140721751044096, 140721751052287, +STORE, 140721750913024, 140721751052287, +STORE, 94426051657728, 94426053881855, +SNULL, 94426051768319, 94426053881855, +STORE, 94426051657728, 94426051768319, +STORE, 94426051768320, 94426053881855, +ERASE, 94426051768320, 94426053881855, +STORE, 94426053861376, 94426053873663, +STORE, 94426053873664, 94426053881855, +STORE, 140228456181760, 140228458434559, +SNULL, 140228456325119, 140228458434559, +STORE, 140228456181760, 140228456325119, +STORE, 140228456325120, 140228458434559, +ERASE, 140228456325120, 140228458434559, +STORE, 140228458422272, 140228458430463, +STORE, 140228458430464, 140228458434559, +STORE, 140721751117824, 140721751121919, +STORE, 140721751105536, 140721751117823, +STORE, 140228458393600, 140228458422271, +STORE, 140228458385408, 140228458393599, +STORE, 140228452384768, 140228456181759, +SNULL, 140228452384768, 140228454043647, +STORE, 140228454043648, 140228456181759, +STORE, 140228452384768, 140228454043647, +SNULL, 140228456140799, 140228456181759, +STORE, 140228454043648, 140228456140799, +STORE, 140228456140800, 140228456181759, +SNULL, 140228456140800, 140228456165375, +STORE, 140228456165376, 140228456181759, +STORE, 140228456140800, 140228456165375, +ERASE, 140228456140800, 140228456165375, +STORE, 140228456140800, 140228456165375, +ERASE, 140228456165376, 140228456181759, +STORE, 140228456165376, 140228456181759, +SNULL, 140228456157183, 140228456165375, +STORE, 140228456140800, 140228456157183, +STORE, 140228456157184, 140228456165375, +SNULL, 94426053869567, 94426053873663, +STORE, 94426053861376, 94426053869567, +STORE, 94426053869568, 94426053873663, +SNULL, 140228458426367, 140228458430463, +STORE, 140228458422272, 140228458426367, +STORE, 140228458426368, 140228458430463, +ERASE, 140228458393600, 140228458422271, +STORE, 94426073681920, 94426073817087, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140732727623680, 140737488351231, +SNULL, 140732727631871, 140737488351231, +STORE, 140732727623680, 140732727631871, +STORE, 140732727492608, 140732727631871, +STORE, 94537485996032, 94537488220159, +SNULL, 94537486106623, 94537488220159, +STORE, 94537485996032, 94537486106623, +STORE, 94537486106624, 94537488220159, +ERASE, 94537486106624, 94537488220159, +STORE, 94537488199680, 94537488211967, +STORE, 94537488211968, 94537488220159, +STORE, 140446578036736, 140446580289535, +SNULL, 140446578180095, 140446580289535, +STORE, 140446578036736, 140446578180095, +STORE, 140446578180096, 140446580289535, +ERASE, 140446578180096, 140446580289535, +STORE, 140446580277248, 140446580285439, +STORE, 140446580285440, 140446580289535, +STORE, 140732727758848, 140732727762943, +STORE, 140732727746560, 140732727758847, +STORE, 140446580248576, 140446580277247, +STORE, 140446580240384, 140446580248575, +STORE, 140446574239744, 140446578036735, +SNULL, 140446574239744, 140446575898623, +STORE, 140446575898624, 140446578036735, +STORE, 140446574239744, 140446575898623, +SNULL, 140446577995775, 140446578036735, +STORE, 140446575898624, 140446577995775, +STORE, 140446577995776, 140446578036735, +SNULL, 140446577995776, 140446578020351, +STORE, 140446578020352, 140446578036735, +STORE, 140446577995776, 140446578020351, +ERASE, 140446577995776, 140446578020351, +STORE, 140446577995776, 140446578020351, +ERASE, 140446578020352, 140446578036735, +STORE, 140446578020352, 140446578036735, +SNULL, 140446578012159, 140446578020351, +STORE, 140446577995776, 140446578012159, +STORE, 140446578012160, 140446578020351, +SNULL, 94537488207871, 94537488211967, +STORE, 94537488199680, 94537488207871, +STORE, 94537488207872, 94537488211967, +SNULL, 140446580281343, 140446580285439, +STORE, 140446580277248, 140446580281343, +STORE, 140446580281344, 140446580285439, +ERASE, 140446580248576, 140446580277247, +STORE, 94537489014784, 94537489149951, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140728766808064, 140737488351231, +SNULL, 140728766816255, 140737488351231, +STORE, 140728766808064, 140728766816255, +STORE, 140728766676992, 140728766816255, +STORE, 94418513866752, 94418516090879, +SNULL, 94418513977343, 94418516090879, +STORE, 94418513866752, 94418513977343, +STORE, 94418513977344, 94418516090879, +ERASE, 94418513977344, 94418516090879, +STORE, 94418516070400, 94418516082687, +STORE, 94418516082688, 94418516090879, +STORE, 140556479520768, 140556481773567, +SNULL, 140556479664127, 140556481773567, +STORE, 140556479520768, 140556479664127, +STORE, 140556479664128, 140556481773567, +ERASE, 140556479664128, 140556481773567, +STORE, 140556481761280, 140556481769471, +STORE, 140556481769472, 140556481773567, +STORE, 140728767148032, 140728767152127, +STORE, 140728767135744, 140728767148031, +STORE, 140556481732608, 140556481761279, +STORE, 140556481724416, 140556481732607, +STORE, 140556475723776, 140556479520767, +SNULL, 140556475723776, 140556477382655, +STORE, 140556477382656, 140556479520767, +STORE, 140556475723776, 140556477382655, +SNULL, 140556479479807, 140556479520767, +STORE, 140556477382656, 140556479479807, +STORE, 140556479479808, 140556479520767, +SNULL, 140556479479808, 140556479504383, +STORE, 140556479504384, 140556479520767, +STORE, 140556479479808, 140556479504383, +ERASE, 140556479479808, 140556479504383, +STORE, 140556479479808, 140556479504383, +ERASE, 140556479504384, 140556479520767, +STORE, 140556479504384, 140556479520767, +SNULL, 140556479496191, 140556479504383, +STORE, 140556479479808, 140556479496191, +STORE, 140556479496192, 140556479504383, +SNULL, 94418516078591, 94418516082687, +STORE, 94418516070400, 94418516078591, +STORE, 94418516078592, 94418516082687, +SNULL, 140556481765375, 140556481769471, +STORE, 140556481761280, 140556481765375, +STORE, 140556481765376, 140556481769471, +ERASE, 140556481732608, 140556481761279, +STORE, 94418541113344, 94418541248511, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140723945873408, 140737488351231, +SNULL, 140723945881599, 140737488351231, +STORE, 140723945873408, 140723945881599, +STORE, 140723945742336, 140723945881599, +STORE, 94543169773568, 94543171997695, +SNULL, 94543169884159, 94543171997695, +STORE, 94543169773568, 94543169884159, +STORE, 94543169884160, 94543171997695, +ERASE, 94543169884160, 94543171997695, +STORE, 94543171977216, 94543171989503, +STORE, 94543171989504, 94543171997695, +STORE, 139890420883456, 139890423136255, +SNULL, 139890421026815, 139890423136255, +STORE, 139890420883456, 139890421026815, +STORE, 139890421026816, 139890423136255, +ERASE, 139890421026816, 139890423136255, +STORE, 139890423123968, 139890423132159, +STORE, 139890423132160, 139890423136255, +STORE, 140723946102784, 140723946106879, +STORE, 140723946090496, 140723946102783, +STORE, 139890423095296, 139890423123967, +STORE, 139890423087104, 139890423095295, +STORE, 139890417086464, 139890420883455, +SNULL, 139890417086464, 139890418745343, +STORE, 139890418745344, 139890420883455, +STORE, 139890417086464, 139890418745343, +SNULL, 139890420842495, 139890420883455, +STORE, 139890418745344, 139890420842495, +STORE, 139890420842496, 139890420883455, +SNULL, 139890420842496, 139890420867071, +STORE, 139890420867072, 139890420883455, +STORE, 139890420842496, 139890420867071, +ERASE, 139890420842496, 139890420867071, +STORE, 139890420842496, 139890420867071, +ERASE, 139890420867072, 139890420883455, +STORE, 139890420867072, 139890420883455, +SNULL, 139890420858879, 139890420867071, +STORE, 139890420842496, 139890420858879, +STORE, 139890420858880, 139890420867071, +SNULL, 94543171985407, 94543171989503, +STORE, 94543171977216, 94543171985407, +STORE, 94543171985408, 94543171989503, +SNULL, 139890423128063, 139890423132159, +STORE, 139890423123968, 139890423128063, +STORE, 139890423128064, 139890423132159, +ERASE, 139890423095296, 139890423123967, +STORE, 94543197097984, 94543197233151, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140736205979648, 140737488351231, +SNULL, 140736205987839, 140737488351231, +STORE, 140736205979648, 140736205987839, +STORE, 140736205848576, 140736205987839, +STORE, 94913209913344, 94913212137471, +SNULL, 94913210023935, 94913212137471, +STORE, 94913209913344, 94913210023935, +STORE, 94913210023936, 94913212137471, +ERASE, 94913210023936, 94913212137471, +STORE, 94913212116992, 94913212129279, +STORE, 94913212129280, 94913212137471, +STORE, 140006323052544, 140006325305343, +SNULL, 140006323195903, 140006325305343, +STORE, 140006323052544, 140006323195903, +STORE, 140006323195904, 140006325305343, +ERASE, 140006323195904, 140006325305343, +STORE, 140006325293056, 140006325301247, +STORE, 140006325301248, 140006325305343, +STORE, 140736206716928, 140736206721023, +STORE, 140736206704640, 140736206716927, +STORE, 140006325264384, 140006325293055, +STORE, 140006325256192, 140006325264383, +STORE, 140006319255552, 140006323052543, +SNULL, 140006319255552, 140006320914431, +STORE, 140006320914432, 140006323052543, +STORE, 140006319255552, 140006320914431, +SNULL, 140006323011583, 140006323052543, +STORE, 140006320914432, 140006323011583, +STORE, 140006323011584, 140006323052543, +SNULL, 140006323011584, 140006323036159, +STORE, 140006323036160, 140006323052543, +STORE, 140006323011584, 140006323036159, +ERASE, 140006323011584, 140006323036159, +STORE, 140006323011584, 140006323036159, +ERASE, 140006323036160, 140006323052543, +STORE, 140006323036160, 140006323052543, +SNULL, 140006323027967, 140006323036159, +STORE, 140006323011584, 140006323027967, +STORE, 140006323027968, 140006323036159, +SNULL, 94913212125183, 94913212129279, +STORE, 94913212116992, 94913212125183, +STORE, 94913212125184, 94913212129279, +SNULL, 140006325297151, 140006325301247, +STORE, 140006325293056, 140006325297151, +STORE, 140006325297152, 140006325301247, +ERASE, 140006325264384, 140006325293055, +STORE, 94913239932928, 94913240068095, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140726926897152, 140737488351231, +SNULL, 140726926905343, 140737488351231, +STORE, 140726926897152, 140726926905343, +STORE, 140726926766080, 140726926905343, +STORE, 94213246820352, 94213249044479, +SNULL, 94213246930943, 94213249044479, +STORE, 94213246820352, 94213246930943, +STORE, 94213246930944, 94213249044479, +ERASE, 94213246930944, 94213249044479, +STORE, 94213249024000, 94213249036287, +STORE, 94213249036288, 94213249044479, +STORE, 140368830242816, 140368832495615, +SNULL, 140368830386175, 140368832495615, +STORE, 140368830242816, 140368830386175, +STORE, 140368830386176, 140368832495615, +ERASE, 140368830386176, 140368832495615, +STORE, 140368832483328, 140368832491519, +STORE, 140368832491520, 140368832495615, +STORE, 140726926999552, 140726927003647, +STORE, 140726926987264, 140726926999551, +STORE, 140368832454656, 140368832483327, +STORE, 140368832446464, 140368832454655, +STORE, 140368826445824, 140368830242815, +SNULL, 140368826445824, 140368828104703, +STORE, 140368828104704, 140368830242815, +STORE, 140368826445824, 140368828104703, +SNULL, 140368830201855, 140368830242815, +STORE, 140368828104704, 140368830201855, +STORE, 140368830201856, 140368830242815, +SNULL, 140368830201856, 140368830226431, +STORE, 140368830226432, 140368830242815, +STORE, 140368830201856, 140368830226431, +ERASE, 140368830201856, 140368830226431, +STORE, 140368830201856, 140368830226431, +ERASE, 140368830226432, 140368830242815, +STORE, 140368830226432, 140368830242815, +SNULL, 140368830218239, 140368830226431, +STORE, 140368830201856, 140368830218239, +STORE, 140368830218240, 140368830226431, +SNULL, 94213249032191, 94213249036287, +STORE, 94213249024000, 94213249032191, +STORE, 94213249032192, 94213249036287, +SNULL, 140368832487423, 140368832491519, +STORE, 140368832483328, 140368832487423, +STORE, 140368832487424, 140368832491519, +ERASE, 140368832454656, 140368832483327, +STORE, 94213267435520, 94213267570687, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140728954130432, 140737488351231, +SNULL, 140728954138623, 140737488351231, +STORE, 140728954130432, 140728954138623, +STORE, 140728953999360, 140728954138623, +STORE, 94672570966016, 94672573190143, +SNULL, 94672571076607, 94672573190143, +STORE, 94672570966016, 94672571076607, +STORE, 94672571076608, 94672573190143, +ERASE, 94672571076608, 94672573190143, +STORE, 94672573169664, 94672573181951, +STORE, 94672573181952, 94672573190143, +STORE, 140201696735232, 140201698988031, +SNULL, 140201696878591, 140201698988031, +STORE, 140201696735232, 140201696878591, +STORE, 140201696878592, 140201698988031, +ERASE, 140201696878592, 140201698988031, +STORE, 140201698975744, 140201698983935, +STORE, 140201698983936, 140201698988031, +STORE, 140728954163200, 140728954167295, +STORE, 140728954150912, 140728954163199, +STORE, 140201698947072, 140201698975743, +STORE, 140201698938880, 140201698947071, +STORE, 140201692938240, 140201696735231, +SNULL, 140201692938240, 140201694597119, +STORE, 140201694597120, 140201696735231, +STORE, 140201692938240, 140201694597119, +SNULL, 140201696694271, 140201696735231, +STORE, 140201694597120, 140201696694271, +STORE, 140201696694272, 140201696735231, +SNULL, 140201696694272, 140201696718847, +STORE, 140201696718848, 140201696735231, +STORE, 140201696694272, 140201696718847, +ERASE, 140201696694272, 140201696718847, +STORE, 140201696694272, 140201696718847, +ERASE, 140201696718848, 140201696735231, +STORE, 140201696718848, 140201696735231, +SNULL, 140201696710655, 140201696718847, +STORE, 140201696694272, 140201696710655, +STORE, 140201696710656, 140201696718847, +SNULL, 94672573177855, 94672573181951, +STORE, 94672573169664, 94672573177855, +STORE, 94672573177856, 94672573181951, +SNULL, 140201698979839, 140201698983935, +STORE, 140201698975744, 140201698979839, +STORE, 140201698979840, 140201698983935, +ERASE, 140201698947072, 140201698975743, +STORE, 94672595689472, 94672595824639, +STORE, 94114394132480, 94114394345471, +STORE, 94114396442624, 94114396446719, +STORE, 94114396446720, 94114396454911, +STORE, 94114396454912, 94114396467199, +STORE, 94114421575680, 94114428256255, +STORE, 139934313955328, 139934315614207, +STORE, 139934315614208, 139934317711359, +STORE, 139934317711360, 139934317727743, +STORE, 139934317727744, 139934317735935, +STORE, 139934317735936, 139934317752319, +STORE, 139934317752320, 139934317764607, +STORE, 139934317764608, 139934319857663, +STORE, 139934319857664, 139934319861759, +STORE, 139934319861760, 139934319865855, +STORE, 139934319865856, 139934320009215, +STORE, 139934320377856, 139934322061311, +STORE, 139934322061312, 139934322077695, +STORE, 139934322106368, 139934322110463, +STORE, 139934322110464, 139934322114559, +STORE, 139934322114560, 139934322118655, +STORE, 140731200376832, 140731200516095, +STORE, 140731200929792, 140731200942079, +STORE, 140731200942080, 140731200946175, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140721532362752, 140737488351231, +SNULL, 140721532370943, 140737488351231, +STORE, 140721532362752, 140721532370943, +STORE, 140721532231680, 140721532370943, +STORE, 94467222597632, 94467224821759, +SNULL, 94467222708223, 94467224821759, +STORE, 94467222597632, 94467222708223, +STORE, 94467222708224, 94467224821759, +ERASE, 94467222708224, 94467224821759, +STORE, 94467224801280, 94467224813567, +STORE, 94467224813568, 94467224821759, +STORE, 140191433543680, 140191435796479, +SNULL, 140191433687039, 140191435796479, +STORE, 140191433543680, 140191433687039, +STORE, 140191433687040, 140191435796479, +ERASE, 140191433687040, 140191435796479, +STORE, 140191435784192, 140191435792383, +STORE, 140191435792384, 140191435796479, +STORE, 140721533034496, 140721533038591, +STORE, 140721533022208, 140721533034495, +STORE, 140191435755520, 140191435784191, +STORE, 140191435747328, 140191435755519, +STORE, 140191429746688, 140191433543679, +SNULL, 140191429746688, 140191431405567, +STORE, 140191431405568, 140191433543679, +STORE, 140191429746688, 140191431405567, +SNULL, 140191433502719, 140191433543679, +STORE, 140191431405568, 140191433502719, +STORE, 140191433502720, 140191433543679, +SNULL, 140191433502720, 140191433527295, +STORE, 140191433527296, 140191433543679, +STORE, 140191433502720, 140191433527295, +ERASE, 140191433502720, 140191433527295, +STORE, 140191433502720, 140191433527295, +ERASE, 140191433527296, 140191433543679, +STORE, 140191433527296, 140191433543679, +SNULL, 140191433519103, 140191433527295, +STORE, 140191433502720, 140191433519103, +STORE, 140191433519104, 140191433527295, +SNULL, 94467224809471, 94467224813567, +STORE, 94467224801280, 94467224809471, +STORE, 94467224809472, 94467224813567, +SNULL, 140191435788287, 140191435792383, +STORE, 140191435784192, 140191435788287, +STORE, 140191435788288, 140191435792383, +ERASE, 140191435755520, 140191435784191, +STORE, 94467251847168, 94467251982335, +STORE, 94367895400448, 94367895613439, +STORE, 94367897710592, 94367897714687, +STORE, 94367897714688, 94367897722879, +STORE, 94367897722880, 94367897735167, +STORE, 94367925264384, 94367926861823, +STORE, 139801317548032, 139801319206911, +STORE, 139801319206912, 139801321304063, +STORE, 139801321304064, 139801321320447, +STORE, 139801321320448, 139801321328639, +STORE, 139801321328640, 139801321345023, +STORE, 139801321345024, 139801321357311, +STORE, 139801321357312, 139801323450367, +STORE, 139801323450368, 139801323454463, +STORE, 139801323454464, 139801323458559, +STORE, 139801323458560, 139801323601919, +STORE, 139801323970560, 139801325654015, +STORE, 139801325654016, 139801325670399, +STORE, 139801325699072, 139801325703167, +STORE, 139801325703168, 139801325707263, +STORE, 139801325707264, 139801325711359, +STORE, 140724442861568, 140724443000831, +STORE, 140724443611136, 140724443623423, +STORE, 140724443623424, 140724443627519, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140731353149440, 140737488351231, +SNULL, 140731353157631, 140737488351231, +STORE, 140731353149440, 140731353157631, +STORE, 140731353018368, 140731353157631, +STORE, 94310379503616, 94310381838335, +SNULL, 94310379716607, 94310381838335, +STORE, 94310379503616, 94310379716607, +STORE, 94310379716608, 94310381838335, +ERASE, 94310379716608, 94310381838335, +STORE, 94310381813760, 94310381826047, +STORE, 94310381826048, 94310381838335, +STORE, 140515434659840, 140515436912639, +SNULL, 140515434803199, 140515436912639, +STORE, 140515434659840, 140515434803199, +STORE, 140515434803200, 140515436912639, +ERASE, 140515434803200, 140515436912639, +STORE, 140515436900352, 140515436908543, +STORE, 140515436908544, 140515436912639, +STORE, 140731353886720, 140731353890815, +STORE, 140731353874432, 140731353886719, +STORE, 140515436871680, 140515436900351, +STORE, 140515436863488, 140515436871679, +STORE, 140515432546304, 140515434659839, +SNULL, 140515432546304, 140515432558591, +STORE, 140515432558592, 140515434659839, +STORE, 140515432546304, 140515432558591, +SNULL, 140515434651647, 140515434659839, +STORE, 140515432558592, 140515434651647, +STORE, 140515434651648, 140515434659839, +ERASE, 140515434651648, 140515434659839, +STORE, 140515434651648, 140515434659839, +STORE, 140515428749312, 140515432546303, +SNULL, 140515428749312, 140515430408191, +STORE, 140515430408192, 140515432546303, +STORE, 140515428749312, 140515430408191, +SNULL, 140515432505343, 140515432546303, +STORE, 140515430408192, 140515432505343, +STORE, 140515432505344, 140515432546303, +SNULL, 140515432505344, 140515432529919, +STORE, 140515432529920, 140515432546303, +STORE, 140515432505344, 140515432529919, +ERASE, 140515432505344, 140515432529919, +STORE, 140515432505344, 140515432529919, +ERASE, 140515432529920, 140515432546303, +STORE, 140515432529920, 140515432546303, +STORE, 140515436855296, 140515436871679, +SNULL, 140515432521727, 140515432529919, +STORE, 140515432505344, 140515432521727, +STORE, 140515432521728, 140515432529919, +SNULL, 140515434655743, 140515434659839, +STORE, 140515434651648, 140515434655743, +STORE, 140515434655744, 140515434659839, +SNULL, 94310381817855, 94310381826047, +STORE, 94310381813760, 94310381817855, +STORE, 94310381817856, 94310381826047, +SNULL, 140515436904447, 140515436908543, +STORE, 140515436900352, 140515436904447, +STORE, 140515436904448, 140515436908543, +ERASE, 140515436871680, 140515436900351, +STORE, 94310395457536, 94310395592703, +STORE, 140515435171840, 140515436855295, +STORE, 94310395457536, 94310395727871, +STORE, 94310395457536, 94310395863039, +STORE, 94310395457536, 94310396047359, +SNULL, 94310396022783, 94310396047359, +STORE, 94310395457536, 94310396022783, +STORE, 94310396022784, 94310396047359, +ERASE, 94310396022784, 94310396047359, +STORE, 94310395457536, 94310396157951, +STORE, 94310395457536, 94310396293119, +SNULL, 94310396276735, 94310396293119, +STORE, 94310395457536, 94310396276735, +STORE, 94310396276736, 94310396293119, +ERASE, 94310396276736, 94310396293119, +STORE, 94310395457536, 94310396411903, +SNULL, 94310396383231, 94310396411903, +STORE, 94310395457536, 94310396383231, +STORE, 94310396383232, 94310396411903, +ERASE, 94310396383232, 94310396411903, +STORE, 94310395457536, 94310396522495, +STORE, 94310395457536, 94310396674047, +SNULL, 94310396657663, 94310396674047, +STORE, 94310395457536, 94310396657663, +STORE, 94310396657664, 94310396674047, +ERASE, 94310396657664, 94310396674047, +SNULL, 94310396624895, 94310396657663, +STORE, 94310395457536, 94310396624895, +STORE, 94310396624896, 94310396657663, +ERASE, 94310396624896, 94310396657663, +STORE, 94310395457536, 94310396776447, +SNULL, 94310396764159, 94310396776447, +STORE, 94310395457536, 94310396764159, +STORE, 94310396764160, 94310396776447, +ERASE, 94310396764160, 94310396776447, +SNULL, 94310396739583, 94310396764159, +STORE, 94310395457536, 94310396739583, +STORE, 94310396739584, 94310396764159, +ERASE, 94310396739584, 94310396764159, +STORE, 94310395457536, 94310396882943, +STORE, 94310395457536, 94310397018111, +STORE, 94310395457536, 94310397161471, +STORE, 94310395457536, 94310397300735, +SNULL, 94310397292543, 94310397300735, +STORE, 94310395457536, 94310397292543, +STORE, 94310397292544, 94310397300735, +ERASE, 94310397292544, 94310397300735, +STORE, 94359222210560, 94359222423551, +STORE, 94359224520704, 94359224524799, +STORE, 94359224524800, 94359224532991, +STORE, 94359224532992, 94359224545279, +STORE, 94359238348800, 94359239385087, +STORE, 140675699838976, 140675701497855, +STORE, 140675701497856, 140675703595007, +STORE, 140675703595008, 140675703611391, +STORE, 140675703611392, 140675703619583, +STORE, 140675703619584, 140675703635967, +STORE, 140675703635968, 140675703648255, +STORE, 140675703648256, 140675705741311, +STORE, 140675705741312, 140675705745407, +STORE, 140675705745408, 140675705749503, +STORE, 140675705749504, 140675705892863, +STORE, 140675706261504, 140675707944959, +STORE, 140675707944960, 140675707961343, +STORE, 140675707990016, 140675707994111, +STORE, 140675707994112, 140675707998207, +STORE, 140675707998208, 140675708002303, +STORE, 140721324634112, 140721324773375, +STORE, 140721324810240, 140721324822527, +STORE, 140721324822528, 140721324826623, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140724099678208, 140737488351231, +SNULL, 140724099686399, 140737488351231, +STORE, 140724099678208, 140724099686399, +STORE, 140724099547136, 140724099686399, +STORE, 94586638516224, 94586640850943, +SNULL, 94586638729215, 94586640850943, +STORE, 94586638516224, 94586638729215, +STORE, 94586638729216, 94586640850943, +ERASE, 94586638729216, 94586640850943, +STORE, 94586640826368, 94586640838655, +STORE, 94586640838656, 94586640850943, +STORE, 140371033796608, 140371036049407, +SNULL, 140371033939967, 140371036049407, +STORE, 140371033796608, 140371033939967, +STORE, 140371033939968, 140371036049407, +ERASE, 140371033939968, 140371036049407, +STORE, 140371036037120, 140371036045311, +STORE, 140371036045312, 140371036049407, +STORE, 140724100001792, 140724100005887, +STORE, 140724099989504, 140724100001791, +STORE, 140371036008448, 140371036037119, +STORE, 140371036000256, 140371036008447, +STORE, 140371031683072, 140371033796607, +SNULL, 140371031683072, 140371031695359, +STORE, 140371031695360, 140371033796607, +STORE, 140371031683072, 140371031695359, +SNULL, 140371033788415, 140371033796607, +STORE, 140371031695360, 140371033788415, +STORE, 140371033788416, 140371033796607, +ERASE, 140371033788416, 140371033796607, +STORE, 140371033788416, 140371033796607, +STORE, 140371027886080, 140371031683071, +SNULL, 140371027886080, 140371029544959, +STORE, 140371029544960, 140371031683071, +STORE, 140371027886080, 140371029544959, +SNULL, 140371031642111, 140371031683071, +STORE, 140371029544960, 140371031642111, +STORE, 140371031642112, 140371031683071, +SNULL, 140371031642112, 140371031666687, +STORE, 140371031666688, 140371031683071, +STORE, 140371031642112, 140371031666687, +ERASE, 140371031642112, 140371031666687, +STORE, 140371031642112, 140371031666687, +ERASE, 140371031666688, 140371031683071, +STORE, 140371031666688, 140371031683071, +STORE, 140371035992064, 140371036008447, +SNULL, 140371031658495, 140371031666687, +STORE, 140371031642112, 140371031658495, +STORE, 140371031658496, 140371031666687, +SNULL, 140371033792511, 140371033796607, +STORE, 140371033788416, 140371033792511, +STORE, 140371033792512, 140371033796607, +SNULL, 94586640830463, 94586640838655, +STORE, 94586640826368, 94586640830463, +STORE, 94586640830464, 94586640838655, +SNULL, 140371036041215, 140371036045311, +STORE, 140371036037120, 140371036041215, +STORE, 140371036041216, 140371036045311, +ERASE, 140371036008448, 140371036037119, +STORE, 94586663849984, 94586663985151, +STORE, 140371034308608, 140371035992063, +STORE, 94586663849984, 94586664120319, +STORE, 94586663849984, 94586664255487, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140727532937216, 140737488351231, +SNULL, 140727532945407, 140737488351231, +STORE, 140727532937216, 140727532945407, +STORE, 140727532806144, 140727532945407, +STORE, 94849780191232, 94849782525951, +SNULL, 94849780404223, 94849782525951, +STORE, 94849780191232, 94849780404223, +STORE, 94849780404224, 94849782525951, +ERASE, 94849780404224, 94849782525951, +STORE, 94849782501376, 94849782513663, +STORE, 94849782513664, 94849782525951, +STORE, 140382070218752, 140382072471551, +SNULL, 140382070362111, 140382072471551, +STORE, 140382070218752, 140382070362111, +STORE, 140382070362112, 140382072471551, +ERASE, 140382070362112, 140382072471551, +STORE, 140382072459264, 140382072467455, +STORE, 140382072467456, 140382072471551, +STORE, 140727533092864, 140727533096959, +STORE, 140727533080576, 140727533092863, +STORE, 140382072430592, 140382072459263, +STORE, 140382072422400, 140382072430591, +STORE, 140382068105216, 140382070218751, +SNULL, 140382068105216, 140382068117503, +STORE, 140382068117504, 140382070218751, +STORE, 140382068105216, 140382068117503, +SNULL, 140382070210559, 140382070218751, +STORE, 140382068117504, 140382070210559, +STORE, 140382070210560, 140382070218751, +ERASE, 140382070210560, 140382070218751, +STORE, 140382070210560, 140382070218751, +STORE, 140382064308224, 140382068105215, +SNULL, 140382064308224, 140382065967103, +STORE, 140382065967104, 140382068105215, +STORE, 140382064308224, 140382065967103, +SNULL, 140382068064255, 140382068105215, +STORE, 140382065967104, 140382068064255, +STORE, 140382068064256, 140382068105215, +SNULL, 140382068064256, 140382068088831, +STORE, 140382068088832, 140382068105215, +STORE, 140382068064256, 140382068088831, +ERASE, 140382068064256, 140382068088831, +STORE, 140382068064256, 140382068088831, +ERASE, 140382068088832, 140382068105215, +STORE, 140382068088832, 140382068105215, +STORE, 140382072414208, 140382072430591, +SNULL, 140382068080639, 140382068088831, +STORE, 140382068064256, 140382068080639, +STORE, 140382068080640, 140382068088831, +SNULL, 140382070214655, 140382070218751, +STORE, 140382070210560, 140382070214655, +STORE, 140382070214656, 140382070218751, +SNULL, 94849782505471, 94849782513663, +STORE, 94849782501376, 94849782505471, +STORE, 94849782505472, 94849782513663, +SNULL, 140382072463359, 140382072467455, +STORE, 140382072459264, 140382072463359, +STORE, 140382072463360, 140382072467455, +ERASE, 140382072430592, 140382072459263, +STORE, 94849782845440, 94849782980607, +STORE, 140382070730752, 140382072414207, +STORE, 94849782845440, 94849783115775, +STORE, 94849782845440, 94849783250943, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722594377728, 140737488351231, +SNULL, 140722594385919, 140737488351231, +STORE, 140722594377728, 140722594385919, +STORE, 140722594246656, 140722594385919, +STORE, 94421466353664, 94421468577791, +SNULL, 94421466464255, 94421468577791, +STORE, 94421466353664, 94421466464255, +STORE, 94421466464256, 94421468577791, +ERASE, 94421466464256, 94421468577791, +STORE, 94421468557312, 94421468569599, +STORE, 94421468569600, 94421468577791, +STORE, 140345458057216, 140345460310015, +SNULL, 140345458200575, 140345460310015, +STORE, 140345458057216, 140345458200575, +STORE, 140345458200576, 140345460310015, +ERASE, 140345458200576, 140345460310015, +STORE, 140345460297728, 140345460305919, +STORE, 140345460305920, 140345460310015, +STORE, 140722595557376, 140722595561471, +STORE, 140722595545088, 140722595557375, +STORE, 140345460269056, 140345460297727, +STORE, 140345460260864, 140345460269055, +STORE, 140345454260224, 140345458057215, +SNULL, 140345454260224, 140345455919103, +STORE, 140345455919104, 140345458057215, +STORE, 140345454260224, 140345455919103, +SNULL, 140345458016255, 140345458057215, +STORE, 140345455919104, 140345458016255, +STORE, 140345458016256, 140345458057215, +SNULL, 140345458016256, 140345458040831, +STORE, 140345458040832, 140345458057215, +STORE, 140345458016256, 140345458040831, +ERASE, 140345458016256, 140345458040831, +STORE, 140345458016256, 140345458040831, +ERASE, 140345458040832, 140345458057215, +STORE, 140345458040832, 140345458057215, +SNULL, 140345458032639, 140345458040831, +STORE, 140345458016256, 140345458032639, +STORE, 140345458032640, 140345458040831, +SNULL, 94421468565503, 94421468569599, +STORE, 94421468557312, 94421468565503, +STORE, 94421468565504, 94421468569599, +SNULL, 140345460301823, 140345460305919, +STORE, 140345460297728, 140345460301823, +STORE, 140345460301824, 140345460305919, +ERASE, 140345460269056, 140345460297727, +STORE, 94421496004608, 94421496139775, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140726096302080, 140737488351231, +SNULL, 140726096310271, 140737488351231, +STORE, 140726096302080, 140726096310271, +STORE, 140726096171008, 140726096310271, +STORE, 94101992124416, 94101994459135, +SNULL, 94101992337407, 94101994459135, +STORE, 94101992124416, 94101992337407, +STORE, 94101992337408, 94101994459135, +ERASE, 94101992337408, 94101994459135, +STORE, 94101994434560, 94101994446847, +STORE, 94101994446848, 94101994459135, +STORE, 140192085594112, 140192087846911, +SNULL, 140192085737471, 140192087846911, +STORE, 140192085594112, 140192085737471, +STORE, 140192085737472, 140192087846911, +ERASE, 140192085737472, 140192087846911, +STORE, 140192087834624, 140192087842815, +STORE, 140192087842816, 140192087846911, +STORE, 140726096375808, 140726096379903, +STORE, 140726096363520, 140726096375807, +STORE, 140192087805952, 140192087834623, +STORE, 140192087797760, 140192087805951, +STORE, 140192083480576, 140192085594111, +SNULL, 140192083480576, 140192083492863, +STORE, 140192083492864, 140192085594111, +STORE, 140192083480576, 140192083492863, +SNULL, 140192085585919, 140192085594111, +STORE, 140192083492864, 140192085585919, +STORE, 140192085585920, 140192085594111, +ERASE, 140192085585920, 140192085594111, +STORE, 140192085585920, 140192085594111, +STORE, 140192079683584, 140192083480575, +SNULL, 140192079683584, 140192081342463, +STORE, 140192081342464, 140192083480575, +STORE, 140192079683584, 140192081342463, +SNULL, 140192083439615, 140192083480575, +STORE, 140192081342464, 140192083439615, +STORE, 140192083439616, 140192083480575, +SNULL, 140192083439616, 140192083464191, +STORE, 140192083464192, 140192083480575, +STORE, 140192083439616, 140192083464191, +ERASE, 140192083439616, 140192083464191, +STORE, 140192083439616, 140192083464191, +ERASE, 140192083464192, 140192083480575, +STORE, 140192083464192, 140192083480575, +STORE, 140192087789568, 140192087805951, +SNULL, 140192083455999, 140192083464191, +STORE, 140192083439616, 140192083455999, +STORE, 140192083456000, 140192083464191, +SNULL, 140192085590015, 140192085594111, +STORE, 140192085585920, 140192085590015, +STORE, 140192085590016, 140192085594111, +SNULL, 94101994438655, 94101994446847, +STORE, 94101994434560, 94101994438655, +STORE, 94101994438656, 94101994446847, +SNULL, 140192087838719, 140192087842815, +STORE, 140192087834624, 140192087838719, +STORE, 140192087838720, 140192087842815, +ERASE, 140192087805952, 140192087834623, +STORE, 94102011887616, 94102012022783, +STORE, 140192086106112, 140192087789567, +STORE, 94102011887616, 94102012157951, +STORE, 94102011887616, 94102012293119, +STORE, 94102011887616, 94102012440575, +SNULL, 94102012428287, 94102012440575, +STORE, 94102011887616, 94102012428287, +STORE, 94102012428288, 94102012440575, +ERASE, 94102012428288, 94102012440575, +STORE, 94102011887616, 94102012579839, +STORE, 94102011887616, 94102012715007, +SNULL, 94102012694527, 94102012715007, +STORE, 94102011887616, 94102012694527, +STORE, 94102012694528, 94102012715007, +ERASE, 94102012694528, 94102012715007, +STORE, 94102011887616, 94102012833791, +STORE, 94102011887616, 94102012968959, +SNULL, 94102012927999, 94102012968959, +STORE, 94102011887616, 94102012927999, +STORE, 94102012928000, 94102012968959, +ERASE, 94102012928000, 94102012968959, +STORE, 94102011887616, 94102013091839, +SNULL, 94102013075455, 94102013091839, +STORE, 94102011887616, 94102013075455, +STORE, 94102013075456, 94102013091839, +ERASE, 94102013075456, 94102013091839, +STORE, 94102011887616, 94102013210623, +STORE, 94102011887616, 94102013345791, +STORE, 93968727965696, 93968728178687, +STORE, 93968730275840, 93968730279935, +STORE, 93968730279936, 93968730288127, +STORE, 93968730288128, 93968730300415, +STORE, 93968731140096, 93968732704767, +STORE, 140588443168768, 140588444827647, +STORE, 140588444827648, 140588446924799, +STORE, 140588446924800, 140588446941183, +STORE, 140588446941184, 140588446949375, +STORE, 140588446949376, 140588446965759, +STORE, 140588446965760, 140588446978047, +STORE, 140588446978048, 140588449071103, +STORE, 140588449071104, 140588449075199, +STORE, 140588449075200, 140588449079295, +STORE, 140588449079296, 140588449222655, +STORE, 140588449591296, 140588451274751, +STORE, 140588451274752, 140588451291135, +STORE, 140588451319808, 140588451323903, +STORE, 140588451323904, 140588451327999, +STORE, 140588451328000, 140588451332095, +STORE, 140733877239808, 140733877379071, +STORE, 140733878702080, 140733878714367, +STORE, 140733878714368, 140733878718463, +STORE, 93968727965696, 93968728178687, +STORE, 93968730275840, 93968730279935, +STORE, 93968730279936, 93968730288127, +STORE, 93968730288128, 93968730300415, +STORE, 93968731140096, 93968732991487, +STORE, 140588443168768, 140588444827647, +STORE, 140588444827648, 140588446924799, +STORE, 140588446924800, 140588446941183, +STORE, 140588446941184, 140588446949375, +STORE, 140588446949376, 140588446965759, +STORE, 140588446965760, 140588446978047, +STORE, 140588446978048, 140588449071103, +STORE, 140588449071104, 140588449075199, +STORE, 140588449075200, 140588449079295, +STORE, 140588449079296, 140588449222655, +STORE, 140588449591296, 140588451274751, +STORE, 140588451274752, 140588451291135, +STORE, 140588451319808, 140588451323903, +STORE, 140588451323904, 140588451327999, +STORE, 140588451328000, 140588451332095, +STORE, 140733877239808, 140733877379071, +STORE, 140733878702080, 140733878714367, +STORE, 140733878714368, 140733878718463, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140733054472192, 140737488351231, +SNULL, 140733054480383, 140737488351231, +STORE, 140733054472192, 140733054480383, +STORE, 140733054341120, 140733054480383, +STORE, 93992873623552, 93992875847679, +SNULL, 93992873734143, 93992875847679, +STORE, 93992873623552, 93992873734143, +STORE, 93992873734144, 93992875847679, +ERASE, 93992873734144, 93992875847679, +STORE, 93992875827200, 93992875839487, +STORE, 93992875839488, 93992875847679, +STORE, 139790881488896, 139790883741695, +SNULL, 139790881632255, 139790883741695, +STORE, 139790881488896, 139790881632255, +STORE, 139790881632256, 139790883741695, +ERASE, 139790881632256, 139790883741695, +STORE, 139790883729408, 139790883737599, +STORE, 139790883737600, 139790883741695, +STORE, 140733054754816, 140733054758911, +STORE, 140733054742528, 140733054754815, +STORE, 139790883700736, 139790883729407, +STORE, 139790883692544, 139790883700735, +STORE, 139790877691904, 139790881488895, +SNULL, 139790877691904, 139790879350783, +STORE, 139790879350784, 139790881488895, +STORE, 139790877691904, 139790879350783, +SNULL, 139790881447935, 139790881488895, +STORE, 139790879350784, 139790881447935, +STORE, 139790881447936, 139790881488895, +SNULL, 139790881447936, 139790881472511, +STORE, 139790881472512, 139790881488895, +STORE, 139790881447936, 139790881472511, +ERASE, 139790881447936, 139790881472511, +STORE, 139790881447936, 139790881472511, +ERASE, 139790881472512, 139790881488895, +STORE, 139790881472512, 139790881488895, +SNULL, 139790881464319, 139790881472511, +STORE, 139790881447936, 139790881464319, +STORE, 139790881464320, 139790881472511, +SNULL, 93992875835391, 93992875839487, +STORE, 93992875827200, 93992875835391, +STORE, 93992875835392, 93992875839487, +SNULL, 139790883733503, 139790883737599, +STORE, 139790883729408, 139790883733503, +STORE, 139790883733504, 139790883737599, +ERASE, 139790883700736, 139790883729407, +STORE, 93992877031424, 93992877166591, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140728550887424, 140737488351231, +SNULL, 140728550895615, 140737488351231, +STORE, 140728550887424, 140728550895615, +STORE, 140728550756352, 140728550895615, +STORE, 94707634077696, 94707636301823, +SNULL, 94707634188287, 94707636301823, +STORE, 94707634077696, 94707634188287, +STORE, 94707634188288, 94707636301823, +ERASE, 94707634188288, 94707636301823, +STORE, 94707636281344, 94707636293631, +STORE, 94707636293632, 94707636301823, +STORE, 140553545666560, 140553547919359, +SNULL, 140553545809919, 140553547919359, +STORE, 140553545666560, 140553545809919, +STORE, 140553545809920, 140553547919359, +ERASE, 140553545809920, 140553547919359, +STORE, 140553547907072, 140553547915263, +STORE, 140553547915264, 140553547919359, +STORE, 140728552374272, 140728552378367, +STORE, 140728552361984, 140728552374271, +STORE, 140553547878400, 140553547907071, +STORE, 140553547870208, 140553547878399, +STORE, 140553541869568, 140553545666559, +SNULL, 140553541869568, 140553543528447, +STORE, 140553543528448, 140553545666559, +STORE, 140553541869568, 140553543528447, +SNULL, 140553545625599, 140553545666559, +STORE, 140553543528448, 140553545625599, +STORE, 140553545625600, 140553545666559, +SNULL, 140553545625600, 140553545650175, +STORE, 140553545650176, 140553545666559, +STORE, 140553545625600, 140553545650175, +ERASE, 140553545625600, 140553545650175, +STORE, 140553545625600, 140553545650175, +ERASE, 140553545650176, 140553545666559, +STORE, 140553545650176, 140553545666559, +SNULL, 140553545641983, 140553545650175, +STORE, 140553545625600, 140553545641983, +STORE, 140553545641984, 140553545650175, +SNULL, 94707636289535, 94707636293631, +STORE, 94707636281344, 94707636289535, +STORE, 94707636289536, 94707636293631, +SNULL, 140553547911167, 140553547915263, +STORE, 140553547907072, 140553547911167, +STORE, 140553547911168, 140553547915263, +ERASE, 140553547878400, 140553547907071, +STORE, 94707651411968, 94707651547135, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140732168695808, 140737488351231, +SNULL, 140732168703999, 140737488351231, +STORE, 140732168695808, 140732168703999, +STORE, 140732168564736, 140732168703999, +STORE, 94454287859712, 94454290083839, +SNULL, 94454287970303, 94454290083839, +STORE, 94454287859712, 94454287970303, +STORE, 94454287970304, 94454290083839, +ERASE, 94454287970304, 94454290083839, +STORE, 94454290063360, 94454290075647, +STORE, 94454290075648, 94454290083839, +STORE, 140564947107840, 140564949360639, +SNULL, 140564947251199, 140564949360639, +STORE, 140564947107840, 140564947251199, +STORE, 140564947251200, 140564949360639, +ERASE, 140564947251200, 140564949360639, +STORE, 140564949348352, 140564949356543, +STORE, 140564949356544, 140564949360639, +STORE, 140732168843264, 140732168847359, +STORE, 140732168830976, 140732168843263, +STORE, 140564949319680, 140564949348351, +STORE, 140564949311488, 140564949319679, +STORE, 140564943310848, 140564947107839, +SNULL, 140564943310848, 140564944969727, +STORE, 140564944969728, 140564947107839, +STORE, 140564943310848, 140564944969727, +SNULL, 140564947066879, 140564947107839, +STORE, 140564944969728, 140564947066879, +STORE, 140564947066880, 140564947107839, +SNULL, 140564947066880, 140564947091455, +STORE, 140564947091456, 140564947107839, +STORE, 140564947066880, 140564947091455, +ERASE, 140564947066880, 140564947091455, +STORE, 140564947066880, 140564947091455, +ERASE, 140564947091456, 140564947107839, +STORE, 140564947091456, 140564947107839, +SNULL, 140564947083263, 140564947091455, +STORE, 140564947066880, 140564947083263, +STORE, 140564947083264, 140564947091455, +SNULL, 94454290071551, 94454290075647, +STORE, 94454290063360, 94454290071551, +STORE, 94454290071552, 94454290075647, +SNULL, 140564949352447, 140564949356543, +STORE, 140564949348352, 140564949352447, +STORE, 140564949352448, 140564949356543, +ERASE, 140564949319680, 140564949348351, +STORE, 94454316236800, 94454316371967, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140735155617792, 140737488351231, +SNULL, 140735155625983, 140737488351231, +STORE, 140735155617792, 140735155625983, +STORE, 140735155486720, 140735155625983, +STORE, 93915969556480, 93915971780607, +SNULL, 93915969667071, 93915971780607, +STORE, 93915969556480, 93915969667071, +STORE, 93915969667072, 93915971780607, +ERASE, 93915969667072, 93915971780607, +STORE, 93915971760128, 93915971772415, +STORE, 93915971772416, 93915971780607, +STORE, 140141164605440, 140141166858239, +SNULL, 140141164748799, 140141166858239, +STORE, 140141164605440, 140141164748799, +STORE, 140141164748800, 140141166858239, +ERASE, 140141164748800, 140141166858239, +STORE, 140141166845952, 140141166854143, +STORE, 140141166854144, 140141166858239, +STORE, 140735155691520, 140735155695615, +STORE, 140735155679232, 140735155691519, +STORE, 140141166817280, 140141166845951, +STORE, 140141166809088, 140141166817279, +STORE, 140141160808448, 140141164605439, +SNULL, 140141160808448, 140141162467327, +STORE, 140141162467328, 140141164605439, +STORE, 140141160808448, 140141162467327, +SNULL, 140141164564479, 140141164605439, +STORE, 140141162467328, 140141164564479, +STORE, 140141164564480, 140141164605439, +SNULL, 140141164564480, 140141164589055, +STORE, 140141164589056, 140141164605439, +STORE, 140141164564480, 140141164589055, +ERASE, 140141164564480, 140141164589055, +STORE, 140141164564480, 140141164589055, +ERASE, 140141164589056, 140141164605439, +STORE, 140141164589056, 140141164605439, +SNULL, 140141164580863, 140141164589055, +STORE, 140141164564480, 140141164580863, +STORE, 140141164580864, 140141164589055, +SNULL, 93915971768319, 93915971772415, +STORE, 93915971760128, 93915971768319, +STORE, 93915971768320, 93915971772415, +SNULL, 140141166850047, 140141166854143, +STORE, 140141166845952, 140141166850047, +STORE, 140141166850048, 140141166854143, +ERASE, 140141166817280, 140141166845951, +STORE, 93916002775040, 93916002910207, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140728988409856, 140737488351231, +SNULL, 140728988418047, 140737488351231, +STORE, 140728988409856, 140728988418047, +STORE, 140728988278784, 140728988418047, +STORE, 94021634813952, 94021637038079, +SNULL, 94021634924543, 94021637038079, +STORE, 94021634813952, 94021634924543, +STORE, 94021634924544, 94021637038079, +ERASE, 94021634924544, 94021637038079, +STORE, 94021637017600, 94021637029887, +STORE, 94021637029888, 94021637038079, +STORE, 140638014038016, 140638016290815, +SNULL, 140638014181375, 140638016290815, +STORE, 140638014038016, 140638014181375, +STORE, 140638014181376, 140638016290815, +ERASE, 140638014181376, 140638016290815, +STORE, 140638016278528, 140638016286719, +STORE, 140638016286720, 140638016290815, +STORE, 140728988536832, 140728988540927, +STORE, 140728988524544, 140728988536831, +STORE, 140638016249856, 140638016278527, +STORE, 140638016241664, 140638016249855, +STORE, 140638010241024, 140638014038015, +SNULL, 140638010241024, 140638011899903, +STORE, 140638011899904, 140638014038015, +STORE, 140638010241024, 140638011899903, +SNULL, 140638013997055, 140638014038015, +STORE, 140638011899904, 140638013997055, +STORE, 140638013997056, 140638014038015, +SNULL, 140638013997056, 140638014021631, +STORE, 140638014021632, 140638014038015, +STORE, 140638013997056, 140638014021631, +ERASE, 140638013997056, 140638014021631, +STORE, 140638013997056, 140638014021631, +ERASE, 140638014021632, 140638014038015, +STORE, 140638014021632, 140638014038015, +SNULL, 140638014013439, 140638014021631, +STORE, 140638013997056, 140638014013439, +STORE, 140638014013440, 140638014021631, +SNULL, 94021637025791, 94021637029887, +STORE, 94021637017600, 94021637025791, +STORE, 94021637025792, 94021637029887, +SNULL, 140638016282623, 140638016286719, +STORE, 140638016278528, 140638016282623, +STORE, 140638016282624, 140638016286719, +ERASE, 140638016249856, 140638016278527, +STORE, 94021643124736, 94021643259903, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140731219275776, 140737488351231, +SNULL, 140731219283967, 140737488351231, +STORE, 140731219275776, 140731219283967, +STORE, 140731219144704, 140731219283967, +STORE, 93888803647488, 93888805871615, +SNULL, 93888803758079, 93888805871615, +STORE, 93888803647488, 93888803758079, +STORE, 93888803758080, 93888805871615, +ERASE, 93888803758080, 93888805871615, +STORE, 93888805851136, 93888805863423, +STORE, 93888805863424, 93888805871615, +STORE, 139630576934912, 139630579187711, +SNULL, 139630577078271, 139630579187711, +STORE, 139630576934912, 139630577078271, +STORE, 139630577078272, 139630579187711, +ERASE, 139630577078272, 139630579187711, +STORE, 139630579175424, 139630579183615, +STORE, 139630579183616, 139630579187711, +STORE, 140731219718144, 140731219722239, +STORE, 140731219705856, 140731219718143, +STORE, 139630579146752, 139630579175423, +STORE, 139630579138560, 139630579146751, +STORE, 139630573137920, 139630576934911, +SNULL, 139630573137920, 139630574796799, +STORE, 139630574796800, 139630576934911, +STORE, 139630573137920, 139630574796799, +SNULL, 139630576893951, 139630576934911, +STORE, 139630574796800, 139630576893951, +STORE, 139630576893952, 139630576934911, +SNULL, 139630576893952, 139630576918527, +STORE, 139630576918528, 139630576934911, +STORE, 139630576893952, 139630576918527, +ERASE, 139630576893952, 139630576918527, +STORE, 139630576893952, 139630576918527, +ERASE, 139630576918528, 139630576934911, +STORE, 139630576918528, 139630576934911, +SNULL, 139630576910335, 139630576918527, +STORE, 139630576893952, 139630576910335, +STORE, 139630576910336, 139630576918527, +SNULL, 93888805859327, 93888805863423, +STORE, 93888805851136, 93888805859327, +STORE, 93888805859328, 93888805863423, +SNULL, 139630579179519, 139630579183615, +STORE, 139630579175424, 139630579179519, +STORE, 139630579179520, 139630579183615, +ERASE, 139630579146752, 139630579175423, +STORE, 93888822235136, 93888822370303, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140733391151104, 140737488351231, +SNULL, 140733391159295, 140737488351231, +STORE, 140733391151104, 140733391159295, +STORE, 140733391020032, 140733391159295, +STORE, 94393875324928, 94393877549055, +SNULL, 94393875435519, 94393877549055, +STORE, 94393875324928, 94393875435519, +STORE, 94393875435520, 94393877549055, +ERASE, 94393875435520, 94393877549055, +STORE, 94393877528576, 94393877540863, +STORE, 94393877540864, 94393877549055, +STORE, 140292111740928, 140292113993727, +SNULL, 140292111884287, 140292113993727, +STORE, 140292111740928, 140292111884287, +STORE, 140292111884288, 140292113993727, +ERASE, 140292111884288, 140292113993727, +STORE, 140292113981440, 140292113989631, +STORE, 140292113989632, 140292113993727, +STORE, 140733391532032, 140733391536127, +STORE, 140733391519744, 140733391532031, +STORE, 140292113952768, 140292113981439, +STORE, 140292113944576, 140292113952767, +STORE, 140292107943936, 140292111740927, +SNULL, 140292107943936, 140292109602815, +STORE, 140292109602816, 140292111740927, +STORE, 140292107943936, 140292109602815, +SNULL, 140292111699967, 140292111740927, +STORE, 140292109602816, 140292111699967, +STORE, 140292111699968, 140292111740927, +SNULL, 140292111699968, 140292111724543, +STORE, 140292111724544, 140292111740927, +STORE, 140292111699968, 140292111724543, +ERASE, 140292111699968, 140292111724543, +STORE, 140292111699968, 140292111724543, +ERASE, 140292111724544, 140292111740927, +STORE, 140292111724544, 140292111740927, +SNULL, 140292111716351, 140292111724543, +STORE, 140292111699968, 140292111716351, +STORE, 140292111716352, 140292111724543, +SNULL, 94393877536767, 94393877540863, +STORE, 94393877528576, 94393877536767, +STORE, 94393877536768, 94393877540863, +SNULL, 140292113985535, 140292113989631, +STORE, 140292113981440, 140292113985535, +STORE, 140292113985536, 140292113989631, +ERASE, 140292113952768, 140292113981439, +STORE, 94393909342208, 94393909477375, +STORE, 94458367512576, 94458367725567, +STORE, 94458369822720, 94458369826815, +STORE, 94458369826816, 94458369835007, +STORE, 94458369835008, 94458369847295, +STORE, 94458393292800, 94458399666175, +STORE, 140619773841408, 140619775500287, +STORE, 140619775500288, 140619777597439, +STORE, 140619777597440, 140619777613823, +STORE, 140619777613824, 140619777622015, +STORE, 140619777622016, 140619777638399, +STORE, 140619777638400, 140619777650687, +STORE, 140619777650688, 140619779743743, +STORE, 140619779743744, 140619779747839, +STORE, 140619779747840, 140619779751935, +STORE, 140619779751936, 140619779895295, +STORE, 140619780263936, 140619781947391, +STORE, 140619781947392, 140619781963775, +STORE, 140619781992448, 140619781996543, +STORE, 140619781996544, 140619782000639, +STORE, 140619782000640, 140619782004735, +STORE, 140725811675136, 140725811814399, +STORE, 140725812813824, 140725812826111, +STORE, 140725812826112, 140725812830207, +STORE, 94458367512576, 94458367725567, +STORE, 94458369822720, 94458369826815, +STORE, 94458369826816, 94458369835007, +STORE, 94458369835008, 94458369847295, +STORE, 94458393292800, 94458400366591, +STORE, 140619773841408, 140619775500287, +STORE, 140619775500288, 140619777597439, +STORE, 140619777597440, 140619777613823, +STORE, 140619777613824, 140619777622015, +STORE, 140619777622016, 140619777638399, +STORE, 140619777638400, 140619777650687, +STORE, 140619777650688, 140619779743743, +STORE, 140619779743744, 140619779747839, +STORE, 140619779747840, 140619779751935, +STORE, 140619779751936, 140619779895295, +STORE, 140619780263936, 140619781947391, +STORE, 140619781947392, 140619781963775, +STORE, 140619781992448, 140619781996543, +STORE, 140619781996544, 140619782000639, +STORE, 140619782000640, 140619782004735, +STORE, 140725811675136, 140725811814399, +STORE, 140725812813824, 140725812826111, +STORE, 140725812826112, 140725812830207, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140728740679680, 140737488351231, +SNULL, 140728740687871, 140737488351231, +STORE, 140728740679680, 140728740687871, +STORE, 140728740548608, 140728740687871, +STORE, 94764075249664, 94764077473791, +SNULL, 94764075360255, 94764077473791, +STORE, 94764075249664, 94764075360255, +STORE, 94764075360256, 94764077473791, +ERASE, 94764075360256, 94764077473791, +STORE, 94764077453312, 94764077465599, +STORE, 94764077465600, 94764077473791, +STORE, 139766406791168, 139766409043967, +SNULL, 139766406934527, 139766409043967, +STORE, 139766406791168, 139766406934527, +STORE, 139766406934528, 139766409043967, +ERASE, 139766406934528, 139766409043967, +STORE, 139766409031680, 139766409039871, +STORE, 139766409039872, 139766409043967, +STORE, 140728740913152, 140728740917247, +STORE, 140728740900864, 140728740913151, +STORE, 139766409003008, 139766409031679, +STORE, 139766408994816, 139766409003007, +STORE, 139766402994176, 139766406791167, +SNULL, 139766402994176, 139766404653055, +STORE, 139766404653056, 139766406791167, +STORE, 139766402994176, 139766404653055, +SNULL, 139766406750207, 139766406791167, +STORE, 139766404653056, 139766406750207, +STORE, 139766406750208, 139766406791167, +SNULL, 139766406750208, 139766406774783, +STORE, 139766406774784, 139766406791167, +STORE, 139766406750208, 139766406774783, +ERASE, 139766406750208, 139766406774783, +STORE, 139766406750208, 139766406774783, +ERASE, 139766406774784, 139766406791167, +STORE, 139766406774784, 139766406791167, +SNULL, 139766406766591, 139766406774783, +STORE, 139766406750208, 139766406766591, +STORE, 139766406766592, 139766406774783, +SNULL, 94764077461503, 94764077465599, +STORE, 94764077453312, 94764077461503, +STORE, 94764077461504, 94764077465599, +SNULL, 139766409035775, 139766409039871, +STORE, 139766409031680, 139766409035775, +STORE, 139766409035776, 139766409039871, +ERASE, 139766409003008, 139766409031679, +STORE, 94764090458112, 94764090593279, +STORE, 94758057480192, 94758057590783, +STORE, 94758059683840, 94758059692031, +STORE, 94758059692032, 94758059696127, +STORE, 94758059696128, 94758059704319, +STORE, 94758083215360, 94758083350527, +STORE, 139951456772096, 139951458430975, +STORE, 139951458430976, 139951460528127, +STORE, 139951460528128, 139951460544511, +STORE, 139951460544512, 139951460552703, +STORE, 139951460552704, 139951460569087, +STORE, 139951460569088, 139951460712447, +STORE, 139951462772736, 139951462780927, +STORE, 139951462809600, 139951462813695, +STORE, 139951462813696, 139951462817791, +STORE, 139951462817792, 139951462821887, +STORE, 140734098313216, 140734098452479, +STORE, 140734098911232, 140734098923519, +STORE, 140734098923520, 140734098927615, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140724904095744, 140737488351231, +SNULL, 140724904103935, 140737488351231, +STORE, 140724904095744, 140724904103935, +STORE, 140724903964672, 140724904103935, +STORE, 4194304, 5128191, +STORE, 7221248, 7241727, +STORE, 7241728, 7249919, +STORE, 140408497864704, 140408500117503, +SNULL, 140408498008063, 140408500117503, +STORE, 140408497864704, 140408498008063, +STORE, 140408498008064, 140408500117503, +ERASE, 140408498008064, 140408500117503, +STORE, 140408500105216, 140408500113407, +STORE, 140408500113408, 140408500117503, +STORE, 140724905369600, 140724905373695, +STORE, 140724905357312, 140724905369599, +STORE, 140408500076544, 140408500105215, +STORE, 140408500068352, 140408500076543, +STORE, 140408494702592, 140408497864703, +SNULL, 140408494702592, 140408495763455, +STORE, 140408495763456, 140408497864703, +STORE, 140408494702592, 140408495763455, +SNULL, 140408497856511, 140408497864703, +STORE, 140408495763456, 140408497856511, +STORE, 140408497856512, 140408497864703, +ERASE, 140408497856512, 140408497864703, +STORE, 140408497856512, 140408497864703, +STORE, 140408490905600, 140408494702591, +SNULL, 140408490905600, 140408492564479, +STORE, 140408492564480, 140408494702591, +STORE, 140408490905600, 140408492564479, +SNULL, 140408494661631, 140408494702591, +STORE, 140408492564480, 140408494661631, +STORE, 140408494661632, 140408494702591, +SNULL, 140408494661632, 140408494686207, +STORE, 140408494686208, 140408494702591, +STORE, 140408494661632, 140408494686207, +ERASE, 140408494661632, 140408494686207, +STORE, 140408494661632, 140408494686207, +ERASE, 140408494686208, 140408494702591, +STORE, 140408494686208, 140408494702591, +STORE, 140408500056064, 140408500076543, +SNULL, 140408494678015, 140408494686207, +STORE, 140408494661632, 140408494678015, +STORE, 140408494678016, 140408494686207, +SNULL, 140408497860607, 140408497864703, +STORE, 140408497856512, 140408497860607, +STORE, 140408497860608, 140408497864703, +SNULL, 7233535, 7241727, +STORE, 7221248, 7233535, +STORE, 7233536, 7241727, +SNULL, 140408500109311, 140408500113407, +STORE, 140408500105216, 140408500109311, +STORE, 140408500109312, 140408500113407, +ERASE, 140408500076544, 140408500105215, +STORE, 25235456, 25370623, +STORE, 25235456, 25518079, +STORE, 140408498372608, 140408500056063, +STORE, 94543937388544, 94543937499135, +STORE, 94543939592192, 94543939600383, +STORE, 94543939600384, 94543939604479, +STORE, 94543939604480, 94543939612671, +STORE, 94543941447680, 94543941582847, +STORE, 140282621947904, 140282623606783, +STORE, 140282623606784, 140282625703935, +STORE, 140282625703936, 140282625720319, +STORE, 140282625720320, 140282625728511, +STORE, 140282625728512, 140282625744895, +STORE, 140282625744896, 140282625888255, +STORE, 140282627948544, 140282627956735, +STORE, 140282627985408, 140282627989503, +STORE, 140282627989504, 140282627993599, +STORE, 140282627993600, 140282627997695, +STORE, 140728295723008, 140728295862271, +STORE, 140728296476672, 140728296488959, +STORE, 140728296488960, 140728296493055, +STORE, 94431504838656, 94431505051647, +STORE, 94431507148800, 94431507152895, +STORE, 94431507152896, 94431507161087, +STORE, 94431507161088, 94431507173375, +STORE, 94431510286336, 94431510691839, +STORE, 139818797948928, 139818799607807, +STORE, 139818799607808, 139818801704959, +STORE, 139818801704960, 139818801721343, +STORE, 139818801721344, 139818801729535, +STORE, 139818801729536, 139818801745919, +STORE, 139818801745920, 139818801758207, +STORE, 139818801758208, 139818803851263, +STORE, 139818803851264, 139818803855359, +STORE, 139818803855360, 139818803859455, +STORE, 139818803859456, 139818804002815, +STORE, 139818804371456, 139818806054911, +STORE, 139818806054912, 139818806071295, +STORE, 139818806099968, 139818806104063, +STORE, 139818806104064, 139818806108159, +STORE, 139818806108160, 139818806112255, +STORE, 140731430457344, 140731430596607, +STORE, 140731431227392, 140731431239679, +STORE, 140731431239680, 140731431243775, +STORE, 94431504838656, 94431505051647, +STORE, 94431507148800, 94431507152895, +STORE, 94431507152896, 94431507161087, +STORE, 94431507161088, 94431507173375, +STORE, 94431510286336, 94431510691839, +STORE, 139818797948928, 139818799607807, +STORE, 139818799607808, 139818801704959, +STORE, 139818801704960, 139818801721343, +STORE, 139818801721344, 139818801729535, +STORE, 139818801729536, 139818801745919, +STORE, 139818801745920, 139818801758207, +STORE, 139818801758208, 139818803851263, +STORE, 139818803851264, 139818803855359, +STORE, 139818803855360, 139818803859455, +STORE, 139818803859456, 139818804002815, +STORE, 139818804371456, 139818806054911, +STORE, 139818806054912, 139818806071295, +STORE, 139818806099968, 139818806104063, +STORE, 139818806104064, 139818806108159, +STORE, 139818806108160, 139818806112255, +STORE, 140731430457344, 140731430596607, +STORE, 140731431227392, 140731431239679, +STORE, 140731431239680, 140731431243775, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140737488338944, 140737488351231, +STORE, 140736944451584, 140737488351231, +SNULL, 140736944463871, 140737488351231, +STORE, 140736944451584, 140736944463871, +STORE, 140736944320512, 140736944463871, +STORE, 4194304, 26279935, +STORE, 28372992, 28454911, +STORE, 28454912, 29806591, +STORE, 139693609893888, 139693612146687, +SNULL, 139693610037247, 139693612146687, +STORE, 139693609893888, 139693610037247, +STORE, 139693610037248, 139693612146687, +ERASE, 139693610037248, 139693612146687, +STORE, 139693612134400, 139693612142591, +STORE, 139693612142592, 139693612146687, +STORE, 140736945152000, 140736945156095, +STORE, 140736945139712, 140736945151999, +STORE, 139693612105728, 139693612134399, +STORE, 139693612097536, 139693612105727, +STORE, 139693606060032, 139693609893887, +SNULL, 139693606060032, 139693607768063, +STORE, 139693607768064, 139693609893887, +STORE, 139693606060032, 139693607768063, +SNULL, 139693609861119, 139693609893887, +STORE, 139693607768064, 139693609861119, +STORE, 139693609861120, 139693609893887, +ERASE, 139693609861120, 139693609893887, +STORE, 139693609861120, 139693609893887, +STORE, 139693603864576, 139693606060031, +SNULL, 139693603864576, 139693603958783, +STORE, 139693603958784, 139693606060031, +STORE, 139693603864576, 139693603958783, +SNULL, 139693606051839, 139693606060031, +STORE, 139693603958784, 139693606051839, +STORE, 139693606051840, 139693606060031, +ERASE, 139693606051840, 139693606060031, +STORE, 139693606051840, 139693606060031, +STORE, 139693601345536, 139693603864575, +SNULL, 139693601345536, 139693601759231, +STORE, 139693601759232, 139693603864575, +STORE, 139693601345536, 139693601759231, +SNULL, 139693603852287, 139693603864575, +STORE, 139693601759232, 139693603852287, +STORE, 139693603852288, 139693603864575, +ERASE, 139693603852288, 139693603864575, +STORE, 139693603852288, 139693603864575, +STORE, 139693598711808, 139693601345535, +SNULL, 139693598711808, 139693599240191, +STORE, 139693599240192, 139693601345535, +STORE, 139693598711808, 139693599240191, +SNULL, 139693601337343, 139693601345535, +STORE, 139693599240192, 139693601337343, +STORE, 139693601337344, 139693601345535, +ERASE, 139693601337344, 139693601345535, +STORE, 139693601337344, 139693601345535, +STORE, 139693596598272, 139693598711807, +SNULL, 139693596598272, 139693596610559, +STORE, 139693596610560, 139693598711807, +STORE, 139693596598272, 139693596610559, +SNULL, 139693598703615, 139693598711807, +STORE, 139693596610560, 139693598703615, +STORE, 139693598703616, 139693598711807, +ERASE, 139693598703616, 139693598711807, +STORE, 139693598703616, 139693598711807, +STORE, 139693594394624, 139693596598271, +SNULL, 139693594394624, 139693594497023, +STORE, 139693594497024, 139693596598271, +STORE, 139693594394624, 139693594497023, +SNULL, 139693596590079, 139693596598271, +STORE, 139693594497024, 139693596590079, +STORE, 139693596590080, 139693596598271, +ERASE, 139693596590080, 139693596598271, +STORE, 139693596590080, 139693596598271, +STORE, 139693612089344, 139693612105727, +STORE, 139693591232512, 139693594394623, +SNULL, 139693591232512, 139693592293375, +STORE, 139693592293376, 139693594394623, +STORE, 139693591232512, 139693592293375, +SNULL, 139693594386431, 139693594394623, +STORE, 139693592293376, 139693594386431, +STORE, 139693594386432, 139693594394623, +ERASE, 139693594386432, 139693594394623, +STORE, 139693594386432, 139693594394623, +STORE, 139693587435520, 139693591232511, +SNULL, 139693587435520, 139693589094399, +STORE, 139693589094400, 139693591232511, +STORE, 139693587435520, 139693589094399, +SNULL, 139693591191551, 139693591232511, +STORE, 139693589094400, 139693591191551, +STORE, 139693591191552, 139693591232511, +SNULL, 139693591191552, 139693591216127, +STORE, 139693591216128, 139693591232511, +STORE, 139693591191552, 139693591216127, +ERASE, 139693591191552, 139693591216127, +STORE, 139693591191552, 139693591216127, +ERASE, 139693591216128, 139693591232511, +STORE, 139693591216128, 139693591232511, +STORE, 139693612077056, 139693612105727, +SNULL, 139693591207935, 139693591216127, +STORE, 139693591191552, 139693591207935, +STORE, 139693591207936, 139693591216127, +SNULL, 139693594390527, 139693594394623, +STORE, 139693594386432, 139693594390527, +STORE, 139693594390528, 139693594394623, +SNULL, 139693596594175, 139693596598271, +STORE, 139693596590080, 139693596594175, +STORE, 139693596594176, 139693596598271, +SNULL, 139693598707711, 139693598711807, +STORE, 139693598703616, 139693598707711, +STORE, 139693598707712, 139693598711807, +SNULL, 139693601341439, 139693601345535, +STORE, 139693601337344, 139693601341439, +STORE, 139693601341440, 139693601345535, +SNULL, 139693603860479, 139693603864575, +STORE, 139693603852288, 139693603860479, +STORE, 139693603860480, 139693603864575, +SNULL, 139693606055935, 139693606060031, +STORE, 139693606051840, 139693606055935, +STORE, 139693606055936, 139693606060031, +SNULL, 139693609865215, 139693609893887, +STORE, 139693609861120, 139693609865215, +STORE, 139693609865216, 139693609893887, +SNULL, 28405759, 28454911, +STORE, 28372992, 28405759, +STORE, 28405760, 28454911, +SNULL, 139693612138495, 139693612142591, +STORE, 139693612134400, 139693612138495, +STORE, 139693612138496, 139693612142591, +ERASE, 139693612105728, 139693612134399, +STORE, 39976960, 40112127, +STORE, 139693610393600, 139693612077055, +STORE, 139693612130304, 139693612134399, +STORE, 139693610258432, 139693610393599, +STORE, 39976960, 40255487, +STORE, 139693585338368, 139693587435519, +STORE, 139693612122112, 139693612134399, +STORE, 139693612113920, 139693612134399, +STORE, 139693612077056, 139693612113919, +STORE, 139693610242048, 139693610393599, +STORE, 39976960, 40390655, +STORE, 39976960, 40546303, +STORE, 139693610233856, 139693610393599, +STORE, 139693610225664, 139693610393599, +STORE, 39976960, 40714239, +STORE, 139693610209280, 139693610393599, +STORE, 39976960, 40861695, +STORE, 94431504838656, 94431505051647, +STORE, 94431507148800, 94431507152895, +STORE, 94431507152896, 94431507161087, +STORE, 94431507161088, 94431507173375, +STORE, 94431510286336, 94431528759295, +STORE, 139818797948928, 139818799607807, +STORE, 139818799607808, 139818801704959, +STORE, 139818801704960, 139818801721343, +STORE, 139818801721344, 139818801729535, +STORE, 139818801729536, 139818801745919, +STORE, 139818801745920, 139818801758207, +STORE, 139818801758208, 139818803851263, +STORE, 139818803851264, 139818803855359, +STORE, 139818803855360, 139818803859455, +STORE, 139818803859456, 139818804002815, +STORE, 139818804371456, 139818806054911, +STORE, 139818806054912, 139818806071295, +STORE, 139818806099968, 139818806104063, +STORE, 139818806104064, 139818806108159, +STORE, 139818806108160, 139818806112255, +STORE, 140731430457344, 140731430596607, +STORE, 140731431227392, 140731431239679, +STORE, 140731431239680, 140731431243775, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140729993904128, 140737488351231, +SNULL, 140729993912319, 140737488351231, +STORE, 140729993904128, 140729993912319, +STORE, 140729993773056, 140729993912319, +STORE, 93926271991808, 93926274215935, +SNULL, 93926272102399, 93926274215935, +STORE, 93926271991808, 93926272102399, +STORE, 93926272102400, 93926274215935, +ERASE, 93926272102400, 93926274215935, +STORE, 93926274195456, 93926274207743, +STORE, 93926274207744, 93926274215935, +STORE, 139962167296000, 139962169548799, +SNULL, 139962167439359, 139962169548799, +STORE, 139962167296000, 139962167439359, +STORE, 139962167439360, 139962169548799, +ERASE, 139962167439360, 139962169548799, +STORE, 139962169536512, 139962169544703, +STORE, 139962169544704, 139962169548799, +STORE, 140729995096064, 140729995100159, +STORE, 140729995083776, 140729995096063, +STORE, 139962169507840, 139962169536511, +STORE, 139962169499648, 139962169507839, +STORE, 139962163499008, 139962167295999, +SNULL, 139962163499008, 139962165157887, +STORE, 139962165157888, 139962167295999, +STORE, 139962163499008, 139962165157887, +SNULL, 139962167255039, 139962167295999, +STORE, 139962165157888, 139962167255039, +STORE, 139962167255040, 139962167295999, +SNULL, 139962167255040, 139962167279615, +STORE, 139962167279616, 139962167295999, +STORE, 139962167255040, 139962167279615, +ERASE, 139962167255040, 139962167279615, +STORE, 139962167255040, 139962167279615, +ERASE, 139962167279616, 139962167295999, +STORE, 139962167279616, 139962167295999, +SNULL, 139962167271423, 139962167279615, +STORE, 139962167255040, 139962167271423, +STORE, 139962167271424, 139962167279615, +SNULL, 93926274203647, 93926274207743, +STORE, 93926274195456, 93926274203647, +STORE, 93926274203648, 93926274207743, +SNULL, 139962169540607, 139962169544703, +STORE, 139962169536512, 139962169540607, +STORE, 139962169540608, 139962169544703, +ERASE, 139962169507840, 139962169536511, +STORE, 93926291120128, 93926291255295, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140724960579584, 140737488351231, +SNULL, 140724960587775, 140737488351231, +STORE, 140724960579584, 140724960587775, +STORE, 140724960448512, 140724960587775, +STORE, 94246489489408, 94246491713535, +SNULL, 94246489599999, 94246491713535, +STORE, 94246489489408, 94246489599999, +STORE, 94246489600000, 94246491713535, +ERASE, 94246489600000, 94246491713535, +STORE, 94246491693056, 94246491705343, +STORE, 94246491705344, 94246491713535, +STORE, 140098174926848, 140098177179647, +SNULL, 140098175070207, 140098177179647, +STORE, 140098174926848, 140098175070207, +STORE, 140098175070208, 140098177179647, +ERASE, 140098175070208, 140098177179647, +STORE, 140098177167360, 140098177175551, +STORE, 140098177175552, 140098177179647, +STORE, 140724961439744, 140724961443839, +STORE, 140724961427456, 140724961439743, +STORE, 140098177138688, 140098177167359, +STORE, 140098177130496, 140098177138687, +STORE, 140098171129856, 140098174926847, +SNULL, 140098171129856, 140098172788735, +STORE, 140098172788736, 140098174926847, +STORE, 140098171129856, 140098172788735, +SNULL, 140098174885887, 140098174926847, +STORE, 140098172788736, 140098174885887, +STORE, 140098174885888, 140098174926847, +SNULL, 140098174885888, 140098174910463, +STORE, 140098174910464, 140098174926847, +STORE, 140098174885888, 140098174910463, +ERASE, 140098174885888, 140098174910463, +STORE, 140098174885888, 140098174910463, +ERASE, 140098174910464, 140098174926847, +STORE, 140098174910464, 140098174926847, +SNULL, 140098174902271, 140098174910463, +STORE, 140098174885888, 140098174902271, +STORE, 140098174902272, 140098174910463, +SNULL, 94246491701247, 94246491705343, +STORE, 94246491693056, 94246491701247, +STORE, 94246491701248, 94246491705343, +SNULL, 140098177171455, 140098177175551, +STORE, 140098177167360, 140098177171455, +STORE, 140098177171456, 140098177175551, +ERASE, 140098177138688, 140098177167359, +STORE, 94246516998144, 94246517133311, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140730522918912, 140737488351231, +SNULL, 140730522927103, 140737488351231, +STORE, 140730522918912, 140730522927103, +STORE, 140730522787840, 140730522927103, +STORE, 94196043120640, 94196045344767, +SNULL, 94196043231231, 94196045344767, +STORE, 94196043120640, 94196043231231, +STORE, 94196043231232, 94196045344767, +ERASE, 94196043231232, 94196045344767, +STORE, 94196045324288, 94196045336575, +STORE, 94196045336576, 94196045344767, +STORE, 139815918940160, 139815921192959, +SNULL, 139815919083519, 139815921192959, +STORE, 139815918940160, 139815919083519, +STORE, 139815919083520, 139815921192959, +ERASE, 139815919083520, 139815921192959, +STORE, 139815921180672, 139815921188863, +STORE, 139815921188864, 139815921192959, +STORE, 140730523344896, 140730523348991, +STORE, 140730523332608, 140730523344895, +STORE, 139815921152000, 139815921180671, +STORE, 139815921143808, 139815921151999, +STORE, 139815915143168, 139815918940159, +SNULL, 139815915143168, 139815916802047, +STORE, 139815916802048, 139815918940159, +STORE, 139815915143168, 139815916802047, +SNULL, 139815918899199, 139815918940159, +STORE, 139815916802048, 139815918899199, +STORE, 139815918899200, 139815918940159, +SNULL, 139815918899200, 139815918923775, +STORE, 139815918923776, 139815918940159, +STORE, 139815918899200, 139815918923775, +ERASE, 139815918899200, 139815918923775, +STORE, 139815918899200, 139815918923775, +ERASE, 139815918923776, 139815918940159, +STORE, 139815918923776, 139815918940159, +SNULL, 139815918915583, 139815918923775, +STORE, 139815918899200, 139815918915583, +STORE, 139815918915584, 139815918923775, +SNULL, 94196045332479, 94196045336575, +STORE, 94196045324288, 94196045332479, +STORE, 94196045332480, 94196045336575, +SNULL, 139815921184767, 139815921188863, +STORE, 139815921180672, 139815921184767, +STORE, 139815921184768, 139815921188863, +ERASE, 139815921152000, 139815921180671, +STORE, 94196076183552, 94196076318719, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722460393472, 140737488351231, +SNULL, 140722460401663, 140737488351231, +STORE, 140722460393472, 140722460401663, +STORE, 140722460262400, 140722460401663, +STORE, 94569810399232, 94569812623359, +SNULL, 94569810509823, 94569812623359, +STORE, 94569810399232, 94569810509823, +STORE, 94569810509824, 94569812623359, +ERASE, 94569810509824, 94569812623359, +STORE, 94569812602880, 94569812615167, +STORE, 94569812615168, 94569812623359, +STORE, 139681565450240, 139681567703039, +SNULL, 139681565593599, 139681567703039, +STORE, 139681565450240, 139681565593599, +STORE, 139681565593600, 139681567703039, +ERASE, 139681565593600, 139681567703039, +STORE, 139681567690752, 139681567698943, +STORE, 139681567698944, 139681567703039, +STORE, 140722460569600, 140722460573695, +STORE, 140722460557312, 140722460569599, +STORE, 139681567662080, 139681567690751, +STORE, 139681567653888, 139681567662079, +STORE, 139681561653248, 139681565450239, +SNULL, 139681561653248, 139681563312127, +STORE, 139681563312128, 139681565450239, +STORE, 139681561653248, 139681563312127, +SNULL, 139681565409279, 139681565450239, +STORE, 139681563312128, 139681565409279, +STORE, 139681565409280, 139681565450239, +SNULL, 139681565409280, 139681565433855, +STORE, 139681565433856, 139681565450239, +STORE, 139681565409280, 139681565433855, +ERASE, 139681565409280, 139681565433855, +STORE, 139681565409280, 139681565433855, +ERASE, 139681565433856, 139681565450239, +STORE, 139681565433856, 139681565450239, +SNULL, 139681565425663, 139681565433855, +STORE, 139681565409280, 139681565425663, +STORE, 139681565425664, 139681565433855, +SNULL, 94569812611071, 94569812615167, +STORE, 94569812602880, 94569812611071, +STORE, 94569812611072, 94569812615167, +SNULL, 139681567694847, 139681567698943, +STORE, 139681567690752, 139681567694847, +STORE, 139681567694848, 139681567698943, +ERASE, 139681567662080, 139681567690751, +STORE, 94569818066944, 94569818202111, +STORE, 94431504838656, 94431505051647, +STORE, 94431507148800, 94431507152895, +STORE, 94431507152896, 94431507161087, +STORE, 94431507161088, 94431507173375, +STORE, 94431510286336, 94431534280703, +STORE, 139818797948928, 139818799607807, +STORE, 139818799607808, 139818801704959, +STORE, 139818801704960, 139818801721343, +STORE, 139818801721344, 139818801729535, +STORE, 139818801729536, 139818801745919, +STORE, 139818801745920, 139818801758207, +STORE, 139818801758208, 139818803851263, +STORE, 139818803851264, 139818803855359, +STORE, 139818803855360, 139818803859455, +STORE, 139818803859456, 139818804002815, +STORE, 139818804371456, 139818806054911, +STORE, 139818806054912, 139818806071295, +STORE, 139818806099968, 139818806104063, +STORE, 139818806104064, 139818806108159, +STORE, 139818806108160, 139818806112255, +STORE, 140731430457344, 140731430596607, +STORE, 140731431227392, 140731431239679, +STORE, 140731431239680, 140731431243775, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140725452365824, 140737488351231, +SNULL, 140725452374015, 140737488351231, +STORE, 140725452365824, 140725452374015, +STORE, 140725452234752, 140725452374015, +STORE, 94395067465728, 94395069689855, +SNULL, 94395067576319, 94395069689855, +STORE, 94395067465728, 94395067576319, +STORE, 94395067576320, 94395069689855, +ERASE, 94395067576320, 94395069689855, +STORE, 94395069669376, 94395069681663, +STORE, 94395069681664, 94395069689855, +STORE, 140269941211136, 140269943463935, +SNULL, 140269941354495, 140269943463935, +STORE, 140269941211136, 140269941354495, +STORE, 140269941354496, 140269943463935, +ERASE, 140269941354496, 140269943463935, +STORE, 140269943451648, 140269943459839, +STORE, 140269943459840, 140269943463935, +STORE, 140725452558336, 140725452562431, +STORE, 140725452546048, 140725452558335, +STORE, 140269943422976, 140269943451647, +STORE, 140269943414784, 140269943422975, +STORE, 140269937414144, 140269941211135, +SNULL, 140269937414144, 140269939073023, +STORE, 140269939073024, 140269941211135, +STORE, 140269937414144, 140269939073023, +SNULL, 140269941170175, 140269941211135, +STORE, 140269939073024, 140269941170175, +STORE, 140269941170176, 140269941211135, +SNULL, 140269941170176, 140269941194751, +STORE, 140269941194752, 140269941211135, +STORE, 140269941170176, 140269941194751, +ERASE, 140269941170176, 140269941194751, +STORE, 140269941170176, 140269941194751, +ERASE, 140269941194752, 140269941211135, +STORE, 140269941194752, 140269941211135, +SNULL, 140269941186559, 140269941194751, +STORE, 140269941170176, 140269941186559, +STORE, 140269941186560, 140269941194751, +SNULL, 94395069677567, 94395069681663, +STORE, 94395069669376, 94395069677567, +STORE, 94395069677568, 94395069681663, +SNULL, 140269943455743, 140269943459839, +STORE, 140269943451648, 140269943455743, +STORE, 140269943455744, 140269943459839, +ERASE, 140269943422976, 140269943451647, +STORE, 94395101691904, 94395101827071, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140733860118528, 140737488351231, +SNULL, 140733860126719, 140737488351231, +STORE, 140733860118528, 140733860126719, +STORE, 140733859987456, 140733860126719, +STORE, 94484752990208, 94484755214335, +SNULL, 94484753100799, 94484755214335, +STORE, 94484752990208, 94484753100799, +STORE, 94484753100800, 94484755214335, +ERASE, 94484753100800, 94484755214335, +STORE, 94484755193856, 94484755206143, +STORE, 94484755206144, 94484755214335, +STORE, 139958922309632, 139958924562431, +SNULL, 139958922452991, 139958924562431, +STORE, 139958922309632, 139958922452991, +STORE, 139958922452992, 139958924562431, +ERASE, 139958922452992, 139958924562431, +STORE, 139958924550144, 139958924558335, +STORE, 139958924558336, 139958924562431, +STORE, 140733860253696, 140733860257791, +STORE, 140733860241408, 140733860253695, +STORE, 139958924521472, 139958924550143, +STORE, 139958924513280, 139958924521471, +STORE, 139958918512640, 139958922309631, +SNULL, 139958918512640, 139958920171519, +STORE, 139958920171520, 139958922309631, +STORE, 139958918512640, 139958920171519, +SNULL, 139958922268671, 139958922309631, +STORE, 139958920171520, 139958922268671, +STORE, 139958922268672, 139958922309631, +SNULL, 139958922268672, 139958922293247, +STORE, 139958922293248, 139958922309631, +STORE, 139958922268672, 139958922293247, +ERASE, 139958922268672, 139958922293247, +STORE, 139958922268672, 139958922293247, +ERASE, 139958922293248, 139958922309631, +STORE, 139958922293248, 139958922309631, +SNULL, 139958922285055, 139958922293247, +STORE, 139958922268672, 139958922285055, +STORE, 139958922285056, 139958922293247, +SNULL, 94484755202047, 94484755206143, +STORE, 94484755193856, 94484755202047, +STORE, 94484755202048, 94484755206143, +SNULL, 139958924554239, 139958924558335, +STORE, 139958924550144, 139958924554239, +STORE, 139958924554240, 139958924558335, +ERASE, 139958924521472, 139958924550143, +STORE, 94484777615360, 94484777750527, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140731051036672, 140737488351231, +SNULL, 140731051044863, 140737488351231, +STORE, 140731051036672, 140731051044863, +STORE, 140731050905600, 140731051044863, +STORE, 93945822998528, 93945825222655, +SNULL, 93945823109119, 93945825222655, +STORE, 93945822998528, 93945823109119, +STORE, 93945823109120, 93945825222655, +ERASE, 93945823109120, 93945825222655, +STORE, 93945825202176, 93945825214463, +STORE, 93945825214464, 93945825222655, +STORE, 140153503997952, 140153506250751, +SNULL, 140153504141311, 140153506250751, +STORE, 140153503997952, 140153504141311, +STORE, 140153504141312, 140153506250751, +ERASE, 140153504141312, 140153506250751, +STORE, 140153506238464, 140153506246655, +STORE, 140153506246656, 140153506250751, +STORE, 140731051331584, 140731051335679, +STORE, 140731051319296, 140731051331583, +STORE, 140153506209792, 140153506238463, +STORE, 140153506201600, 140153506209791, +STORE, 140153500200960, 140153503997951, +SNULL, 140153500200960, 140153501859839, +STORE, 140153501859840, 140153503997951, +STORE, 140153500200960, 140153501859839, +SNULL, 140153503956991, 140153503997951, +STORE, 140153501859840, 140153503956991, +STORE, 140153503956992, 140153503997951, +SNULL, 140153503956992, 140153503981567, +STORE, 140153503981568, 140153503997951, +STORE, 140153503956992, 140153503981567, +ERASE, 140153503956992, 140153503981567, +STORE, 140153503956992, 140153503981567, +ERASE, 140153503981568, 140153503997951, +STORE, 140153503981568, 140153503997951, +SNULL, 140153503973375, 140153503981567, +STORE, 140153503956992, 140153503973375, +STORE, 140153503973376, 140153503981567, +SNULL, 93945825210367, 93945825214463, +STORE, 93945825202176, 93945825210367, +STORE, 93945825210368, 93945825214463, +SNULL, 140153506242559, 140153506246655, +STORE, 140153506238464, 140153506242559, +STORE, 140153506242560, 140153506246655, +ERASE, 140153506209792, 140153506238463, +STORE, 93945854537728, 93945854672895, +STORE, 94431504838656, 94431505051647, +STORE, 94431507148800, 94431507152895, +STORE, 94431507152896, 94431507161087, +STORE, 94431507161088, 94431507173375, +STORE, 94431510286336, 94431537885183, +STORE, 139818797948928, 139818799607807, +STORE, 139818799607808, 139818801704959, +STORE, 139818801704960, 139818801721343, +STORE, 139818801721344, 139818801729535, +STORE, 139818801729536, 139818801745919, +STORE, 139818801745920, 139818801758207, +STORE, 139818801758208, 139818803851263, +STORE, 139818803851264, 139818803855359, +STORE, 139818803855360, 139818803859455, +STORE, 139818803859456, 139818804002815, +STORE, 139818804371456, 139818806054911, +STORE, 139818806054912, 139818806071295, +STORE, 139818806099968, 139818806104063, +STORE, 139818806104064, 139818806108159, +STORE, 139818806108160, 139818806112255, +STORE, 140731430457344, 140731430596607, +STORE, 140731431227392, 140731431239679, +STORE, 140731431239680, 140731431243775, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140736025325568, 140737488351231, +SNULL, 140736025333759, 140737488351231, +STORE, 140736025325568, 140736025333759, +STORE, 140736025194496, 140736025333759, +STORE, 94809095172096, 94809097396223, +SNULL, 94809095282687, 94809097396223, +STORE, 94809095172096, 94809095282687, +STORE, 94809095282688, 94809097396223, +ERASE, 94809095282688, 94809097396223, +STORE, 94809097375744, 94809097388031, +STORE, 94809097388032, 94809097396223, +STORE, 140194992517120, 140194994769919, +SNULL, 140194992660479, 140194994769919, +STORE, 140194992517120, 140194992660479, +STORE, 140194992660480, 140194994769919, +ERASE, 140194992660480, 140194994769919, +STORE, 140194994757632, 140194994765823, +STORE, 140194994765824, 140194994769919, +STORE, 140736026173440, 140736026177535, +STORE, 140736026161152, 140736026173439, +STORE, 140194994728960, 140194994757631, +STORE, 140194994720768, 140194994728959, +STORE, 140194988720128, 140194992517119, +SNULL, 140194988720128, 140194990379007, +STORE, 140194990379008, 140194992517119, +STORE, 140194988720128, 140194990379007, +SNULL, 140194992476159, 140194992517119, +STORE, 140194990379008, 140194992476159, +STORE, 140194992476160, 140194992517119, +SNULL, 140194992476160, 140194992500735, +STORE, 140194992500736, 140194992517119, +STORE, 140194992476160, 140194992500735, +ERASE, 140194992476160, 140194992500735, +STORE, 140194992476160, 140194992500735, +ERASE, 140194992500736, 140194992517119, +STORE, 140194992500736, 140194992517119, +SNULL, 140194992492543, 140194992500735, +STORE, 140194992476160, 140194992492543, +STORE, 140194992492544, 140194992500735, +SNULL, 94809097383935, 94809097388031, +STORE, 94809097375744, 94809097383935, +STORE, 94809097383936, 94809097388031, +SNULL, 140194994761727, 140194994765823, +STORE, 140194994757632, 140194994761727, +STORE, 140194994761728, 140194994765823, +ERASE, 140194994728960, 140194994757631, +STORE, 94809124286464, 94809124421631, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140726342660096, 140737488351231, +SNULL, 140726342668287, 140737488351231, +STORE, 140726342660096, 140726342668287, +STORE, 140726342529024, 140726342668287, +STORE, 94140331462656, 94140333686783, +SNULL, 94140331573247, 94140333686783, +STORE, 94140331462656, 94140331573247, +STORE, 94140331573248, 94140333686783, +ERASE, 94140331573248, 94140333686783, +STORE, 94140333666304, 94140333678591, +STORE, 94140333678592, 94140333686783, +STORE, 140714077208576, 140714079461375, +SNULL, 140714077351935, 140714079461375, +STORE, 140714077208576, 140714077351935, +STORE, 140714077351936, 140714079461375, +ERASE, 140714077351936, 140714079461375, +STORE, 140714079449088, 140714079457279, +STORE, 140714079457280, 140714079461375, +STORE, 140726343933952, 140726343938047, +STORE, 140726343921664, 140726343933951, +STORE, 140714079420416, 140714079449087, +STORE, 140714079412224, 140714079420415, +STORE, 140714073411584, 140714077208575, +SNULL, 140714073411584, 140714075070463, +STORE, 140714075070464, 140714077208575, +STORE, 140714073411584, 140714075070463, +SNULL, 140714077167615, 140714077208575, +STORE, 140714075070464, 140714077167615, +STORE, 140714077167616, 140714077208575, +SNULL, 140714077167616, 140714077192191, +STORE, 140714077192192, 140714077208575, +STORE, 140714077167616, 140714077192191, +ERASE, 140714077167616, 140714077192191, +STORE, 140714077167616, 140714077192191, +ERASE, 140714077192192, 140714077208575, +STORE, 140714077192192, 140714077208575, +SNULL, 140714077183999, 140714077192191, +STORE, 140714077167616, 140714077183999, +STORE, 140714077184000, 140714077192191, +SNULL, 94140333674495, 94140333678591, +STORE, 94140333666304, 94140333674495, +STORE, 94140333674496, 94140333678591, +SNULL, 140714079453183, 140714079457279, +STORE, 140714079449088, 140714079453183, +STORE, 140714079453184, 140714079457279, +ERASE, 140714079420416, 140714079449087, +STORE, 94140341432320, 94140341567487, +STORE, 94431504838656, 94431505051647, +STORE, 94431507148800, 94431507152895, +STORE, 94431507152896, 94431507161087, +STORE, 94431507161088, 94431507173375, +STORE, 94431510286336, 94431539601407, +STORE, 139818797948928, 139818799607807, +STORE, 139818799607808, 139818801704959, +STORE, 139818801704960, 139818801721343, +STORE, 139818801721344, 139818801729535, +STORE, 139818801729536, 139818801745919, +STORE, 139818801745920, 139818801758207, +STORE, 139818801758208, 139818803851263, +STORE, 139818803851264, 139818803855359, +STORE, 139818803855360, 139818803859455, +STORE, 139818803859456, 139818804002815, +STORE, 139818804371456, 139818806054911, +STORE, 139818806054912, 139818806071295, +STORE, 139818806099968, 139818806104063, +STORE, 139818806104064, 139818806108159, +STORE, 139818806108160, 139818806112255, +STORE, 140731430457344, 140731430596607, +STORE, 140731431227392, 140731431239679, +STORE, 140731431239680, 140731431243775, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140725843607552, 140737488351231, +SNULL, 140725843615743, 140737488351231, +STORE, 140725843607552, 140725843615743, +STORE, 140725843476480, 140725843615743, +STORE, 94889043505152, 94889045839871, +SNULL, 94889043718143, 94889045839871, +STORE, 94889043505152, 94889043718143, +STORE, 94889043718144, 94889045839871, +ERASE, 94889043718144, 94889045839871, +STORE, 94889045815296, 94889045827583, +STORE, 94889045827584, 94889045839871, +STORE, 140250965946368, 140250968199167, +SNULL, 140250966089727, 140250968199167, +STORE, 140250965946368, 140250966089727, +STORE, 140250966089728, 140250968199167, +ERASE, 140250966089728, 140250968199167, +STORE, 140250968186880, 140250968195071, +STORE, 140250968195072, 140250968199167, +STORE, 140725844500480, 140725844504575, +STORE, 140725844488192, 140725844500479, +STORE, 140250968158208, 140250968186879, +STORE, 140250968150016, 140250968158207, +STORE, 140250963832832, 140250965946367, +SNULL, 140250963832832, 140250963845119, +STORE, 140250963845120, 140250965946367, +STORE, 140250963832832, 140250963845119, +SNULL, 140250965938175, 140250965946367, +STORE, 140250963845120, 140250965938175, +STORE, 140250965938176, 140250965946367, +ERASE, 140250965938176, 140250965946367, +STORE, 140250965938176, 140250965946367, +STORE, 140250960035840, 140250963832831, +SNULL, 140250960035840, 140250961694719, +STORE, 140250961694720, 140250963832831, +STORE, 140250960035840, 140250961694719, +SNULL, 140250963791871, 140250963832831, +STORE, 140250961694720, 140250963791871, +STORE, 140250963791872, 140250963832831, +SNULL, 140250963791872, 140250963816447, +STORE, 140250963816448, 140250963832831, +STORE, 140250963791872, 140250963816447, +ERASE, 140250963791872, 140250963816447, +STORE, 140250963791872, 140250963816447, +ERASE, 140250963816448, 140250963832831, +STORE, 140250963816448, 140250963832831, +STORE, 140250968141824, 140250968158207, +SNULL, 140250963808255, 140250963816447, +STORE, 140250963791872, 140250963808255, +STORE, 140250963808256, 140250963816447, +SNULL, 140250965942271, 140250965946367, +STORE, 140250965938176, 140250965942271, +STORE, 140250965942272, 140250965946367, +SNULL, 94889045819391, 94889045827583, +STORE, 94889045815296, 94889045819391, +STORE, 94889045819392, 94889045827583, +SNULL, 140250968190975, 140250968195071, +STORE, 140250968186880, 140250968190975, +STORE, 140250968190976, 140250968195071, +ERASE, 140250968158208, 140250968186879, +STORE, 94889052213248, 94889052348415, +STORE, 140250966458368, 140250968141823, +STORE, 94889052213248, 94889052483583, +STORE, 94889052213248, 94889052618751, +STORE, 94170851819520, 94170852032511, +STORE, 94170854129664, 94170854133759, +STORE, 94170854133760, 94170854141951, +STORE, 94170854141952, 94170854154239, +STORE, 94170866515968, 94170867740671, +STORE, 140062030422016, 140062032080895, +STORE, 140062032080896, 140062034178047, +STORE, 140062034178048, 140062034194431, +STORE, 140062034194432, 140062034202623, +STORE, 140062034202624, 140062034219007, +STORE, 140062034219008, 140062034231295, +STORE, 140062034231296, 140062036324351, +STORE, 140062036324352, 140062036328447, +STORE, 140062036328448, 140062036332543, +STORE, 140062036332544, 140062036475903, +STORE, 140062036844544, 140062038527999, +STORE, 140062038528000, 140062038544383, +STORE, 140062038573056, 140062038577151, +STORE, 140062038577152, 140062038581247, +STORE, 140062038581248, 140062038585343, +STORE, 140736210550784, 140736210690047, +STORE, 140736210759680, 140736210771967, +STORE, 140736210771968, 140736210776063, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140724272365568, 140737488351231, +SNULL, 140724272373759, 140737488351231, +STORE, 140724272365568, 140724272373759, +STORE, 140724272234496, 140724272373759, +STORE, 94607711965184, 94607714189311, +SNULL, 94607712075775, 94607714189311, +STORE, 94607711965184, 94607712075775, +STORE, 94607712075776, 94607714189311, +ERASE, 94607712075776, 94607714189311, +STORE, 94607714168832, 94607714181119, +STORE, 94607714181120, 94607714189311, +STORE, 140054949253120, 140054951505919, +SNULL, 140054949396479, 140054951505919, +STORE, 140054949253120, 140054949396479, +STORE, 140054949396480, 140054951505919, +ERASE, 140054949396480, 140054951505919, +STORE, 140054951493632, 140054951501823, +STORE, 140054951501824, 140054951505919, +STORE, 140724272992256, 140724272996351, +STORE, 140724272979968, 140724272992255, +STORE, 140054951464960, 140054951493631, +STORE, 140054951456768, 140054951464959, +STORE, 140054945456128, 140054949253119, +SNULL, 140054945456128, 140054947115007, +STORE, 140054947115008, 140054949253119, +STORE, 140054945456128, 140054947115007, +SNULL, 140054949212159, 140054949253119, +STORE, 140054947115008, 140054949212159, +STORE, 140054949212160, 140054949253119, +SNULL, 140054949212160, 140054949236735, +STORE, 140054949236736, 140054949253119, +STORE, 140054949212160, 140054949236735, +ERASE, 140054949212160, 140054949236735, +STORE, 140054949212160, 140054949236735, +ERASE, 140054949236736, 140054949253119, +STORE, 140054949236736, 140054949253119, +SNULL, 140054949228543, 140054949236735, +STORE, 140054949212160, 140054949228543, +STORE, 140054949228544, 140054949236735, +SNULL, 94607714177023, 94607714181119, +STORE, 94607714168832, 94607714177023, +STORE, 94607714177024, 94607714181119, +SNULL, 140054951497727, 140054951501823, +STORE, 140054951493632, 140054951497727, +STORE, 140054951497728, 140054951501823, +ERASE, 140054951464960, 140054951493631, +STORE, 94607733374976, 94607733510143, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140733586923520, 140737488351231, +SNULL, 140733586931711, 140737488351231, +STORE, 140733586923520, 140733586931711, +STORE, 140733586792448, 140733586931711, +STORE, 93901634904064, 93901637128191, +SNULL, 93901635014655, 93901637128191, +STORE, 93901634904064, 93901635014655, +STORE, 93901635014656, 93901637128191, +ERASE, 93901635014656, 93901637128191, +STORE, 93901637107712, 93901637119999, +STORE, 93901637120000, 93901637128191, +STORE, 140086104784896, 140086107037695, +SNULL, 140086104928255, 140086107037695, +STORE, 140086104784896, 140086104928255, +STORE, 140086104928256, 140086107037695, +ERASE, 140086104928256, 140086107037695, +STORE, 140086107025408, 140086107033599, +STORE, 140086107033600, 140086107037695, +STORE, 140733587263488, 140733587267583, +STORE, 140733587251200, 140733587263487, +STORE, 140086106996736, 140086107025407, +STORE, 140086106988544, 140086106996735, +STORE, 140086100987904, 140086104784895, +SNULL, 140086100987904, 140086102646783, +STORE, 140086102646784, 140086104784895, +STORE, 140086100987904, 140086102646783, +SNULL, 140086104743935, 140086104784895, +STORE, 140086102646784, 140086104743935, +STORE, 140086104743936, 140086104784895, +SNULL, 140086104743936, 140086104768511, +STORE, 140086104768512, 140086104784895, +STORE, 140086104743936, 140086104768511, +ERASE, 140086104743936, 140086104768511, +STORE, 140086104743936, 140086104768511, +ERASE, 140086104768512, 140086104784895, +STORE, 140086104768512, 140086104784895, +SNULL, 140086104760319, 140086104768511, +STORE, 140086104743936, 140086104760319, +STORE, 140086104760320, 140086104768511, +SNULL, 93901637115903, 93901637119999, +STORE, 93901637107712, 93901637115903, +STORE, 93901637115904, 93901637119999, +SNULL, 140086107029503, 140086107033599, +STORE, 140086107025408, 140086107029503, +STORE, 140086107029504, 140086107033599, +ERASE, 140086106996736, 140086107025407, +STORE, 93901662715904, 93901662851071, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140723365613568, 140737488351231, +SNULL, 140723365621759, 140737488351231, +STORE, 140723365613568, 140723365621759, +STORE, 140723365482496, 140723365621759, +STORE, 94759193546752, 94759195770879, +SNULL, 94759193657343, 94759195770879, +STORE, 94759193546752, 94759193657343, +STORE, 94759193657344, 94759195770879, +ERASE, 94759193657344, 94759195770879, +STORE, 94759195750400, 94759195762687, +STORE, 94759195762688, 94759195770879, +STORE, 140607636246528, 140607638499327, +SNULL, 140607636389887, 140607638499327, +STORE, 140607636246528, 140607636389887, +STORE, 140607636389888, 140607638499327, +ERASE, 140607636389888, 140607638499327, +STORE, 140607638487040, 140607638495231, +STORE, 140607638495232, 140607638499327, +STORE, 140723365900288, 140723365904383, +STORE, 140723365888000, 140723365900287, +STORE, 140607638458368, 140607638487039, +STORE, 140607638450176, 140607638458367, +STORE, 140607632449536, 140607636246527, +SNULL, 140607632449536, 140607634108415, +STORE, 140607634108416, 140607636246527, +STORE, 140607632449536, 140607634108415, +SNULL, 140607636205567, 140607636246527, +STORE, 140607634108416, 140607636205567, +STORE, 140607636205568, 140607636246527, +SNULL, 140607636205568, 140607636230143, +STORE, 140607636230144, 140607636246527, +STORE, 140607636205568, 140607636230143, +ERASE, 140607636205568, 140607636230143, +STORE, 140607636205568, 140607636230143, +ERASE, 140607636230144, 140607636246527, +STORE, 140607636230144, 140607636246527, +SNULL, 140607636221951, 140607636230143, +STORE, 140607636205568, 140607636221951, +STORE, 140607636221952, 140607636230143, +SNULL, 94759195758591, 94759195762687, +STORE, 94759195750400, 94759195758591, +STORE, 94759195758592, 94759195762687, +SNULL, 140607638491135, 140607638495231, +STORE, 140607638487040, 140607638491135, +STORE, 140607638491136, 140607638495231, +ERASE, 140607638458368, 140607638487039, +STORE, 94759204995072, 94759205130239, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140732503789568, 140737488351231, +SNULL, 140732503797759, 140737488351231, +STORE, 140732503789568, 140732503797759, +STORE, 140732503658496, 140732503797759, +STORE, 94077792956416, 94077795180543, +SNULL, 94077793067007, 94077795180543, +STORE, 94077792956416, 94077793067007, +STORE, 94077793067008, 94077795180543, +ERASE, 94077793067008, 94077795180543, +STORE, 94077795160064, 94077795172351, +STORE, 94077795172352, 94077795180543, +STORE, 140359874252800, 140359876505599, +SNULL, 140359874396159, 140359876505599, +STORE, 140359874252800, 140359874396159, +STORE, 140359874396160, 140359876505599, +ERASE, 140359874396160, 140359876505599, +STORE, 140359876493312, 140359876501503, +STORE, 140359876501504, 140359876505599, +STORE, 140732504465408, 140732504469503, +STORE, 140732504453120, 140732504465407, +STORE, 140359876464640, 140359876493311, +STORE, 140359876456448, 140359876464639, +STORE, 140359870455808, 140359874252799, +SNULL, 140359870455808, 140359872114687, +STORE, 140359872114688, 140359874252799, +STORE, 140359870455808, 140359872114687, +SNULL, 140359874211839, 140359874252799, +STORE, 140359872114688, 140359874211839, +STORE, 140359874211840, 140359874252799, +SNULL, 140359874211840, 140359874236415, +STORE, 140359874236416, 140359874252799, +STORE, 140359874211840, 140359874236415, +ERASE, 140359874211840, 140359874236415, +STORE, 140359874211840, 140359874236415, +ERASE, 140359874236416, 140359874252799, +STORE, 140359874236416, 140359874252799, +SNULL, 140359874228223, 140359874236415, +STORE, 140359874211840, 140359874228223, +STORE, 140359874228224, 140359874236415, +SNULL, 94077795168255, 94077795172351, +STORE, 94077795160064, 94077795168255, +STORE, 94077795168256, 94077795172351, +SNULL, 140359876497407, 140359876501503, +STORE, 140359876493312, 140359876497407, +STORE, 140359876497408, 140359876501503, +ERASE, 140359876464640, 140359876493311, +STORE, 94077808717824, 94077808852991, +STORE, 94549486252032, 94549486465023, +STORE, 94549488562176, 94549488566271, +STORE, 94549488566272, 94549488574463, +STORE, 94549488574464, 94549488586751, +STORE, 94549503492096, 94549506121727, +STORE, 140085800894464, 140085802553343, +STORE, 140085802553344, 140085804650495, +STORE, 140085804650496, 140085804666879, +STORE, 140085804666880, 140085804675071, +STORE, 140085804675072, 140085804691455, +STORE, 140085804691456, 140085804703743, +STORE, 140085804703744, 140085806796799, +STORE, 140085806796800, 140085806800895, +STORE, 140085806800896, 140085806804991, +STORE, 140085806804992, 140085806948351, +STORE, 140085807316992, 140085809000447, +STORE, 140085809000448, 140085809016831, +STORE, 140085809045504, 140085809049599, +STORE, 140085809049600, 140085809053695, +STORE, 140085809053696, 140085809057791, +STORE, 140731810545664, 140731810684927, +STORE, 140731810967552, 140731810979839, +STORE, 140731810979840, 140731810983935, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140724752330752, 140737488351231, +SNULL, 140724752338943, 140737488351231, +STORE, 140724752330752, 140724752338943, +STORE, 140724752199680, 140724752338943, +STORE, 94656357539840, 94656359874559, +SNULL, 94656357752831, 94656359874559, +STORE, 94656357539840, 94656357752831, +STORE, 94656357752832, 94656359874559, +ERASE, 94656357752832, 94656359874559, +STORE, 94656359849984, 94656359862271, +STORE, 94656359862272, 94656359874559, +STORE, 139632585203712, 139632587456511, +SNULL, 139632585347071, 139632587456511, +STORE, 139632585203712, 139632585347071, +STORE, 139632585347072, 139632587456511, +ERASE, 139632585347072, 139632587456511, +STORE, 139632587444224, 139632587452415, +STORE, 139632587452416, 139632587456511, +STORE, 139632587440128, 139632587444223, +STORE, 139632587427840, 139632587440127, +STORE, 139632587399168, 139632587427839, +STORE, 139632587390976, 139632587399167, +STORE, 139632583090176, 139632585203711, +SNULL, 139632583090176, 139632583102463, +STORE, 139632583102464, 139632585203711, +STORE, 139632583090176, 139632583102463, +SNULL, 139632585195519, 139632585203711, +STORE, 139632583102464, 139632585195519, +STORE, 139632585195520, 139632585203711, +ERASE, 139632585195520, 139632585203711, +STORE, 139632585195520, 139632585203711, +STORE, 139632579293184, 139632583090175, +SNULL, 139632579293184, 139632580952063, +STORE, 139632580952064, 139632583090175, +STORE, 139632579293184, 139632580952063, +SNULL, 139632583049215, 139632583090175, +STORE, 139632580952064, 139632583049215, +STORE, 139632583049216, 139632583090175, +SNULL, 139632583049216, 139632583073791, +STORE, 139632583073792, 139632583090175, +STORE, 139632583049216, 139632583073791, +ERASE, 139632583049216, 139632583073791, +STORE, 139632583049216, 139632583073791, +ERASE, 139632583073792, 139632583090175, +STORE, 139632583073792, 139632583090175, +STORE, 139632587382784, 139632587399167, +SNULL, 139632583065599, 139632583073791, +STORE, 139632583049216, 139632583065599, +STORE, 139632583065600, 139632583073791, +SNULL, 139632585199615, 139632585203711, +STORE, 139632585195520, 139632585199615, +STORE, 139632585199616, 139632585203711, +SNULL, 94656359854079, 94656359862271, +STORE, 94656359849984, 94656359854079, +STORE, 94656359854080, 94656359862271, +SNULL, 139632587448319, 139632587452415, +STORE, 139632587444224, 139632587448319, +STORE, 139632587448320, 139632587452415, +ERASE, 139632587399168, 139632587427839, +STORE, 94656378912768, 94656379047935, +STORE, 139632585699328, 139632587382783, +STORE, 94656378912768, 94656379183103, +STORE, 94656378912768, 94656379318271, +STORE, 94656378912768, 94656379494399, +SNULL, 94656379469823, 94656379494399, +STORE, 94656378912768, 94656379469823, +STORE, 94656379469824, 94656379494399, +ERASE, 94656379469824, 94656379494399, +STORE, 94656378912768, 94656379621375, +STORE, 94656378912768, 94656379756543, +STORE, 94656378912768, 94656379912191, +STORE, 94656378912768, 94656380055551, +STORE, 94656378912768, 94656380190719, +STORE, 94656378912768, 94656380338175, +SNULL, 94656380313599, 94656380338175, +STORE, 94656378912768, 94656380313599, +STORE, 94656380313600, 94656380338175, +ERASE, 94656380313600, 94656380338175, +STORE, 94656378912768, 94656380448767, +SNULL, 94656380432383, 94656380448767, +STORE, 94656378912768, 94656380432383, +STORE, 94656380432384, 94656380448767, +ERASE, 94656380432384, 94656380448767, +STORE, 94656378912768, 94656380567551, +STORE, 94656378912768, 94656380719103, +STORE, 94656378912768, 94656380858367, +STORE, 94656378912768, 94656380997631, +STORE, 94656378912768, 94656381132799, +SNULL, 94656381124607, 94656381132799, +STORE, 94656378912768, 94656381124607, +STORE, 94656381124608, 94656381132799, +ERASE, 94656381124608, 94656381132799, +STORE, 94656378912768, 94656381276159, +STORE, 94656378912768, 94656381427711, +STORE, 94604087611392, 94604087824383, +STORE, 94604089921536, 94604089925631, +STORE, 94604089925632, 94604089933823, +STORE, 94604089933824, 94604089946111, +STORE, 94604105125888, 94604106424319, +STORE, 140454937694208, 140454939353087, +STORE, 140454939353088, 140454941450239, +STORE, 140454941450240, 140454941466623, +STORE, 140454941466624, 140454941474815, +STORE, 140454941474816, 140454941491199, +STORE, 140454941491200, 140454941503487, +STORE, 140454941503488, 140454943596543, +STORE, 140454943596544, 140454943600639, +STORE, 140454943600640, 140454943604735, +STORE, 140454943604736, 140454943748095, +STORE, 140454944116736, 140454945800191, +STORE, 140454945800192, 140454945816575, +STORE, 140454945845248, 140454945849343, +STORE, 140454945849344, 140454945853439, +STORE, 140454945853440, 140454945857535, +STORE, 140728438214656, 140728438353919, +STORE, 140728439095296, 140728439107583, +STORE, 140728439107584, 140728439111679, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140727821099008, 140737488351231, +SNULL, 140727821107199, 140737488351231, +STORE, 140727821099008, 140727821107199, +STORE, 140727820967936, 140727821107199, +STORE, 94088457240576, 94088459575295, +SNULL, 94088457453567, 94088459575295, +STORE, 94088457240576, 94088457453567, +STORE, 94088457453568, 94088459575295, +ERASE, 94088457453568, 94088459575295, +STORE, 94088459550720, 94088459563007, +STORE, 94088459563008, 94088459575295, +STORE, 140234378989568, 140234381242367, +SNULL, 140234379132927, 140234381242367, +STORE, 140234378989568, 140234379132927, +STORE, 140234379132928, 140234381242367, +ERASE, 140234379132928, 140234381242367, +STORE, 140234381230080, 140234381238271, +STORE, 140234381238272, 140234381242367, +STORE, 140727822077952, 140727822082047, +STORE, 140727822065664, 140727822077951, +STORE, 140234381201408, 140234381230079, +STORE, 140234381193216, 140234381201407, +STORE, 140234376876032, 140234378989567, +SNULL, 140234376876032, 140234376888319, +STORE, 140234376888320, 140234378989567, +STORE, 140234376876032, 140234376888319, +SNULL, 140234378981375, 140234378989567, +STORE, 140234376888320, 140234378981375, +STORE, 140234378981376, 140234378989567, +ERASE, 140234378981376, 140234378989567, +STORE, 140234378981376, 140234378989567, +STORE, 140234373079040, 140234376876031, +SNULL, 140234373079040, 140234374737919, +STORE, 140234374737920, 140234376876031, +STORE, 140234373079040, 140234374737919, +SNULL, 140234376835071, 140234376876031, +STORE, 140234374737920, 140234376835071, +STORE, 140234376835072, 140234376876031, +SNULL, 140234376835072, 140234376859647, +STORE, 140234376859648, 140234376876031, +STORE, 140234376835072, 140234376859647, +ERASE, 140234376835072, 140234376859647, +STORE, 140234376835072, 140234376859647, +ERASE, 140234376859648, 140234376876031, +STORE, 140234376859648, 140234376876031, +STORE, 140234381185024, 140234381201407, +SNULL, 140234376851455, 140234376859647, +STORE, 140234376835072, 140234376851455, +STORE, 140234376851456, 140234376859647, +SNULL, 140234378985471, 140234378989567, +STORE, 140234378981376, 140234378985471, +STORE, 140234378985472, 140234378989567, +SNULL, 94088459554815, 94088459563007, +STORE, 94088459550720, 94088459554815, +STORE, 94088459554816, 94088459563007, +SNULL, 140234381234175, 140234381238271, +STORE, 140234381230080, 140234381234175, +STORE, 140234381234176, 140234381238271, +ERASE, 140234381201408, 140234381230079, +STORE, 94088468852736, 94088468987903, +STORE, 140234379501568, 140234381185023, +STORE, 94088468852736, 94088469123071, +STORE, 94088468852736, 94088469258239, +STORE, 94110050402304, 94110050615295, +STORE, 94110052712448, 94110052716543, +STORE, 94110052716544, 94110052724735, +STORE, 94110052724736, 94110052737023, +STORE, 94110061875200, 94110062415871, +STORE, 140139439357952, 140139441016831, +STORE, 140139441016832, 140139443113983, +STORE, 140139443113984, 140139443130367, +STORE, 140139443130368, 140139443138559, +STORE, 140139443138560, 140139443154943, +STORE, 140139443154944, 140139443167231, +STORE, 140139443167232, 140139445260287, +STORE, 140139445260288, 140139445264383, +STORE, 140139445264384, 140139445268479, +STORE, 140139445268480, 140139445411839, +STORE, 140139445780480, 140139447463935, +STORE, 140139447463936, 140139447480319, +STORE, 140139447508992, 140139447513087, +STORE, 140139447513088, 140139447517183, +STORE, 140139447517184, 140139447521279, +STORE, 140731901427712, 140731901566975, +STORE, 140731902259200, 140731902271487, +STORE, 140731902271488, 140731902275583, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140727282622464, 140737488351231, +SNULL, 140727282630655, 140737488351231, +STORE, 140727282622464, 140727282630655, +STORE, 140727282491392, 140727282630655, +STORE, 94266649866240, 94266652200959, +SNULL, 94266650079231, 94266652200959, +STORE, 94266649866240, 94266650079231, +STORE, 94266650079232, 94266652200959, +ERASE, 94266650079232, 94266652200959, +STORE, 94266652176384, 94266652188671, +STORE, 94266652188672, 94266652200959, +STORE, 139888497991680, 139888500244479, +SNULL, 139888498135039, 139888500244479, +STORE, 139888497991680, 139888498135039, +STORE, 139888498135040, 139888500244479, +ERASE, 139888498135040, 139888500244479, +STORE, 139888500232192, 139888500240383, +STORE, 139888500240384, 139888500244479, +STORE, 140727283113984, 140727283118079, +STORE, 140727283101696, 140727283113983, +STORE, 139888500203520, 139888500232191, +STORE, 139888500195328, 139888500203519, +STORE, 139888495878144, 139888497991679, +SNULL, 139888495878144, 139888495890431, +STORE, 139888495890432, 139888497991679, +STORE, 139888495878144, 139888495890431, +SNULL, 139888497983487, 139888497991679, +STORE, 139888495890432, 139888497983487, +STORE, 139888497983488, 139888497991679, +ERASE, 139888497983488, 139888497991679, +STORE, 139888497983488, 139888497991679, +STORE, 139888492081152, 139888495878143, +SNULL, 139888492081152, 139888493740031, +STORE, 139888493740032, 139888495878143, +STORE, 139888492081152, 139888493740031, +SNULL, 139888495837183, 139888495878143, +STORE, 139888493740032, 139888495837183, +STORE, 139888495837184, 139888495878143, +SNULL, 139888495837184, 139888495861759, +STORE, 139888495861760, 139888495878143, +STORE, 139888495837184, 139888495861759, +ERASE, 139888495837184, 139888495861759, +STORE, 139888495837184, 139888495861759, +ERASE, 139888495861760, 139888495878143, +STORE, 139888495861760, 139888495878143, +STORE, 139888500187136, 139888500203519, +SNULL, 139888495853567, 139888495861759, +STORE, 139888495837184, 139888495853567, +STORE, 139888495853568, 139888495861759, +SNULL, 139888497987583, 139888497991679, +STORE, 139888497983488, 139888497987583, +STORE, 139888497987584, 139888497991679, +SNULL, 94266652180479, 94266652188671, +STORE, 94266652176384, 94266652180479, +STORE, 94266652180480, 94266652188671, +SNULL, 139888500236287, 139888500240383, +STORE, 139888500232192, 139888500236287, +STORE, 139888500236288, 139888500240383, +ERASE, 139888500203520, 139888500232191, +STORE, 94266678542336, 94266678677503, +STORE, 139888498503680, 139888500187135, +STORE, 94266678542336, 94266678812671, +STORE, 94266678542336, 94266678947839, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722507702272, 140737488351231, +SNULL, 140722507710463, 140737488351231, +STORE, 140722507702272, 140722507710463, +STORE, 140722507571200, 140722507710463, +STORE, 94313981394944, 94313983729663, +SNULL, 94313981607935, 94313983729663, +STORE, 94313981394944, 94313981607935, +STORE, 94313981607936, 94313983729663, +ERASE, 94313981607936, 94313983729663, +STORE, 94313983705088, 94313983717375, +STORE, 94313983717376, 94313983729663, +STORE, 140456286076928, 140456288329727, +SNULL, 140456286220287, 140456288329727, +STORE, 140456286076928, 140456286220287, +STORE, 140456286220288, 140456288329727, +ERASE, 140456286220288, 140456288329727, +STORE, 140456288317440, 140456288325631, +STORE, 140456288325632, 140456288329727, +STORE, 140722507997184, 140722508001279, +STORE, 140722507984896, 140722507997183, +STORE, 140456288288768, 140456288317439, +STORE, 140456288280576, 140456288288767, +STORE, 140456283963392, 140456286076927, +SNULL, 140456283963392, 140456283975679, +STORE, 140456283975680, 140456286076927, +STORE, 140456283963392, 140456283975679, +SNULL, 140456286068735, 140456286076927, +STORE, 140456283975680, 140456286068735, +STORE, 140456286068736, 140456286076927, +ERASE, 140456286068736, 140456286076927, +STORE, 140456286068736, 140456286076927, +STORE, 140456280166400, 140456283963391, +SNULL, 140456280166400, 140456281825279, +STORE, 140456281825280, 140456283963391, +STORE, 140456280166400, 140456281825279, +SNULL, 140456283922431, 140456283963391, +STORE, 140456281825280, 140456283922431, +STORE, 140456283922432, 140456283963391, +SNULL, 140456283922432, 140456283947007, +STORE, 140456283947008, 140456283963391, +STORE, 140456283922432, 140456283947007, +ERASE, 140456283922432, 140456283947007, +STORE, 140456283922432, 140456283947007, +ERASE, 140456283947008, 140456283963391, +STORE, 140456283947008, 140456283963391, +STORE, 140456288272384, 140456288288767, +SNULL, 140456283938815, 140456283947007, +STORE, 140456283922432, 140456283938815, +STORE, 140456283938816, 140456283947007, +SNULL, 140456286072831, 140456286076927, +STORE, 140456286068736, 140456286072831, +STORE, 140456286072832, 140456286076927, +SNULL, 94313983709183, 94313983717375, +STORE, 94313983705088, 94313983709183, +STORE, 94313983709184, 94313983717375, +SNULL, 140456288321535, 140456288325631, +STORE, 140456288317440, 140456288321535, +STORE, 140456288321536, 140456288325631, +ERASE, 140456288288768, 140456288317439, +STORE, 94314006716416, 94314006851583, +STORE, 140456286588928, 140456288272383, +STORE, 94314006716416, 94314006986751, +STORE, 94314006716416, 94314007121919, +STORE, 93948644454400, 93948644667391, +STORE, 93948646764544, 93948646768639, +STORE, 93948646768640, 93948646776831, +STORE, 93948646776832, 93948646789119, +STORE, 93948664999936, 93948667142143, +STORE, 140187350659072, 140187352317951, +STORE, 140187352317952, 140187354415103, +STORE, 140187354415104, 140187354431487, +STORE, 140187354431488, 140187354439679, +STORE, 140187354439680, 140187354456063, +STORE, 140187354456064, 140187354468351, +STORE, 140187354468352, 140187356561407, +STORE, 140187356561408, 140187356565503, +STORE, 140187356565504, 140187356569599, +STORE, 140187356569600, 140187356712959, +STORE, 140187357081600, 140187358765055, +STORE, 140187358765056, 140187358781439, +STORE, 140187358810112, 140187358814207, +STORE, 140187358814208, 140187358818303, +STORE, 140187358818304, 140187358822399, +STORE, 140730484518912, 140730484658175, +STORE, 140730485690368, 140730485702655, +STORE, 140730485702656, 140730485706751, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140721211551744, 140737488351231, +SNULL, 140721211559935, 140737488351231, +STORE, 140721211551744, 140721211559935, +STORE, 140721211420672, 140721211559935, +STORE, 94105221423104, 94105223757823, +SNULL, 94105221636095, 94105223757823, +STORE, 94105221423104, 94105221636095, +STORE, 94105221636096, 94105223757823, +ERASE, 94105221636096, 94105223757823, +STORE, 94105223733248, 94105223745535, +STORE, 94105223745536, 94105223757823, +STORE, 140474453676032, 140474455928831, +SNULL, 140474453819391, 140474455928831, +STORE, 140474453676032, 140474453819391, +STORE, 140474453819392, 140474455928831, +ERASE, 140474453819392, 140474455928831, +STORE, 140474455916544, 140474455924735, +STORE, 140474455924736, 140474455928831, +STORE, 140721211703296, 140721211707391, +STORE, 140721211691008, 140721211703295, +STORE, 140474455887872, 140474455916543, +STORE, 140474455879680, 140474455887871, +STORE, 140474451562496, 140474453676031, +SNULL, 140474451562496, 140474451574783, +STORE, 140474451574784, 140474453676031, +STORE, 140474451562496, 140474451574783, +SNULL, 140474453667839, 140474453676031, +STORE, 140474451574784, 140474453667839, +STORE, 140474453667840, 140474453676031, +ERASE, 140474453667840, 140474453676031, +STORE, 140474453667840, 140474453676031, +STORE, 140474447765504, 140474451562495, +SNULL, 140474447765504, 140474449424383, +STORE, 140474449424384, 140474451562495, +STORE, 140474447765504, 140474449424383, +SNULL, 140474451521535, 140474451562495, +STORE, 140474449424384, 140474451521535, +STORE, 140474451521536, 140474451562495, +SNULL, 140474451521536, 140474451546111, +STORE, 140474451546112, 140474451562495, +STORE, 140474451521536, 140474451546111, +ERASE, 140474451521536, 140474451546111, +STORE, 140474451521536, 140474451546111, +ERASE, 140474451546112, 140474451562495, +STORE, 140474451546112, 140474451562495, +STORE, 140474455871488, 140474455887871, +SNULL, 140474451537919, 140474451546111, +STORE, 140474451521536, 140474451537919, +STORE, 140474451537920, 140474451546111, +SNULL, 140474453671935, 140474453676031, +STORE, 140474453667840, 140474453671935, +STORE, 140474453671936, 140474453676031, +SNULL, 94105223737343, 94105223745535, +STORE, 94105223733248, 94105223737343, +STORE, 94105223737344, 94105223745535, +SNULL, 140474455920639, 140474455924735, +STORE, 140474455916544, 140474455920639, +STORE, 140474455920640, 140474455924735, +ERASE, 140474455887872, 140474455916543, +STORE, 94105238712320, 94105238847487, +STORE, 140474454188032, 140474455871487, +STORE, 94105238712320, 94105238982655, +STORE, 94105238712320, 94105239117823, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140732356354048, 140737488351231, +SNULL, 140732356362239, 140737488351231, +STORE, 140732356354048, 140732356362239, +STORE, 140732356222976, 140732356362239, +STORE, 94461165989888, 94461168324607, +SNULL, 94461166202879, 94461168324607, +STORE, 94461165989888, 94461166202879, +STORE, 94461166202880, 94461168324607, +ERASE, 94461166202880, 94461168324607, +STORE, 94461168300032, 94461168312319, +STORE, 94461168312320, 94461168324607, +STORE, 140317255110656, 140317257363455, +SNULL, 140317255254015, 140317257363455, +STORE, 140317255110656, 140317255254015, +STORE, 140317255254016, 140317257363455, +ERASE, 140317255254016, 140317257363455, +STORE, 140317257351168, 140317257359359, +STORE, 140317257359360, 140317257363455, +STORE, 140732356583424, 140732356587519, +STORE, 140732356571136, 140732356583423, +STORE, 140317257322496, 140317257351167, +STORE, 140317257314304, 140317257322495, +STORE, 140317252997120, 140317255110655, +SNULL, 140317252997120, 140317253009407, +STORE, 140317253009408, 140317255110655, +STORE, 140317252997120, 140317253009407, +SNULL, 140317255102463, 140317255110655, +STORE, 140317253009408, 140317255102463, +STORE, 140317255102464, 140317255110655, +ERASE, 140317255102464, 140317255110655, +STORE, 140317255102464, 140317255110655, +STORE, 140317249200128, 140317252997119, +SNULL, 140317249200128, 140317250859007, +STORE, 140317250859008, 140317252997119, +STORE, 140317249200128, 140317250859007, +SNULL, 140317252956159, 140317252997119, +STORE, 140317250859008, 140317252956159, +STORE, 140317252956160, 140317252997119, +SNULL, 140317252956160, 140317252980735, +STORE, 140317252980736, 140317252997119, +STORE, 140317252956160, 140317252980735, +ERASE, 140317252956160, 140317252980735, +STORE, 140317252956160, 140317252980735, +ERASE, 140317252980736, 140317252997119, +STORE, 140317252980736, 140317252997119, +STORE, 140317257306112, 140317257322495, +SNULL, 140317252972543, 140317252980735, +STORE, 140317252956160, 140317252972543, +STORE, 140317252972544, 140317252980735, +SNULL, 140317255106559, 140317255110655, +STORE, 140317255102464, 140317255106559, +STORE, 140317255106560, 140317255110655, +SNULL, 94461168304127, 94461168312319, +STORE, 94461168300032, 94461168304127, +STORE, 94461168304128, 94461168312319, +SNULL, 140317257355263, 140317257359359, +STORE, 140317257351168, 140317257355263, +STORE, 140317257355264, 140317257359359, +ERASE, 140317257322496, 140317257351167, +STORE, 94461195268096, 94461195403263, +STORE, 140317255622656, 140317257306111, +STORE, 94461195268096, 94461195538431, +STORE, 94461195268096, 94461195673599, +STORE, 94110050402304, 94110050615295, +STORE, 94110052712448, 94110052716543, +STORE, 94110052716544, 94110052724735, +STORE, 94110052724736, 94110052737023, +STORE, 94110061875200, 94110062415871, +STORE, 140139439357952, 140139441016831, +STORE, 140139441016832, 140139443113983, +STORE, 140139443113984, 140139443130367, +STORE, 140139443130368, 140139443138559, +STORE, 140139443138560, 140139443154943, +STORE, 140139443154944, 140139443167231, +STORE, 140139443167232, 140139445260287, +STORE, 140139445260288, 140139445264383, +STORE, 140139445264384, 140139445268479, +STORE, 140139445268480, 140139445411839, +STORE, 140139445780480, 140139447463935, +STORE, 140139447463936, 140139447480319, +STORE, 140139447508992, 140139447513087, +STORE, 140139447513088, 140139447517183, +STORE, 140139447517184, 140139447521279, +STORE, 140731901427712, 140731901566975, +STORE, 140731902259200, 140731902271487, +STORE, 140731902271488, 140731902275583, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140720941613056, 140737488351231, +SNULL, 140720941621247, 140737488351231, +STORE, 140720941613056, 140720941621247, +STORE, 140720941481984, 140720941621247, +STORE, 93902377721856, 93902379945983, +SNULL, 93902377832447, 93902379945983, +STORE, 93902377721856, 93902377832447, +STORE, 93902377832448, 93902379945983, +ERASE, 93902377832448, 93902379945983, +STORE, 93902379925504, 93902379937791, +STORE, 93902379937792, 93902379945983, +STORE, 139836543635456, 139836545888255, +SNULL, 139836543778815, 139836545888255, +STORE, 139836543635456, 139836543778815, +STORE, 139836543778816, 139836545888255, +ERASE, 139836543778816, 139836545888255, +STORE, 139836545875968, 139836545884159, +STORE, 139836545884160, 139836545888255, +STORE, 140720941711360, 140720941715455, +STORE, 140720941699072, 140720941711359, +STORE, 139836545847296, 139836545875967, +STORE, 139836545839104, 139836545847295, +STORE, 139836539838464, 139836543635455, +SNULL, 139836539838464, 139836541497343, +STORE, 139836541497344, 139836543635455, +STORE, 139836539838464, 139836541497343, +SNULL, 139836543594495, 139836543635455, +STORE, 139836541497344, 139836543594495, +STORE, 139836543594496, 139836543635455, +SNULL, 139836543594496, 139836543619071, +STORE, 139836543619072, 139836543635455, +STORE, 139836543594496, 139836543619071, +ERASE, 139836543594496, 139836543619071, +STORE, 139836543594496, 139836543619071, +ERASE, 139836543619072, 139836543635455, +STORE, 139836543619072, 139836543635455, +SNULL, 139836543610879, 139836543619071, +STORE, 139836543594496, 139836543610879, +STORE, 139836543610880, 139836543619071, +SNULL, 93902379933695, 93902379937791, +STORE, 93902379925504, 93902379933695, +STORE, 93902379933696, 93902379937791, +SNULL, 139836545880063, 139836545884159, +STORE, 139836545875968, 139836545880063, +STORE, 139836545880064, 139836545884159, +ERASE, 139836545847296, 139836545875967, +STORE, 93902396891136, 93902397026303, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140736538206208, 140737488351231, +SNULL, 140736538214399, 140737488351231, +STORE, 140736538206208, 140736538214399, +STORE, 140736538075136, 140736538214399, +STORE, 94173471399936, 94173473734655, +SNULL, 94173471612927, 94173473734655, +STORE, 94173471399936, 94173471612927, +STORE, 94173471612928, 94173473734655, +ERASE, 94173471612928, 94173473734655, +STORE, 94173473710080, 94173473722367, +STORE, 94173473722368, 94173473734655, +STORE, 140035513556992, 140035515809791, +SNULL, 140035513700351, 140035515809791, +STORE, 140035513556992, 140035513700351, +STORE, 140035513700352, 140035515809791, +ERASE, 140035513700352, 140035515809791, +STORE, 140035515797504, 140035515805695, +STORE, 140035515805696, 140035515809791, +STORE, 140736538329088, 140736538333183, +STORE, 140736538316800, 140736538329087, +STORE, 140035515768832, 140035515797503, +STORE, 140035515760640, 140035515768831, +STORE, 140035511443456, 140035513556991, +SNULL, 140035511443456, 140035511455743, +STORE, 140035511455744, 140035513556991, +STORE, 140035511443456, 140035511455743, +SNULL, 140035513548799, 140035513556991, +STORE, 140035511455744, 140035513548799, +STORE, 140035513548800, 140035513556991, +ERASE, 140035513548800, 140035513556991, +STORE, 140035513548800, 140035513556991, +STORE, 140035507646464, 140035511443455, +SNULL, 140035507646464, 140035509305343, +STORE, 140035509305344, 140035511443455, +STORE, 140035507646464, 140035509305343, +SNULL, 140035511402495, 140035511443455, +STORE, 140035509305344, 140035511402495, +STORE, 140035511402496, 140035511443455, +SNULL, 140035511402496, 140035511427071, +STORE, 140035511427072, 140035511443455, +STORE, 140035511402496, 140035511427071, +ERASE, 140035511402496, 140035511427071, +STORE, 140035511402496, 140035511427071, +ERASE, 140035511427072, 140035511443455, +STORE, 140035511427072, 140035511443455, +STORE, 140035515752448, 140035515768831, +SNULL, 140035511418879, 140035511427071, +STORE, 140035511402496, 140035511418879, +STORE, 140035511418880, 140035511427071, +SNULL, 140035513552895, 140035513556991, +STORE, 140035513548800, 140035513552895, +STORE, 140035513552896, 140035513556991, +SNULL, 94173473714175, 94173473722367, +STORE, 94173473710080, 94173473714175, +STORE, 94173473714176, 94173473722367, +SNULL, 140035515801599, 140035515805695, +STORE, 140035515797504, 140035515801599, +STORE, 140035515801600, 140035515805695, +ERASE, 140035515768832, 140035515797503, +STORE, 94173478645760, 94173478780927, +STORE, 140035514068992, 140035515752447, +STORE, 94173478645760, 94173478916095, +STORE, 94173478645760, 94173479051263, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140724216176640, 140737488351231, +SNULL, 140724216184831, 140737488351231, +STORE, 140724216176640, 140724216184831, +STORE, 140724216045568, 140724216184831, +STORE, 94870930628608, 94870932963327, +SNULL, 94870930841599, 94870932963327, +STORE, 94870930628608, 94870930841599, +STORE, 94870930841600, 94870932963327, +ERASE, 94870930841600, 94870932963327, +STORE, 94870932938752, 94870932951039, +STORE, 94870932951040, 94870932963327, +STORE, 140453683736576, 140453685989375, +SNULL, 140453683879935, 140453685989375, +STORE, 140453683736576, 140453683879935, +STORE, 140453683879936, 140453685989375, +ERASE, 140453683879936, 140453685989375, +STORE, 140453685977088, 140453685985279, +STORE, 140453685985280, 140453685989375, +STORE, 140724216832000, 140724216836095, +STORE, 140724216819712, 140724216831999, +STORE, 140453685948416, 140453685977087, +STORE, 140453685940224, 140453685948415, +STORE, 140453681623040, 140453683736575, +SNULL, 140453681623040, 140453681635327, +STORE, 140453681635328, 140453683736575, +STORE, 140453681623040, 140453681635327, +SNULL, 140453683728383, 140453683736575, +STORE, 140453681635328, 140453683728383, +STORE, 140453683728384, 140453683736575, +ERASE, 140453683728384, 140453683736575, +STORE, 140453683728384, 140453683736575, +STORE, 140453677826048, 140453681623039, +SNULL, 140453677826048, 140453679484927, +STORE, 140453679484928, 140453681623039, +STORE, 140453677826048, 140453679484927, +SNULL, 140453681582079, 140453681623039, +STORE, 140453679484928, 140453681582079, +STORE, 140453681582080, 140453681623039, +SNULL, 140453681582080, 140453681606655, +STORE, 140453681606656, 140453681623039, +STORE, 140453681582080, 140453681606655, +ERASE, 140453681582080, 140453681606655, +STORE, 140453681582080, 140453681606655, +ERASE, 140453681606656, 140453681623039, +STORE, 140453681606656, 140453681623039, +STORE, 140453685932032, 140453685948415, +SNULL, 140453681598463, 140453681606655, +STORE, 140453681582080, 140453681598463, +STORE, 140453681598464, 140453681606655, +SNULL, 140453683732479, 140453683736575, +STORE, 140453683728384, 140453683732479, +STORE, 140453683732480, 140453683736575, +SNULL, 94870932942847, 94870932951039, +STORE, 94870932938752, 94870932942847, +STORE, 94870932942848, 94870932951039, +SNULL, 140453685981183, 140453685985279, +STORE, 140453685977088, 140453685981183, +STORE, 140453685981184, 140453685985279, +ERASE, 140453685948416, 140453685977087, +STORE, 94870940565504, 94870940700671, +STORE, 140453684248576, 140453685932031, +STORE, 94870940565504, 94870940835839, +STORE, 94870940565504, 94870940971007, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140731275661312, 140737488351231, +SNULL, 140731275669503, 140737488351231, +STORE, 140731275661312, 140731275669503, +STORE, 140731275530240, 140731275669503, +STORE, 94642788548608, 94642790883327, +SNULL, 94642788761599, 94642790883327, +STORE, 94642788548608, 94642788761599, +STORE, 94642788761600, 94642790883327, +ERASE, 94642788761600, 94642790883327, +STORE, 94642790858752, 94642790871039, +STORE, 94642790871040, 94642790883327, +STORE, 140228458749952, 140228461002751, +SNULL, 140228458893311, 140228461002751, +STORE, 140228458749952, 140228458893311, +STORE, 140228458893312, 140228461002751, +ERASE, 140228458893312, 140228461002751, +STORE, 140228460990464, 140228460998655, +STORE, 140228460998656, 140228461002751, +STORE, 140731276349440, 140731276353535, +STORE, 140731276337152, 140731276349439, +STORE, 140228460961792, 140228460990463, +STORE, 140228460953600, 140228460961791, +STORE, 140228456636416, 140228458749951, +SNULL, 140228456636416, 140228456648703, +STORE, 140228456648704, 140228458749951, +STORE, 140228456636416, 140228456648703, +SNULL, 140228458741759, 140228458749951, +STORE, 140228456648704, 140228458741759, +STORE, 140228458741760, 140228458749951, +ERASE, 140228458741760, 140228458749951, +STORE, 140228458741760, 140228458749951, +STORE, 140228452839424, 140228456636415, +SNULL, 140228452839424, 140228454498303, +STORE, 140228454498304, 140228456636415, +STORE, 140228452839424, 140228454498303, +SNULL, 140228456595455, 140228456636415, +STORE, 140228454498304, 140228456595455, +STORE, 140228456595456, 140228456636415, +SNULL, 140228456595456, 140228456620031, +STORE, 140228456620032, 140228456636415, +STORE, 140228456595456, 140228456620031, +ERASE, 140228456595456, 140228456620031, +STORE, 140228456595456, 140228456620031, +ERASE, 140228456620032, 140228456636415, +STORE, 140228456620032, 140228456636415, +STORE, 140228460945408, 140228460961791, +SNULL, 140228456611839, 140228456620031, +STORE, 140228456595456, 140228456611839, +STORE, 140228456611840, 140228456620031, +SNULL, 140228458745855, 140228458749951, +STORE, 140228458741760, 140228458745855, +STORE, 140228458745856, 140228458749951, +SNULL, 94642790862847, 94642790871039, +STORE, 94642790858752, 94642790862847, +STORE, 94642790862848, 94642790871039, +SNULL, 140228460994559, 140228460998655, +STORE, 140228460990464, 140228460994559, +STORE, 140228460994560, 140228460998655, +ERASE, 140228460961792, 140228460990463, +STORE, 94642801549312, 94642801684479, +STORE, 140228459261952, 140228460945407, +STORE, 94642801549312, 94642801819647, +STORE, 94642801549312, 94642801954815, +STORE, 94604087611392, 94604087824383, +STORE, 94604089921536, 94604089925631, +STORE, 94604089925632, 94604089933823, +STORE, 94604089933824, 94604089946111, +STORE, 94604105125888, 94604106424319, +STORE, 140454937694208, 140454939353087, +STORE, 140454939353088, 140454941450239, +STORE, 140454941450240, 140454941466623, +STORE, 140454941466624, 140454941474815, +STORE, 140454941474816, 140454941491199, +STORE, 140454941491200, 140454941503487, +STORE, 140454941503488, 140454943596543, +STORE, 140454943596544, 140454943600639, +STORE, 140454943600640, 140454943604735, +STORE, 140454943604736, 140454943748095, +STORE, 140454944116736, 140454945800191, +STORE, 140454945800192, 140454945816575, +STORE, 140454945845248, 140454945849343, +STORE, 140454945849344, 140454945853439, +STORE, 140454945853440, 140454945857535, +STORE, 140728438214656, 140728438353919, +STORE, 140728439095296, 140728439107583, +STORE, 140728439107584, 140728439111679, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140721843453952, 140737488351231, +SNULL, 140721843462143, 140737488351231, +STORE, 140721843453952, 140721843462143, +STORE, 140721843322880, 140721843462143, +STORE, 94465962455040, 94465964789759, +SNULL, 94465962668031, 94465964789759, +STORE, 94465962455040, 94465962668031, +STORE, 94465962668032, 94465964789759, +ERASE, 94465962668032, 94465964789759, +STORE, 94465964765184, 94465964777471, +STORE, 94465964777472, 94465964789759, +STORE, 139913488314368, 139913490567167, +SNULL, 139913488457727, 139913490567167, +STORE, 139913488314368, 139913488457727, +STORE, 139913488457728, 139913490567167, +ERASE, 139913488457728, 139913490567167, +STORE, 139913490554880, 139913490563071, +STORE, 139913490563072, 139913490567167, +STORE, 140721843503104, 140721843507199, +STORE, 140721843490816, 140721843503103, +STORE, 139913490526208, 139913490554879, +STORE, 139913490518016, 139913490526207, +STORE, 139913486200832, 139913488314367, +SNULL, 139913486200832, 139913486213119, +STORE, 139913486213120, 139913488314367, +STORE, 139913486200832, 139913486213119, +SNULL, 139913488306175, 139913488314367, +STORE, 139913486213120, 139913488306175, +STORE, 139913488306176, 139913488314367, +ERASE, 139913488306176, 139913488314367, +STORE, 139913488306176, 139913488314367, +STORE, 139913482403840, 139913486200831, +SNULL, 139913482403840, 139913484062719, +STORE, 139913484062720, 139913486200831, +STORE, 139913482403840, 139913484062719, +SNULL, 139913486159871, 139913486200831, +STORE, 139913484062720, 139913486159871, +STORE, 139913486159872, 139913486200831, +SNULL, 139913486159872, 139913486184447, +STORE, 139913486184448, 139913486200831, +STORE, 139913486159872, 139913486184447, +ERASE, 139913486159872, 139913486184447, +STORE, 139913486159872, 139913486184447, +ERASE, 139913486184448, 139913486200831, +STORE, 139913486184448, 139913486200831, +STORE, 139913490509824, 139913490526207, +SNULL, 139913486176255, 139913486184447, +STORE, 139913486159872, 139913486176255, +STORE, 139913486176256, 139913486184447, +SNULL, 139913488310271, 139913488314367, +STORE, 139913488306176, 139913488310271, +STORE, 139913488310272, 139913488314367, +SNULL, 94465964769279, 94465964777471, +STORE, 94465964765184, 94465964769279, +STORE, 94465964769280, 94465964777471, +SNULL, 139913490558975, 139913490563071, +STORE, 139913490554880, 139913490558975, +STORE, 139913490558976, 139913490563071, +ERASE, 139913490526208, 139913490554879, +STORE, 94465970024448, 94465970159615, +STORE, 139913488826368, 139913490509823, +STORE, 94465970024448, 94465970294783, +STORE, 94465970024448, 94465970429951, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140720583307264, 140737488351231, +SNULL, 140720583315455, 140737488351231, +STORE, 140720583307264, 140720583315455, +STORE, 140720583176192, 140720583315455, +STORE, 94212322082816, 94212324417535, +SNULL, 94212322295807, 94212324417535, +STORE, 94212322082816, 94212322295807, +STORE, 94212322295808, 94212324417535, +ERASE, 94212322295808, 94212324417535, +STORE, 94212324392960, 94212324405247, +STORE, 94212324405248, 94212324417535, +STORE, 139659688538112, 139659690790911, +SNULL, 139659688681471, 139659690790911, +STORE, 139659688538112, 139659688681471, +STORE, 139659688681472, 139659690790911, +ERASE, 139659688681472, 139659690790911, +STORE, 139659690778624, 139659690786815, +STORE, 139659690786816, 139659690790911, +STORE, 140720584781824, 140720584785919, +STORE, 140720584769536, 140720584781823, +STORE, 139659690749952, 139659690778623, +STORE, 139659690741760, 139659690749951, +STORE, 139659686424576, 139659688538111, +SNULL, 139659686424576, 139659686436863, +STORE, 139659686436864, 139659688538111, +STORE, 139659686424576, 139659686436863, +SNULL, 139659688529919, 139659688538111, +STORE, 139659686436864, 139659688529919, +STORE, 139659688529920, 139659688538111, +ERASE, 139659688529920, 139659688538111, +STORE, 139659688529920, 139659688538111, +STORE, 139659682627584, 139659686424575, +SNULL, 139659682627584, 139659684286463, +STORE, 139659684286464, 139659686424575, +STORE, 139659682627584, 139659684286463, +SNULL, 139659686383615, 139659686424575, +STORE, 139659684286464, 139659686383615, +STORE, 139659686383616, 139659686424575, +SNULL, 139659686383616, 139659686408191, +STORE, 139659686408192, 139659686424575, +STORE, 139659686383616, 139659686408191, +ERASE, 139659686383616, 139659686408191, +STORE, 139659686383616, 139659686408191, +ERASE, 139659686408192, 139659686424575, +STORE, 139659686408192, 139659686424575, +STORE, 139659690733568, 139659690749951, +SNULL, 139659686399999, 139659686408191, +STORE, 139659686383616, 139659686399999, +STORE, 139659686400000, 139659686408191, +SNULL, 139659688534015, 139659688538111, +STORE, 139659688529920, 139659688534015, +STORE, 139659688534016, 139659688538111, +SNULL, 94212324397055, 94212324405247, +STORE, 94212324392960, 94212324397055, +STORE, 94212324397056, 94212324405247, +SNULL, 139659690782719, 139659690786815, +STORE, 139659690778624, 139659690782719, +STORE, 139659690782720, 139659690786815, +ERASE, 139659690749952, 139659690778623, +STORE, 94212355014656, 94212355149823, +STORE, 139659689050112, 139659690733567, +STORE, 94212355014656, 94212355284991, +STORE, 94212355014656, 94212355420159, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140727689830400, 140737488351231, +SNULL, 140727689838591, 140737488351231, +STORE, 140727689830400, 140727689838591, +STORE, 140727689699328, 140727689838591, +STORE, 94572390281216, 94572392615935, +SNULL, 94572390494207, 94572392615935, +STORE, 94572390281216, 94572390494207, +STORE, 94572390494208, 94572392615935, +ERASE, 94572390494208, 94572392615935, +STORE, 94572392591360, 94572392603647, +STORE, 94572392603648, 94572392615935, +STORE, 140575923769344, 140575926022143, +SNULL, 140575923912703, 140575926022143, +STORE, 140575923769344, 140575923912703, +STORE, 140575923912704, 140575926022143, +ERASE, 140575923912704, 140575926022143, +STORE, 140575926009856, 140575926018047, +STORE, 140575926018048, 140575926022143, +STORE, 140727689871360, 140727689875455, +STORE, 140727689859072, 140727689871359, +STORE, 140575925981184, 140575926009855, +STORE, 140575925972992, 140575925981183, +STORE, 140575921655808, 140575923769343, +SNULL, 140575921655808, 140575921668095, +STORE, 140575921668096, 140575923769343, +STORE, 140575921655808, 140575921668095, +SNULL, 140575923761151, 140575923769343, +STORE, 140575921668096, 140575923761151, +STORE, 140575923761152, 140575923769343, +ERASE, 140575923761152, 140575923769343, +STORE, 140575923761152, 140575923769343, +STORE, 140575917858816, 140575921655807, +SNULL, 140575917858816, 140575919517695, +STORE, 140575919517696, 140575921655807, +STORE, 140575917858816, 140575919517695, +SNULL, 140575921614847, 140575921655807, +STORE, 140575919517696, 140575921614847, +STORE, 140575921614848, 140575921655807, +SNULL, 140575921614848, 140575921639423, +STORE, 140575921639424, 140575921655807, +STORE, 140575921614848, 140575921639423, +ERASE, 140575921614848, 140575921639423, +STORE, 140575921614848, 140575921639423, +ERASE, 140575921639424, 140575921655807, +STORE, 140575921639424, 140575921655807, +STORE, 140575925964800, 140575925981183, +SNULL, 140575921631231, 140575921639423, +STORE, 140575921614848, 140575921631231, +STORE, 140575921631232, 140575921639423, +SNULL, 140575923765247, 140575923769343, +STORE, 140575923761152, 140575923765247, +STORE, 140575923765248, 140575923769343, +SNULL, 94572392595455, 94572392603647, +STORE, 94572392591360, 94572392595455, +STORE, 94572392595456, 94572392603647, +SNULL, 140575926013951, 140575926018047, +STORE, 140575926009856, 140575926013951, +STORE, 140575926013952, 140575926018047, +ERASE, 140575925981184, 140575926009855, +STORE, 94572402278400, 94572402413567, +STORE, 140575924281344, 140575925964799, +STORE, 94572402278400, 94572402548735, +STORE, 94572402278400, 94572402683903, +STORE, 94572402278400, 94572402851839, +SNULL, 94572402827263, 94572402851839, +STORE, 94572402278400, 94572402827263, +STORE, 94572402827264, 94572402851839, +ERASE, 94572402827264, 94572402851839, +STORE, 94572402278400, 94572402966527, +STORE, 94572402278400, 94572403109887, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140725520506880, 140737488351231, +SNULL, 140725520515071, 140737488351231, +STORE, 140725520506880, 140725520515071, +STORE, 140725520375808, 140725520515071, +STORE, 93829948788736, 93829951012863, +SNULL, 93829948899327, 93829951012863, +STORE, 93829948788736, 93829948899327, +STORE, 93829948899328, 93829951012863, +ERASE, 93829948899328, 93829951012863, +STORE, 93829950992384, 93829951004671, +STORE, 93829951004672, 93829951012863, +STORE, 140133696794624, 140133699047423, +SNULL, 140133696937983, 140133699047423, +STORE, 140133696794624, 140133696937983, +STORE, 140133696937984, 140133699047423, +ERASE, 140133696937984, 140133699047423, +STORE, 140133699035136, 140133699043327, +STORE, 140133699043328, 140133699047423, +STORE, 140725520875520, 140725520879615, +STORE, 140725520863232, 140725520875519, +STORE, 140133699006464, 140133699035135, +STORE, 140133698998272, 140133699006463, +STORE, 140133692997632, 140133696794623, +SNULL, 140133692997632, 140133694656511, +STORE, 140133694656512, 140133696794623, +STORE, 140133692997632, 140133694656511, +SNULL, 140133696753663, 140133696794623, +STORE, 140133694656512, 140133696753663, +STORE, 140133696753664, 140133696794623, +SNULL, 140133696753664, 140133696778239, +STORE, 140133696778240, 140133696794623, +STORE, 140133696753664, 140133696778239, +ERASE, 140133696753664, 140133696778239, +STORE, 140133696753664, 140133696778239, +ERASE, 140133696778240, 140133696794623, +STORE, 140133696778240, 140133696794623, +SNULL, 140133696770047, 140133696778239, +STORE, 140133696753664, 140133696770047, +STORE, 140133696770048, 140133696778239, +SNULL, 93829951000575, 93829951004671, +STORE, 93829950992384, 93829951000575, +STORE, 93829951000576, 93829951004671, +SNULL, 140133699039231, 140133699043327, +STORE, 140133699035136, 140133699039231, +STORE, 140133699039232, 140133699043327, +ERASE, 140133699006464, 140133699035135, +STORE, 93829978693632, 93829978828799, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140736118022144, 140737488351231, +SNULL, 140736118030335, 140737488351231, +STORE, 140736118022144, 140736118030335, +STORE, 140736117891072, 140736118030335, +STORE, 94467663982592, 94467666206719, +SNULL, 94467664093183, 94467666206719, +STORE, 94467663982592, 94467664093183, +STORE, 94467664093184, 94467666206719, +ERASE, 94467664093184, 94467666206719, +STORE, 94467666186240, 94467666198527, +STORE, 94467666198528, 94467666206719, +STORE, 140525377327104, 140525379579903, +SNULL, 140525377470463, 140525379579903, +STORE, 140525377327104, 140525377470463, +STORE, 140525377470464, 140525379579903, +ERASE, 140525377470464, 140525379579903, +STORE, 140525379567616, 140525379575807, +STORE, 140525379575808, 140525379579903, +STORE, 140736118771712, 140736118775807, +STORE, 140736118759424, 140736118771711, +STORE, 140525379538944, 140525379567615, +STORE, 140525379530752, 140525379538943, +STORE, 140525373530112, 140525377327103, +SNULL, 140525373530112, 140525375188991, +STORE, 140525375188992, 140525377327103, +STORE, 140525373530112, 140525375188991, +SNULL, 140525377286143, 140525377327103, +STORE, 140525375188992, 140525377286143, +STORE, 140525377286144, 140525377327103, +SNULL, 140525377286144, 140525377310719, +STORE, 140525377310720, 140525377327103, +STORE, 140525377286144, 140525377310719, +ERASE, 140525377286144, 140525377310719, +STORE, 140525377286144, 140525377310719, +ERASE, 140525377310720, 140525377327103, +STORE, 140525377310720, 140525377327103, +SNULL, 140525377302527, 140525377310719, +STORE, 140525377286144, 140525377302527, +STORE, 140525377302528, 140525377310719, +SNULL, 94467666194431, 94467666198527, +STORE, 94467666186240, 94467666194431, +STORE, 94467666194432, 94467666198527, +SNULL, 140525379571711, 140525379575807, +STORE, 140525379567616, 140525379571711, +STORE, 140525379571712, 140525379575807, +ERASE, 140525379538944, 140525379567615, +STORE, 94467693379584, 94467693514751, +STORE, 94200172744704, 94200172957695, +STORE, 94200175054848, 94200175058943, +STORE, 94200175058944, 94200175067135, +STORE, 94200175067136, 94200175079423, +STORE, 94200196673536, 94200198905855, +STORE, 140053867720704, 140053869379583, +STORE, 140053869379584, 140053871476735, +STORE, 140053871476736, 140053871493119, +STORE, 140053871493120, 140053871501311, +STORE, 140053871501312, 140053871517695, +STORE, 140053871517696, 140053871529983, +STORE, 140053871529984, 140053873623039, +STORE, 140053873623040, 140053873627135, +STORE, 140053873627136, 140053873631231, +STORE, 140053873631232, 140053873774591, +STORE, 140053874143232, 140053875826687, +STORE, 140053875826688, 140053875843071, +STORE, 140053875871744, 140053875875839, +STORE, 140053875875840, 140053875879935, +STORE, 140053875879936, 140053875884031, +STORE, 140728538484736, 140728538623999, +STORE, 140728538652672, 140728538664959, +STORE, 140728538664960, 140728538669055, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140732307775488, 140737488351231, +SNULL, 140732307783679, 140737488351231, +STORE, 140732307775488, 140732307783679, +STORE, 140732307644416, 140732307783679, +STORE, 93831417630720, 93831419965439, +SNULL, 93831417843711, 93831419965439, +STORE, 93831417630720, 93831417843711, +STORE, 93831417843712, 93831419965439, +ERASE, 93831417843712, 93831419965439, +STORE, 93831419940864, 93831419953151, +STORE, 93831419953152, 93831419965439, +STORE, 140241062088704, 140241064341503, +SNULL, 140241062232063, 140241064341503, +STORE, 140241062088704, 140241062232063, +STORE, 140241062232064, 140241064341503, +ERASE, 140241062232064, 140241064341503, +STORE, 140241064329216, 140241064337407, +STORE, 140241064337408, 140241064341503, +STORE, 140732308140032, 140732308144127, +STORE, 140732308127744, 140732308140031, +STORE, 140241064300544, 140241064329215, +STORE, 140241064292352, 140241064300543, +STORE, 140241059975168, 140241062088703, +SNULL, 140241059975168, 140241059987455, +STORE, 140241059987456, 140241062088703, +STORE, 140241059975168, 140241059987455, +SNULL, 140241062080511, 140241062088703, +STORE, 140241059987456, 140241062080511, +STORE, 140241062080512, 140241062088703, +ERASE, 140241062080512, 140241062088703, +STORE, 140241062080512, 140241062088703, +STORE, 140241056178176, 140241059975167, +SNULL, 140241056178176, 140241057837055, +STORE, 140241057837056, 140241059975167, +STORE, 140241056178176, 140241057837055, +SNULL, 140241059934207, 140241059975167, +STORE, 140241057837056, 140241059934207, +STORE, 140241059934208, 140241059975167, +SNULL, 140241059934208, 140241059958783, +STORE, 140241059958784, 140241059975167, +STORE, 140241059934208, 140241059958783, +ERASE, 140241059934208, 140241059958783, +STORE, 140241059934208, 140241059958783, +ERASE, 140241059958784, 140241059975167, +STORE, 140241059958784, 140241059975167, +STORE, 140241064284160, 140241064300543, +SNULL, 140241059950591, 140241059958783, +STORE, 140241059934208, 140241059950591, +STORE, 140241059950592, 140241059958783, +SNULL, 140241062084607, 140241062088703, +STORE, 140241062080512, 140241062084607, +STORE, 140241062084608, 140241062088703, +SNULL, 93831419944959, 93831419953151, +STORE, 93831419940864, 93831419944959, +STORE, 93831419944960, 93831419953151, +SNULL, 140241064333311, 140241064337407, +STORE, 140241064329216, 140241064333311, +STORE, 140241064333312, 140241064337407, +ERASE, 140241064300544, 140241064329215, +STORE, 93831435284480, 93831435419647, +STORE, 140241062600704, 140241064284159, +STORE, 93831435284480, 93831435554815, +STORE, 93831435284480, 93831435689983, +STORE, 93831435284480, 93831435862015, +SNULL, 93831435837439, 93831435862015, +STORE, 93831435284480, 93831435837439, +STORE, 93831435837440, 93831435862015, +ERASE, 93831435837440, 93831435862015, +STORE, 93831435284480, 93831435972607, +STORE, 93831435284480, 93831436107775, +SNULL, 93831436091391, 93831436107775, +STORE, 93831435284480, 93831436091391, +STORE, 93831436091392, 93831436107775, +ERASE, 93831436091392, 93831436107775, +STORE, 93831435284480, 93831436226559, +STORE, 93831435284480, 93831436361727, +STORE, 93831435284480, 93831436505087, +STORE, 93831435284480, 93831436652543, +STORE, 93831435284480, 93831436787711, +STORE, 93831435284480, 93831436926975, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140728546775040, 140737488351231, +SNULL, 140728546783231, 140737488351231, +STORE, 140728546775040, 140728546783231, +STORE, 140728546643968, 140728546783231, +STORE, 94456178786304, 94456181010431, +SNULL, 94456178896895, 94456181010431, +STORE, 94456178786304, 94456178896895, +STORE, 94456178896896, 94456181010431, +ERASE, 94456178896896, 94456181010431, +STORE, 94456180989952, 94456181002239, +STORE, 94456181002240, 94456181010431, +STORE, 140221893091328, 140221895344127, +SNULL, 140221893234687, 140221895344127, +STORE, 140221893091328, 140221893234687, +STORE, 140221893234688, 140221895344127, +ERASE, 140221893234688, 140221895344127, +STORE, 140221895331840, 140221895340031, +STORE, 140221895340032, 140221895344127, +STORE, 140728547803136, 140728547807231, +STORE, 140728547790848, 140728547803135, +STORE, 140221895303168, 140221895331839, +STORE, 140221895294976, 140221895303167, +STORE, 140221889294336, 140221893091327, +SNULL, 140221889294336, 140221890953215, +STORE, 140221890953216, 140221893091327, +STORE, 140221889294336, 140221890953215, +SNULL, 140221893050367, 140221893091327, +STORE, 140221890953216, 140221893050367, +STORE, 140221893050368, 140221893091327, +SNULL, 140221893050368, 140221893074943, +STORE, 140221893074944, 140221893091327, +STORE, 140221893050368, 140221893074943, +ERASE, 140221893050368, 140221893074943, +STORE, 140221893050368, 140221893074943, +ERASE, 140221893074944, 140221893091327, +STORE, 140221893074944, 140221893091327, +SNULL, 140221893066751, 140221893074943, +STORE, 140221893050368, 140221893066751, +STORE, 140221893066752, 140221893074943, +SNULL, 94456180998143, 94456181002239, +STORE, 94456180989952, 94456180998143, +STORE, 94456180998144, 94456181002239, +SNULL, 140221895335935, 140221895340031, +STORE, 140221895331840, 140221895335935, +STORE, 140221895335936, 140221895340031, +ERASE, 140221895303168, 140221895331839, +STORE, 94456203730944, 94456203866111, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140734438637568, 140737488351231, +SNULL, 140734438645759, 140737488351231, +STORE, 140734438637568, 140734438645759, +STORE, 140734438506496, 140734438645759, +STORE, 94652233351168, 94652235575295, +SNULL, 94652233461759, 94652235575295, +STORE, 94652233351168, 94652233461759, +STORE, 94652233461760, 94652235575295, +ERASE, 94652233461760, 94652235575295, +STORE, 94652235554816, 94652235567103, +STORE, 94652235567104, 94652235575295, +STORE, 140536493195264, 140536495448063, +SNULL, 140536493338623, 140536495448063, +STORE, 140536493195264, 140536493338623, +STORE, 140536493338624, 140536495448063, +ERASE, 140536493338624, 140536495448063, +STORE, 140536495435776, 140536495443967, +STORE, 140536495443968, 140536495448063, +STORE, 140734439002112, 140734439006207, +STORE, 140734438989824, 140734439002111, +STORE, 140536495407104, 140536495435775, +STORE, 140536495398912, 140536495407103, +STORE, 140536489398272, 140536493195263, +SNULL, 140536489398272, 140536491057151, +STORE, 140536491057152, 140536493195263, +STORE, 140536489398272, 140536491057151, +SNULL, 140536493154303, 140536493195263, +STORE, 140536491057152, 140536493154303, +STORE, 140536493154304, 140536493195263, +SNULL, 140536493154304, 140536493178879, +STORE, 140536493178880, 140536493195263, +STORE, 140536493154304, 140536493178879, +ERASE, 140536493154304, 140536493178879, +STORE, 140536493154304, 140536493178879, +ERASE, 140536493178880, 140536493195263, +STORE, 140536493178880, 140536493195263, +SNULL, 140536493170687, 140536493178879, +STORE, 140536493154304, 140536493170687, +STORE, 140536493170688, 140536493178879, +SNULL, 94652235563007, 94652235567103, +STORE, 94652235554816, 94652235563007, +STORE, 94652235563008, 94652235567103, +SNULL, 140536495439871, 140536495443967, +STORE, 140536495435776, 140536495439871, +STORE, 140536495439872, 140536495443967, +ERASE, 140536495407104, 140536495435775, +STORE, 94652265619456, 94652265754623, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140721814200320, 140737488351231, +SNULL, 140721814208511, 140737488351231, +STORE, 140721814200320, 140721814208511, +STORE, 140721814069248, 140721814208511, +STORE, 94062800691200, 94062802915327, +SNULL, 94062800801791, 94062802915327, +STORE, 94062800691200, 94062800801791, +STORE, 94062800801792, 94062802915327, +ERASE, 94062800801792, 94062802915327, +STORE, 94062802894848, 94062802907135, +STORE, 94062802907136, 94062802915327, +STORE, 139717739700224, 139717741953023, +SNULL, 139717739843583, 139717741953023, +STORE, 139717739700224, 139717739843583, +STORE, 139717739843584, 139717741953023, +ERASE, 139717739843584, 139717741953023, +STORE, 139717741940736, 139717741948927, +STORE, 139717741948928, 139717741953023, +STORE, 140721814224896, 140721814228991, +STORE, 140721814212608, 140721814224895, +STORE, 139717741912064, 139717741940735, +STORE, 139717741903872, 139717741912063, +STORE, 139717735903232, 139717739700223, +SNULL, 139717735903232, 139717737562111, +STORE, 139717737562112, 139717739700223, +STORE, 139717735903232, 139717737562111, +SNULL, 139717739659263, 139717739700223, +STORE, 139717737562112, 139717739659263, +STORE, 139717739659264, 139717739700223, +SNULL, 139717739659264, 139717739683839, +STORE, 139717739683840, 139717739700223, +STORE, 139717739659264, 139717739683839, +ERASE, 139717739659264, 139717739683839, +STORE, 139717739659264, 139717739683839, +ERASE, 139717739683840, 139717739700223, +STORE, 139717739683840, 139717739700223, +SNULL, 139717739675647, 139717739683839, +STORE, 139717739659264, 139717739675647, +STORE, 139717739675648, 139717739683839, +SNULL, 94062802903039, 94062802907135, +STORE, 94062802894848, 94062802903039, +STORE, 94062802903040, 94062802907135, +SNULL, 139717741944831, 139717741948927, +STORE, 139717741940736, 139717741944831, +STORE, 139717741944832, 139717741948927, +ERASE, 139717741912064, 139717741940735, +STORE, 94062814060544, 94062814195711, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140723945754624, 140737488351231, +SNULL, 140723945762815, 140737488351231, +STORE, 140723945754624, 140723945762815, +STORE, 140723945623552, 140723945762815, +STORE, 94886119305216, 94886121639935, +SNULL, 94886119518207, 94886121639935, +STORE, 94886119305216, 94886119518207, +STORE, 94886119518208, 94886121639935, +ERASE, 94886119518208, 94886121639935, +STORE, 94886121615360, 94886121627647, +STORE, 94886121627648, 94886121639935, +STORE, 140152532131840, 140152534384639, +SNULL, 140152532275199, 140152534384639, +STORE, 140152532131840, 140152532275199, +STORE, 140152532275200, 140152534384639, +ERASE, 140152532275200, 140152534384639, +STORE, 140152534372352, 140152534380543, +STORE, 140152534380544, 140152534384639, +STORE, 140723946213376, 140723946217471, +STORE, 140723946201088, 140723946213375, +STORE, 140152534343680, 140152534372351, +STORE, 140152534335488, 140152534343679, +STORE, 140152530018304, 140152532131839, +SNULL, 140152530018304, 140152530030591, +STORE, 140152530030592, 140152532131839, +STORE, 140152530018304, 140152530030591, +SNULL, 140152532123647, 140152532131839, +STORE, 140152530030592, 140152532123647, +STORE, 140152532123648, 140152532131839, +ERASE, 140152532123648, 140152532131839, +STORE, 140152532123648, 140152532131839, +STORE, 140152526221312, 140152530018303, +SNULL, 140152526221312, 140152527880191, +STORE, 140152527880192, 140152530018303, +STORE, 140152526221312, 140152527880191, +SNULL, 140152529977343, 140152530018303, +STORE, 140152527880192, 140152529977343, +STORE, 140152529977344, 140152530018303, +SNULL, 140152529977344, 140152530001919, +STORE, 140152530001920, 140152530018303, +STORE, 140152529977344, 140152530001919, +ERASE, 140152529977344, 140152530001919, +STORE, 140152529977344, 140152530001919, +ERASE, 140152530001920, 140152530018303, +STORE, 140152530001920, 140152530018303, +STORE, 140152534327296, 140152534343679, +SNULL, 140152529993727, 140152530001919, +STORE, 140152529977344, 140152529993727, +STORE, 140152529993728, 140152530001919, +SNULL, 140152532127743, 140152532131839, +STORE, 140152532123648, 140152532127743, +STORE, 140152532127744, 140152532131839, +SNULL, 94886121619455, 94886121627647, +STORE, 94886121615360, 94886121619455, +STORE, 94886121619456, 94886121627647, +SNULL, 140152534376447, 140152534380543, +STORE, 140152534372352, 140152534376447, +STORE, 140152534376448, 140152534380543, +ERASE, 140152534343680, 140152534372351, +STORE, 94886129770496, 94886129905663, +STORE, 140152532643840, 140152534327295, +STORE, 94886129770496, 94886130040831, +STORE, 94886129770496, 94886130175999, +STORE, 94886129770496, 94886130348031, +SNULL, 94886130323455, 94886130348031, +STORE, 94886129770496, 94886130323455, +STORE, 94886130323456, 94886130348031, +ERASE, 94886130323456, 94886130348031, +STORE, 94886129770496, 94886130458623, +STORE, 94886129770496, 94886130606079, +SNULL, 94886130573311, 94886130606079, +STORE, 94886129770496, 94886130573311, +STORE, 94886130573312, 94886130606079, +ERASE, 94886130573312, 94886130606079, +STORE, 94886129770496, 94886130724863, +STORE, 94886129770496, 94886130876415, +STORE, 94886129770496, 94886131023871, +STORE, 94886129770496, 94886131175423, +STORE, 94886129770496, 94886131318783, +STORE, 94886129770496, 94886131453951, +SNULL, 94886131449855, 94886131453951, +STORE, 94886129770496, 94886131449855, +STORE, 94886131449856, 94886131453951, +ERASE, 94886131449856, 94886131453951, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140735450779648, 140737488351231, +SNULL, 140735450787839, 140737488351231, +STORE, 140735450779648, 140735450787839, +STORE, 140735450648576, 140735450787839, +STORE, 93947794079744, 93947796414463, +SNULL, 93947794292735, 93947796414463, +STORE, 93947794079744, 93947794292735, +STORE, 93947794292736, 93947796414463, +ERASE, 93947794292736, 93947796414463, +STORE, 93947796389888, 93947796402175, +STORE, 93947796402176, 93947796414463, +STORE, 139841993433088, 139841995685887, +SNULL, 139841993576447, 139841995685887, +STORE, 139841993433088, 139841993576447, +STORE, 139841993576448, 139841995685887, +ERASE, 139841993576448, 139841995685887, +STORE, 139841995673600, 139841995681791, +STORE, 139841995681792, 139841995685887, +STORE, 140735451308032, 140735451312127, +STORE, 140735451295744, 140735451308031, +STORE, 139841995644928, 139841995673599, +STORE, 139841995636736, 139841995644927, +STORE, 139841991319552, 139841993433087, +SNULL, 139841991319552, 139841991331839, +STORE, 139841991331840, 139841993433087, +STORE, 139841991319552, 139841991331839, +SNULL, 139841993424895, 139841993433087, +STORE, 139841991331840, 139841993424895, +STORE, 139841993424896, 139841993433087, +ERASE, 139841993424896, 139841993433087, +STORE, 139841993424896, 139841993433087, +STORE, 139841987522560, 139841991319551, +SNULL, 139841987522560, 139841989181439, +STORE, 139841989181440, 139841991319551, +STORE, 139841987522560, 139841989181439, +SNULL, 139841991278591, 139841991319551, +STORE, 139841989181440, 139841991278591, +STORE, 139841991278592, 139841991319551, +SNULL, 139841991278592, 139841991303167, +STORE, 139841991303168, 139841991319551, +STORE, 139841991278592, 139841991303167, +ERASE, 139841991278592, 139841991303167, +STORE, 139841991278592, 139841991303167, +ERASE, 139841991303168, 139841991319551, +STORE, 139841991303168, 139841991319551, +STORE, 139841995628544, 139841995644927, +SNULL, 139841991294975, 139841991303167, +STORE, 139841991278592, 139841991294975, +STORE, 139841991294976, 139841991303167, +SNULL, 139841993428991, 139841993433087, +STORE, 139841993424896, 139841993428991, +STORE, 139841993428992, 139841993433087, +SNULL, 93947796393983, 93947796402175, +STORE, 93947796389888, 93947796393983, +STORE, 93947796393984, 93947796402175, +SNULL, 139841995677695, 139841995681791, +STORE, 139841995673600, 139841995677695, +STORE, 139841995677696, 139841995681791, +ERASE, 139841995644928, 139841995673599, +STORE, 93947829739520, 93947829874687, +STORE, 139841993945088, 139841995628543, +STORE, 93947829739520, 93947830009855, +STORE, 93947829739520, 93947830145023, +STORE, 94659351814144, 94659352027135, +STORE, 94659354124288, 94659354128383, +STORE, 94659354128384, 94659354136575, +STORE, 94659354136576, 94659354148863, +STORE, 94659383476224, 94659385057279, +STORE, 139959054557184, 139959056216063, +STORE, 139959056216064, 139959058313215, +STORE, 139959058313216, 139959058329599, +STORE, 139959058329600, 139959058337791, +STORE, 139959058337792, 139959058354175, +STORE, 139959058354176, 139959058366463, +STORE, 139959058366464, 139959060459519, +STORE, 139959060459520, 139959060463615, +STORE, 139959060463616, 139959060467711, +STORE, 139959060467712, 139959060611071, +STORE, 139959060979712, 139959062663167, +STORE, 139959062663168, 139959062679551, +STORE, 139959062708224, 139959062712319, +STORE, 139959062712320, 139959062716415, +STORE, 139959062716416, 139959062720511, +STORE, 140735532539904, 140735532679167, +STORE, 140735532830720, 140735532843007, +STORE, 140735532843008, 140735532847103, +STORE, 93894361829376, 93894362042367, +STORE, 93894364139520, 93894364143615, +STORE, 93894364143616, 93894364151807, +STORE, 93894364151808, 93894364164095, +STORE, 93894396944384, 93894397624319, +STORE, 140075612573696, 140075614232575, +STORE, 140075614232576, 140075616329727, +STORE, 140075616329728, 140075616346111, +STORE, 140075616346112, 140075616354303, +STORE, 140075616354304, 140075616370687, +STORE, 140075616370688, 140075616382975, +STORE, 140075616382976, 140075618476031, +STORE, 140075618476032, 140075618480127, +STORE, 140075618480128, 140075618484223, +STORE, 140075618484224, 140075618627583, +STORE, 140075618996224, 140075620679679, +STORE, 140075620679680, 140075620696063, +STORE, 140075620724736, 140075620728831, +STORE, 140075620728832, 140075620732927, +STORE, 140075620732928, 140075620737023, +STORE, 140720830312448, 140720830451711, +STORE, 140720830631936, 140720830644223, +STORE, 140720830644224, 140720830648319, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140735116226560, 140737488351231, +SNULL, 140735116234751, 140737488351231, +STORE, 140735116226560, 140735116234751, +STORE, 140735116095488, 140735116234751, +STORE, 94873398054912, 94873400279039, +SNULL, 94873398165503, 94873400279039, +STORE, 94873398054912, 94873398165503, +STORE, 94873398165504, 94873400279039, +ERASE, 94873398165504, 94873400279039, +STORE, 94873400258560, 94873400270847, +STORE, 94873400270848, 94873400279039, +STORE, 140303828606976, 140303830859775, +SNULL, 140303828750335, 140303830859775, +STORE, 140303828606976, 140303828750335, +STORE, 140303828750336, 140303830859775, +ERASE, 140303828750336, 140303830859775, +STORE, 140303830847488, 140303830855679, +STORE, 140303830855680, 140303830859775, +STORE, 140735116251136, 140735116255231, +STORE, 140735116238848, 140735116251135, +STORE, 140303830818816, 140303830847487, +STORE, 140303830810624, 140303830818815, +STORE, 140303824809984, 140303828606975, +SNULL, 140303824809984, 140303826468863, +STORE, 140303826468864, 140303828606975, +STORE, 140303824809984, 140303826468863, +SNULL, 140303828566015, 140303828606975, +STORE, 140303826468864, 140303828566015, +STORE, 140303828566016, 140303828606975, +SNULL, 140303828566016, 140303828590591, +STORE, 140303828590592, 140303828606975, +STORE, 140303828566016, 140303828590591, +ERASE, 140303828566016, 140303828590591, +STORE, 140303828566016, 140303828590591, +ERASE, 140303828590592, 140303828606975, +STORE, 140303828590592, 140303828606975, +SNULL, 140303828582399, 140303828590591, +STORE, 140303828566016, 140303828582399, +STORE, 140303828582400, 140303828590591, +SNULL, 94873400266751, 94873400270847, +STORE, 94873400258560, 94873400266751, +STORE, 94873400266752, 94873400270847, +SNULL, 140303830851583, 140303830855679, +STORE, 140303830847488, 140303830851583, +STORE, 140303830851584, 140303830855679, +ERASE, 140303830818816, 140303830847487, +STORE, 94873413713920, 94873413849087, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140732349956096, 140737488351231, +SNULL, 140732349964287, 140737488351231, +STORE, 140732349956096, 140732349964287, +STORE, 140732349825024, 140732349964287, +STORE, 94009652736000, 94009655070719, +SNULL, 94009652948991, 94009655070719, +STORE, 94009652736000, 94009652948991, +STORE, 94009652948992, 94009655070719, +ERASE, 94009652948992, 94009655070719, +STORE, 94009655046144, 94009655058431, +STORE, 94009655058432, 94009655070719, +STORE, 140295688531968, 140295690784767, +SNULL, 140295688675327, 140295690784767, +STORE, 140295688531968, 140295688675327, +STORE, 140295688675328, 140295690784767, +ERASE, 140295688675328, 140295690784767, +STORE, 140295690772480, 140295690780671, +STORE, 140295690780672, 140295690784767, +STORE, 140732350005248, 140732350009343, +STORE, 140732349992960, 140732350005247, +STORE, 140295690743808, 140295690772479, +STORE, 140295690735616, 140295690743807, +STORE, 140295686418432, 140295688531967, +SNULL, 140295686418432, 140295686430719, +STORE, 140295686430720, 140295688531967, +STORE, 140295686418432, 140295686430719, +SNULL, 140295688523775, 140295688531967, +STORE, 140295686430720, 140295688523775, +STORE, 140295688523776, 140295688531967, +ERASE, 140295688523776, 140295688531967, +STORE, 140295688523776, 140295688531967, +STORE, 140295682621440, 140295686418431, +SNULL, 140295682621440, 140295684280319, +STORE, 140295684280320, 140295686418431, +STORE, 140295682621440, 140295684280319, +SNULL, 140295686377471, 140295686418431, +STORE, 140295684280320, 140295686377471, +STORE, 140295686377472, 140295686418431, +SNULL, 140295686377472, 140295686402047, +STORE, 140295686402048, 140295686418431, +STORE, 140295686377472, 140295686402047, +ERASE, 140295686377472, 140295686402047, +STORE, 140295686377472, 140295686402047, +ERASE, 140295686402048, 140295686418431, +STORE, 140295686402048, 140295686418431, +STORE, 140295690727424, 140295690743807, +SNULL, 140295686393855, 140295686402047, +STORE, 140295686377472, 140295686393855, +STORE, 140295686393856, 140295686402047, +SNULL, 140295688527871, 140295688531967, +STORE, 140295688523776, 140295688527871, +STORE, 140295688527872, 140295688531967, +SNULL, 94009655050239, 94009655058431, +STORE, 94009655046144, 94009655050239, +STORE, 94009655050240, 94009655058431, +SNULL, 140295690776575, 140295690780671, +STORE, 140295690772480, 140295690776575, +STORE, 140295690776576, 140295690780671, +ERASE, 140295690743808, 140295690772479, +STORE, 94009672114176, 94009672249343, +STORE, 140295689043968, 140295690727423, +STORE, 94009672114176, 94009672384511, +STORE, 94009672114176, 94009672519679, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722376515584, 140737488351231, +SNULL, 140722376523775, 140737488351231, +STORE, 140722376515584, 140722376523775, +STORE, 140722376384512, 140722376523775, +STORE, 94089815773184, 94089818107903, +SNULL, 94089815986175, 94089818107903, +STORE, 94089815773184, 94089815986175, +STORE, 94089815986176, 94089818107903, +ERASE, 94089815986176, 94089818107903, +STORE, 94089818083328, 94089818095615, +STORE, 94089818095616, 94089818107903, +STORE, 140265595711488, 140265597964287, +SNULL, 140265595854847, 140265597964287, +STORE, 140265595711488, 140265595854847, +STORE, 140265595854848, 140265597964287, +ERASE, 140265595854848, 140265597964287, +STORE, 140265597952000, 140265597960191, +STORE, 140265597960192, 140265597964287, +STORE, 140722378297344, 140722378301439, +STORE, 140722378285056, 140722378297343, +STORE, 140265597923328, 140265597951999, +STORE, 140265597915136, 140265597923327, +STORE, 140265593597952, 140265595711487, +SNULL, 140265593597952, 140265593610239, +STORE, 140265593610240, 140265595711487, +STORE, 140265593597952, 140265593610239, +SNULL, 140265595703295, 140265595711487, +STORE, 140265593610240, 140265595703295, +STORE, 140265595703296, 140265595711487, +ERASE, 140265595703296, 140265595711487, +STORE, 140265595703296, 140265595711487, +STORE, 140265589800960, 140265593597951, +SNULL, 140265589800960, 140265591459839, +STORE, 140265591459840, 140265593597951, +STORE, 140265589800960, 140265591459839, +SNULL, 140265593556991, 140265593597951, +STORE, 140265591459840, 140265593556991, +STORE, 140265593556992, 140265593597951, +SNULL, 140265593556992, 140265593581567, +STORE, 140265593581568, 140265593597951, +STORE, 140265593556992, 140265593581567, +ERASE, 140265593556992, 140265593581567, +STORE, 140265593556992, 140265593581567, +ERASE, 140265593581568, 140265593597951, +STORE, 140265593581568, 140265593597951, +STORE, 140265597906944, 140265597923327, +SNULL, 140265593573375, 140265593581567, +STORE, 140265593556992, 140265593573375, +STORE, 140265593573376, 140265593581567, +SNULL, 140265595707391, 140265595711487, +STORE, 140265595703296, 140265595707391, +STORE, 140265595707392, 140265595711487, +SNULL, 94089818087423, 94089818095615, +STORE, 94089818083328, 94089818087423, +STORE, 94089818087424, 94089818095615, +SNULL, 140265597956095, 140265597960191, +STORE, 140265597952000, 140265597956095, +STORE, 140265597956096, 140265597960191, +ERASE, 140265597923328, 140265597951999, +STORE, 94089837146112, 94089837281279, +STORE, 140265596223488, 140265597906943, +STORE, 94089837146112, 94089837416447, +STORE, 94089837146112, 94089837551615, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140735265218560, 140737488351231, +SNULL, 140735265226751, 140737488351231, +STORE, 140735265218560, 140735265226751, +STORE, 140735265087488, 140735265226751, +STORE, 94250422370304, 94250424705023, +SNULL, 94250422583295, 94250424705023, +STORE, 94250422370304, 94250422583295, +STORE, 94250422583296, 94250424705023, +ERASE, 94250422583296, 94250424705023, +STORE, 94250424680448, 94250424692735, +STORE, 94250424692736, 94250424705023, +STORE, 140344442474496, 140344444727295, +SNULL, 140344442617855, 140344444727295, +STORE, 140344442474496, 140344442617855, +STORE, 140344442617856, 140344444727295, +ERASE, 140344442617856, 140344444727295, +STORE, 140344444715008, 140344444723199, +STORE, 140344444723200, 140344444727295, +STORE, 140735265341440, 140735265345535, +STORE, 140735265329152, 140735265341439, +STORE, 140344444686336, 140344444715007, +STORE, 140344444678144, 140344444686335, +STORE, 140344440360960, 140344442474495, +SNULL, 140344440360960, 140344440373247, +STORE, 140344440373248, 140344442474495, +STORE, 140344440360960, 140344440373247, +SNULL, 140344442466303, 140344442474495, +STORE, 140344440373248, 140344442466303, +STORE, 140344442466304, 140344442474495, +ERASE, 140344442466304, 140344442474495, +STORE, 140344442466304, 140344442474495, +STORE, 140344436563968, 140344440360959, +SNULL, 140344436563968, 140344438222847, +STORE, 140344438222848, 140344440360959, +STORE, 140344436563968, 140344438222847, +SNULL, 140344440319999, 140344440360959, +STORE, 140344438222848, 140344440319999, +STORE, 140344440320000, 140344440360959, +SNULL, 140344440320000, 140344440344575, +STORE, 140344440344576, 140344440360959, +STORE, 140344440320000, 140344440344575, +ERASE, 140344440320000, 140344440344575, +STORE, 140344440320000, 140344440344575, +ERASE, 140344440344576, 140344440360959, +STORE, 140344440344576, 140344440360959, +STORE, 140344444669952, 140344444686335, +SNULL, 140344440336383, 140344440344575, +STORE, 140344440320000, 140344440336383, +STORE, 140344440336384, 140344440344575, +SNULL, 140344442470399, 140344442474495, +STORE, 140344442466304, 140344442470399, +STORE, 140344442470400, 140344442474495, +SNULL, 94250424684543, 94250424692735, +STORE, 94250424680448, 94250424684543, +STORE, 94250424684544, 94250424692735, +SNULL, 140344444719103, 140344444723199, +STORE, 140344444715008, 140344444719103, +STORE, 140344444719104, 140344444723199, +ERASE, 140344444686336, 140344444715007, +STORE, 94250445512704, 94250445647871, +STORE, 140344442986496, 140344444669951, +STORE, 94250445512704, 94250445783039, +STORE, 94250445512704, 94250445918207, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140725762719744, 140737488351231, +SNULL, 140725762727935, 140737488351231, +STORE, 140725762719744, 140725762727935, +STORE, 140725762588672, 140725762727935, +STORE, 94819009097728, 94819011432447, +SNULL, 94819009310719, 94819011432447, +STORE, 94819009097728, 94819009310719, +STORE, 94819009310720, 94819011432447, +ERASE, 94819009310720, 94819011432447, +STORE, 94819011407872, 94819011420159, +STORE, 94819011420160, 94819011432447, +STORE, 139987985596416, 139987987849215, +SNULL, 139987985739775, 139987987849215, +STORE, 139987985596416, 139987985739775, +STORE, 139987985739776, 139987987849215, +ERASE, 139987985739776, 139987987849215, +STORE, 139987987836928, 139987987845119, +STORE, 139987987845120, 139987987849215, +STORE, 140725763072000, 140725763076095, +STORE, 140725763059712, 140725763071999, +STORE, 139987987808256, 139987987836927, +STORE, 139987987800064, 139987987808255, +STORE, 139987983482880, 139987985596415, +SNULL, 139987983482880, 139987983495167, +STORE, 139987983495168, 139987985596415, +STORE, 139987983482880, 139987983495167, +SNULL, 139987985588223, 139987985596415, +STORE, 139987983495168, 139987985588223, +STORE, 139987985588224, 139987985596415, +ERASE, 139987985588224, 139987985596415, +STORE, 139987985588224, 139987985596415, +STORE, 139987979685888, 139987983482879, +SNULL, 139987979685888, 139987981344767, +STORE, 139987981344768, 139987983482879, +STORE, 139987979685888, 139987981344767, +SNULL, 139987983441919, 139987983482879, +STORE, 139987981344768, 139987983441919, +STORE, 139987983441920, 139987983482879, +SNULL, 139987983441920, 139987983466495, +STORE, 139987983466496, 139987983482879, +STORE, 139987983441920, 139987983466495, +ERASE, 139987983441920, 139987983466495, +STORE, 139987983441920, 139987983466495, +ERASE, 139987983466496, 139987983482879, +STORE, 139987983466496, 139987983482879, +STORE, 139987987791872, 139987987808255, +SNULL, 139987983458303, 139987983466495, +STORE, 139987983441920, 139987983458303, +STORE, 139987983458304, 139987983466495, +SNULL, 139987985592319, 139987985596415, +STORE, 139987985588224, 139987985592319, +STORE, 139987985592320, 139987985596415, +SNULL, 94819011411967, 94819011420159, +STORE, 94819011407872, 94819011411967, +STORE, 94819011411968, 94819011420159, +SNULL, 139987987841023, 139987987845119, +STORE, 139987987836928, 139987987841023, +STORE, 139987987841024, 139987987845119, +ERASE, 139987987808256, 139987987836927, +STORE, 94819028176896, 94819028312063, +STORE, 139987986108416, 139987987791871, +STORE, 94819028176896, 94819028447231, +STORE, 94819028176896, 94819028582399, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722475413504, 140737488351231, +SNULL, 140722475421695, 140737488351231, +STORE, 140722475413504, 140722475421695, +STORE, 140722475282432, 140722475421695, +STORE, 94620599119872, 94620601343999, +SNULL, 94620599230463, 94620601343999, +STORE, 94620599119872, 94620599230463, +STORE, 94620599230464, 94620601343999, +ERASE, 94620599230464, 94620601343999, +STORE, 94620601323520, 94620601335807, +STORE, 94620601335808, 94620601343999, +STORE, 139891763060736, 139891765313535, +SNULL, 139891763204095, 139891765313535, +STORE, 139891763060736, 139891763204095, +STORE, 139891763204096, 139891765313535, +ERASE, 139891763204096, 139891765313535, +STORE, 139891765301248, 139891765309439, +STORE, 139891765309440, 139891765313535, +STORE, 140722475700224, 140722475704319, +STORE, 140722475687936, 140722475700223, +STORE, 139891765272576, 139891765301247, +STORE, 139891765264384, 139891765272575, +STORE, 139891759263744, 139891763060735, +SNULL, 139891759263744, 139891760922623, +STORE, 139891760922624, 139891763060735, +STORE, 139891759263744, 139891760922623, +SNULL, 139891763019775, 139891763060735, +STORE, 139891760922624, 139891763019775, +STORE, 139891763019776, 139891763060735, +SNULL, 139891763019776, 139891763044351, +STORE, 139891763044352, 139891763060735, +STORE, 139891763019776, 139891763044351, +ERASE, 139891763019776, 139891763044351, +STORE, 139891763019776, 139891763044351, +ERASE, 139891763044352, 139891763060735, +STORE, 139891763044352, 139891763060735, +SNULL, 139891763036159, 139891763044351, +STORE, 139891763019776, 139891763036159, +STORE, 139891763036160, 139891763044351, +SNULL, 94620601331711, 94620601335807, +STORE, 94620601323520, 94620601331711, +STORE, 94620601331712, 94620601335807, +SNULL, 139891765305343, 139891765309439, +STORE, 139891765301248, 139891765305343, +STORE, 139891765305344, 139891765309439, +ERASE, 139891765272576, 139891765301247, +STORE, 94620610027520, 94620610162687, +STORE, 94031976210432, 94031976423423, +STORE, 94031978520576, 94031978524671, +STORE, 94031978524672, 94031978532863, +STORE, 94031978532864, 94031978545151, +STORE, 94031990398976, 94031992565759, +STORE, 140336240640000, 140336242298879, +STORE, 140336242298880, 140336244396031, +STORE, 140336244396032, 140336244412415, +STORE, 140336244412416, 140336244420607, +STORE, 140336244420608, 140336244436991, +STORE, 140336244436992, 140336244449279, +STORE, 140336244449280, 140336246542335, +STORE, 140336246542336, 140336246546431, +STORE, 140336246546432, 140336246550527, +STORE, 140336246550528, 140336246693887, +STORE, 140336247062528, 140336248745983, +STORE, 140336248745984, 140336248762367, +STORE, 140336248791040, 140336248795135, +STORE, 140336248795136, 140336248799231, +STORE, 140336248799232, 140336248803327, +STORE, 140728500064256, 140728500203519, +STORE, 140728501501952, 140728501514239, +STORE, 140728501514240, 140728501518335, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140730503987200, 140737488351231, +SNULL, 140730503995391, 140737488351231, +STORE, 140730503987200, 140730503995391, +STORE, 140730503856128, 140730503995391, +STORE, 93866544205824, 93866546429951, +SNULL, 93866544316415, 93866546429951, +STORE, 93866544205824, 93866544316415, +STORE, 93866544316416, 93866546429951, +ERASE, 93866544316416, 93866546429951, +STORE, 93866546409472, 93866546421759, +STORE, 93866546421760, 93866546429951, +STORE, 140216311959552, 140216314212351, +SNULL, 140216312102911, 140216314212351, +STORE, 140216311959552, 140216312102911, +STORE, 140216312102912, 140216314212351, +ERASE, 140216312102912, 140216314212351, +STORE, 140216314200064, 140216314208255, +STORE, 140216314208256, 140216314212351, +STORE, 140730504626176, 140730504630271, +STORE, 140730504613888, 140730504626175, +STORE, 140216314171392, 140216314200063, +STORE, 140216314163200, 140216314171391, +STORE, 140216308162560, 140216311959551, +SNULL, 140216308162560, 140216309821439, +STORE, 140216309821440, 140216311959551, +STORE, 140216308162560, 140216309821439, +SNULL, 140216311918591, 140216311959551, +STORE, 140216309821440, 140216311918591, +STORE, 140216311918592, 140216311959551, +SNULL, 140216311918592, 140216311943167, +STORE, 140216311943168, 140216311959551, +STORE, 140216311918592, 140216311943167, +ERASE, 140216311918592, 140216311943167, +STORE, 140216311918592, 140216311943167, +ERASE, 140216311943168, 140216311959551, +STORE, 140216311943168, 140216311959551, +SNULL, 140216311934975, 140216311943167, +STORE, 140216311918592, 140216311934975, +STORE, 140216311934976, 140216311943167, +SNULL, 93866546417663, 93866546421759, +STORE, 93866546409472, 93866546417663, +STORE, 93866546417664, 93866546421759, +SNULL, 140216314204159, 140216314208255, +STORE, 140216314200064, 140216314204159, +STORE, 140216314204160, 140216314208255, +ERASE, 140216314171392, 140216314200063, +STORE, 93866550386688, 93866550521855, +STORE, 94074292674560, 94074292887551, +STORE, 94074294984704, 94074294988799, +STORE, 94074294988800, 94074294996991, +STORE, 94074294996992, 94074295009279, +STORE, 94074300219392, 94074301378559, +STORE, 139781563256832, 139781564915711, +STORE, 139781564915712, 139781567012863, +STORE, 139781567012864, 139781567029247, +STORE, 139781567029248, 139781567037439, +STORE, 139781567037440, 139781567053823, +STORE, 139781567053824, 139781567066111, +STORE, 139781567066112, 139781569159167, +STORE, 139781569159168, 139781569163263, +STORE, 139781569163264, 139781569167359, +STORE, 139781569167360, 139781569310719, +STORE, 139781569679360, 139781571362815, +STORE, 139781571362816, 139781571379199, +STORE, 139781571407872, 139781571411967, +STORE, 139781571411968, 139781571416063, +STORE, 139781571416064, 139781571420159, +STORE, 140723688488960, 140723688628223, +STORE, 140723689005056, 140723689017343, +STORE, 140723689017344, 140723689021439, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140735189745664, 140737488351231, +SNULL, 140735189753855, 140737488351231, +STORE, 140735189745664, 140735189753855, +STORE, 140735189614592, 140735189753855, +STORE, 94172072177664, 94172074512383, +SNULL, 94172072390655, 94172074512383, +STORE, 94172072177664, 94172072390655, +STORE, 94172072390656, 94172074512383, +ERASE, 94172072390656, 94172074512383, +STORE, 94172074487808, 94172074500095, +STORE, 94172074500096, 94172074512383, +STORE, 140687827263488, 140687829516287, +SNULL, 140687827406847, 140687829516287, +STORE, 140687827263488, 140687827406847, +STORE, 140687827406848, 140687829516287, +ERASE, 140687827406848, 140687829516287, +STORE, 140687829504000, 140687829512191, +STORE, 140687829512192, 140687829516287, +STORE, 140735189766144, 140735189770239, +STORE, 140735189753856, 140735189766143, +STORE, 140687829475328, 140687829503999, +STORE, 140687829467136, 140687829475327, +STORE, 140687825149952, 140687827263487, +SNULL, 140687825149952, 140687825162239, +STORE, 140687825162240, 140687827263487, +STORE, 140687825149952, 140687825162239, +SNULL, 140687827255295, 140687827263487, +STORE, 140687825162240, 140687827255295, +STORE, 140687827255296, 140687827263487, +ERASE, 140687827255296, 140687827263487, +STORE, 140687827255296, 140687827263487, +STORE, 140687821352960, 140687825149951, +SNULL, 140687821352960, 140687823011839, +STORE, 140687823011840, 140687825149951, +STORE, 140687821352960, 140687823011839, +SNULL, 140687825108991, 140687825149951, +STORE, 140687823011840, 140687825108991, +STORE, 140687825108992, 140687825149951, +SNULL, 140687825108992, 140687825133567, +STORE, 140687825133568, 140687825149951, +STORE, 140687825108992, 140687825133567, +ERASE, 140687825108992, 140687825133567, +STORE, 140687825108992, 140687825133567, +ERASE, 140687825133568, 140687825149951, +STORE, 140687825133568, 140687825149951, +STORE, 140687829458944, 140687829475327, +SNULL, 140687825125375, 140687825133567, +STORE, 140687825108992, 140687825125375, +STORE, 140687825125376, 140687825133567, +SNULL, 140687827259391, 140687827263487, +STORE, 140687827255296, 140687827259391, +STORE, 140687827259392, 140687827263487, +SNULL, 94172074491903, 94172074500095, +STORE, 94172074487808, 94172074491903, +STORE, 94172074491904, 94172074500095, +SNULL, 140687829508095, 140687829512191, +STORE, 140687829504000, 140687829508095, +STORE, 140687829508096, 140687829512191, +ERASE, 140687829475328, 140687829503999, +STORE, 94172092432384, 94172092567551, +STORE, 140687827775488, 140687829458943, +STORE, 94172092432384, 94172092702719, +STORE, 94172092432384, 94172092837887, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140737229504512, 140737488351231, +SNULL, 140737229512703, 140737488351231, +STORE, 140737229504512, 140737229512703, +STORE, 140737229373440, 140737229512703, +STORE, 94155246866432, 94155249090559, +SNULL, 94155246977023, 94155249090559, +STORE, 94155246866432, 94155246977023, +STORE, 94155246977024, 94155249090559, +ERASE, 94155246977024, 94155249090559, +STORE, 94155249070080, 94155249082367, +STORE, 94155249082368, 94155249090559, +STORE, 140640993693696, 140640995946495, +SNULL, 140640993837055, 140640995946495, +STORE, 140640993693696, 140640993837055, +STORE, 140640993837056, 140640995946495, +ERASE, 140640993837056, 140640995946495, +STORE, 140640995934208, 140640995942399, +STORE, 140640995942400, 140640995946495, +STORE, 140737230004224, 140737230008319, +STORE, 140737229991936, 140737230004223, +STORE, 140640995905536, 140640995934207, +STORE, 140640995897344, 140640995905535, +STORE, 140640989896704, 140640993693695, +SNULL, 140640989896704, 140640991555583, +STORE, 140640991555584, 140640993693695, +STORE, 140640989896704, 140640991555583, +SNULL, 140640993652735, 140640993693695, +STORE, 140640991555584, 140640993652735, +STORE, 140640993652736, 140640993693695, +SNULL, 140640993652736, 140640993677311, +STORE, 140640993677312, 140640993693695, +STORE, 140640993652736, 140640993677311, +ERASE, 140640993652736, 140640993677311, +STORE, 140640993652736, 140640993677311, +ERASE, 140640993677312, 140640993693695, +STORE, 140640993677312, 140640993693695, +SNULL, 140640993669119, 140640993677311, +STORE, 140640993652736, 140640993669119, +STORE, 140640993669120, 140640993677311, +SNULL, 94155249078271, 94155249082367, +STORE, 94155249070080, 94155249078271, +STORE, 94155249078272, 94155249082367, +SNULL, 140640995938303, 140640995942399, +STORE, 140640995934208, 140640995938303, +STORE, 140640995938304, 140640995942399, +ERASE, 140640995905536, 140640995934207, +STORE, 94155281035264, 94155281170431, +STORE, 94088066453504, 94088066564095, +STORE, 94088068657152, 94088068665343, +STORE, 94088068665344, 94088068669439, +STORE, 94088068669440, 94088068677631, +STORE, 94088090214400, 94088090349567, +STORE, 140503024627712, 140503026286591, +STORE, 140503026286592, 140503028383743, +STORE, 140503028383744, 140503028400127, +STORE, 140503028400128, 140503028408319, +STORE, 140503028408320, 140503028424703, +STORE, 140503028424704, 140503028568063, +STORE, 140503030628352, 140503030636543, +STORE, 140503030665216, 140503030669311, +STORE, 140503030669312, 140503030673407, +STORE, 140503030673408, 140503030677503, +STORE, 140730894725120, 140730894864383, +STORE, 140730894880768, 140730894893055, +STORE, 140730894893056, 140730894897151, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140730434342912, 140737488351231, +SNULL, 140730434351103, 140737488351231, +STORE, 140730434342912, 140730434351103, +STORE, 140730434211840, 140730434351103, +STORE, 4194304, 5128191, +STORE, 7221248, 7241727, +STORE, 7241728, 7249919, +STORE, 140109041938432, 140109044191231, +SNULL, 140109042081791, 140109044191231, +STORE, 140109041938432, 140109042081791, +STORE, 140109042081792, 140109044191231, +ERASE, 140109042081792, 140109044191231, +STORE, 140109044178944, 140109044187135, +STORE, 140109044187136, 140109044191231, +STORE, 140730434850816, 140730434854911, +STORE, 140730434838528, 140730434850815, +STORE, 140109044150272, 140109044178943, +STORE, 140109044142080, 140109044150271, +STORE, 140109038776320, 140109041938431, +SNULL, 140109038776320, 140109039837183, +STORE, 140109039837184, 140109041938431, +STORE, 140109038776320, 140109039837183, +SNULL, 140109041930239, 140109041938431, +STORE, 140109039837184, 140109041930239, +STORE, 140109041930240, 140109041938431, +ERASE, 140109041930240, 140109041938431, +STORE, 140109041930240, 140109041938431, +STORE, 140109034979328, 140109038776319, +SNULL, 140109034979328, 140109036638207, +STORE, 140109036638208, 140109038776319, +STORE, 140109034979328, 140109036638207, +SNULL, 140109038735359, 140109038776319, +STORE, 140109036638208, 140109038735359, +STORE, 140109038735360, 140109038776319, +SNULL, 140109038735360, 140109038759935, +STORE, 140109038759936, 140109038776319, +STORE, 140109038735360, 140109038759935, +ERASE, 140109038735360, 140109038759935, +STORE, 140109038735360, 140109038759935, +ERASE, 140109038759936, 140109038776319, +STORE, 140109038759936, 140109038776319, +STORE, 140109044129792, 140109044150271, +SNULL, 140109038751743, 140109038759935, +STORE, 140109038735360, 140109038751743, +STORE, 140109038751744, 140109038759935, +SNULL, 140109041934335, 140109041938431, +STORE, 140109041930240, 140109041934335, +STORE, 140109041934336, 140109041938431, +SNULL, 7233535, 7241727, +STORE, 7221248, 7233535, +STORE, 7233536, 7241727, +SNULL, 140109044183039, 140109044187135, +STORE, 140109044178944, 140109044183039, +STORE, 140109044183040, 140109044187135, +ERASE, 140109044150272, 140109044178943, +STORE, 20000768, 20135935, +STORE, 20000768, 20283391, +STORE, 140109042446336, 140109044129791, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140730853408768, 140737488351231, +SNULL, 140730853416959, 140737488351231, +STORE, 140730853408768, 140730853416959, +STORE, 140730853277696, 140730853416959, +STORE, 94865902977024, 94865905311743, +SNULL, 94865903190015, 94865905311743, +STORE, 94865902977024, 94865903190015, +STORE, 94865903190016, 94865905311743, +ERASE, 94865903190016, 94865905311743, +STORE, 94865905287168, 94865905299455, +STORE, 94865905299456, 94865905311743, +STORE, 139768865738752, 139768867991551, +SNULL, 139768865882111, 139768867991551, +STORE, 139768865738752, 139768865882111, +STORE, 139768865882112, 139768867991551, +ERASE, 139768865882112, 139768867991551, +STORE, 139768867979264, 139768867987455, +STORE, 139768867987456, 139768867991551, +STORE, 140730853957632, 140730853961727, +STORE, 140730853945344, 140730853957631, +STORE, 139768867950592, 139768867979263, +STORE, 139768867942400, 139768867950591, +STORE, 139768863625216, 139768865738751, +SNULL, 139768863625216, 139768863637503, +STORE, 139768863637504, 139768865738751, +STORE, 139768863625216, 139768863637503, +SNULL, 139768865730559, 139768865738751, +STORE, 139768863637504, 139768865730559, +STORE, 139768865730560, 139768865738751, +ERASE, 139768865730560, 139768865738751, +STORE, 139768865730560, 139768865738751, +STORE, 139768859828224, 139768863625215, +SNULL, 139768859828224, 139768861487103, +STORE, 139768861487104, 139768863625215, +STORE, 139768859828224, 139768861487103, +SNULL, 139768863584255, 139768863625215, +STORE, 139768861487104, 139768863584255, +STORE, 139768863584256, 139768863625215, +SNULL, 139768863584256, 139768863608831, +STORE, 139768863608832, 139768863625215, +STORE, 139768863584256, 139768863608831, +ERASE, 139768863584256, 139768863608831, +STORE, 139768863584256, 139768863608831, +ERASE, 139768863608832, 139768863625215, +STORE, 139768863608832, 139768863625215, +STORE, 139768867934208, 139768867950591, +SNULL, 139768863600639, 139768863608831, +STORE, 139768863584256, 139768863600639, +STORE, 139768863600640, 139768863608831, +SNULL, 139768865734655, 139768865738751, +STORE, 139768865730560, 139768865734655, +STORE, 139768865734656, 139768865738751, +SNULL, 94865905291263, 94865905299455, +STORE, 94865905287168, 94865905291263, +STORE, 94865905291264, 94865905299455, +SNULL, 139768867983359, 139768867987455, +STORE, 139768867979264, 139768867983359, +STORE, 139768867983360, 139768867987455, +ERASE, 139768867950592, 139768867979263, +STORE, 94865923670016, 94865923805183, +STORE, 139768866250752, 139768867934207, +STORE, 94865923670016, 94865923940351, +STORE, 94865923670016, 94865924075519, +STORE, 94865923670016, 94865924222975, +SNULL, 94865924210687, 94865924222975, +STORE, 94865923670016, 94865924210687, +STORE, 94865924210688, 94865924222975, +ERASE, 94865924210688, 94865924222975, +STORE, 94865923670016, 94865924349951, +STORE, 94865923670016, 94865924493311, +STORE, 94865923670016, 94865924640767, +SNULL, 94865924603903, 94865924640767, +STORE, 94865923670016, 94865924603903, +STORE, 94865924603904, 94865924640767, +ERASE, 94865924603904, 94865924640767, +STORE, 94865923670016, 94865924747263, +STORE, 94865923670016, 94865924898815, +SNULL, 94865924874239, 94865924898815, +STORE, 94865923670016, 94865924874239, +STORE, 94865924874240, 94865924898815, +ERASE, 94865924874240, 94865924898815, +STORE, 94865923670016, 94865925025791, +SNULL, 94865925013503, 94865925025791, +STORE, 94865923670016, 94865925013503, +STORE, 94865925013504, 94865925025791, +ERASE, 94865925013504, 94865925025791, +SNULL, 94865924988927, 94865925013503, +STORE, 94865923670016, 94865924988927, +STORE, 94865924988928, 94865925013503, +ERASE, 94865924988928, 94865925013503, +STORE, 94865923670016, 94865925152767, +SNULL, 94865925136383, 94865925152767, +STORE, 94865923670016, 94865925136383, +STORE, 94865925136384, 94865925152767, +ERASE, 94865925136384, 94865925152767, +STORE, 94865923670016, 94865925292031, +SNULL, 94865925279743, 94865925292031, +STORE, 94865923670016, 94865925279743, +STORE, 94865925279744, 94865925292031, +ERASE, 94865925279744, 94865925292031, +SNULL, 94865925255167, 94865925279743, +STORE, 94865923670016, 94865925255167, +STORE, 94865925255168, 94865925279743, +ERASE, 94865925255168, 94865925279743, +STORE, 94865923670016, 94865925406719, +SNULL, 94865925394431, 94865925406719, +STORE, 94865923670016, 94865925394431, +STORE, 94865925394432, 94865925406719, +ERASE, 94865925394432, 94865925406719, +STORE, 94865923670016, 94865925545983, +SNULL, 94865925533695, 94865925545983, +STORE, 94865923670016, 94865925533695, +STORE, 94865925533696, 94865925545983, +ERASE, 94865925533696, 94865925545983, +SNULL, 94865925492735, 94865925533695, +STORE, 94865923670016, 94865925492735, +STORE, 94865925492736, 94865925533695, +ERASE, 94865925492736, 94865925533695, +STORE, 94865923670016, 94865925627903, +SNULL, 94865925599231, 94865925627903, +STORE, 94865923670016, 94865925599231, +STORE, 94865925599232, 94865925627903, +ERASE, 94865925599232, 94865925627903, +STORE, 94865923670016, 94865925738495, +SNULL, 94865925726207, 94865925738495, +STORE, 94865923670016, 94865925726207, +STORE, 94865925726208, 94865925738495, +ERASE, 94865925726208, 94865925738495, +STORE, 94865923670016, 94865925877759, +SNULL, 94865925865471, 94865925877759, +STORE, 94865923670016, 94865925865471, +STORE, 94865925865472, 94865925877759, +ERASE, 94865925865472, 94865925877759, +STORE, 94865923670016, 94865926021119, +SNULL, 94865926008831, 94865926021119, +STORE, 94865923670016, 94865926008831, +STORE, 94865926008832, 94865926021119, +ERASE, 94865926008832, 94865926021119, +SNULL, 94865925971967, 94865926008831, +STORE, 94865923670016, 94865925971967, +STORE, 94865925971968, 94865926008831, +ERASE, 94865925971968, 94865926008831, +STORE, 94865923670016, 94865926115327, +STORE, 94865923670016, 94865926254591, +SNULL, 94865926246399, 94865926254591, +STORE, 94865923670016, 94865926246399, +STORE, 94865926246400, 94865926254591, +ERASE, 94865926246400, 94865926254591, +STORE, 94865923670016, 94865926385663, +STORE, 94865923670016, 94865926537215, +STORE, 94865923670016, 94865926672383, +STORE, 94865923670016, 94865926815743, +STORE, 94865923670016, 94865926955007, +STORE, 94865923670016, 94865927094271, +STORE, 94865923670016, 94865927233535, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140731148435456, 140737488351231, +SNULL, 140731148443647, 140737488351231, +STORE, 140731148435456, 140731148443647, +STORE, 140731148304384, 140731148443647, +STORE, 94090775400448, 94090777735167, +SNULL, 94090775613439, 94090777735167, +STORE, 94090775400448, 94090775613439, +STORE, 94090775613440, 94090777735167, +ERASE, 94090775613440, 94090777735167, +STORE, 94090777710592, 94090777722879, +STORE, 94090777722880, 94090777735167, +STORE, 140301090283520, 140301092536319, +SNULL, 140301090426879, 140301092536319, +STORE, 140301090283520, 140301090426879, +STORE, 140301090426880, 140301092536319, +ERASE, 140301090426880, 140301092536319, +STORE, 140301092524032, 140301092532223, +STORE, 140301092532224, 140301092536319, +STORE, 140731148570624, 140731148574719, +STORE, 140731148558336, 140731148570623, +STORE, 140301092495360, 140301092524031, +STORE, 140301092487168, 140301092495359, +STORE, 140301088169984, 140301090283519, +SNULL, 140301088169984, 140301088182271, +STORE, 140301088182272, 140301090283519, +STORE, 140301088169984, 140301088182271, +SNULL, 140301090275327, 140301090283519, +STORE, 140301088182272, 140301090275327, +STORE, 140301090275328, 140301090283519, +ERASE, 140301090275328, 140301090283519, +STORE, 140301090275328, 140301090283519, +STORE, 140301084372992, 140301088169983, +SNULL, 140301084372992, 140301086031871, +STORE, 140301086031872, 140301088169983, +STORE, 140301084372992, 140301086031871, +SNULL, 140301088129023, 140301088169983, +STORE, 140301086031872, 140301088129023, +STORE, 140301088129024, 140301088169983, +SNULL, 140301088129024, 140301088153599, +STORE, 140301088153600, 140301088169983, +STORE, 140301088129024, 140301088153599, +ERASE, 140301088129024, 140301088153599, +STORE, 140301088129024, 140301088153599, +ERASE, 140301088153600, 140301088169983, +STORE, 140301088153600, 140301088169983, +STORE, 140301092478976, 140301092495359, +SNULL, 140301088145407, 140301088153599, +STORE, 140301088129024, 140301088145407, +STORE, 140301088145408, 140301088153599, +SNULL, 140301090279423, 140301090283519, +STORE, 140301090275328, 140301090279423, +STORE, 140301090279424, 140301090283519, +SNULL, 94090777714687, 94090777722879, +STORE, 94090777710592, 94090777714687, +STORE, 94090777714688, 94090777722879, +SNULL, 140301092528127, 140301092532223, +STORE, 140301092524032, 140301092528127, +STORE, 140301092528128, 140301092532223, +ERASE, 140301092495360, 140301092524031, +STORE, 94090794590208, 94090794725375, +STORE, 140301090795520, 140301092478975, +STORE, 94090794590208, 94090794860543, +STORE, 94090794590208, 94090794995711, +STORE, 94090794590208, 94090795163647, +SNULL, 94090795139071, 94090795163647, +STORE, 94090794590208, 94090795139071, +STORE, 94090795139072, 94090795163647, +ERASE, 94090795139072, 94090795163647, +STORE, 94090794590208, 94090795278335, +STORE, 94090794590208, 94090795425791, +SNULL, 94090795388927, 94090795425791, +STORE, 94090794590208, 94090795388927, +STORE, 94090795388928, 94090795425791, +ERASE, 94090795388928, 94090795425791, +STORE, 94090794590208, 94090795528191, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140733084430336, 140737488351231, +SNULL, 140733084438527, 140737488351231, +STORE, 140733084430336, 140733084438527, +STORE, 140733084299264, 140733084438527, +STORE, 94116169183232, 94116171517951, +SNULL, 94116169396223, 94116171517951, +STORE, 94116169183232, 94116169396223, +STORE, 94116169396224, 94116171517951, +ERASE, 94116169396224, 94116171517951, +STORE, 94116171493376, 94116171505663, +STORE, 94116171505664, 94116171517951, +STORE, 139772214128640, 139772216381439, +SNULL, 139772214271999, 139772216381439, +STORE, 139772214128640, 139772214271999, +STORE, 139772214272000, 139772216381439, +ERASE, 139772214272000, 139772216381439, +STORE, 139772216369152, 139772216377343, +STORE, 139772216377344, 139772216381439, +STORE, 140733085270016, 140733085274111, +STORE, 140733085257728, 140733085270015, +STORE, 139772216340480, 139772216369151, +STORE, 139772216332288, 139772216340479, +STORE, 139772212015104, 139772214128639, +SNULL, 139772212015104, 139772212027391, +STORE, 139772212027392, 139772214128639, +STORE, 139772212015104, 139772212027391, +SNULL, 139772214120447, 139772214128639, +STORE, 139772212027392, 139772214120447, +STORE, 139772214120448, 139772214128639, +ERASE, 139772214120448, 139772214128639, +STORE, 139772214120448, 139772214128639, +STORE, 139772208218112, 139772212015103, +SNULL, 139772208218112, 139772209876991, +STORE, 139772209876992, 139772212015103, +STORE, 139772208218112, 139772209876991, +SNULL, 139772211974143, 139772212015103, +STORE, 139772209876992, 139772211974143, +STORE, 139772211974144, 139772212015103, +SNULL, 139772211974144, 139772211998719, +STORE, 139772211998720, 139772212015103, +STORE, 139772211974144, 139772211998719, +ERASE, 139772211974144, 139772211998719, +STORE, 139772211974144, 139772211998719, +ERASE, 139772211998720, 139772212015103, +STORE, 139772211998720, 139772212015103, +STORE, 139772216324096, 139772216340479, +SNULL, 139772211990527, 139772211998719, +STORE, 139772211974144, 139772211990527, +STORE, 139772211990528, 139772211998719, +SNULL, 139772214124543, 139772214128639, +STORE, 139772214120448, 139772214124543, +STORE, 139772214124544, 139772214128639, +SNULL, 94116171497471, 94116171505663, +STORE, 94116171493376, 94116171497471, +STORE, 94116171497472, 94116171505663, +SNULL, 139772216373247, 139772216377343, +STORE, 139772216369152, 139772216373247, +STORE, 139772216373248, 139772216377343, +ERASE, 139772216340480, 139772216369151, +STORE, 94116199383040, 94116199518207, +STORE, 139772214640640, 139772216324095, +STORE, 94116199383040, 94116199653375, +STORE, 94116199383040, 94116199788543, +STORE, 140737488347136, 140737488351231, +STORE, 140726067826688, 140737488351231, +SNULL, 140726067830783, 140737488351231, +STORE, 140726067826688, 140726067830783, +STORE, 140726067695616, 140726067830783, +STORE, 94535150673920, 94535152898047, +SNULL, 94535150784511, 94535152898047, +STORE, 94535150673920, 94535150784511, +STORE, 94535150784512, 94535152898047, +ERASE, 94535150784512, 94535152898047, +STORE, 94535152877568, 94535152889855, +STORE, 94535152889856, 94535152898047, +STORE, 140381257314304, 140381259567103, +SNULL, 140381257457663, 140381259567103, +STORE, 140381257314304, 140381257457663, +STORE, 140381257457664, 140381259567103, +ERASE, 140381257457664, 140381259567103, +STORE, 140381259554816, 140381259563007, +STORE, 140381259563008, 140381259567103, +STORE, 140726068060160, 140726068064255, +STORE, 140726068047872, 140726068060159, +STORE, 140381259526144, 140381259554815, +STORE, 140381259517952, 140381259526143, +STORE, 140381253517312, 140381257314303, +SNULL, 140381253517312, 140381255176191, +STORE, 140381255176192, 140381257314303, +STORE, 140381253517312, 140381255176191, +SNULL, 140381257273343, 140381257314303, +STORE, 140381255176192, 140381257273343, +STORE, 140381257273344, 140381257314303, +SNULL, 140381257273344, 140381257297919, +STORE, 140381257297920, 140381257314303, +STORE, 140381257273344, 140381257297919, +ERASE, 140381257273344, 140381257297919, +STORE, 140381257273344, 140381257297919, +ERASE, 140381257297920, 140381257314303, +STORE, 140381257297920, 140381257314303, +SNULL, 140381257289727, 140381257297919, +STORE, 140381257273344, 140381257289727, +STORE, 140381257289728, 140381257297919, +SNULL, 94535152885759, 94535152889855, +STORE, 94535152877568, 94535152885759, +STORE, 94535152885760, 94535152889855, +SNULL, 140381259558911, 140381259563007, +STORE, 140381259554816, 140381259558911, +STORE, 140381259558912, 140381259563007, +ERASE, 140381259526144, 140381259554815, +STORE, 94535186296832, 94535186431999, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140729189425152, 140737488351231, +SNULL, 140729189433343, 140737488351231, +STORE, 140729189425152, 140729189433343, +STORE, 140729189294080, 140729189433343, +STORE, 94428200128512, 94428202352639, +SNULL, 94428200239103, 94428202352639, +STORE, 94428200128512, 94428200239103, +STORE, 94428200239104, 94428202352639, +ERASE, 94428200239104, 94428202352639, +STORE, 94428202332160, 94428202344447, +STORE, 94428202344448, 94428202352639, +STORE, 139707216986112, 139707219238911, +SNULL, 139707217129471, 139707219238911, +STORE, 139707216986112, 139707217129471, +STORE, 139707217129472, 139707219238911, +ERASE, 139707217129472, 139707219238911, +STORE, 139707219226624, 139707219234815, +STORE, 139707219234816, 139707219238911, +STORE, 140729189785600, 140729189789695, +STORE, 140729189773312, 140729189785599, +STORE, 139707219197952, 139707219226623, +STORE, 139707219189760, 139707219197951, +STORE, 139707213189120, 139707216986111, +SNULL, 139707213189120, 139707214847999, +STORE, 139707214848000, 139707216986111, +STORE, 139707213189120, 139707214847999, +SNULL, 139707216945151, 139707216986111, +STORE, 139707214848000, 139707216945151, +STORE, 139707216945152, 139707216986111, +SNULL, 139707216945152, 139707216969727, +STORE, 139707216969728, 139707216986111, +STORE, 139707216945152, 139707216969727, +ERASE, 139707216945152, 139707216969727, +STORE, 139707216945152, 139707216969727, +ERASE, 139707216969728, 139707216986111, +STORE, 139707216969728, 139707216986111, +SNULL, 139707216961535, 139707216969727, +STORE, 139707216945152, 139707216961535, +STORE, 139707216961536, 139707216969727, +SNULL, 94428202340351, 94428202344447, +STORE, 94428202332160, 94428202340351, +STORE, 94428202340352, 94428202344447, +SNULL, 139707219230719, 139707219234815, +STORE, 139707219226624, 139707219230719, +STORE, 139707219230720, 139707219234815, +ERASE, 139707219197952, 139707219226623, +STORE, 94428208599040, 94428208734207, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722000953344, 140737488351231, +SNULL, 140722000961535, 140737488351231, +STORE, 140722000953344, 140722000961535, +STORE, 140722000822272, 140722000961535, +STORE, 94636494757888, 94636496982015, +SNULL, 94636494868479, 94636496982015, +STORE, 94636494757888, 94636494868479, +STORE, 94636494868480, 94636496982015, +ERASE, 94636494868480, 94636496982015, +STORE, 94636496961536, 94636496973823, +STORE, 94636496973824, 94636496982015, +STORE, 140142275100672, 140142277353471, +SNULL, 140142275244031, 140142277353471, +STORE, 140142275100672, 140142275244031, +STORE, 140142275244032, 140142277353471, +ERASE, 140142275244032, 140142277353471, +STORE, 140142277341184, 140142277349375, +STORE, 140142277349376, 140142277353471, +STORE, 140722002747392, 140722002751487, +STORE, 140722002735104, 140722002747391, +STORE, 140142277312512, 140142277341183, +STORE, 140142277304320, 140142277312511, +STORE, 140142271303680, 140142275100671, +SNULL, 140142271303680, 140142272962559, +STORE, 140142272962560, 140142275100671, +STORE, 140142271303680, 140142272962559, +SNULL, 140142275059711, 140142275100671, +STORE, 140142272962560, 140142275059711, +STORE, 140142275059712, 140142275100671, +SNULL, 140142275059712, 140142275084287, +STORE, 140142275084288, 140142275100671, +STORE, 140142275059712, 140142275084287, +ERASE, 140142275059712, 140142275084287, +STORE, 140142275059712, 140142275084287, +ERASE, 140142275084288, 140142275100671, +STORE, 140142275084288, 140142275100671, +SNULL, 140142275076095, 140142275084287, +STORE, 140142275059712, 140142275076095, +STORE, 140142275076096, 140142275084287, +SNULL, 94636496969727, 94636496973823, +STORE, 94636496961536, 94636496969727, +STORE, 94636496969728, 94636496973823, +SNULL, 140142277345279, 140142277349375, +STORE, 140142277341184, 140142277345279, +STORE, 140142277345280, 140142277349375, +ERASE, 140142277312512, 140142277341183, +STORE, 94636516286464, 94636516421631, +STORE, 94071103692800, 94071103905791, +STORE, 94071106002944, 94071106007039, +STORE, 94071106007040, 94071106015231, +STORE, 94071106015232, 94071106027519, +STORE, 94071138521088, 94071140368383, +STORE, 140145668190208, 140145669849087, +STORE, 140145669849088, 140145671946239, +STORE, 140145671946240, 140145671962623, +STORE, 140145671962624, 140145671970815, +STORE, 140145671970816, 140145671987199, +STORE, 140145671987200, 140145671999487, +STORE, 140145671999488, 140145674092543, +STORE, 140145674092544, 140145674096639, +STORE, 140145674096640, 140145674100735, +STORE, 140145674100736, 140145674244095, +STORE, 140145674612736, 140145676296191, +STORE, 140145676296192, 140145676312575, +STORE, 140145676341248, 140145676345343, +STORE, 140145676345344, 140145676349439, +STORE, 140145676349440, 140145676353535, +STORE, 140734927740928, 140734927880191, +STORE, 140734928842752, 140734928855039, +STORE, 140734928855040, 140734928859135, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722342535168, 140737488351231, +SNULL, 140722342543359, 140737488351231, +STORE, 140722342535168, 140722342543359, +STORE, 140722342404096, 140722342543359, +STORE, 94399699714048, 94399702048767, +SNULL, 94399699927039, 94399702048767, +STORE, 94399699714048, 94399699927039, +STORE, 94399699927040, 94399702048767, +ERASE, 94399699927040, 94399702048767, +STORE, 94399702024192, 94399702036479, +STORE, 94399702036480, 94399702048767, +STORE, 139811024748544, 139811027001343, +SNULL, 139811024891903, 139811027001343, +STORE, 139811024748544, 139811024891903, +STORE, 139811024891904, 139811027001343, +ERASE, 139811024891904, 139811027001343, +STORE, 139811026989056, 139811026997247, +STORE, 139811026997248, 139811027001343, +STORE, 140722342707200, 140722342711295, +STORE, 140722342694912, 140722342707199, +STORE, 139811026960384, 139811026989055, +STORE, 139811026952192, 139811026960383, +STORE, 139811022635008, 139811024748543, +SNULL, 139811022635008, 139811022647295, +STORE, 139811022647296, 139811024748543, +STORE, 139811022635008, 139811022647295, +SNULL, 139811024740351, 139811024748543, +STORE, 139811022647296, 139811024740351, +STORE, 139811024740352, 139811024748543, +ERASE, 139811024740352, 139811024748543, +STORE, 139811024740352, 139811024748543, +STORE, 139811018838016, 139811022635007, +SNULL, 139811018838016, 139811020496895, +STORE, 139811020496896, 139811022635007, +STORE, 139811018838016, 139811020496895, +SNULL, 139811022594047, 139811022635007, +STORE, 139811020496896, 139811022594047, +STORE, 139811022594048, 139811022635007, +SNULL, 139811022594048, 139811022618623, +STORE, 139811022618624, 139811022635007, +STORE, 139811022594048, 139811022618623, +ERASE, 139811022594048, 139811022618623, +STORE, 139811022594048, 139811022618623, +ERASE, 139811022618624, 139811022635007, +STORE, 139811022618624, 139811022635007, +STORE, 139811026944000, 139811026960383, +SNULL, 139811022610431, 139811022618623, +STORE, 139811022594048, 139811022610431, +STORE, 139811022610432, 139811022618623, +SNULL, 139811024744447, 139811024748543, +STORE, 139811024740352, 139811024744447, +STORE, 139811024744448, 139811024748543, +SNULL, 94399702028287, 94399702036479, +STORE, 94399702024192, 94399702028287, +STORE, 94399702028288, 94399702036479, +SNULL, 139811026993151, 139811026997247, +STORE, 139811026989056, 139811026993151, +STORE, 139811026993152, 139811026997247, +ERASE, 139811026960384, 139811026989055, +STORE, 94399723880448, 94399724015615, +STORE, 139811025260544, 139811026943999, +STORE, 94399723880448, 94399724150783, +STORE, 94399723880448, 94399724285951, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140735364939776, 140737488351231, +SNULL, 140735364947967, 140737488351231, +STORE, 140735364939776, 140735364947967, +STORE, 140735364808704, 140735364947967, +STORE, 94421528674304, 94421531009023, +SNULL, 94421528887295, 94421531009023, +STORE, 94421528674304, 94421528887295, +STORE, 94421528887296, 94421531009023, +ERASE, 94421528887296, 94421531009023, +STORE, 94421530984448, 94421530996735, +STORE, 94421530996736, 94421531009023, +STORE, 140162004742144, 140162006994943, +SNULL, 140162004885503, 140162006994943, +STORE, 140162004742144, 140162004885503, +STORE, 140162004885504, 140162006994943, +ERASE, 140162004885504, 140162006994943, +STORE, 140162006982656, 140162006990847, +STORE, 140162006990848, 140162006994943, +STORE, 140735365402624, 140735365406719, +STORE, 140735365390336, 140735365402623, +STORE, 140162006953984, 140162006982655, +STORE, 140162006945792, 140162006953983, +STORE, 140162002628608, 140162004742143, +SNULL, 140162002628608, 140162002640895, +STORE, 140162002640896, 140162004742143, +STORE, 140162002628608, 140162002640895, +SNULL, 140162004733951, 140162004742143, +STORE, 140162002640896, 140162004733951, +STORE, 140162004733952, 140162004742143, +ERASE, 140162004733952, 140162004742143, +STORE, 140162004733952, 140162004742143, +STORE, 140161998831616, 140162002628607, +SNULL, 140161998831616, 140162000490495, +STORE, 140162000490496, 140162002628607, +STORE, 140161998831616, 140162000490495, +SNULL, 140162002587647, 140162002628607, +STORE, 140162000490496, 140162002587647, +STORE, 140162002587648, 140162002628607, +SNULL, 140162002587648, 140162002612223, +STORE, 140162002612224, 140162002628607, +STORE, 140162002587648, 140162002612223, +ERASE, 140162002587648, 140162002612223, +STORE, 140162002587648, 140162002612223, +ERASE, 140162002612224, 140162002628607, +STORE, 140162002612224, 140162002628607, +STORE, 140162006937600, 140162006953983, +SNULL, 140162002604031, 140162002612223, +STORE, 140162002587648, 140162002604031, +STORE, 140162002604032, 140162002612223, +SNULL, 140162004738047, 140162004742143, +STORE, 140162004733952, 140162004738047, +STORE, 140162004738048, 140162004742143, +SNULL, 94421530988543, 94421530996735, +STORE, 94421530984448, 94421530988543, +STORE, 94421530988544, 94421530996735, +SNULL, 140162006986751, 140162006990847, +STORE, 140162006982656, 140162006986751, +STORE, 140162006986752, 140162006990847, +ERASE, 140162006953984, 140162006982655, +STORE, 94421551697920, 94421551833087, +STORE, 140162005254144, 140162006937599, +STORE, 94421551697920, 94421551968255, +STORE, 94421551697920, 94421552103423, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140733498486784, 140737488351231, +SNULL, 140733498494975, 140737488351231, +STORE, 140733498486784, 140733498494975, +STORE, 140733498355712, 140733498494975, +STORE, 94567985836032, 94567988170751, +SNULL, 94567986049023, 94567988170751, +STORE, 94567985836032, 94567986049023, +STORE, 94567986049024, 94567988170751, +ERASE, 94567986049024, 94567988170751, +STORE, 94567988146176, 94567988158463, +STORE, 94567988158464, 94567988170751, +STORE, 139634278572032, 139634280824831, +SNULL, 139634278715391, 139634280824831, +STORE, 139634278572032, 139634278715391, +STORE, 139634278715392, 139634280824831, +ERASE, 139634278715392, 139634280824831, +STORE, 139634280812544, 139634280820735, +STORE, 139634280820736, 139634280824831, +STORE, 140733498544128, 140733498548223, +STORE, 140733498531840, 140733498544127, +STORE, 139634280783872, 139634280812543, +STORE, 139634280775680, 139634280783871, +STORE, 139634276458496, 139634278572031, +SNULL, 139634276458496, 139634276470783, +STORE, 139634276470784, 139634278572031, +STORE, 139634276458496, 139634276470783, +SNULL, 139634278563839, 139634278572031, +STORE, 139634276470784, 139634278563839, +STORE, 139634278563840, 139634278572031, +ERASE, 139634278563840, 139634278572031, +STORE, 139634278563840, 139634278572031, +STORE, 139634272661504, 139634276458495, +SNULL, 139634272661504, 139634274320383, +STORE, 139634274320384, 139634276458495, +STORE, 139634272661504, 139634274320383, +SNULL, 139634276417535, 139634276458495, +STORE, 139634274320384, 139634276417535, +STORE, 139634276417536, 139634276458495, +SNULL, 139634276417536, 139634276442111, +STORE, 139634276442112, 139634276458495, +STORE, 139634276417536, 139634276442111, +ERASE, 139634276417536, 139634276442111, +STORE, 139634276417536, 139634276442111, +ERASE, 139634276442112, 139634276458495, +STORE, 139634276442112, 139634276458495, +STORE, 139634280767488, 139634280783871, +SNULL, 139634276433919, 139634276442111, +STORE, 139634276417536, 139634276433919, +STORE, 139634276433920, 139634276442111, +SNULL, 139634278567935, 139634278572031, +STORE, 139634278563840, 139634278567935, +STORE, 139634278567936, 139634278572031, +SNULL, 94567988150271, 94567988158463, +STORE, 94567988146176, 94567988150271, +STORE, 94567988150272, 94567988158463, +SNULL, 139634280816639, 139634280820735, +STORE, 139634280812544, 139634280816639, +STORE, 139634280816640, 139634280820735, +ERASE, 139634280783872, 139634280812543, +STORE, 94567996379136, 94567996514303, +STORE, 139634279084032, 139634280767487, +STORE, 94567996379136, 94567996649471, +STORE, 94567996379136, 94567996784639, +STORE, 94567996379136, 94567996960767, +SNULL, 94567996932095, 94567996960767, +STORE, 94567996379136, 94567996932095, +STORE, 94567996932096, 94567996960767, +ERASE, 94567996932096, 94567996960767, +STORE, 94567996379136, 94567997071359, +STORE, 94567996379136, 94567997206527, +SNULL, 94567997186047, 94567997206527, +STORE, 94567996379136, 94567997186047, +STORE, 94567997186048, 94567997206527, +ERASE, 94567997186048, 94567997206527, +STORE, 94567996379136, 94567997358079, +STORE, 94567996379136, 94567997493247, +SNULL, 94567997476863, 94567997493247, +STORE, 94567996379136, 94567997476863, +STORE, 94567997476864, 94567997493247, +ERASE, 94567997476864, 94567997493247, +STORE, 94567996379136, 94567997612031, +STORE, 94567996379136, 94567997767679, +SNULL, 94567997739007, 94567997767679, +STORE, 94567996379136, 94567997739007, +STORE, 94567997739008, 94567997767679, +ERASE, 94567997739008, 94567997767679, +SNULL, 94567997698047, 94567997739007, +STORE, 94567996379136, 94567997698047, +STORE, 94567997698048, 94567997739007, +ERASE, 94567997698048, 94567997739007, +STORE, 94567996379136, 94567997853695, +STORE, 94567996379136, 94567997988863, +STORE, 94567996379136, 94567998132223, +STORE, 94567996379136, 94567998275583, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140723667759104, 140737488351231, +SNULL, 140723667767295, 140737488351231, +STORE, 140723667759104, 140723667767295, +STORE, 140723667628032, 140723667767295, +STORE, 94231598800896, 94231601135615, +SNULL, 94231599013887, 94231601135615, +STORE, 94231598800896, 94231599013887, +STORE, 94231599013888, 94231601135615, +ERASE, 94231599013888, 94231601135615, +STORE, 94231601111040, 94231601123327, +STORE, 94231601123328, 94231601135615, +STORE, 140269472649216, 140269474902015, +SNULL, 140269472792575, 140269474902015, +STORE, 140269472649216, 140269472792575, +STORE, 140269472792576, 140269474902015, +ERASE, 140269472792576, 140269474902015, +STORE, 140269474889728, 140269474897919, +STORE, 140269474897920, 140269474902015, +STORE, 140723667836928, 140723667841023, +STORE, 140723667824640, 140723667836927, +STORE, 140269474861056, 140269474889727, +STORE, 140269474852864, 140269474861055, +STORE, 140269470535680, 140269472649215, +SNULL, 140269470535680, 140269470547967, +STORE, 140269470547968, 140269472649215, +STORE, 140269470535680, 140269470547967, +SNULL, 140269472641023, 140269472649215, +STORE, 140269470547968, 140269472641023, +STORE, 140269472641024, 140269472649215, +ERASE, 140269472641024, 140269472649215, +STORE, 140269472641024, 140269472649215, +STORE, 140269466738688, 140269470535679, +SNULL, 140269466738688, 140269468397567, +STORE, 140269468397568, 140269470535679, +STORE, 140269466738688, 140269468397567, +SNULL, 140269470494719, 140269470535679, +STORE, 140269468397568, 140269470494719, +STORE, 140269470494720, 140269470535679, +SNULL, 140269470494720, 140269470519295, +STORE, 140269470519296, 140269470535679, +STORE, 140269470494720, 140269470519295, +ERASE, 140269470494720, 140269470519295, +STORE, 140269470494720, 140269470519295, +ERASE, 140269470519296, 140269470535679, +STORE, 140269470519296, 140269470535679, +STORE, 140269474844672, 140269474861055, +SNULL, 140269470511103, 140269470519295, +STORE, 140269470494720, 140269470511103, +STORE, 140269470511104, 140269470519295, +SNULL, 140269472645119, 140269472649215, +STORE, 140269472641024, 140269472645119, +STORE, 140269472645120, 140269472649215, +SNULL, 94231601115135, 94231601123327, +STORE, 94231601111040, 94231601115135, +STORE, 94231601115136, 94231601123327, +SNULL, 140269474893823, 140269474897919, +STORE, 140269474889728, 140269474893823, +STORE, 140269474893824, 140269474897919, +ERASE, 140269474861056, 140269474889727, +STORE, 94231626592256, 94231626727423, +STORE, 140269473161216, 140269474844671, +STORE, 94231626592256, 94231626862591, +STORE, 94231626592256, 94231626997759, +STORE, 94327178862592, 94327179075583, +STORE, 94327181172736, 94327181176831, +STORE, 94327181176832, 94327181185023, +STORE, 94327181185024, 94327181197311, +STORE, 94327185715200, 94327186685951, +STORE, 140172071755776, 140172073414655, +STORE, 140172073414656, 140172075511807, +STORE, 140172075511808, 140172075528191, +STORE, 140172075528192, 140172075536383, +STORE, 140172075536384, 140172075552767, +STORE, 140172075552768, 140172075565055, +STORE, 140172075565056, 140172077658111, +STORE, 140172077658112, 140172077662207, +STORE, 140172077662208, 140172077666303, +STORE, 140172077666304, 140172077809663, +STORE, 140172078178304, 140172079861759, +STORE, 140172079861760, 140172079878143, +STORE, 140172079878144, 140172079906815, +STORE, 140172079906816, 140172079910911, +STORE, 140172079910912, 140172079915007, +STORE, 140172079915008, 140172079919103, +STORE, 140720358359040, 140720358494207, +STORE, 140720358498304, 140720358510591, +STORE, 140720358510592, 140720358514687, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140722548621312, 140737488351231, +SNULL, 140722548629503, 140737488351231, +STORE, 140722548621312, 140722548629503, +STORE, 140722548490240, 140722548629503, +STORE, 93949289504768, 93949291728895, +SNULL, 93949289615359, 93949291728895, +STORE, 93949289504768, 93949289615359, +STORE, 93949289615360, 93949291728895, +ERASE, 93949289615360, 93949291728895, +STORE, 93949291708416, 93949291720703, +STORE, 93949291720704, 93949291728895, +STORE, 140305861902336, 140305864155135, +SNULL, 140305862045695, 140305864155135, +STORE, 140305861902336, 140305862045695, +STORE, 140305862045696, 140305864155135, +ERASE, 140305862045696, 140305864155135, +STORE, 140305864142848, 140305864151039, +STORE, 140305864151040, 140305864155135, +STORE, 140722549821440, 140722549825535, +STORE, 140722549809152, 140722549821439, +STORE, 140305864114176, 140305864142847, +STORE, 140305864105984, 140305864114175, +STORE, 140305858105344, 140305861902335, +SNULL, 140305858105344, 140305859764223, +STORE, 140305859764224, 140305861902335, +STORE, 140305858105344, 140305859764223, +SNULL, 140305861861375, 140305861902335, +STORE, 140305859764224, 140305861861375, +STORE, 140305861861376, 140305861902335, +SNULL, 140305861861376, 140305861885951, +STORE, 140305861885952, 140305861902335, +STORE, 140305861861376, 140305861885951, +ERASE, 140305861861376, 140305861885951, +STORE, 140305861861376, 140305861885951, +ERASE, 140305861885952, 140305861902335, +STORE, 140305861885952, 140305861902335, +SNULL, 140305861877759, 140305861885951, +STORE, 140305861861376, 140305861877759, +STORE, 140305861877760, 140305861885951, +SNULL, 93949291716607, 93949291720703, +STORE, 93949291708416, 93949291716607, +STORE, 93949291716608, 93949291720703, +SNULL, 140305864146943, 140305864151039, +STORE, 140305864142848, 140305864146943, +STORE, 140305864146944, 140305864151039, +ERASE, 140305864114176, 140305864142847, +STORE, 93949324136448, 93949324271615, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140725754908672, 140737488351231, +SNULL, 140725754916863, 140737488351231, +STORE, 140725754908672, 140725754916863, +STORE, 140725754777600, 140725754916863, +STORE, 94831184375808, 94831186599935, +SNULL, 94831184486399, 94831186599935, +STORE, 94831184375808, 94831184486399, +STORE, 94831184486400, 94831186599935, +ERASE, 94831184486400, 94831186599935, +STORE, 94831186579456, 94831186591743, +STORE, 94831186591744, 94831186599935, +STORE, 140605482479616, 140605484732415, +SNULL, 140605482622975, 140605484732415, +STORE, 140605482479616, 140605482622975, +STORE, 140605482622976, 140605484732415, +ERASE, 140605482622976, 140605484732415, +STORE, 140605484720128, 140605484728319, +STORE, 140605484728320, 140605484732415, +STORE, 140725755670528, 140725755674623, +STORE, 140725755658240, 140725755670527, +STORE, 140605484691456, 140605484720127, +STORE, 140605484683264, 140605484691455, +STORE, 140605478682624, 140605482479615, +SNULL, 140605478682624, 140605480341503, +STORE, 140605480341504, 140605482479615, +STORE, 140605478682624, 140605480341503, +SNULL, 140605482438655, 140605482479615, +STORE, 140605480341504, 140605482438655, +STORE, 140605482438656, 140605482479615, +SNULL, 140605482438656, 140605482463231, +STORE, 140605482463232, 140605482479615, +STORE, 140605482438656, 140605482463231, +ERASE, 140605482438656, 140605482463231, +STORE, 140605482438656, 140605482463231, +ERASE, 140605482463232, 140605482479615, +STORE, 140605482463232, 140605482479615, +SNULL, 140605482455039, 140605482463231, +STORE, 140605482438656, 140605482455039, +STORE, 140605482455040, 140605482463231, +SNULL, 94831186587647, 94831186591743, +STORE, 94831186579456, 94831186587647, +STORE, 94831186587648, 94831186591743, +SNULL, 140605484724223, 140605484728319, +STORE, 140605484720128, 140605484724223, +STORE, 140605484724224, 140605484728319, +ERASE, 140605484691456, 140605484720127, +STORE, 94831217156096, 94831217291263, +STORE, 94327178862592, 94327179075583, +STORE, 94327181172736, 94327181176831, +STORE, 94327181176832, 94327181185023, +STORE, 94327181185024, 94327181197311, +STORE, 94327185715200, 94327186685951, +STORE, 140172071755776, 140172073414655, +STORE, 140172073414656, 140172075511807, +STORE, 140172075511808, 140172075528191, +STORE, 140172075528192, 140172075536383, +STORE, 140172075536384, 140172075552767, +STORE, 140172075552768, 140172075565055, +STORE, 140172075565056, 140172077658111, +STORE, 140172077658112, 140172077662207, +STORE, 140172077662208, 140172077666303, +STORE, 140172077666304, 140172077809663, +STORE, 140172078178304, 140172079861759, +STORE, 140172079861760, 140172079878143, +STORE, 140172079878144, 140172079906815, +STORE, 140172079906816, 140172079910911, +STORE, 140172079910912, 140172079915007, +STORE, 140172079915008, 140172079919103, +STORE, 140720358359040, 140720358494207, +STORE, 140720358498304, 140720358510591, +STORE, 140720358510592, 140720358514687, +STORE, 140737488347136, 140737488351231, +STORE, 140737488343040, 140737488351231, +STORE, 140737488338944, 140737488351231, +STORE, 140734529933312, 140737488351231, +SNULL, 140734529945599, 140737488351231, +STORE, 140734529933312, 140734529945599, +STORE, 140734529802240, 140734529945599, +STORE, 4194304, 26279935, +STORE, 28372992, 28454911, +STORE, 28454912, 29806591, +STORE, 140249744060416, 140249746313215, +SNULL, 140249744203775, 140249746313215, +STORE, 140249744060416, 140249744203775, +STORE, 140249744203776, 140249746313215, +ERASE, 140249744203776, 140249746313215, +STORE, 140249746300928, 140249746309119, +STORE, 140249746309120, 140249746313215, +STORE, 140734530174976, 140734530179071, +STORE, 140734530162688, 140734530174975, +STORE, 140249746272256, 140249746300927, +STORE, 140249746264064, 140249746272255, +STORE, 140249740226560, 140249744060415, +SNULL, 140249740226560, 140249741934591, +STORE, 140249741934592, 140249744060415, +STORE, 140249740226560, 140249741934591, +SNULL, 140249744027647, 140249744060415, +STORE, 140249741934592, 140249744027647, +STORE, 140249744027648, 140249744060415, +ERASE, 140249744027648, 140249744060415, +STORE, 140249744027648, 140249744060415, +STORE, 140249738031104, 140249740226559, +SNULL, 140249738031104, 140249738125311, +STORE, 140249738125312, 140249740226559, +STORE, 140249738031104, 140249738125311, +SNULL, 140249740218367, 140249740226559, +STORE, 140249738125312, 140249740218367, +STORE, 140249740218368, 140249740226559, +ERASE, 140249740218368, 140249740226559, +STORE, 140249740218368, 140249740226559, +STORE, 140249735512064, 140249738031103, +SNULL, 140249735512064, 140249735925759, +STORE, 140249735925760, 140249738031103, +STORE, 140249735512064, 140249735925759, +SNULL, 140249738018815, 140249738031103, +STORE, 140249735925760, 140249738018815, +STORE, 140249738018816, 140249738031103, +ERASE, 140249738018816, 140249738031103, +STORE, 140249738018816, 140249738031103, +STORE, 140249732878336, 140249735512063, +SNULL, 140249732878336, 140249733406719, +STORE, 140249733406720, 140249735512063, +STORE, 140249732878336, 140249733406719, +SNULL, 140249735503871, 140249735512063, +STORE, 140249733406720, 140249735503871, +STORE, 140249735503872, 140249735512063, +ERASE, 140249735503872, 140249735512063, +STORE, 140249735503872, 140249735512063, +STORE, 140249730764800, 140249732878335, +SNULL, 140249730764800, 140249730777087, +STORE, 140249730777088, 140249732878335, +STORE, 140249730764800, 140249730777087, +SNULL, 140249732870143, 140249732878335, +STORE, 140249730777088, 140249732870143, +STORE, 140249732870144, 140249732878335, +ERASE, 140249732870144, 140249732878335, +STORE, 140249732870144, 140249732878335, +STORE, 140249728561152, 140249730764799, +SNULL, 140249728561152, 140249728663551, +STORE, 140249728663552, 140249730764799, +STORE, 140249728561152, 140249728663551, +SNULL, 140249730756607, 140249730764799, +STORE, 140249728663552, 140249730756607, +STORE, 140249730756608, 140249730764799, +ERASE, 140249730756608, 140249730764799, +STORE, 140249730756608, 140249730764799, +STORE, 140249746255872, 140249746272255, +STORE, 140249725399040, 140249728561151, +SNULL, 140249725399040, 140249726459903, +STORE, 140249726459904, 140249728561151, +STORE, 140249725399040, 140249726459903, +SNULL, 140249728552959, 140249728561151, +STORE, 140249726459904, 140249728552959, +STORE, 140249728552960, 140249728561151, +ERASE, 140249728552960, 140249728561151, +STORE, 140249728552960, 140249728561151, +STORE, 140249721602048, 140249725399039, +SNULL, 140249721602048, 140249723260927, +STORE, 140249723260928, 140249725399039, +STORE, 140249721602048, 140249723260927, +SNULL, 140249725358079, 140249725399039, +STORE, 140249723260928, 140249725358079, +STORE, 140249725358080, 140249725399039, +SNULL, 140249725358080, 140249725382655, +STORE, 140249725382656, 140249725399039, +STORE, 140249725358080, 140249725382655, +ERASE, 140249725358080, 140249725382655, +STORE, 140249725358080, 140249725382655, +ERASE, 140249725382656, 140249725399039, +STORE, 140249725382656, 140249725399039, +STORE, 140249746243584, 140249746272255, +SNULL, 140249725374463, 140249725382655, +STORE, 140249725358080, 140249725374463, +STORE, 140249725374464, 140249725382655, +SNULL, 140249728557055, 140249728561151, +STORE, 140249728552960, 140249728557055, +STORE, 140249728557056, 140249728561151, +SNULL, 140249730760703, 140249730764799, +STORE, 140249730756608, 140249730760703, +STORE, 140249730760704, 140249730764799, +SNULL, 140249732874239, 140249732878335, +STORE, 140249732870144, 140249732874239, +STORE, 140249732874240, 140249732878335, +SNULL, 140249735507967, 140249735512063, +STORE, 140249735503872, 140249735507967, +STORE, 140249735507968, 140249735512063, +SNULL, 140249738027007, 140249738031103, +STORE, 140249738018816, 140249738027007, +STORE, 140249738027008, 140249738031103, +SNULL, 140249740222463, 140249740226559, +STORE, 140249740218368, 140249740222463, +STORE, 140249740222464, 140249740226559, +SNULL, 140249744031743, 140249744060415, +STORE, 140249744027648, 140249744031743, +STORE, 140249744031744, 140249744060415, +SNULL, 28405759, 28454911, +STORE, 28372992, 28405759, +STORE, 28405760, 28454911, +SNULL, 140249746305023, 140249746309119, +STORE, 140249746300928, 140249746305023, +STORE, 140249746305024, 140249746309119, +ERASE, 140249746272256, 140249746300927, +STORE, 33853440, 33988607, +STORE, 140249744560128, 140249746243583, +STORE, 140249746296832, 140249746300927, +STORE, 140249744424960, 140249744560127, +STORE, 33853440, 34131967, +STORE, 140249719504896, 140249721602047, +STORE, 140249746288640, 140249746300927, +STORE, 140249746280448, 140249746300927, +STORE, 140249746243584, 140249746280447, +STORE, 140249744408576, 140249744560127, +STORE, 33853440, 34267135, +STORE, 33853440, 34422783, +STORE, 140249744400384, 140249744560127, +STORE, 140249744392192, 140249744560127, +STORE, 33853440, 34557951, +STORE, 33853440, 34693119, +STORE, 140249744375808, 140249744560127, +STORE, 140249744367616, 140249744560127, +STORE, 33853440, 34832383, +STORE, 140249719230464, 140249721602047, +STORE, 140249744207872, 140249744560127, +STORE, 33853440, 34971647, +SNULL, 34963455, 34971647, +STORE, 33853440, 34963455, +STORE, 34963456, 34971647, +ERASE, 34963456, 34971647, +SNULL, 34955263, 34963455, +STORE, 33853440, 34955263, +STORE, 34955264, 34963455, +ERASE, 34955264, 34963455, +SNULL, 34947071, 34955263, +STORE, 33853440, 34947071, +STORE, 34947072, 34955263, +ERASE, 34947072, 34955263, +SNULL, 34938879, 34947071, +STORE, 33853440, 34938879, +STORE, 34938880, 34947071, +ERASE, 34938880, 34947071, +STORE, 140249719214080, 140249721602047, +STORE, 140249719148544, 140249721602047, +STORE, 140249719115776, 140249721602047, +STORE, 140249717018624, 140249721602047, +STORE, 140249716953088, 140249721602047, +STORE, 33853440, 35086335, +STORE, 140249716822016, 140249721602047, +STORE, 140249716559872, 140249721602047, +STORE, 140249716551680, 140249721602047, +STORE, 140249716535296, 140249721602047, +STORE, 140249716527104, 140249721602047, +STORE, 140249716518912, 140249721602047, +STORE, 33853440, 35221503, +SNULL, 35213311, 35221503, +STORE, 33853440, 35213311, +STORE, 35213312, 35221503, +ERASE, 35213312, 35221503, +SNULL, 35205119, 35213311, +STORE, 33853440, 35205119, +STORE, 35205120, 35213311, +ERASE, 35205120, 35213311, +SNULL, 35192831, 35205119, +STORE, 33853440, 35192831, +STORE, 35192832, 35205119, +ERASE, 35192832, 35205119, +SNULL, 35176447, 35192831, +STORE, 33853440, 35176447, +STORE, 35176448, 35192831, +ERASE, 35176448, 35192831, +STORE, 140249716502528, 140249721602047, +STORE, 33853440, 35311615, +SNULL, 35307519, 35311615, +STORE, 33853440, 35307519, +STORE, 35307520, 35311615, +ERASE, 35307520, 35311615, +SNULL, 35303423, 35307519, +STORE, 33853440, 35303423, +STORE, 35303424, 35307519, +ERASE, 35303424, 35307519, +SNULL, 35299327, 35303423, +STORE, 33853440, 35299327, +STORE, 35299328, 35303423, +ERASE, 35299328, 35303423, +SNULL, 35295231, 35299327, +STORE, 33853440, 35295231, +STORE, 35295232, 35299327, +ERASE, 35295232, 35299327, +SNULL, 35291135, 35295231, +STORE, 33853440, 35291135, +STORE, 35291136, 35295231, +ERASE, 35291136, 35295231, +SNULL, 35287039, 35291135, +STORE, 33853440, 35287039, +STORE, 35287040, 35291135, +ERASE, 35287040, 35291135, +SNULL, 35282943, 35287039, +STORE, 33853440, 35282943, +STORE, 35282944, 35287039, +ERASE, 35282944, 35287039, +STORE, 140249716486144, 140249721602047, +STORE, 140249716453376, 140249721602047, +STORE, 33853440, 35418111, +SNULL, 35401727, 35418111, +STORE, 33853440, 35401727, +STORE, 35401728, 35418111, +ERASE, 35401728, 35418111, +SNULL, 35389439, 35401727, +STORE, 33853440, 35389439, +STORE, 35389440, 35401727, +ERASE, 35389440, 35401727, +STORE, 140249714356224, 140249721602047, +STORE, 33853440, 35540991, +STORE, 140249714339840, 140249721602047, +STORE, 140249714077696, 140249721602047, +STORE, 140249714069504, 140249721602047, +STORE, 140249714061312, 140249721602047, +STORE, 33853440, 35680255, +SNULL, 35672063, 35680255, +STORE, 33853440, 35672063, +STORE, 35672064, 35680255, +ERASE, 35672064, 35680255, +SNULL, 35627007, 35672063, +STORE, 33853440, 35627007, +STORE, 35627008, 35672063, +ERASE, 35627008, 35672063, +STORE, 140249711964160, 140249721602047, +STORE, 33853440, 35762175, +SNULL, 35753983, 35762175, +STORE, 33853440, 35753983, +STORE, 35753984, 35762175, +ERASE, 35753984, 35762175, +SNULL, 35745791, 35753983, +STORE, 33853440, 35745791, +STORE, 35745792, 35753983, +ERASE, 35745792, 35753983, +STORE, 140249711955968, 140249721602047, +STORE, 140249711947776, 140249721602047, +STORE, 140249710899200, 140249721602047, +STORE, 140249710866432, 140249721602047, +STORE, 140249710600192, 140249721602047, +SNULL, 140249744424959, 140249744560127, +STORE, 140249744207872, 140249744424959, +STORE, 140249744424960, 140249744560127, +ERASE, 140249744424960, 140249744560127, +STORE, 140249708503040, 140249721602047, +STORE, 33853440, 35885055, +STORE, 140249707978752, 140249721602047, +STORE, 140249705881600, 140249721602047, +STORE, 33853440, 36036607, +STORE, 33853440, 36175871, +STORE, 140249744551936, 140249744560127, +STORE, 140249744543744, 140249744560127, +STORE, 140249744535552, 140249744560127, +STORE, 140249744527360, 140249744560127, +STORE, 140249744519168, 140249744560127, +STORE, 140249705619456, 140249721602047, +STORE, 140249744510976, 140249744560127, +STORE, 140249744502784, 140249744560127, +STORE, 140249744494592, 140249744560127, +STORE, 140249744486400, 140249744560127, +STORE, 140249744478208, 140249744560127, +STORE, 140249744470016, 140249744560127, +STORE, 140249744461824, 140249744560127, +STORE, 140249744453632, 140249744560127, +STORE, 140249744445440, 140249744560127, +STORE, 140249744437248, 140249744560127, +STORE, 140249744429056, 140249744560127, +STORE, 140249703522304, 140249721602047, +STORE, 33853440, 36311039, +STORE, 140249703489536, 140249721602047, +STORE, 33853440, 36474879, +STORE, 140249703456768, 140249721602047, +STORE, 33853440, 36622335, +STORE, 140249703424000, 140249721602047, +STORE, 140249703391232, 140249721602047, +STORE, 33853440, 36810751, +STORE, 140249703358464, 140249721602047, +STORE, 140249703325696, 140249721602047, +SNULL, 36655103, 36810751, +STORE, 33853440, 36655103, +STORE, 36655104, 36810751, +ERASE, 36655104, 36810751, +SNULL, 36438015, 36655103, +STORE, 33853440, 36438015, +STORE, 36438016, 36655103, +ERASE, 36438016, 36655103, +STORE, 140249703317504, 140249721602047, +STORE, 140249701220352, 140249721602047, +STORE, 33853440, 36585471, +STORE, 33853440, 36782079, +STORE, 140249701212160, 140249721602047, +STORE, 140249701203968, 140249721602047, +STORE, 140249701195776, 140249721602047, +STORE, 140249701187584, 140249721602047, +STORE, 140249701179392, 140249721602047, +STORE, 140249701171200, 140249721602047, +STORE, 140249701163008, 140249721602047, +STORE, 140249701154816, 140249721602047, +STORE, 140249701146624, 140249721602047, +STORE, 140249701138432, 140249721602047, +STORE, 140249701130240, 140249721602047, +STORE, 140249700081664, 140249721602047, +STORE, 140249700073472, 140249721602047, +STORE, 33853440, 36978687, +STORE, 140249697976320, 140249721602047, +STORE, 33853440, 37240831, +STORE, 140249695879168, 140249721602047, +STORE, 140249695870976, 140249721602047, +STORE, 140249695862784, 140249721602047, +STORE, 140249695854592, 140249721602047, +STORE, 140249695326208, 140249721602047, +SNULL, 140249710600191, 140249721602047, +STORE, 140249695326208, 140249710600191, +STORE, 140249710600192, 140249721602047, +SNULL, 140249710600192, 140249710866431, +STORE, 140249710866432, 140249721602047, +STORE, 140249710600192, 140249710866431, +ERASE, 140249710600192, 140249710866431, +STORE, 140249691131904, 140249710600191, +STORE, 33853440, 37474303, +STORE, 140249710858240, 140249721602047, +STORE, 140249710850048, 140249721602047, +STORE, 140249710841856, 140249721602047, +STORE, 140249710833664, 140249721602047, +STORE, 140249710825472, 140249721602047, +STORE, 140249710817280, 140249721602047, +STORE, 140249710809088, 140249721602047, +STORE, 140249710800896, 140249721602047, +STORE, 140249710792704, 140249721602047, +STORE, 140249710784512, 140249721602047, +STORE, 140249710776320, 140249721602047, +STORE, 140249710768128, 140249721602047, +STORE, 140249710759936, 140249721602047, +STORE, 140249710751744, 140249721602047, +STORE, 140249710743552, 140249721602047, +STORE, 140249710735360, 140249721602047, +STORE, 140249689034752, 140249710600191, +STORE, 140249710727168, 140249721602047, +STORE, 140249686937600, 140249710600191, +STORE, 33853440, 37867519, +STORE, 140249684840448, 140249710600191, +STORE, 140249710718976, 140249721602047, +STORE, 140249682743296, 140249710600191, +STORE, 140249710710784, 140249721602047, +STORE, 140249710702592, 140249721602047, +STORE, 140249710694400, 140249721602047, +STORE, 140249710686208, 140249721602047, +STORE, 140249710678016, 140249721602047, +STORE, 140249682612224, 140249710600191, +STORE, 140249682087936, 140249710600191, +SNULL, 140249705619455, 140249710600191, +STORE, 140249682087936, 140249705619455, +STORE, 140249705619456, 140249710600191, +SNULL, 140249705619456, 140249705881599, +STORE, 140249705881600, 140249710600191, +STORE, 140249705619456, 140249705881599, +ERASE, 140249705619456, 140249705881599, +STORE, 140249679990784, 140249705619455, +STORE, 140249710669824, 140249721602047, +STORE, 140249677893632, 140249705619455, +STORE, 140249710653440, 140249721602047, +STORE, 140249710645248, 140249721602047, +STORE, 140249710637056, 140249721602047, +STORE, 140249710628864, 140249721602047, +STORE, 140249710620672, 140249721602047, +STORE, 140249710612480, 140249721602047, +STORE, 140249710604288, 140249721602047, +STORE, 140249705873408, 140249710600191, +STORE, 140249705865216, 140249710600191, +STORE, 140249705857024, 140249710600191, +STORE, 140249705848832, 140249710600191, +STORE, 140249705840640, 140249710600191, +STORE, 140249705832448, 140249710600191, +STORE, 140249705824256, 140249710600191, +STORE, 140249705816064, 140249710600191, +STORE, 140249705807872, 140249710600191, +STORE, 140249705799680, 140249710600191, +STORE, 33853440, 38129663, +SNULL, 140249744207872, 140249744367615, +STORE, 140249744367616, 140249744424959, +STORE, 140249744207872, 140249744367615, +ERASE, 140249744207872, 140249744367615, +STORE, 140249677606912, 140249705619455, +STORE, 140249675509760, 140249705619455, +SNULL, 140249677606911, 140249705619455, +STORE, 140249675509760, 140249677606911, +STORE, 140249677606912, 140249705619455, +SNULL, 140249677606912, 140249677893631, +STORE, 140249677893632, 140249705619455, +STORE, 140249677606912, 140249677893631, +ERASE, 140249677606912, 140249677893631, +STORE, 140249744359424, 140249744424959, +STORE, 33853440, 38391807, +STORE, 140249674981376, 140249677606911, +STORE, 140249672884224, 140249677606911, +SNULL, 140249719230463, 140249721602047, +STORE, 140249710604288, 140249719230463, +STORE, 140249719230464, 140249721602047, +SNULL, 140249719230464, 140249719504895, +STORE, 140249719504896, 140249721602047, +STORE, 140249719230464, 140249719504895, +ERASE, 140249719230464, 140249719504895, +STORE, 140249744351232, 140249744424959, +STORE, 140249744343040, 140249744424959, +STORE, 140249744334848, 140249744424959, +STORE, 140249744326656, 140249744424959, +STORE, 140249744310272, 140249744424959, +STORE, 140249744302080, 140249744424959, +STORE, 140249744285696, 140249744424959, +STORE, 140249744277504, 140249744424959, +STORE, 140249744261120, 140249744424959, +STORE, 140249744252928, 140249744424959, +STORE, 140249744220160, 140249744424959, +STORE, 140249744211968, 140249744424959, +STORE, 140249719488512, 140249721602047, +STORE, 140249744203776, 140249744424959, +STORE, 140249719472128, 140249721602047, +STORE, 140249719463936, 140249721602047, +STORE, 140249719447552, 140249721602047, +STORE, 140249719439360, 140249721602047, +STORE, 140249719406592, 140249721602047, +STORE, 140249719398400, 140249721602047, +STORE, 140249719382016, 140249721602047, +STORE, 140249719373824, 140249721602047, +STORE, 140249719357440, 140249721602047, +STORE, 140249719349248, 140249721602047, +STORE, 140249719332864, 140249721602047, +STORE, 140249719324672, 140249721602047, +STORE, 140249719291904, 140249721602047, +STORE, 140249719283712, 140249721602047, +STORE, 140249719267328, 140249721602047, +STORE, 140249719259136, 140249721602047, +STORE, 140249719242752, 140249721602047, +STORE, 140249719234560, 140249721602047, +STORE, 140249705783296, 140249710600191, +STORE, 140249705775104, 140249710600191, +STORE, 140249705742336, 140249710600191, +STORE, 140249705734144, 140249710600191, +STORE, 140249705717760, 140249710600191, +STORE, 140249670787072, 140249677606911, +STORE, 140249705709568, 140249710600191, +STORE, 140249705693184, 140249710600191, +STORE, 140249705684992, 140249710600191, +STORE, 140249705668608, 140249710600191, +STORE, 140249705660416, 140249710600191, +STORE, 140249705627648, 140249710600191, +STORE, 140249677893632, 140249710600191, +STORE, 140249677877248, 140249710600191, +STORE, 140249677869056, 140249710600191, +STORE, 140249677852672, 140249710600191, +STORE, 140249677844480, 140249710600191, +STORE, 140249677828096, 140249710600191, +STORE, 140249668689920, 140249677606911, +STORE, 140249677819904, 140249710600191, +STORE, 140249677787136, 140249710600191, +STORE, 140249677778944, 140249710600191, +STORE, 140249677762560, 140249710600191, +STORE, 140249677754368, 140249710600191, +STORE, 140249677737984, 140249710600191, +STORE, 140249677729792, 140249710600191, +STORE, 140249677713408, 140249710600191, +STORE, 140249677705216, 140249710600191, +STORE, 140249677672448, 140249710600191, +STORE, 140249677664256, 140249710600191, +STORE, 140249677647872, 140249710600191, +STORE, 140249677639680, 140249710600191, +STORE, 140249677623296, 140249710600191, +STORE, 140249677615104, 140249710600191, +STORE, 140249668673536, 140249677606911, +STORE, 140249668673536, 140249710600191, +STORE, 140249668640768, 140249710600191, +STORE, 140249668632576, 140249710600191, +STORE, 140249668616192, 140249710600191, +STORE, 140249668608000, 140249710600191, +STORE, 140249668591616, 140249710600191, +STORE, 140249668583424, 140249710600191, +STORE, 140249668567040, 140249710600191, +STORE, 140249668558848, 140249710600191, +STORE, 140249668526080, 140249710600191, +STORE, 140249668517888, 140249710600191, +STORE, 140249668501504, 140249710600191, +STORE, 140249668493312, 140249710600191, +STORE, 140249668476928, 140249710600191, +STORE, 140249668468736, 140249710600191, +STORE, 140249668452352, 140249710600191, +STORE, 140249668444160, 140249710600191, +STORE, 140249668411392, 140249710600191, +STORE, 140249668403200, 140249710600191, +STORE, 140249668386816, 140249710600191, +STORE, 140249668378624, 140249710600191, +STORE, 140249668362240, 140249710600191, +STORE, 140249668354048, 140249710600191, +STORE, 140249668337664, 140249710600191, +STORE, 140249668329472, 140249710600191, +STORE, 140249668296704, 140249710600191, +STORE, 140249668288512, 140249710600191, +STORE, 140249668272128, 140249710600191, +STORE, 140249668263936, 140249710600191, +STORE, 140249668247552, 140249710600191, +STORE, 140249668239360, 140249710600191, +STORE, 140249668222976, 140249710600191, +STORE, 140249668214784, 140249710600191, +STORE, 140249668182016, 140249710600191, +STORE, 140249668173824, 140249710600191, +STORE, 140249668157440, 140249710600191, +STORE, 140249668149248, 140249710600191, +STORE, 140249668132864, 140249710600191, +STORE, 140249668124672, 140249710600191, +STORE, 140249668108288, 140249710600191, +STORE, 140249668100096, 140249710600191, +STORE, 140249668067328, 140249710600191, +STORE, 140249668059136, 140249710600191, +STORE, 140249668042752, 140249710600191, +STORE, 140249668034560, 140249710600191, +STORE, 140249668018176, 140249710600191, +STORE, 140249668009984, 140249710600191, +STORE, 140249667993600, 140249710600191, +STORE, 140249667985408, 140249710600191, +STORE, 140249667952640, 140249710600191, +STORE, 140249667944448, 140249710600191, +STORE, 140249667928064, 140249710600191, +STORE, 140249667919872, 140249710600191, +STORE, 140249667903488, 140249710600191, +STORE, 140249667895296, 140249710600191, +STORE, 140249667878912, 140249710600191, +STORE, 140249667870720, 140249710600191, +STORE, 140249667837952, 140249710600191, +STORE, 140249667829760, 140249710600191, +STORE, 140249667813376, 140249710600191, +STORE, 140249667805184, 140249710600191, +STORE, 140249667788800, 140249710600191, +STORE, 140249667780608, 140249710600191, +STORE, 140249667764224, 140249710600191, +STORE, 140249667756032, 140249710600191, +STORE, 140249667723264, 140249710600191, +STORE, 140249667715072, 140249710600191, +STORE, 140249667698688, 140249710600191, +STORE, 140249667690496, 140249710600191, +STORE, 140249667674112, 140249710600191, +STORE, 140249667665920, 140249710600191, +STORE, 140249667649536, 140249710600191, +STORE, 140249667641344, 140249710600191, +STORE, 140249667608576, 140249710600191, +STORE, 140249667600384, 140249710600191, +STORE, 140249667584000, 140249710600191, +STORE, 140249667575808, 140249710600191, +STORE, 140249667559424, 140249710600191, +STORE, 140249667551232, 140249710600191, +STORE, 140249667534848, 140249710600191, +STORE, 140249667526656, 140249710600191, +STORE, 140249667493888, 140249710600191, +STORE, 140249667485696, 140249710600191, +STORE, 140249667469312, 140249710600191, +STORE, 140249667461120, 140249710600191, +STORE, 140249667444736, 140249710600191, +STORE, 140249667436544, 140249710600191, +STORE, 140249667420160, 140249710600191, +STORE, 140249665323008, 140249710600191, +STORE, 140249665314816, 140249710600191, +STORE, 140249665282048, 140249710600191, +STORE, 140249665273856, 140249710600191, +STORE, 140249665257472, 140249710600191, +STORE, 140249665249280, 140249710600191, +STORE, 140249665232896, 140249710600191, +STORE, 140249665224704, 140249710600191, +STORE, 140249665208320, 140249710600191, +STORE, 140249665200128, 140249710600191, +STORE, 140249665167360, 140249710600191, +STORE, 140249665159168, 140249710600191, +STORE, 140249665142784, 140249710600191, +STORE, 140249665134592, 140249710600191, +STORE, 140249665118208, 140249710600191, +STORE, 140249665110016, 140249710600191, +STORE, 140249665093632, 140249710600191, +STORE, 140249665085440, 140249710600191, +STORE, 140249665052672, 140249710600191, +STORE, 140249665044480, 140249710600191, +STORE, 140249665028096, 140249710600191, +STORE, 140249665019904, 140249710600191, +STORE, 140249665003520, 140249710600191, +STORE, 140249664995328, 140249710600191, +STORE, 140249664978944, 140249710600191, +STORE, 140249664970752, 140249710600191, +STORE, 140249664937984, 140249710600191, +STORE, 140249664929792, 140249710600191, +STORE, 140249664913408, 140249710600191, +STORE, 140249664905216, 140249710600191, +STORE, 140249664888832, 140249710600191, +STORE, 140249664880640, 140249710600191, +STORE, 140249664864256, 140249710600191, +STORE, 140249664856064, 140249710600191, +STORE, 140249664823296, 140249710600191, +STORE, 140249664815104, 140249710600191, +STORE, 140249664798720, 140249710600191, +STORE, 140249664790528, 140249710600191, +STORE, 140249664774144, 140249710600191, +STORE, 140249664765952, 140249710600191, +STORE, 140249664749568, 140249710600191, +STORE, 140249664741376, 140249710600191, +STORE, 140249664708608, 140249710600191, +STORE, 140249664700416, 140249710600191, +STORE, 140249664684032, 140249710600191, +STORE, 140249664675840, 140249710600191, +STORE, 140249664659456, 140249710600191, +STORE, 140249664651264, 140249710600191, +STORE, 140249664634880, 140249710600191, +STORE, 140249664626688, 140249710600191, +STORE, 140249664593920, 140249710600191, +STORE, 140249664585728, 140249710600191, +STORE, 140249664569344, 140249710600191, +STORE, 140249664561152, 140249710600191, +STORE, 140249664544768, 140249710600191, +STORE, 140249664536576, 140249710600191, +STORE, 140249664520192, 140249710600191, +STORE, 140249664512000, 140249710600191, +STORE, 140249664479232, 140249710600191, +STORE, 140249664471040, 140249710600191, +STORE, 140249664454656, 140249710600191, +STORE, 140249664446464, 140249710600191, +STORE, 140249664430080, 140249710600191, +STORE, 140249664421888, 140249710600191, +STORE, 140249664405504, 140249710600191, +STORE, 140249664397312, 140249710600191, +STORE, 140249664364544, 140249710600191, +STORE, 140249664356352, 140249710600191, +STORE, 140249664339968, 140249710600191, +STORE, 140249664331776, 140249710600191, +STORE, 140249664315392, 140249710600191, +STORE, 140249664307200, 140249710600191, +STORE, 140249664290816, 140249710600191, +STORE, 140249664282624, 140249710600191, +STORE, 140249664249856, 140249710600191, +STORE, 140249664241664, 140249710600191, +STORE, 140249664225280, 140249710600191, +STORE, 140249664217088, 140249710600191, +STORE, 140249664200704, 140249710600191, +STORE, 140249664192512, 140249710600191, +STORE, 140249664176128, 140249710600191, +STORE, 140249664167936, 140249710600191, +STORE, 140249664135168, 140249710600191, +STORE, 140249664126976, 140249710600191, +STORE, 140249664110592, 140249710600191, +STORE, 140249664102400, 140249710600191, +STORE, 140249664086016, 140249710600191, +STORE, 140249664077824, 140249710600191, +STORE, 140249664061440, 140249710600191, +STORE, 140249664053248, 140249710600191, +STORE, 140249664020480, 140249710600191, +STORE, 140249664012288, 140249710600191, +STORE, 140249663995904, 140249710600191, +STORE, 140249663987712, 140249710600191, +STORE, 140249663971328, 140249710600191, +STORE, 140249663963136, 140249710600191, +STORE, 140249663946752, 140249710600191, +STORE, 140249663938560, 140249710600191, +STORE, 140249663905792, 140249710600191, +STORE, 140249663897600, 140249710600191, +STORE, 140249663881216, 140249710600191, +STORE, 140249663873024, 140249710600191, +STORE, 140249663856640, 140249710600191, +STORE, 140249663848448, 140249710600191, +STORE, 140249663832064, 140249710600191, +STORE, 140249663823872, 140249710600191, +STORE, 140249663791104, 140249710600191, +STORE, 140249663782912, 140249710600191, +STORE, 140249663766528, 140249710600191, +STORE, 140249663758336, 140249710600191, +STORE, 140249663741952, 140249710600191, +STORE, 140249663733760, 140249710600191, +STORE, 140249663717376, 140249710600191, +STORE, 140249663709184, 140249710600191, +STORE, 140249663676416, 140249710600191, +STORE, 140249663668224, 140249710600191, +STORE, 140249663651840, 140249710600191, +STORE, 140249663643648, 140249710600191, +STORE, 140249663627264, 140249710600191, +STORE, 33853440, 38526975, +STORE, 140249663619072, 140249710600191, +STORE, 140249663602688, 140249710600191, +STORE, 140249661505536, 140249710600191, +STORE, 140249661497344, 140249710600191, +STORE, 140249661464576, 140249710600191, +STORE, 140249661456384, 140249710600191, +STORE, 140249661440000, 140249710600191, +STORE, 140249661431808, 140249710600191, +STORE, 140249661415424, 140249710600191, +STORE, 140249661407232, 140249710600191, +STORE, 140249661390848, 140249710600191, +STORE, 140249661382656, 140249710600191, +STORE, 140249661349888, 140249710600191, +STORE, 140249661341696, 140249710600191, +STORE, 140249661325312, 140249710600191, +STORE, 140249661317120, 140249710600191, +STORE, 140249661300736, 140249710600191, +STORE, 140249661292544, 140249710600191, +STORE, 140249661276160, 140249710600191, +STORE, 140249661267968, 140249710600191, +STORE, 140249661235200, 140249710600191, +STORE, 140249661227008, 140249710600191, +STORE, 140249661210624, 140249710600191, +STORE, 140249661202432, 140249710600191, +STORE, 140249661186048, 140249710600191, +STORE, 140249661177856, 140249710600191, +STORE, 140249661161472, 140249710600191, +STORE, 140249661153280, 140249710600191, +STORE, 140249661120512, 140249710600191, +STORE, 140249661112320, 140249710600191, +STORE, 140249661095936, 140249710600191, +STORE, 140249661087744, 140249710600191, +STORE, 140249661071360, 140249710600191, +STORE, 140249661063168, 140249710600191, +STORE, 140249661046784, 140249710600191, +STORE, 140249661038592, 140249710600191, +STORE, 140249661005824, 140249710600191, +STORE, 140249660997632, 140249710600191, +STORE, 140249660981248, 140249710600191, +STORE, 140249660973056, 140249710600191, +STORE, 140249660956672, 140249710600191, +STORE, 140249660948480, 140249710600191, +STORE, 140249660932096, 140249710600191, +STORE, 140249660923904, 140249710600191, +STORE, 140249660891136, 140249710600191, +STORE, 140249660882944, 140249710600191, +STORE, 140249660866560, 140249710600191, +STORE, 140249660858368, 140249710600191, +STORE, 140249660841984, 140249710600191, +STORE, 140249660833792, 140249710600191, +STORE, 140249660817408, 140249710600191, +STORE, 140249660809216, 140249710600191, +STORE, 140249660776448, 140249710600191, +STORE, 140249660768256, 140249710600191, +STORE, 140249660751872, 140249710600191, +STORE, 140249660743680, 140249710600191, +STORE, 140249660727296, 140249710600191, +STORE, 140249660719104, 140249710600191, +STORE, 140249660702720, 140249710600191, +STORE, 140249660694528, 140249710600191, +STORE, 140249660661760, 140249710600191, +STORE, 140249660653568, 140249710600191, +STORE, 140249660637184, 140249710600191, +STORE, 140249660628992, 140249710600191, +STORE, 140249660612608, 140249710600191, +STORE, 140249660604416, 140249710600191, +STORE, 140249660588032, 140249710600191, +STORE, 140249660579840, 140249710600191, +STORE, 140249660547072, 140249710600191, +STORE, 140249660538880, 140249710600191, +STORE, 140249660522496, 140249710600191, +STORE, 140249660514304, 140249710600191, +STORE, 140249660497920, 140249710600191, +STORE, 140249660489728, 140249710600191, +STORE, 140249660473344, 140249710600191, +STORE, 140249660465152, 140249710600191, +STORE, 140249660432384, 140249710600191, +STORE, 140249660424192, 140249710600191, +STORE, 140249660407808, 140249710600191, +STORE, 140249660399616, 140249710600191, +STORE, 140249660383232, 140249710600191, +STORE, 140249660375040, 140249710600191, +STORE, 140249660358656, 140249710600191, +STORE, 140249660350464, 140249710600191, +STORE, 140249660317696, 140249710600191, +STORE, 140249660309504, 140249710600191, +STORE, 140249660293120, 140249710600191, +STORE, 140249660284928, 140249710600191, +STORE, 140249660268544, 140249710600191, +STORE, 140249660260352, 140249710600191, +STORE, 140249660243968, 140249710600191, +STORE, 140249660235776, 140249710600191, +STORE, 140249660203008, 140249710600191, +STORE, 140249660194816, 140249710600191, +STORE, 140249660178432, 140249710600191, +STORE, 140249660170240, 140249710600191, +STORE, 140249660153856, 140249710600191, +STORE, 140249660145664, 140249710600191, +STORE, 140249660129280, 140249710600191, +STORE, 140249660121088, 140249710600191, +STORE, 140249660088320, 140249710600191, +STORE, 140249660080128, 140249710600191, +STORE, 140249660063744, 140249710600191, +STORE, 140249660055552, 140249710600191, +STORE, 140249660039168, 140249710600191, +STORE, 140249660030976, 140249710600191, +STORE, 140249660014592, 140249710600191, +STORE, 140249660006400, 140249710600191, +STORE, 140249659973632, 140249710600191, +STORE, 140249659965440, 140249710600191, +STORE, 140249659949056, 140249710600191, +STORE, 140249659940864, 140249710600191, +STORE, 140249659924480, 140249710600191, +STORE, 140249659916288, 140249710600191, +STORE, 140249659899904, 140249710600191, +STORE, 140249659891712, 140249710600191, +STORE, 140249659858944, 140249710600191, +STORE, 140249659850752, 140249710600191, +STORE, 140249659834368, 140249710600191, +STORE, 140249659826176, 140249710600191, +STORE, 140249659809792, 140249710600191, +STORE, 140249659801600, 140249710600191, +STORE, 140249659785216, 140249710600191, +STORE, 140249657688064, 140249710600191, +STORE, 140249657679872, 140249710600191, +STORE, 140249657647104, 140249710600191, +STORE, 140249657638912, 140249710600191, +STORE, 140249657622528, 140249710600191, +STORE, 140249657614336, 140249710600191, +STORE, 140249657597952, 140249710600191, +STORE, 140249657589760, 140249710600191, +STORE, 140249657573376, 140249710600191, +STORE, 140249657565184, 140249710600191, +STORE, 140249657532416, 140249710600191, +STORE, 140249657524224, 140249710600191, +STORE, 140249657507840, 140249710600191, +STORE, 140249657499648, 140249710600191, +STORE, 140249657483264, 140249710600191, +STORE, 140249657475072, 140249710600191, +STORE, 140249657458688, 140249710600191, +STORE, 140249657450496, 140249710600191, +STORE, 140249657417728, 140249710600191, +STORE, 140249657409536, 140249710600191, +STORE, 140249657393152, 140249710600191, +STORE, 140249657384960, 140249710600191, +STORE, 140249657368576, 140249710600191, +STORE, 140249657360384, 140249710600191, +STORE, 140249657344000, 140249710600191, +STORE, 140249657335808, 140249710600191, +STORE, 140249657303040, 140249710600191, +STORE, 140249657294848, 140249710600191, +STORE, 140249657278464, 140249710600191, +STORE, 140249657270272, 140249710600191, +STORE, 140249657253888, 140249710600191, +STORE, 140249657245696, 140249710600191, +STORE, 140249657229312, 140249710600191, +STORE, 140249657221120, 140249710600191, +STORE, 140249657188352, 140249710600191, +STORE, 140249657180160, 140249710600191, +STORE, 140249657163776, 140249710600191, +STORE, 140249657155584, 140249710600191, +STORE, 140249657139200, 140249710600191, +STORE, 140249657131008, 140249710600191, +STORE, 140249657114624, 140249710600191, +STORE, 140249657106432, 140249710600191, +STORE, 140249657073664, 140249710600191, +STORE, 140249657065472, 140249710600191, +STORE, 140249657049088, 140249710600191, +STORE, 140249657040896, 140249710600191, +STORE, 140249657024512, 140249710600191, +STORE, 140249657016320, 140249710600191, +STORE, 140249656999936, 140249710600191, +STORE, 140249656991744, 140249710600191, +STORE, 140249656958976, 140249710600191, +STORE, 140249656950784, 140249710600191, +STORE, 140249656934400, 140249710600191, +STORE, 140249656926208, 140249710600191, +STORE, 140249656909824, 140249710600191, +STORE, 140249656901632, 140249710600191, +STORE, 140249656885248, 140249710600191, +STORE, 140249656877056, 140249710600191, +STORE, 140249656844288, 140249710600191, +STORE, 140249656836096, 140249710600191, +STORE, 140249656819712, 140249710600191, +STORE, 140249656811520, 140249710600191, +STORE, 140249656795136, 140249710600191, +STORE, 33853440, 38662143, +STORE, 140249656786944, 140249710600191, +STORE, 140249656770560, 140249710600191, +STORE, 140249656762368, 140249710600191, +STORE, 140249656729600, 140249710600191, +STORE, 140249656721408, 140249710600191, +STORE, 140249656705024, 140249710600191, +STORE, 140249656696832, 140249710600191, +STORE, 140249656680448, 140249710600191, +STORE, 140249656672256, 140249710600191, +STORE, 140249656655872, 140249710600191, +STORE, 140249656647680, 140249710600191, +STORE, 140249656614912, 140249710600191, +STORE, 140249656606720, 140249710600191, +STORE, 140249656590336, 140249710600191, +STORE, 140249656582144, 140249710600191, +STORE, 140249656565760, 140249710600191, +STORE, 140249656557568, 140249710600191, +STORE, 140249656541184, 140249710600191, +STORE, 140249656532992, 140249710600191, +STORE, 140249656500224, 140249710600191, +STORE, 140249656492032, 140249710600191, +STORE, 140249656475648, 140249710600191, +STORE, 140249656467456, 140249710600191, +STORE, 140249656451072, 140249710600191, +STORE, 140249656442880, 140249710600191, +STORE, 140249656426496, 140249710600191, +STORE, 140249656418304, 140249710600191, +STORE, 140249656385536, 140249710600191, +STORE, 140249656377344, 140249710600191, +STORE, 140249656360960, 140249710600191, +STORE, 140249656352768, 140249710600191, +STORE, 140249656336384, 140249710600191, +STORE, 140249656328192, 140249710600191, +STORE, 140249656311808, 140249710600191, +STORE, 140249656303616, 140249710600191, +STORE, 140249656270848, 140249710600191, +STORE, 140249656262656, 140249710600191, +STORE, 140249656246272, 140249710600191, +STORE, 140249656238080, 140249710600191, +STORE, 140249656221696, 140249710600191, +STORE, 140249656213504, 140249710600191, +STORE, 140249656197120, 140249710600191, +STORE, 140249656188928, 140249710600191, +STORE, 140249656156160, 140249710600191, +STORE, 140249656147968, 140249710600191, +STORE, 140249656131584, 140249710600191, +STORE, 140249656123392, 140249710600191, +STORE, 140249656107008, 140249710600191, +STORE, 140249656098816, 140249710600191, +STORE, 140249656082432, 140249710600191, +STORE, 140249656074240, 140249710600191, +STORE, 140249656041472, 140249710600191, +STORE, 140249656033280, 140249710600191, +STORE, 140249656016896, 140249710600191, +STORE, 140249656008704, 140249710600191, +STORE, 140249655992320, 140249710600191, +STORE, 140249655984128, 140249710600191, +STORE, 140249655967744, 140249710600191, +STORE, 140249653870592, 140249710600191, +STORE, 140249653862400, 140249710600191, +STORE, 140249653829632, 140249710600191, +STORE, 140249653821440, 140249710600191, +STORE, 140249653805056, 140249710600191, +STORE, 140249653796864, 140249710600191, +STORE, 140249653780480, 140249710600191, +STORE, 140249653772288, 140249710600191, +STORE, 140249653755904, 140249710600191, +STORE, 140249652703232, 140249710600191, +SNULL, 140249682087935, 140249710600191, +STORE, 140249652703232, 140249682087935, +STORE, 140249682087936, 140249710600191, + }; + + unsigned long set26[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140729464770560, 140737488351231, +SNULL, 140729464774655, 140737488351231, +STORE, 140729464770560, 140729464774655, +STORE, 140729464639488, 140729464774655, +STORE, 4194304, 5066751, +STORE, 7159808, 7172095, +STORE, 7172096, 7180287, +STORE, 140729465114624, 140729465118719, +STORE, 140729465102336, 140729465114623, +STORE, 30867456, 30875647, +STORE, 30867456, 31010815, +STORE, 140109040988160, 140109042671615, +STORE, 140109040959488, 140109040988159, +STORE, 140109040943104, 140109040959487, +ERASE, 140109040943104, 140109040959487, +STORE, 140109040840704, 140109040959487, +ERASE, 140109040840704, 140109040959487, +STORE, 140109040951296, 140109040959487, +ERASE, 140109040951296, 140109040959487, +STORE, 140109040955392, 140109040959487, +ERASE, 140109040955392, 140109040959487, + }; + unsigned long set27[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140726128070656, 140737488351231, +SNULL, 140726128074751, 140737488351231, +STORE, 140726128070656, 140726128074751, +STORE, 140726127939584, 140726128074751, +STORE, 94478497189888, 94478499303423, +SNULL, 94478497202175, 94478499303423, +STORE, 94478497189888, 94478497202175, +STORE, 94478497202176, 94478499303423, +ERASE, 94478497202176, 94478499303423, +STORE, 94478499295232, 94478499303423, +STORE, 140415605723136, 140415607975935, +SNULL, 140415605866495, 140415607975935, +STORE, 140415605723136, 140415605866495, +STORE, 140415605866496, 140415607975935, +ERASE, 140415605866496, 140415607975935, +STORE, 140415607963648, 140415607971839, +STORE, 140415607971840, 140415607975935, +STORE, 140726130024448, 140726130028543, +STORE, 140726130012160, 140726130024447, +STORE, 140415607934976, 140415607963647, +STORE, 140415607926784, 140415607934975, +STORE, 140415603245056, 140415605723135, +SNULL, 140415603245056, 140415603613695, +STORE, 140415603613696, 140415605723135, +STORE, 140415603245056, 140415603613695, +SNULL, 140415605710847, 140415605723135, +STORE, 140415603613696, 140415605710847, +STORE, 140415605710848, 140415605723135, +ERASE, 140415605710848, 140415605723135, +STORE, 140415605710848, 140415605723135, +STORE, 140415599370240, 140415603245055, +SNULL, 140415599370240, 140415601111039, +STORE, 140415601111040, 140415603245055, +STORE, 140415599370240, 140415601111039, +SNULL, 140415603208191, 140415603245055, +STORE, 140415601111040, 140415603208191, +STORE, 140415603208192, 140415603245055, +ERASE, 140415603208192, 140415603245055, +STORE, 140415603208192, 140415603245055, +STORE, 140415595692032, 140415599370239, +SNULL, 140415595692032, 140415597207551, +STORE, 140415597207552, 140415599370239, +STORE, 140415595692032, 140415597207551, +SNULL, 140415599304703, 140415599370239, +STORE, 140415597207552, 140415599304703, +STORE, 140415599304704, 140415599370239, +SNULL, 140415599304704, 140415599353855, +STORE, 140415599353856, 140415599370239, +STORE, 140415599304704, 140415599353855, +ERASE, 140415599304704, 140415599353855, +STORE, 140415599304704, 140415599353855, +ERASE, 140415599353856, 140415599370239, +STORE, 140415599353856, 140415599370239, +STORE, 140415593500672, 140415595692031, +SNULL, 140415593500672, 140415593590783, +STORE, 140415593590784, 140415595692031, +STORE, 140415593500672, 140415593590783, +SNULL, 140415595683839, 140415595692031, +STORE, 140415593590784, 140415595683839, +STORE, 140415595683840, 140415595692031, +ERASE, 140415595683840, 140415595692031, +STORE, 140415595683840, 140415595692031, +STORE, 140415589703680, 140415593500671, +SNULL, 140415589703680, 140415591362559, +STORE, 140415591362560, 140415593500671, +STORE, 140415589703680, 140415591362559, +SNULL, 140415593459711, 140415593500671, +STORE, 140415591362560, 140415593459711, +STORE, 140415593459712, 140415593500671, +SNULL, 140415593459712, 140415593484287, +STORE, 140415593484288, 140415593500671, +STORE, 140415593459712, 140415593484287, +ERASE, 140415593459712, 140415593484287, +STORE, 140415593459712, 140415593484287, +ERASE, 140415593484288, 140415593500671, +STORE, 140415593484288, 140415593500671, +STORE, 140415587590144, 140415589703679, +SNULL, 140415587590144, 140415587602431, +STORE, 140415587602432, 140415589703679, +STORE, 140415587590144, 140415587602431, +SNULL, 140415589695487, 140415589703679, +STORE, 140415587602432, 140415589695487, +STORE, 140415589695488, 140415589703679, +ERASE, 140415589695488, 140415589703679, +STORE, 140415589695488, 140415589703679, +STORE, 140415607918592, 140415607934975, +STORE, 140415585398784, 140415587590143, +SNULL, 140415585398784, 140415585480703, +STORE, 140415585480704, 140415587590143, +STORE, 140415585398784, 140415585480703, +SNULL, 140415587573759, 140415587590143, +STORE, 140415585480704, 140415587573759, +STORE, 140415587573760, 140415587590143, +SNULL, 140415587573760, 140415587581951, +STORE, 140415587581952, 140415587590143, +STORE, 140415587573760, 140415587581951, +ERASE, 140415587573760, 140415587581951, +STORE, 140415587573760, 140415587581951, +ERASE, 140415587581952, 140415587590143, +STORE, 140415587581952, 140415587590143, +STORE, 140415583182848, 140415585398783, +SNULL, 140415583182848, 140415583281151, +STORE, 140415583281152, 140415585398783, +STORE, 140415583182848, 140415583281151, +SNULL, 140415585374207, 140415585398783, +STORE, 140415583281152, 140415585374207, +STORE, 140415585374208, 140415585398783, +SNULL, 140415585374208, 140415585382399, +STORE, 140415585382400, 140415585398783, +STORE, 140415585374208, 140415585382399, +ERASE, 140415585374208, 140415585382399, +STORE, 140415585374208, 140415585382399, +ERASE, 140415585382400, 140415585398783, +STORE, 140415585382400, 140415585398783, +STORE, 140415580979200, 140415583182847, +SNULL, 140415580979200, 140415581081599, +STORE, 140415581081600, 140415583182847, +STORE, 140415580979200, 140415581081599, +SNULL, 140415583174655, 140415583182847, +STORE, 140415581081600, 140415583174655, +STORE, 140415583174656, 140415583182847, +ERASE, 140415583174656, 140415583182847, +STORE, 140415583174656, 140415583182847, +STORE, 140415578816512, 140415580979199, +SNULL, 140415578816512, 140415578877951, +STORE, 140415578877952, 140415580979199, +STORE, 140415578816512, 140415578877951, +SNULL, 140415580971007, 140415580979199, +STORE, 140415578877952, 140415580971007, +STORE, 140415580971008, 140415580979199, +ERASE, 140415580971008, 140415580979199, +STORE, 140415580971008, 140415580979199, +STORE, 140415576563712, 140415578816511, +SNULL, 140415576563712, 140415576715263, +STORE, 140415576715264, 140415578816511, +STORE, 140415576563712, 140415576715263, +SNULL, 140415578808319, 140415578816511, +STORE, 140415576715264, 140415578808319, +STORE, 140415578808320, 140415578816511, +ERASE, 140415578808320, 140415578816511, +STORE, 140415578808320, 140415578816511, +STORE, 140415574392832, 140415576563711, +SNULL, 140415574392832, 140415574462463, +STORE, 140415574462464, 140415576563711, +STORE, 140415574392832, 140415574462463, +SNULL, 140415576555519, 140415576563711, +STORE, 140415574462464, 140415576555519, +STORE, 140415576555520, 140415576563711, +ERASE, 140415576555520, 140415576563711, +STORE, 140415576555520, 140415576563711, +STORE, 140415607910400, 140415607934975, +STORE, 140415571230720, 140415574392831, +SNULL, 140415571230720, 140415572291583, +STORE, 140415572291584, 140415574392831, +STORE, 140415571230720, 140415572291583, +SNULL, 140415574384639, 140415574392831, +STORE, 140415572291584, 140415574384639, +STORE, 140415574384640, 140415574392831, +ERASE, 140415574384640, 140415574392831, +STORE, 140415574384640, 140415574392831, +STORE, 140415607902208, 140415607934975, +SNULL, 140415593476095, 140415593484287, +STORE, 140415593459712, 140415593476095, +STORE, 140415593476096, 140415593484287, +SNULL, 140415574388735, 140415574392831, +STORE, 140415574384640, 140415574388735, +STORE, 140415574388736, 140415574392831, +SNULL, 140415576559615, 140415576563711, +STORE, 140415576555520, 140415576559615, +STORE, 140415576559616, 140415576563711, +SNULL, 140415589699583, 140415589703679, +STORE, 140415589695488, 140415589699583, +STORE, 140415589699584, 140415589703679, +SNULL, 140415585378303, 140415585382399, +STORE, 140415585374208, 140415585378303, +STORE, 140415585378304, 140415585382399, +SNULL, 140415578812415, 140415578816511, +STORE, 140415578808320, 140415578812415, +STORE, 140415578812416, 140415578816511, +SNULL, 140415580975103, 140415580979199, +STORE, 140415580971008, 140415580975103, +STORE, 140415580975104, 140415580979199, +SNULL, 140415583178751, 140415583182847, +STORE, 140415583174656, 140415583178751, +STORE, 140415583178752, 140415583182847, +SNULL, 140415587577855, 140415587581951, +STORE, 140415587573760, 140415587577855, +STORE, 140415587577856, 140415587581951, +SNULL, 140415595687935, 140415595692031, +STORE, 140415595683840, 140415595687935, +STORE, 140415595687936, 140415595692031, +STORE, 140415607894016, 140415607934975, +SNULL, 140415599345663, 140415599353855, +STORE, 140415599304704, 140415599345663, +STORE, 140415599345664, 140415599353855, +SNULL, 140415603240959, 140415603245055, +STORE, 140415603208192, 140415603240959, +STORE, 140415603240960, 140415603245055, +SNULL, 140415605719039, 140415605723135, +STORE, 140415605710848, 140415605719039, +STORE, 140415605719040, 140415605723135, +SNULL, 94478499299327, 94478499303423, +STORE, 94478499295232, 94478499299327, +STORE, 94478499299328, 94478499303423, +SNULL, 140415607967743, 140415607971839, +STORE, 140415607963648, 140415607967743, +STORE, 140415607967744, 140415607971839, +ERASE, 140415607934976, 140415607963647, +STORE, 94478511173632, 94478511378431, +STORE, 140415606210560, 140415607894015, +STORE, 140415607934976, 140415607963647, +STORE, 94478511173632, 94478511513599, +STORE, 94478511173632, 94478511648767, +SNULL, 94478511615999, 94478511648767, +STORE, 94478511173632, 94478511615999, +STORE, 94478511616000, 94478511648767, +ERASE, 94478511616000, 94478511648767, +STORE, 94478511173632, 94478511751167, +SNULL, 94478511747071, 94478511751167, +STORE, 94478511173632, 94478511747071, +STORE, 94478511747072, 94478511751167, +ERASE, 94478511747072, 94478511751167, +STORE, 94478511173632, 94478511882239, +SNULL, 94478511878143, 94478511882239, +STORE, 94478511173632, 94478511878143, +STORE, 94478511878144, 94478511882239, +ERASE, 94478511878144, 94478511882239, +STORE, 94478511173632, 94478512013311, +SNULL, 94478512009215, 94478512013311, +STORE, 94478511173632, 94478512009215, +STORE, 94478512009216, 94478512013311, +ERASE, 94478512009216, 94478512013311, +STORE, 94478511173632, 94478512144383, +STORE, 94478511173632, 94478512279551, +STORE, 140415606181888, 140415606210559, +STORE, 140415569100800, 140415571230719, +SNULL, 140415569100800, 140415569129471, +STORE, 140415569129472, 140415571230719, +STORE, 140415569100800, 140415569129471, +SNULL, 140415571222527, 140415571230719, +STORE, 140415569129472, 140415571222527, +STORE, 140415571222528, 140415571230719, +ERASE, 140415571222528, 140415571230719, +STORE, 140415571222528, 140415571230719, +STORE, 140415566905344, 140415569100799, +SNULL, 140415566905344, 140415566987263, +STORE, 140415566987264, 140415569100799, +STORE, 140415566905344, 140415566987263, +SNULL, 140415569084415, 140415569100799, +STORE, 140415566987264, 140415569084415, +STORE, 140415569084416, 140415569100799, +SNULL, 140415569084416, 140415569092607, +STORE, 140415569092608, 140415569100799, +STORE, 140415569084416, 140415569092607, +ERASE, 140415569084416, 140415569092607, +STORE, 140415569084416, 140415569092607, +ERASE, 140415569092608, 140415569100799, +STORE, 140415569092608, 140415569100799, +SNULL, 140415569088511, 140415569092607, +STORE, 140415569084416, 140415569088511, +STORE, 140415569088512, 140415569092607, +SNULL, 140415571226623, 140415571230719, +STORE, 140415571222528, 140415571226623, +STORE, 140415571226624, 140415571230719, +ERASE, 140415606181888, 140415606210559, +STORE, 140415606181888, 140415606210559, +STORE, 140415564759040, 140415566905343, +SNULL, 140415564759040, 140415564804095, +STORE, 140415564804096, 140415566905343, +STORE, 140415564759040, 140415564804095, +SNULL, 140415566897151, 140415566905343, +STORE, 140415564804096, 140415566897151, +STORE, 140415566897152, 140415566905343, +ERASE, 140415566897152, 140415566905343, +STORE, 140415566897152, 140415566905343, +STORE, 140415562588160, 140415564759039, +SNULL, 140415562588160, 140415562629119, +STORE, 140415562629120, 140415564759039, +STORE, 140415562588160, 140415562629119, +SNULL, 140415564726271, 140415564759039, +STORE, 140415562629120, 140415564726271, +STORE, 140415564726272, 140415564759039, +SNULL, 140415564726272, 140415564734463, +STORE, 140415564734464, 140415564759039, +STORE, 140415564726272, 140415564734463, +ERASE, 140415564726272, 140415564734463, +STORE, 140415564726272, 140415564734463, +ERASE, 140415564734464, 140415564759039, +STORE, 140415564734464, 140415564759039, +SNULL, 140415564730367, 140415564734463, +STORE, 140415564726272, 140415564730367, +STORE, 140415564730368, 140415564734463, +SNULL, 140415566901247, 140415566905343, +STORE, 140415566897152, 140415566901247, +STORE, 140415566901248, 140415566905343, +ERASE, 140415606181888, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415605944320, 140415606210559, +ERASE, 140415605944320, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 94478511173632, 94478512414719, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 140415606206464, 140415606210559, +ERASE, 140415606206464, 140415606210559, +STORE, 94478511173632, 94478512652287, +STORE, 94478511173632, 94478512787455, +STORE, 94478511173632, 94478512922623, +STORE, 94478511173632, 94478513057791, +STORE, 140415537422336, 140415562588159, +STORE, 94478511173632, 94478513192959, +STORE, 94478511173632, 94478513356799, +STORE, 94478511173632, 94478513491967, +STORE, 94478511173632, 94478513627135, +STORE, 94478511173632, 94478513790975, +STORE, 94478511173632, 94478513926143, +STORE, 94478511173632, 94478514061311, +STORE, 94478511173632, 94478514196479, +STORE, 94478511173632, 94478514331647, +STORE, 94478511173632, 94478514606079, +STORE, 94478511173632, 94478514741247, +STORE, 94478511173632, 94478514876415, +STORE, 94478511173632, 94478515011583, +STORE, 94478511173632, 94478515146751, +STORE, 94478511173632, 94478515281919, +STORE, 94478511173632, 94478515474431, +STORE, 94478511173632, 94478515609599, +STORE, 94478511173632, 94478515744767, +STORE, 140415536922624, 140415562588159, +STORE, 94478511173632, 94478515879935, +STORE, 94478511173632, 94478516015103, +STORE, 94478511173632, 94478516150271, +STORE, 94478511173632, 94478516285439, +STORE, 94478511173632, 94478516420607, +STORE, 94478511173632, 94478516555775, +STORE, 94478511173632, 94478516690943, +STORE, 94478511173632, 94478516826111, +STORE, 94478511173632, 94478516961279, +STORE, 94478511173632, 94478517231615, +STORE, 94478511173632, 94478517366783, +STORE, 94478511173632, 94478517501951, +STORE, 94478511173632, 94478517637119, +STORE, 94478511173632, 94478517772287, +STORE, 94478511173632, 94478517907455, +STORE, 94478511173632, 94478518042623, +STORE, 94478511173632, 94478518177791, +STORE, 94478511173632, 94478518312959, +STORE, 94478511173632, 94478518448127, +STORE, 140415535910912, 140415562588159, +SNULL, 140415536922623, 140415562588159, +STORE, 140415535910912, 140415536922623, +STORE, 140415536922624, 140415562588159, +SNULL, 140415536922624, 140415537422335, +STORE, 140415537422336, 140415562588159, +STORE, 140415536922624, 140415537422335, +ERASE, 140415536922624, 140415537422335, +STORE, 94478511173632, 94478518583295, +STORE, 94478511173632, 94478518718463, +STORE, 94478511173632, 94478518853631, +STORE, 94478511173632, 94478518988799, +STORE, 94478511173632, 94478519123967, +STORE, 94478511173632, 94478519259135, +STORE, 140415509696512, 140415535910911, +ERASE, 140415537422336, 140415562588159, +STORE, 140415482433536, 140415509696511, + }; + unsigned long set28[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140722475622400, 140737488351231, +SNULL, 140722475626495, 140737488351231, +STORE, 140722475622400, 140722475626495, +STORE, 140722475491328, 140722475626495, +STORE, 93865834291200, 93865836548095, +SNULL, 93865834422271, 93865836548095, +STORE, 93865834291200, 93865834422271, +STORE, 93865834422272, 93865836548095, +ERASE, 93865834422272, 93865836548095, +STORE, 93865836519424, 93865836527615, +STORE, 93865836527616, 93865836548095, +STORE, 139918411104256, 139918413357055, +SNULL, 139918411247615, 139918413357055, +STORE, 139918411104256, 139918411247615, +STORE, 139918411247616, 139918413357055, +ERASE, 139918411247616, 139918413357055, +STORE, 139918413344768, 139918413352959, +STORE, 139918413352960, 139918413357055, +STORE, 140722476642304, 140722476646399, +STORE, 140722476630016, 140722476642303, +STORE, 139918413316096, 139918413344767, +STORE, 139918413307904, 139918413316095, +STORE, 139918408888320, 139918411104255, +SNULL, 139918408888320, 139918408986623, +STORE, 139918408986624, 139918411104255, +STORE, 139918408888320, 139918408986623, +SNULL, 139918411079679, 139918411104255, +STORE, 139918408986624, 139918411079679, +STORE, 139918411079680, 139918411104255, +SNULL, 139918411079680, 139918411087871, +STORE, 139918411087872, 139918411104255, +STORE, 139918411079680, 139918411087871, +ERASE, 139918411079680, 139918411087871, +STORE, 139918411079680, 139918411087871, +ERASE, 139918411087872, 139918411104255, +STORE, 139918411087872, 139918411104255, +STORE, 139918405091328, 139918408888319, +SNULL, 139918405091328, 139918406750207, +STORE, 139918406750208, 139918408888319, +STORE, 139918405091328, 139918406750207, +SNULL, 139918408847359, 139918408888319, +STORE, 139918406750208, 139918408847359, +STORE, 139918408847360, 139918408888319, +SNULL, 139918408847360, 139918408871935, +STORE, 139918408871936, 139918408888319, +STORE, 139918408847360, 139918408871935, +ERASE, 139918408847360, 139918408871935, +STORE, 139918408847360, 139918408871935, +ERASE, 139918408871936, 139918408888319, +STORE, 139918408871936, 139918408888319, +STORE, 139918413299712, 139918413316095, +SNULL, 139918408863743, 139918408871935, +STORE, 139918408847360, 139918408863743, +STORE, 139918408863744, 139918408871935, +SNULL, 139918411083775, 139918411087871, +STORE, 139918411079680, 139918411083775, +STORE, 139918411083776, 139918411087871, +SNULL, 93865836523519, 93865836527615, +STORE, 93865836519424, 93865836523519, +STORE, 93865836523520, 93865836527615, +SNULL, 139918413348863, 139918413352959, +STORE, 139918413344768, 139918413348863, +STORE, 139918413348864, 139918413352959, +ERASE, 139918413316096, 139918413344767, +STORE, 93865848528896, 93865848664063, + }; + unsigned long set29[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140734467944448, 140737488351231, +SNULL, 140734467948543, 140737488351231, +STORE, 140734467944448, 140734467948543, +STORE, 140734467813376, 140734467948543, +STORE, 94880407924736, 94880410177535, +SNULL, 94880408055807, 94880410177535, +STORE, 94880407924736, 94880408055807, +STORE, 94880408055808, 94880410177535, +ERASE, 94880408055808, 94880410177535, +STORE, 94880410148864, 94880410157055, +STORE, 94880410157056, 94880410177535, +STORE, 140143367815168, 140143370067967, +SNULL, 140143367958527, 140143370067967, +STORE, 140143367815168, 140143367958527, +STORE, 140143367958528, 140143370067967, +ERASE, 140143367958528, 140143370067967, +STORE, 140143370055680, 140143370063871, +STORE, 140143370063872, 140143370067967, +STORE, 140734468329472, 140734468333567, +STORE, 140734468317184, 140734468329471, +STORE, 140143370027008, 140143370055679, +STORE, 140143370018816, 140143370027007, +STORE, 140143365599232, 140143367815167, +SNULL, 140143365599232, 140143365697535, +STORE, 140143365697536, 140143367815167, +STORE, 140143365599232, 140143365697535, +SNULL, 140143367790591, 140143367815167, +STORE, 140143365697536, 140143367790591, +STORE, 140143367790592, 140143367815167, +SNULL, 140143367790592, 140143367798783, +STORE, 140143367798784, 140143367815167, +STORE, 140143367790592, 140143367798783, +ERASE, 140143367790592, 140143367798783, +STORE, 140143367790592, 140143367798783, +ERASE, 140143367798784, 140143367815167, +STORE, 140143367798784, 140143367815167, +STORE, 140143361802240, 140143365599231, +SNULL, 140143361802240, 140143363461119, +STORE, 140143363461120, 140143365599231, +STORE, 140143361802240, 140143363461119, +SNULL, 140143365558271, 140143365599231, +STORE, 140143363461120, 140143365558271, +STORE, 140143365558272, 140143365599231, +SNULL, 140143365558272, 140143365582847, +STORE, 140143365582848, 140143365599231, +STORE, 140143365558272, 140143365582847, +ERASE, 140143365558272, 140143365582847, +STORE, 140143365558272, 140143365582847, +ERASE, 140143365582848, 140143365599231, +STORE, 140143365582848, 140143365599231, +STORE, 140143370010624, 140143370027007, +SNULL, 140143365574655, 140143365582847, +STORE, 140143365558272, 140143365574655, +STORE, 140143365574656, 140143365582847, +SNULL, 140143367794687, 140143367798783, +STORE, 140143367790592, 140143367794687, +STORE, 140143367794688, 140143367798783, +SNULL, 94880410152959, 94880410157055, +STORE, 94880410148864, 94880410152959, +STORE, 94880410152960, 94880410157055, +SNULL, 140143370059775, 140143370063871, +STORE, 140143370055680, 140143370059775, +STORE, 140143370059776, 140143370063871, +ERASE, 140143370027008, 140143370055679, +STORE, 94880442400768, 94880442535935, +STORE, 140143353409536, 140143361802239, +SNULL, 140143353413631, 140143361802239, +STORE, 140143353409536, 140143353413631, +STORE, 140143353413632, 140143361802239, +STORE, 140143345016832, 140143353409535, +STORE, 140143210799104, 140143345016831, +SNULL, 140143210799104, 140143239364607, +STORE, 140143239364608, 140143345016831, +STORE, 140143210799104, 140143239364607, +ERASE, 140143210799104, 140143239364607, +SNULL, 140143306473471, 140143345016831, +STORE, 140143239364608, 140143306473471, +STORE, 140143306473472, 140143345016831, +ERASE, 140143306473472, 140143345016831, +SNULL, 140143239499775, 140143306473471, +STORE, 140143239364608, 140143239499775, +STORE, 140143239499776, 140143306473471, +SNULL, 140143345020927, 140143353409535, +STORE, 140143345016832, 140143345020927, +STORE, 140143345020928, 140143353409535, +STORE, 140143336624128, 140143345016831, +SNULL, 140143336628223, 140143345016831, +STORE, 140143336624128, 140143336628223, +STORE, 140143336628224, 140143345016831, +STORE, 140143328231424, 140143336624127, +SNULL, 140143328235519, 140143336624127, +STORE, 140143328231424, 140143328235519, +STORE, 140143328235520, 140143336624127, +STORE, 140143319838720, 140143328231423, +SNULL, 140143319842815, 140143328231423, +STORE, 140143319838720, 140143319842815, +STORE, 140143319842816, 140143328231423, +STORE, 140143311446016, 140143319838719, +STORE, 140143105146880, 140143239364607, +STORE, 140143096754176, 140143105146879, +STORE, 140143029645312, 140143096754175, +ERASE, 140143029645312, 140143096754175, +STORE, 140142962536448, 140143096754175, +SNULL, 140142962536448, 140142970929151, +STORE, 140142970929152, 140143096754175, +STORE, 140142962536448, 140142970929151, +ERASE, 140142962536448, 140142970929151, +STORE, 140142962536448, 140142970929151, +STORE, 140142828318720, 140142962536447, +STORE, 140142819926016, 140142828318719, +SNULL, 140142828318720, 140142836711423, +STORE, 140142836711424, 140142962536447, +STORE, 140142828318720, 140142836711423, +ERASE, 140142828318720, 140142836711423, +SNULL, 140143172255743, 140143239364607, +STORE, 140143105146880, 140143172255743, +STORE, 140143172255744, 140143239364607, +ERASE, 140143172255744, 140143239364607, +SNULL, 140143105282047, 140143172255743, +STORE, 140143105146880, 140143105282047, +STORE, 140143105282048, 140143172255743, +SNULL, 140143038038015, 140143096754175, +STORE, 140142970929152, 140143038038015, +STORE, 140143038038016, 140143096754175, +ERASE, 140143038038016, 140143096754175, +SNULL, 140142971064319, 140143038038015, +STORE, 140142970929152, 140142971064319, +STORE, 140142971064320, 140143038038015, +SNULL, 140142903820287, 140142962536447, +STORE, 140142836711424, 140142903820287, +STORE, 140142903820288, 140142962536447, +ERASE, 140142903820288, 140142962536447, +SNULL, 140142836846591, 140142903820287, +STORE, 140142836711424, 140142836846591, +STORE, 140142836846592, 140142903820287, +STORE, 140142685708288, 140142819926015, +SNULL, 140143311450111, 140143319838719, +STORE, 140143311446016, 140143311450111, +STORE, 140143311450112, 140143319838719, +SNULL, 140142962540543, 140142970929151, +STORE, 140142962536448, 140142962540543, +STORE, 140142962540544, 140142970929151, +SNULL, 140142685708288, 140142702493695, +STORE, 140142702493696, 140142819926015, +STORE, 140142685708288, 140142702493695, +ERASE, 140142685708288, 140142702493695, +SNULL, 140142769602559, 140142819926015, +STORE, 140142702493696, 140142769602559, +STORE, 140142769602560, 140142819926015, +ERASE, 140142769602560, 140142819926015, +SNULL, 140142702628863, 140142769602559, +STORE, 140142702493696, 140142702628863, +STORE, 140142702628864, 140142769602559, +STORE, 140143230971904, 140143239364607, +SNULL, 140143230975999, 140143239364607, +STORE, 140143230971904, 140143230975999, +STORE, 140143230976000, 140143239364607, +SNULL, 140143096758271, 140143105146879, +STORE, 140143096754176, 140143096758271, +STORE, 140143096758272, 140143105146879, +STORE, 140143222579200, 140143230971903, +SNULL, 140143222583295, 140143230971903, +STORE, 140143222579200, 140143222583295, +STORE, 140143222583296, 140143230971903, +STORE, 140143214186496, 140143222579199, +SNULL, 140142819930111, 140142828318719, +STORE, 140142819926016, 140142819930111, +STORE, 140142819930112, 140142828318719, +STORE, 140143205793792, 140143222579199, +SNULL, 140143205793792, 140143214186495, +STORE, 140143214186496, 140143222579199, +STORE, 140143205793792, 140143214186495, +SNULL, 140143214190591, 140143222579199, +STORE, 140143214186496, 140143214190591, +STORE, 140143214190592, 140143222579199, +SNULL, 140143205797887, 140143214186495, +STORE, 140143205793792, 140143205797887, +STORE, 140143205797888, 140143214186495, +STORE, 140143197401088, 140143205793791, +SNULL, 140143197405183, 140143205793791, +STORE, 140143197401088, 140143197405183, +STORE, 140143197405184, 140143205793791, +STORE, 140143189008384, 140143197401087, +STORE, 140143180615680, 140143197401087, +STORE, 140143088361472, 140143096754175, +SNULL, 140143180619775, 140143197401087, +STORE, 140143180615680, 140143180619775, +STORE, 140143180619776, 140143197401087, +SNULL, 140143180619776, 140143189008383, +STORE, 140143189008384, 140143197401087, +STORE, 140143180619776, 140143189008383, +SNULL, 140143189012479, 140143197401087, +STORE, 140143189008384, 140143189012479, +STORE, 140143189012480, 140143197401087, +SNULL, 140143088365567, 140143096754175, +STORE, 140143088361472, 140143088365567, +STORE, 140143088365568, 140143096754175, +STORE, 140143079968768, 140143088361471, +SNULL, 140143079972863, 140143088361471, +STORE, 140143079968768, 140143079972863, +STORE, 140143079972864, 140143088361471, +STORE, 140143071576064, 140143079968767, +SNULL, 140143071580159, 140143079968767, +STORE, 140143071576064, 140143071580159, +STORE, 140143071580160, 140143079968767, +STORE, 140143063183360, 140143071576063, +STORE, 140143054790656, 140143071576063, +SNULL, 140143054794751, 140143071576063, +STORE, 140143054790656, 140143054794751, +STORE, 140143054794752, 140143071576063, +SNULL, 140143054794752, 140143063183359, +STORE, 140143063183360, 140143071576063, +STORE, 140143054794752, 140143063183359, +SNULL, 140143063187455, 140143071576063, +STORE, 140143063183360, 140143063187455, +STORE, 140143063187456, 140143071576063, +STORE, 140143046397952, 140143054790655, +STORE, 140142954143744, 140142962536447, +STORE, 140142945751040, 140142962536447, +STORE, 140142937358336, 140142962536447, +STORE, 140142928965632, 140142962536447, +STORE, 140142568275968, 140142702493695, +SNULL, 140142635384831, 140142702493695, +STORE, 140142568275968, 140142635384831, +STORE, 140142635384832, 140142702493695, +ERASE, 140142635384832, 140142702493695, +STORE, 140142920572928, 140142962536447, +STORE, 140142912180224, 140142962536447, +STORE, 140142568275968, 140142702493695, +SNULL, 140142568275968, 140142635384831, +STORE, 140142635384832, 140142702493695, +STORE, 140142568275968, 140142635384831, +SNULL, 140142635519999, 140142702493695, +STORE, 140142635384832, 140142635519999, +STORE, 140142635520000, 140142702493695, +STORE, 140142819930112, 140142836711423, +STORE, 140142811533312, 140142819926015, +STORE, 140142434058240, 140142635384831, +SNULL, 140142501167103, 140142635384831, +STORE, 140142434058240, 140142501167103, +STORE, 140142501167104, 140142635384831, +SNULL, 140142501167104, 140142568275967, +STORE, 140142568275968, 140142635384831, +STORE, 140142501167104, 140142568275967, +ERASE, 140142501167104, 140142568275967, +STORE, 140142299840512, 140142501167103, +STORE, 140142803140608, 140142819926015, +SNULL, 140142366949375, 140142501167103, +STORE, 140142299840512, 140142366949375, +STORE, 140142366949376, 140142501167103, +SNULL, 140142366949376, 140142434058239, +STORE, 140142434058240, 140142501167103, +STORE, 140142366949376, 140142434058239, +ERASE, 140142366949376, 140142434058239, +STORE, 140142794747904, 140142819926015, +STORE, 140142786355200, 140142819926015, +STORE, 140142299840512, 140142501167103, +STORE, 140142777962496, 140142819926015, +STORE, 140142559883264, 140142568275967, +STORE, 140142232731648, 140142501167103, +STORE, 140142551490560, 140142568275967, +SNULL, 140142777962496, 140142803140607, +STORE, 140142803140608, 140142819926015, +STORE, 140142777962496, 140142803140607, +SNULL, 140142803144703, 140142819926015, +STORE, 140142803140608, 140142803144703, +STORE, 140142803144704, 140142819926015, +STORE, 140142543097856, 140142568275967, +STORE, 140142098513920, 140142501167103, +SNULL, 140142165622783, 140142501167103, +STORE, 140142098513920, 140142165622783, +STORE, 140142165622784, 140142501167103, +SNULL, 140142165622784, 140142232731647, +STORE, 140142232731648, 140142501167103, +STORE, 140142165622784, 140142232731647, +ERASE, 140142165622784, 140142232731647, +SNULL, 140142568411135, 140142635384831, +STORE, 140142568275968, 140142568411135, +STORE, 140142568411136, 140142635384831, +STORE, 140141964296192, 140142165622783, +SNULL, 140142912180224, 140142928965631, +STORE, 140142928965632, 140142962536447, +STORE, 140142912180224, 140142928965631, +SNULL, 140142928969727, 140142962536447, +STORE, 140142928965632, 140142928969727, +STORE, 140142928969728, 140142962536447, +STORE, 140141830078464, 140142165622783, +SNULL, 140142912184319, 140142928965631, +STORE, 140142912180224, 140142912184319, +STORE, 140142912184320, 140142928965631, +SNULL, 140142232731648, 140142434058239, +STORE, 140142434058240, 140142501167103, +STORE, 140142232731648, 140142434058239, +SNULL, 140142434193407, 140142501167103, +STORE, 140142434058240, 140142434193407, +STORE, 140142434193408, 140142501167103, +SNULL, 140142232731648, 140142299840511, +STORE, 140142299840512, 140142434058239, +STORE, 140142232731648, 140142299840511, +SNULL, 140142299975679, 140142434058239, +STORE, 140142299840512, 140142299975679, +STORE, 140142299975680, 140142434058239, +SNULL, 140142928969728, 140142954143743, +STORE, 140142954143744, 140142962536447, +STORE, 140142928969728, 140142954143743, +SNULL, 140142954147839, 140142962536447, +STORE, 140142954143744, 140142954147839, +STORE, 140142954147840, 140142962536447, +STORE, 140141830078464, 140142299840511, +SNULL, 140142543097856, 140142559883263, +STORE, 140142559883264, 140142568275967, +STORE, 140142543097856, 140142559883263, +SNULL, 140142559887359, 140142568275967, +STORE, 140142559883264, 140142559887359, +STORE, 140142559887360, 140142568275967, +STORE, 140142534705152, 140142559883263, +SNULL, 140142928969728, 140142945751039, +STORE, 140142945751040, 140142954143743, +STORE, 140142928969728, 140142945751039, +SNULL, 140142945755135, 140142954143743, +STORE, 140142945751040, 140142945755135, +STORE, 140142945755136, 140142954143743, +SNULL, 140142299975680, 140142366949375, +STORE, 140142366949376, 140142434058239, +STORE, 140142299975680, 140142366949375, +SNULL, 140142367084543, 140142434058239, +STORE, 140142366949376, 140142367084543, +STORE, 140142367084544, 140142434058239, +SNULL, 140142928969728, 140142937358335, +STORE, 140142937358336, 140142945751039, +STORE, 140142928969728, 140142937358335, +SNULL, 140142937362431, 140142945751039, +STORE, 140142937358336, 140142937362431, +STORE, 140142937362432, 140142945751039, +SNULL, 140141830078464, 140142232731647, +STORE, 140142232731648, 140142299840511, +STORE, 140141830078464, 140142232731647, +SNULL, 140142232866815, 140142299840511, +STORE, 140142232731648, 140142232866815, +STORE, 140142232866816, 140142299840511, +SNULL, 140142534705152, 140142543097855, +STORE, 140142543097856, 140142559883263, +STORE, 140142534705152, 140142543097855, +SNULL, 140142543101951, 140142559883263, +STORE, 140142543097856, 140142543101951, +STORE, 140142543101952, 140142559883263, +STORE, 140142526312448, 140142543097855, +STORE, 140142517919744, 140142543097855, +SNULL, 140141830078464, 140142098513919, +STORE, 140142098513920, 140142232731647, +STORE, 140141830078464, 140142098513919, +SNULL, 140142098649087, 140142232731647, +STORE, 140142098513920, 140142098649087, +STORE, 140142098649088, 140142232731647, +SNULL, 140142031405055, 140142098513919, +STORE, 140141830078464, 140142031405055, +STORE, 140142031405056, 140142098513919, +ERASE, 140142031405056, 140142098513919, +SNULL, 140141830078464, 140141964296191, +STORE, 140141964296192, 140142031405055, +STORE, 140141830078464, 140141964296191, +SNULL, 140141964431359, 140142031405055, +STORE, 140141964296192, 140141964431359, +STORE, 140141964431360, 140142031405055, +STORE, 140142509527040, 140142543097855, +SNULL, 140141897187327, 140141964296191, +STORE, 140141830078464, 140141897187327, +STORE, 140141897187328, 140141964296191, +ERASE, 140141897187328, 140141964296191, +SNULL, 140141830213631, 140141897187327, +STORE, 140141830078464, 140141830213631, +STORE, 140141830213632, 140141897187327, +SNULL, 140142803144704, 140142811533311, +STORE, 140142811533312, 140142819926015, +STORE, 140142803144704, 140142811533311, +SNULL, 140142811537407, 140142819926015, +STORE, 140142811533312, 140142811537407, +STORE, 140142811537408, 140142819926015, +SNULL, 140142098649088, 140142165622783, +STORE, 140142165622784, 140142232731647, +STORE, 140142098649088, 140142165622783, +SNULL, 140142165757951, 140142232731647, +STORE, 140142165622784, 140142165757951, +STORE, 140142165757952, 140142232731647, +STORE, 140142090121216, 140142098513919, +SNULL, 140142777962496, 140142786355199, +STORE, 140142786355200, 140142803140607, +STORE, 140142777962496, 140142786355199, +SNULL, 140142786359295, 140142803140607, +STORE, 140142786355200, 140142786359295, +STORE, 140142786359296, 140142803140607, +SNULL, 140142509527040, 140142534705151, +STORE, 140142534705152, 140142543097855, +STORE, 140142509527040, 140142534705151, +SNULL, 140142534709247, 140142543097855, +STORE, 140142534705152, 140142534709247, +STORE, 140142534709248, 140142543097855, +STORE, 140142081728512, 140142098513919, +SNULL, 140142786359296, 140142794747903, +STORE, 140142794747904, 140142803140607, +STORE, 140142786359296, 140142794747903, +SNULL, 140142794751999, 140142803140607, +STORE, 140142794747904, 140142794751999, +STORE, 140142794752000, 140142803140607, +STORE, 140142073335808, 140142098513919, +SNULL, 140142073339903, 140142098513919, +STORE, 140142073335808, 140142073339903, +STORE, 140142073339904, 140142098513919, +SNULL, 140142543101952, 140142551490559, +STORE, 140142551490560, 140142559883263, +STORE, 140142543101952, 140142551490559, +SNULL, 140142551494655, 140142559883263, +STORE, 140142551490560, 140142551494655, +STORE, 140142551494656, 140142559883263, +SNULL, 140142509527040, 140142517919743, +STORE, 140142517919744, 140142534705151, +STORE, 140142509527040, 140142517919743, +SNULL, 140142517923839, 140142534705151, +STORE, 140142517919744, 140142517923839, +STORE, 140142517923840, 140142534705151, +STORE, 140142064943104, 140142073335807, +SNULL, 140142073339904, 140142090121215, +STORE, 140142090121216, 140142098513919, +STORE, 140142073339904, 140142090121215, +SNULL, 140142090125311, 140142098513919, +STORE, 140142090121216, 140142090125311, +STORE, 140142090125312, 140142098513919, +STORE, 140142056550400, 140142073335807, +SNULL, 140142056554495, 140142073335807, +STORE, 140142056550400, 140142056554495, +STORE, 140142056554496, 140142073335807, +STORE, 140142048157696, 140142056550399, +SNULL, 140142509531135, 140142517919743, +STORE, 140142509527040, 140142509531135, +STORE, 140142509531136, 140142517919743, +SNULL, 140142777966591, 140142786355199, +STORE, 140142777962496, 140142777966591, +STORE, 140142777966592, 140142786355199, +SNULL, 140143046402047, 140143054790655, +STORE, 140143046397952, 140143046402047, +STORE, 140143046402048, 140143054790655, +SNULL, 140142912184320, 140142920572927, +STORE, 140142920572928, 140142928965631, +STORE, 140142912184320, 140142920572927, +SNULL, 140142920577023, 140142928965631, +STORE, 140142920572928, 140142920577023, +STORE, 140142920577024, 140142928965631, +STORE, 140142039764992, 140142056550399, +STORE, 140141955903488, 140141964296191, +SNULL, 140142819930112, 140142828318719, +STORE, 140142828318720, 140142836711423, +STORE, 140142819930112, 140142828318719, +SNULL, 140142828322815, 140142836711423, +STORE, 140142828318720, 140142828322815, +STORE, 140142828322816, 140142836711423, +SNULL, 140142517923840, 140142526312447, +STORE, 140142526312448, 140142534705151, +STORE, 140142517923840, 140142526312447, +SNULL, 140142526316543, 140142534705151, +STORE, 140142526312448, 140142526316543, +STORE, 140142526316544, 140142534705151, +STORE, 140141947510784, 140141964296191, +SNULL, 140142056554496, 140142064943103, +STORE, 140142064943104, 140142073335807, +STORE, 140142056554496, 140142064943103, +SNULL, 140142064947199, 140142073335807, +STORE, 140142064943104, 140142064947199, +STORE, 140142064947200, 140142073335807, +SNULL, 140142073339904, 140142081728511, +STORE, 140142081728512, 140142090121215, +STORE, 140142073339904, 140142081728511, +SNULL, 140142081732607, 140142090121215, +STORE, 140142081728512, 140142081732607, +STORE, 140142081732608, 140142090121215, +STORE, 140141939118080, 140141964296191, +STORE, 140141930725376, 140141964296191, +STORE, 140141922332672, 140141964296191, +STORE, 140141913939968, 140141964296191, +SNULL, 140141913939968, 140141922332671, +STORE, 140141922332672, 140141964296191, +STORE, 140141913939968, 140141922332671, +SNULL, 140141922336767, 140141964296191, +STORE, 140141922332672, 140141922336767, +STORE, 140141922336768, 140141964296191, +STORE, 140141905547264, 140141922332671, +SNULL, 140141905551359, 140141922332671, +STORE, 140141905547264, 140141905551359, +STORE, 140141905551360, 140141922332671, +STORE, 140141821685760, 140141830078463, +STORE, 140141813293056, 140141830078463, +STORE, 140141804900352, 140141830078463, +STORE, 140141796507648, 140141830078463, +SNULL, 140141796511743, 140141830078463, +STORE, 140141796507648, 140141796511743, +STORE, 140141796511744, 140141830078463, +SNULL, 140141922336768, 140141955903487, +STORE, 140141955903488, 140141964296191, +STORE, 140141922336768, 140141955903487, +SNULL, 140141955907583, 140141964296191, +STORE, 140141955903488, 140141955907583, +STORE, 140141955907584, 140141964296191, +STORE, 140141788114944, 140141796507647, +STORE, 140141779722240, 140141796507647, +SNULL, 140141779722240, 140141788114943, +STORE, 140141788114944, 140141796507647, +STORE, 140141779722240, 140141788114943, +SNULL, 140141788119039, 140141796507647, +STORE, 140141788114944, 140141788119039, +STORE, 140141788119040, 140141796507647, +SNULL, 140141922336768, 140141947510783, +STORE, 140141947510784, 140141955903487, +STORE, 140141922336768, 140141947510783, +SNULL, 140141947514879, 140141955903487, +STORE, 140141947510784, 140141947514879, +STORE, 140141947514880, 140141955903487, +SNULL, 140142039764992, 140142048157695, +STORE, 140142048157696, 140142056550399, +STORE, 140142039764992, 140142048157695, +SNULL, 140142048161791, 140142056550399, +STORE, 140142048157696, 140142048161791, +STORE, 140142048161792, 140142056550399, +SNULL, 140142039769087, 140142048157695, +STORE, 140142039764992, 140142039769087, +STORE, 140142039769088, 140142048157695, +SNULL, 140141796511744, 140141804900351, +STORE, 140141804900352, 140141830078463, +STORE, 140141796511744, 140141804900351, +SNULL, 140141804904447, 140141830078463, +STORE, 140141804900352, 140141804904447, +STORE, 140141804904448, 140141830078463, +STORE, 140141771329536, 140141788114943, +STORE, 140141762936832, 140141788114943, +STORE, 140141754544128, 140141788114943, +SNULL, 140141804904448, 140141821685759, +STORE, 140141821685760, 140141830078463, +STORE, 140141804904448, 140141821685759, +SNULL, 140141821689855, 140141830078463, +STORE, 140141821685760, 140141821689855, +STORE, 140141821689856, 140141830078463, +SNULL, 140141922336768, 140141939118079, +STORE, 140141939118080, 140141947510783, +STORE, 140141922336768, 140141939118079, +SNULL, 140141939122175, 140141947510783, +STORE, 140141939118080, 140141939122175, +STORE, 140141939122176, 140141947510783, +SNULL, 140141905551360, 140141913939967, +STORE, 140141913939968, 140141922332671, +STORE, 140141905551360, 140141913939967, +SNULL, 140141913944063, 140141922332671, +STORE, 140141913939968, 140141913944063, +STORE, 140141913944064, 140141922332671, +STORE, 140141746151424, 140141788114943, +STORE, 140141737758720, 140141788114943, +SNULL, 140141804904448, 140141813293055, +STORE, 140141813293056, 140141821685759, +STORE, 140141804904448, 140141813293055, +SNULL, 140141813297151, 140141821685759, +STORE, 140141813293056, 140141813297151, +STORE, 140141813297152, 140141821685759, +STORE, 140141729366016, 140141788114943, +STORE, 140141720973312, 140141788114943, +STORE, 140141712580608, 140141788114943, +SNULL, 140141712584703, 140141788114943, +STORE, 140141712580608, 140141712584703, +STORE, 140141712584704, 140141788114943, +SNULL, 140141922336768, 140141930725375, +STORE, 140141930725376, 140141939118079, +STORE, 140141922336768, 140141930725375, +SNULL, 140141930729471, 140141939118079, +STORE, 140141930725376, 140141930729471, +STORE, 140141930729472, 140141939118079, +STORE, 140141704187904, 140141712580607, +SNULL, 140141704191999, 140141712580607, +STORE, 140141704187904, 140141704191999, +STORE, 140141704192000, 140141712580607, +STORE, 140141695795200, 140141704187903, +STORE, 140141687402496, 140141704187903, +SNULL, 140141712584704, 140141771329535, +STORE, 140141771329536, 140141788114943, +STORE, 140141712584704, 140141771329535, +SNULL, 140141771333631, 140141788114943, +STORE, 140141771329536, 140141771333631, +STORE, 140141771333632, 140141788114943, +SNULL, 140141771333632, 140141779722239, +STORE, 140141779722240, 140141788114943, +STORE, 140141771333632, 140141779722239, +SNULL, 140141779726335, 140141788114943, +STORE, 140141779722240, 140141779726335, +STORE, 140141779726336, 140141788114943, +STORE, 140141679009792, 140141704187903, +SNULL, 140141679013887, 140141704187903, +STORE, 140141679009792, 140141679013887, +STORE, 140141679013888, 140141704187903, +STORE, 140141670617088, 140141679009791, +SNULL, 140141670621183, 140141679009791, +STORE, 140141670617088, 140141670621183, +STORE, 140141670621184, 140141679009791, +STORE, 140141662224384, 140141670617087, +SNULL, 140141712584704, 140141737758719, +STORE, 140141737758720, 140141771329535, +STORE, 140141712584704, 140141737758719, +SNULL, 140141737762815, 140141771329535, +STORE, 140141737758720, 140141737762815, +STORE, 140141737762816, 140141771329535, +SNULL, 140141712584704, 140141729366015, +STORE, 140141729366016, 140141737758719, +STORE, 140141712584704, 140141729366015, +SNULL, 140141729370111, 140141737758719, +STORE, 140141729366016, 140141729370111, +STORE, 140141729370112, 140141737758719, +SNULL, 140141737762816, 140141746151423, +STORE, 140141746151424, 140141771329535, +STORE, 140141737762816, 140141746151423, +SNULL, 140141746155519, 140141771329535, +STORE, 140141746151424, 140141746155519, +STORE, 140141746155520, 140141771329535, +STORE, 140141653831680, 140141670617087, +SNULL, 140141746155520, 140141762936831, +STORE, 140141762936832, 140141771329535, +STORE, 140141746155520, 140141762936831, +SNULL, 140141762940927, 140141771329535, +STORE, 140141762936832, 140141762940927, +STORE, 140141762940928, 140141771329535, +STORE, 140141645438976, 140141670617087, +SNULL, 140141645443071, 140141670617087, +STORE, 140141645438976, 140141645443071, +STORE, 140141645443072, 140141670617087, +SNULL, 140141712584704, 140141720973311, +STORE, 140141720973312, 140141729366015, +STORE, 140141712584704, 140141720973311, +SNULL, 140141720977407, 140141729366015, +STORE, 140141720973312, 140141720977407, +STORE, 140141720977408, 140141729366015, +STORE, 140141637046272, 140141645438975, +SNULL, 140141637050367, 140141645438975, +STORE, 140141637046272, 140141637050367, +STORE, 140141637050368, 140141645438975, +STORE, 140141628653568, 140141637046271, +SNULL, 140141628657663, 140141637046271, +STORE, 140141628653568, 140141628657663, +STORE, 140141628657664, 140141637046271, +STORE, 140141620260864, 140141628653567, +SNULL, 140141679013888, 140141687402495, +STORE, 140141687402496, 140141704187903, +STORE, 140141679013888, 140141687402495, +SNULL, 140141687406591, 140141704187903, +STORE, 140141687402496, 140141687406591, +STORE, 140141687406592, 140141704187903, +SNULL, 140141746155520, 140141754544127, +STORE, 140141754544128, 140141762936831, +STORE, 140141746155520, 140141754544127, +SNULL, 140141754548223, 140141762936831, +STORE, 140141754544128, 140141754548223, +STORE, 140141754548224, 140141762936831, +SNULL, 140141687406592, 140141695795199, +STORE, 140141695795200, 140141704187903, +STORE, 140141687406592, 140141695795199, +SNULL, 140141695799295, 140141704187903, +STORE, 140141695795200, 140141695799295, +STORE, 140141695799296, 140141704187903, +STORE, 140141611868160, 140141628653567, +SNULL, 140141611872255, 140141628653567, +STORE, 140141611868160, 140141611872255, +STORE, 140141611872256, 140141628653567, +SNULL, 140141645443072, 140141662224383, +STORE, 140141662224384, 140141670617087, +STORE, 140141645443072, 140141662224383, +SNULL, 140141662228479, 140141670617087, +STORE, 140141662224384, 140141662228479, +STORE, 140141662228480, 140141670617087, +STORE, 140141603475456, 140141611868159, +SNULL, 140141603479551, 140141611868159, +STORE, 140141603475456, 140141603479551, +STORE, 140141603479552, 140141611868159, +STORE, 140141595082752, 140141603475455, +SNULL, 140141645443072, 140141653831679, +STORE, 140141653831680, 140141662224383, +STORE, 140141645443072, 140141653831679, +SNULL, 140141653835775, 140141662224383, +STORE, 140141653831680, 140141653835775, +STORE, 140141653835776, 140141662224383, +STORE, 140141586690048, 140141603475455, +SNULL, 140141611872256, 140141620260863, +STORE, 140141620260864, 140141628653567, +STORE, 140141611872256, 140141620260863, +SNULL, 140141620264959, 140141628653567, +STORE, 140141620260864, 140141620264959, +STORE, 140141620264960, 140141628653567, +SNULL, 140141586690048, 140141595082751, +STORE, 140141595082752, 140141603475455, +STORE, 140141586690048, 140141595082751, +SNULL, 140141595086847, 140141603475455, +STORE, 140141595082752, 140141595086847, +STORE, 140141595086848, 140141603475455, +STORE, 140141578297344, 140141595082751, +SNULL, 140141578301439, 140141595082751, +STORE, 140141578297344, 140141578301439, +STORE, 140141578301440, 140141595082751, +SNULL, 140141578301440, 140141586690047, +STORE, 140141586690048, 140141595082751, +STORE, 140141578301440, 140141586690047, +SNULL, 140141586694143, 140141595082751, +STORE, 140141586690048, 140141586694143, +STORE, 140141586694144, 140141595082751, +STORE, 140143370027008, 140143370055679, +STORE, 140143309254656, 140143311446015, +SNULL, 140143309254656, 140143309344767, +STORE, 140143309344768, 140143311446015, +STORE, 140143309254656, 140143309344767, +SNULL, 140143311437823, 140143311446015, +STORE, 140143309344768, 140143311437823, +STORE, 140143311437824, 140143311446015, +ERASE, 140143311437824, 140143311446015, +STORE, 140143311437824, 140143311446015, +SNULL, 140143311441919, 140143311446015, +STORE, 140143311437824, 140143311441919, +STORE, 140143311441920, 140143311446015, +ERASE, 140143370027008, 140143370055679, +ERASE, 140142912180224, 140142912184319, +ERASE, 140142912184320, 140142920572927, +ERASE, 140142945751040, 140142945755135, +ERASE, 140142945755136, 140142954143743, +ERASE, 140142090121216, 140142090125311, +ERASE, 140142090125312, 140142098513919, +ERASE, 140142794747904, 140142794751999, +ERASE, 140142794752000, 140142803140607, +ERASE, 140141913939968, 140141913944063, +ERASE, 140141913944064, 140141922332671, +ERASE, 140141746151424, 140141746155519, +ERASE, 140141746155520, 140141754544127, +ERASE, 140142954143744, 140142954147839, +ERASE, 140142954147840, 140142962536447, +ERASE, 140142081728512, 140142081732607, +ERASE, 140142081732608, 140142090121215, +ERASE, 140141905547264, 140141905551359, +ERASE, 140141905551360, 140141913939967, +ERASE, 140141729366016, 140141729370111, +ERASE, 140141729370112, 140141737758719, +ERASE, 140142920572928, 140142920577023, +ERASE, 140142920577024, 140142928965631, +ERASE, 140142039764992, 140142039769087, +ERASE, 140142039769088, 140142048157695, +ERASE, 140141679009792, 140141679013887, +ERASE, 140141679013888, 140141687402495, +ERASE, 140142551490560, 140142551494655, +ERASE, 140142551494656, 140142559883263, +ERASE, 140141947510784, 140141947514879, +ERASE, 140141947514880, 140141955903487, +ERASE, 140141771329536, 140141771333631, +ERASE, 140141771333632, 140141779722239, +ERASE, 140142928965632, 140142928969727, +ERASE, 140142928969728, 140142937358335, +ERASE, 140142073335808, 140142073339903, +ERASE, 140142073339904, 140142081728511, +ERASE, 140142543097856, 140142543101951, +ERASE, 140142543101952, 140142551490559, +ERASE, 140141955903488, 140141955907583, +ERASE, 140141955907584, 140141964296191, +ERASE, 140141704187904, 140141704191999, +ERASE, 140141704192000, 140141712580607, +ERASE, 140142786355200, 140142786359295, +ERASE, 140142786359296, 140142794747903, +ERASE, 140142056550400, 140142056554495, +ERASE, 140142056554496, 140142064943103, +ERASE, 140142828318720, 140142828322815, +ERASE, 140142828322816, 140142836711423, +ERASE, 140141788114944, 140141788119039, +ERASE, 140141788119040, 140141796507647, +ERASE, 140141695795200, 140141695799295, +ERASE, 140141695799296, 140141704187903, +ERASE, 140141578297344, 140141578301439, +ERASE, 140141578301440, 140141586690047, +ERASE, 140141611868160, 140141611872255, +ERASE, 140141611872256, 140141620260863, +ERASE, 140142811533312, 140142811537407, +ERASE, 140142811537408, 140142819926015, +ERASE, 140142064943104, 140142064947199, +ERASE, 140142064947200, 140142073335807, +ERASE, 140141628653568, 140141628657663, +ERASE, 140141628657664, 140141637046271, +ERASE, 140143046397952, 140143046402047, +ERASE, 140143046402048, 140143054790655, +ERASE, 140141796507648, 140141796511743, +ERASE, 140141796511744, 140141804900351, +ERASE, 140142803140608, 140142803144703, +ERASE, 140142803144704, 140142811533311, +ERASE, 140142509527040, 140142509531135, +ERASE, 140142509531136, 140142517919743, +ERASE, 140141821685760, 140141821689855, +ERASE, 140141821689856, 140141830078463, +ERASE, 140142777962496, 140142777966591, +ERASE, 140142777966592, 140142786355199, +ERASE, 140141804900352, 140141804904447, +ERASE, 140141804904448, 140141813293055, +ERASE, 140141930725376, 140141930729471, +ERASE, 140141930729472, 140141939118079, +ERASE, 140142937358336, 140142937362431, +ERASE, 140142937362432, 140142945751039, +ERASE, 140142559883264, 140142559887359, +ERASE, 140142559887360, 140142568275967, +ERASE, 140142534705152, 140142534709247, +ERASE, 140142534709248, 140142543097855, +ERASE, 140142048157696, 140142048161791, +ERASE, 140142048161792, 140142056550399, +ERASE, 140141754544128, 140141754548223, +ERASE, 140141754548224, 140141762936831, +ERASE, 140141939118080, 140141939122175, +ERASE, 140141939122176, 140141947510783, +ERASE, 140141653831680, 140141653835775, +ERASE, 140141653835776, 140141662224383, +ERASE, 140141712580608, 140141712584703, +ERASE, 140141712584704, 140141720973311, +ERASE, 140141645438976, 140141645443071, +ERASE, 140141645443072, 140141653831679, +ERASE, 140141687402496, 140141687406591, +ERASE, 140141687406592, 140141695795199, +ERASE, 140141662224384, 140141662228479, +ERASE, 140141662228480, 140141670617087, +ERASE, 140141922332672, 140141922336767, +ERASE, 140141922336768, 140141930725375, +ERASE, 140141737758720, 140141737762815, +ERASE, 140141737762816, 140141746151423, +ERASE, 140141637046272, 140141637050367, +ERASE, 140141637050368, 140141645438975, +ERASE, 140142517919744, 140142517923839, +ERASE, 140142517923840, 140142526312447, +ERASE, 140143096754176, 140143096758271, +ERASE, 140143096758272, 140143105146879, +ERASE, 140141595082752, 140141595086847, +ERASE, 140141595086848, 140141603475455, +ERASE, 140141762936832, 140141762940927, +ERASE, 140141762940928, 140141771329535, +ERASE, 140143311446016, 140143311450111, +ERASE, 140143311450112, 140143319838719, +ERASE, 140142526312448, 140142526316543, +ERASE, 140142526316544, 140142534705151, +ERASE, 140142819926016, 140142819930111, +ERASE, 140142819930112, 140142828318719, +ERASE, 140143180615680, 140143180619775, +ERASE, 140143180619776, 140143189008383, +ERASE, 140142962536448, 140142962540543, +ERASE, 140142962540544, 140142970929151, +ERASE, 140143214186496, 140143214190591, +ERASE, 140143214190592, 140143222579199, +ERASE, 140143088361472, 140143088365567, +ERASE, 140143088365568, 140143096754175, +ERASE, 140141586690048, 140141586694143, +ERASE, 140141586694144, 140141595082751, +ERASE, 140143230971904, 140143230975999, +ERASE, 140143230976000, 140143239364607, +ERASE, 140141779722240, 140141779726335, +ERASE, 140141779726336, 140141788114943, +ERASE, 140141670617088, 140141670621183, +ERASE, 140141670621184, 140141679009791, +ERASE, 140141813293056, 140141813297151, +ERASE, 140141813297152, 140141821685759, +ERASE, 140143222579200, 140143222583295, +ERASE, 140143222583296, 140143230971903, +ERASE, 140143189008384, 140143189012479, +ERASE, 140143189012480, 140143197401087, +ERASE, 140143071576064, 140143071580159, +ERASE, 140143071580160, 140143079968767, +ERASE, 140141620260864, 140141620264959, +ERASE, 140141620264960, 140141628653567, +ERASE, 140141603475456, 140141603479551, +ERASE, 140141603479552, 140141611868159, +ERASE, 140141720973312, 140141720977407, +ERASE, 140141720977408, 140141729366015, +ERASE, 140143079968768, 140143079972863, +ERASE, 140143079972864, 140143088361471, +ERASE, 140143205793792, 140143205797887, +ERASE, 140143205797888, 140143214186495, + }; + unsigned long set30[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140733436743680, 140737488351231, +SNULL, 140733436747775, 140737488351231, +STORE, 140733436743680, 140733436747775, +STORE, 140733436612608, 140733436747775, +STORE, 94630728904704, 94630731157503, +SNULL, 94630729035775, 94630731157503, +STORE, 94630728904704, 94630729035775, +STORE, 94630729035776, 94630731157503, +ERASE, 94630729035776, 94630731157503, +STORE, 94630731128832, 94630731137023, +STORE, 94630731137024, 94630731157503, +STORE, 140165750841344, 140165753094143, +SNULL, 140165750984703, 140165753094143, +STORE, 140165750841344, 140165750984703, +STORE, 140165750984704, 140165753094143, +ERASE, 140165750984704, 140165753094143, +STORE, 140165753081856, 140165753090047, +STORE, 140165753090048, 140165753094143, +STORE, 140733436887040, 140733436891135, +STORE, 140733436874752, 140733436887039, +STORE, 140165753053184, 140165753081855, +STORE, 140165753044992, 140165753053183, +STORE, 140165748625408, 140165750841343, +SNULL, 140165748625408, 140165748723711, +STORE, 140165748723712, 140165750841343, +STORE, 140165748625408, 140165748723711, +SNULL, 140165750816767, 140165750841343, +STORE, 140165748723712, 140165750816767, +STORE, 140165750816768, 140165750841343, +SNULL, 140165750816768, 140165750824959, +STORE, 140165750824960, 140165750841343, +STORE, 140165750816768, 140165750824959, +ERASE, 140165750816768, 140165750824959, +STORE, 140165750816768, 140165750824959, +ERASE, 140165750824960, 140165750841343, +STORE, 140165750824960, 140165750841343, +STORE, 140165744828416, 140165748625407, +SNULL, 140165744828416, 140165746487295, +STORE, 140165746487296, 140165748625407, +STORE, 140165744828416, 140165746487295, +SNULL, 140165748584447, 140165748625407, +STORE, 140165746487296, 140165748584447, +STORE, 140165748584448, 140165748625407, +SNULL, 140165748584448, 140165748609023, +STORE, 140165748609024, 140165748625407, +STORE, 140165748584448, 140165748609023, +ERASE, 140165748584448, 140165748609023, +STORE, 140165748584448, 140165748609023, +ERASE, 140165748609024, 140165748625407, +STORE, 140165748609024, 140165748625407, +STORE, 140165753036800, 140165753053183, +SNULL, 140165748600831, 140165748609023, +STORE, 140165748584448, 140165748600831, +STORE, 140165748600832, 140165748609023, +SNULL, 140165750820863, 140165750824959, +STORE, 140165750816768, 140165750820863, +STORE, 140165750820864, 140165750824959, +SNULL, 94630731132927, 94630731137023, +STORE, 94630731128832, 94630731132927, +STORE, 94630731132928, 94630731137023, +SNULL, 140165753085951, 140165753090047, +STORE, 140165753081856, 140165753085951, +STORE, 140165753085952, 140165753090047, +ERASE, 140165753053184, 140165753081855, +STORE, 94630743547904, 94630743683071, +STORE, 140165736435712, 140165744828415, +SNULL, 140165736439807, 140165744828415, +STORE, 140165736435712, 140165736439807, +STORE, 140165736439808, 140165744828415, +STORE, 140165728043008, 140165736435711, +STORE, 140165593825280, 140165728043007, +SNULL, 140165593825280, 140165653725183, +STORE, 140165653725184, 140165728043007, +STORE, 140165593825280, 140165653725183, +ERASE, 140165593825280, 140165653725183, +SNULL, 140165720834047, 140165728043007, +STORE, 140165653725184, 140165720834047, +STORE, 140165720834048, 140165728043007, +ERASE, 140165720834048, 140165728043007, +SNULL, 140165653860351, 140165720834047, +STORE, 140165653725184, 140165653860351, +STORE, 140165653860352, 140165720834047, +SNULL, 140165728047103, 140165736435711, +STORE, 140165728043008, 140165728047103, +STORE, 140165728047104, 140165736435711, +STORE, 140165645332480, 140165653725183, +SNULL, 140165645336575, 140165653725183, +STORE, 140165645332480, 140165645336575, +STORE, 140165645336576, 140165653725183, +STORE, 140165636939776, 140165645332479, +SNULL, 140165636943871, 140165645332479, +STORE, 140165636939776, 140165636943871, +STORE, 140165636943872, 140165645332479, +STORE, 140165628547072, 140165636939775, +SNULL, 140165628551167, 140165636939775, +STORE, 140165628547072, 140165628551167, +STORE, 140165628551168, 140165636939775, +STORE, 140165620154368, 140165628547071, +STORE, 140165611761664, 140165628547071, +STORE, 140165603368960, 140165628547071, +STORE, 140165469151232, 140165603368959, +SNULL, 140165469151232, 140165519507455, +STORE, 140165519507456, 140165603368959, +STORE, 140165469151232, 140165519507455, +ERASE, 140165469151232, 140165519507455, +SNULL, 140165586616319, 140165603368959, +STORE, 140165519507456, 140165586616319, +STORE, 140165586616320, 140165603368959, +ERASE, 140165586616320, 140165603368959, +STORE, 140165594976256, 140165628547071, +STORE, 140165385289728, 140165586616319, +SNULL, 140165452398591, 140165586616319, +STORE, 140165385289728, 140165452398591, +STORE, 140165452398592, 140165586616319, +SNULL, 140165452398592, 140165519507455, +STORE, 140165519507456, 140165586616319, +STORE, 140165452398592, 140165519507455, +ERASE, 140165452398592, 140165519507455, +STORE, 140165251072000, 140165452398591, +SNULL, 140165318180863, 140165452398591, +STORE, 140165251072000, 140165318180863, +STORE, 140165318180864, 140165452398591, +SNULL, 140165318180864, 140165385289727, +STORE, 140165385289728, 140165452398591, +STORE, 140165318180864, 140165385289727, +ERASE, 140165318180864, 140165385289727, +SNULL, 140165519642623, 140165586616319, +STORE, 140165519507456, 140165519642623, +STORE, 140165519642624, 140165586616319, +SNULL, 140165594976256, 140165611761663, +STORE, 140165611761664, 140165628547071, +STORE, 140165594976256, 140165611761663, +SNULL, 140165611765759, 140165628547071, +STORE, 140165611761664, 140165611765759, +STORE, 140165611765760, 140165628547071, +STORE, 140165385289728, 140165519507455, +SNULL, 140165385424895, 140165519507455, +STORE, 140165385289728, 140165385424895, +STORE, 140165385424896, 140165519507455, +SNULL, 140165594976256, 140165603368959, +STORE, 140165603368960, 140165611761663, +STORE, 140165594976256, 140165603368959, +SNULL, 140165603373055, 140165611761663, +STORE, 140165603368960, 140165603373055, +STORE, 140165603373056, 140165611761663, +SNULL, 140165251207167, 140165318180863, +STORE, 140165251072000, 140165251207167, +STORE, 140165251207168, 140165318180863, +STORE, 140165376897024, 140165385289727, +SNULL, 140165376901119, 140165385289727, +STORE, 140165376897024, 140165376901119, +STORE, 140165376901120, 140165385289727, +SNULL, 140165385424896, 140165452398591, +STORE, 140165452398592, 140165519507455, +STORE, 140165385424896, 140165452398591, +SNULL, 140165452533759, 140165519507455, +STORE, 140165452398592, 140165452533759, +STORE, 140165452533760, 140165519507455, +STORE, 140165368504320, 140165376897023, +SNULL, 140165594980351, 140165603368959, +STORE, 140165594976256, 140165594980351, +STORE, 140165594980352, 140165603368959, +SNULL, 140165368508415, 140165376897023, +STORE, 140165368504320, 140165368508415, +STORE, 140165368508416, 140165376897023, +SNULL, 140165611765760, 140165620154367, +STORE, 140165620154368, 140165628547071, +STORE, 140165611765760, 140165620154367, +SNULL, 140165620158463, 140165628547071, +STORE, 140165620154368, 140165620158463, +STORE, 140165620158464, 140165628547071, +STORE, 140165360111616, 140165368504319, +STORE, 140165351718912, 140165368504319, +STORE, 140165343326208, 140165368504319, +SNULL, 140165343326208, 140165351718911, +STORE, 140165351718912, 140165368504319, +STORE, 140165343326208, 140165351718911, +SNULL, 140165351723007, 140165368504319, +STORE, 140165351718912, 140165351723007, +STORE, 140165351723008, 140165368504319, +SNULL, 140165343330303, 140165351718911, +STORE, 140165343326208, 140165343330303, +STORE, 140165343330304, 140165351718911, +SNULL, 140165351723008, 140165360111615, +STORE, 140165360111616, 140165368504319, +STORE, 140165351723008, 140165360111615, +SNULL, 140165360115711, 140165368504319, +STORE, 140165360111616, 140165360115711, +STORE, 140165360115712, 140165368504319, +STORE, 140165334933504, 140165343326207, +SNULL, 140165334937599, 140165343326207, +STORE, 140165334933504, 140165334937599, +STORE, 140165334937600, 140165343326207, +STORE, 140165326540800, 140165334933503, +STORE, 140165242679296, 140165251071999, +SNULL, 140165242683391, 140165251071999, +STORE, 140165242679296, 140165242683391, +STORE, 140165242683392, 140165251071999, +STORE, 140165234286592, 140165242679295, +STORE, 140165225893888, 140165242679295, +SNULL, 140165225897983, 140165242679295, +STORE, 140165225893888, 140165225897983, +STORE, 140165225897984, 140165242679295, +SNULL, 140165225897984, 140165234286591, +STORE, 140165234286592, 140165242679295, +STORE, 140165225897984, 140165234286591, +SNULL, 140165234290687, 140165242679295, +STORE, 140165234286592, 140165234290687, +STORE, 140165234290688, 140165242679295, +SNULL, 140165326544895, 140165334933503, +STORE, 140165326540800, 140165326544895, +STORE, 140165326544896, 140165334933503, +STORE, 140165217501184, 140165225893887, +STORE, 140165209108480, 140165225893887, +SNULL, 140165209108480, 140165217501183, +STORE, 140165217501184, 140165225893887, +STORE, 140165209108480, 140165217501183, +SNULL, 140165217505279, 140165225893887, +STORE, 140165217501184, 140165217505279, +STORE, 140165217505280, 140165225893887, +SNULL, 140165209112575, 140165217501183, +STORE, 140165209108480, 140165209112575, +STORE, 140165209112576, 140165217501183, +STORE, 140165200715776, 140165209108479, +STORE, 140165066498048, 140165200715775, +SNULL, 140165066498048, 140165116854271, +STORE, 140165116854272, 140165200715775, +STORE, 140165066498048, 140165116854271, +ERASE, 140165066498048, 140165116854271, +SNULL, 140165183963135, 140165200715775, +STORE, 140165116854272, 140165183963135, +STORE, 140165183963136, 140165200715775, +ERASE, 140165183963136, 140165200715775, +SNULL, 140165116989439, 140165183963135, +STORE, 140165116854272, 140165116989439, +STORE, 140165116989440, 140165183963135, +STORE, 140165192323072, 140165209108479, +STORE, 140165108461568, 140165116854271, +STORE, 140164974243840, 140165108461567, +STORE, 140164965851136, 140164974243839, +SNULL, 140164974243840, 140164982636543, +STORE, 140164982636544, 140165108461567, +STORE, 140164974243840, 140164982636543, +ERASE, 140164974243840, 140164982636543, +STORE, 140164965851136, 140164982636543, +STORE, 140164957458432, 140164982636543, +STORE, 140164949065728, 140164982636543, +STORE, 140164940673024, 140164982636543, +STORE, 140164806455296, 140164940673023, +STORE, 140164798062592, 140164806455295, +STORE, 140164789669888, 140164806455295, +STORE, 140164655452160, 140164789669887, +STORE, 140164647059456, 140164655452159, +STORE, 140164638666752, 140164655452159, +SNULL, 140164655452160, 140164714201087, +STORE, 140164714201088, 140164789669887, +STORE, 140164655452160, 140164714201087, +ERASE, 140164655452160, 140164714201087, +STORE, 140164705808384, 140164714201087, +STORE, 140164697415680, 140164714201087, +STORE, 140164504449024, 140164638666751, +SNULL, 140164504449024, 140164512874495, +STORE, 140164512874496, 140164638666751, +STORE, 140164504449024, 140164512874495, +ERASE, 140164504449024, 140164512874495, +STORE, 140164689022976, 140164714201087, +STORE, 140164680630272, 140164714201087, +SNULL, 140164680634367, 140164714201087, +STORE, 140164680630272, 140164680634367, +STORE, 140164680634368, 140164714201087, +STORE, 140164378656768, 140164638666751, +SNULL, 140165192323072, 140165200715775, +STORE, 140165200715776, 140165209108479, +STORE, 140165192323072, 140165200715775, +SNULL, 140165200719871, 140165209108479, +STORE, 140165200715776, 140165200719871, +STORE, 140165200719872, 140165209108479, +SNULL, 140165049745407, 140165108461567, +STORE, 140164982636544, 140165049745407, +STORE, 140165049745408, 140165108461567, +ERASE, 140165049745408, 140165108461567, +SNULL, 140164982771711, 140165049745407, +STORE, 140164982636544, 140164982771711, +STORE, 140164982771712, 140165049745407, +STORE, 140164244439040, 140164638666751, +SNULL, 140164311547903, 140164638666751, +STORE, 140164244439040, 140164311547903, +STORE, 140164311547904, 140164638666751, +SNULL, 140164311547904, 140164378656767, +STORE, 140164378656768, 140164638666751, +STORE, 140164311547904, 140164378656767, +ERASE, 140164311547904, 140164378656767, +SNULL, 140164806455296, 140164848418815, +STORE, 140164848418816, 140164940673023, +STORE, 140164806455296, 140164848418815, +ERASE, 140164806455296, 140164848418815, +SNULL, 140164915527679, 140164940673023, +STORE, 140164848418816, 140164915527679, +STORE, 140164915527680, 140164940673023, +ERASE, 140164915527680, 140164940673023, +STORE, 140164110221312, 140164311547903, +SNULL, 140164177330175, 140164311547903, +STORE, 140164110221312, 140164177330175, +STORE, 140164177330176, 140164311547903, +SNULL, 140164177330176, 140164244439039, +STORE, 140164244439040, 140164311547903, +STORE, 140164177330176, 140164244439039, +ERASE, 140164177330176, 140164244439039, +SNULL, 140164781309951, 140164789669887, +STORE, 140164714201088, 140164781309951, +STORE, 140164781309952, 140164789669887, +ERASE, 140164781309952, 140164789669887, +STORE, 140163976003584, 140164177330175, +SNULL, 140164043112447, 140164177330175, +STORE, 140163976003584, 140164043112447, +STORE, 140164043112448, 140164177330175, +SNULL, 140164043112448, 140164110221311, +STORE, 140164110221312, 140164177330175, +STORE, 140164043112448, 140164110221311, +ERASE, 140164043112448, 140164110221311, +SNULL, 140164579983359, 140164638666751, +STORE, 140164378656768, 140164579983359, +STORE, 140164579983360, 140164638666751, +ERASE, 140164579983360, 140164638666751, +STORE, 140163841785856, 140164043112447, +SNULL, 140163908894719, 140164043112447, +STORE, 140163841785856, 140163908894719, +STORE, 140163908894720, 140164043112447, +SNULL, 140163908894720, 140163976003583, +STORE, 140163976003584, 140164043112447, +STORE, 140163908894720, 140163976003583, +ERASE, 140163908894720, 140163976003583, +SNULL, 140164940673024, 140164965851135, +STORE, 140164965851136, 140164982636543, +STORE, 140164940673024, 140164965851135, +SNULL, 140164965855231, 140164982636543, +STORE, 140164965851136, 140164965855231, +STORE, 140164965855232, 140164982636543, +SNULL, 140164965855232, 140164974243839, +STORE, 140164974243840, 140164982636543, +STORE, 140164965855232, 140164974243839, +SNULL, 140164974247935, 140164982636543, +STORE, 140164974243840, 140164974247935, +STORE, 140164974247936, 140164982636543, +SNULL, 140164445765631, 140164579983359, +STORE, 140164378656768, 140164445765631, +STORE, 140164445765632, 140164579983359, +SNULL, 140164445765632, 140164512874495, +STORE, 140164512874496, 140164579983359, +STORE, 140164445765632, 140164512874495, +ERASE, 140164445765632, 140164512874495, +SNULL, 140164378791935, 140164445765631, +STORE, 140164378656768, 140164378791935, +STORE, 140164378791936, 140164445765631, +SNULL, 140164789673983, 140164806455295, +STORE, 140164789669888, 140164789673983, +STORE, 140164789673984, 140164806455295, +SNULL, 140164789673984, 140164798062591, +STORE, 140164798062592, 140164806455295, +STORE, 140164789673984, 140164798062591, +SNULL, 140164798066687, 140164806455295, +STORE, 140164798062592, 140164798066687, +STORE, 140164798066688, 140164806455295, +SNULL, 140164638670847, 140164655452159, +STORE, 140164638666752, 140164638670847, +STORE, 140164638670848, 140164655452159, +STORE, 140165100068864, 140165116854271, +STORE, 140165091676160, 140165116854271, +STORE, 140165083283456, 140165116854271, +SNULL, 140164244574207, 140164311547903, +STORE, 140164244439040, 140164244574207, +STORE, 140164244574208, 140164311547903, +SNULL, 140164848553983, 140164915527679, +STORE, 140164848418816, 140164848553983, +STORE, 140164848553984, 140164915527679, +SNULL, 140164110356479, 140164177330175, +STORE, 140164110221312, 140164110356479, +STORE, 140164110356480, 140164177330175, +SNULL, 140164714336255, 140164781309951, +STORE, 140164714201088, 140164714336255, +STORE, 140164714336256, 140164781309951, +SNULL, 140163976138751, 140164043112447, +STORE, 140163976003584, 140163976138751, +STORE, 140163976138752, 140164043112447, +SNULL, 140164513009663, 140164579983359, +STORE, 140164512874496, 140164513009663, +STORE, 140164513009664, 140164579983359, +SNULL, 140163841921023, 140163908894719, +STORE, 140163841785856, 140163841921023, +STORE, 140163841921024, 140163908894719, +SNULL, 140165083283456, 140165100068863, +STORE, 140165100068864, 140165116854271, +STORE, 140165083283456, 140165100068863, +SNULL, 140165100072959, 140165116854271, +STORE, 140165100068864, 140165100072959, +STORE, 140165100072960, 140165116854271, +SNULL, 140165100072960, 140165108461567, +STORE, 140165108461568, 140165116854271, +STORE, 140165100072960, 140165108461567, +SNULL, 140165108465663, 140165116854271, +STORE, 140165108461568, 140165108465663, +STORE, 140165108465664, 140165116854271, +STORE, 140165074890752, 140165100068863, +SNULL, 140165074894847, 140165100068863, +STORE, 140165074890752, 140165074894847, +STORE, 140165074894848, 140165100068863, +STORE, 140165066498048, 140165074890751, +STORE, 140165058105344, 140165074890751, +STORE, 140164932280320, 140164965851135, +SNULL, 140165192327167, 140165200715775, +STORE, 140165192323072, 140165192327167, +STORE, 140165192327168, 140165200715775, +STORE, 140164923887616, 140164965851135, +SNULL, 140164923891711, 140164965851135, +STORE, 140164923887616, 140164923891711, +STORE, 140164923891712, 140164965851135, +SNULL, 140164680634368, 140164705808383, +STORE, 140164705808384, 140164714201087, +STORE, 140164680634368, 140164705808383, +SNULL, 140164705812479, 140164714201087, +STORE, 140164705808384, 140164705812479, +STORE, 140164705812480, 140164714201087, +SNULL, 140164680634368, 140164697415679, +STORE, 140164697415680, 140164705808383, +STORE, 140164680634368, 140164697415679, +SNULL, 140164697419775, 140164705808383, +STORE, 140164697415680, 140164697419775, +STORE, 140164697419776, 140164705808383, +STORE, 140164840026112, 140164848418815, +STORE, 140164831633408, 140164848418815, +STORE, 140164823240704, 140164848418815, +SNULL, 140165074894848, 140165083283455, +STORE, 140165083283456, 140165100068863, +STORE, 140165074894848, 140165083283455, +SNULL, 140165083287551, 140165100068863, +STORE, 140165083283456, 140165083287551, +STORE, 140165083287552, 140165100068863, +SNULL, 140165083287552, 140165091676159, +STORE, 140165091676160, 140165100068863, +STORE, 140165083287552, 140165091676159, +SNULL, 140165091680255, 140165100068863, +STORE, 140165091676160, 140165091680255, +STORE, 140165091680256, 140165100068863, +SNULL, 140164638670848, 140164647059455, +STORE, 140164647059456, 140164655452159, +STORE, 140164638670848, 140164647059455, +SNULL, 140164647063551, 140164655452159, +STORE, 140164647059456, 140164647063551, +STORE, 140164647063552, 140164655452159, +SNULL, 140164923891712, 140164940673023, +STORE, 140164940673024, 140164965851135, +STORE, 140164923891712, 140164940673023, +SNULL, 140164940677119, 140164965851135, +STORE, 140164940673024, 140164940677119, +STORE, 140164940677120, 140164965851135, +SNULL, 140164940677120, 140164949065727, +STORE, 140164949065728, 140164965851135, +STORE, 140164940677120, 140164949065727, +SNULL, 140164949069823, 140164965851135, +STORE, 140164949065728, 140164949069823, +STORE, 140164949069824, 140164965851135, +SNULL, 140164949069824, 140164957458431, +STORE, 140164957458432, 140164965851135, +STORE, 140164949069824, 140164957458431, +SNULL, 140164957462527, 140164965851135, +STORE, 140164957458432, 140164957462527, +STORE, 140164957462528, 140164965851135, +SNULL, 140164680634368, 140164689022975, +STORE, 140164689022976, 140164697415679, +STORE, 140164680634368, 140164689022975, +SNULL, 140164689027071, 140164697415679, +STORE, 140164689022976, 140164689027071, +STORE, 140164689027072, 140164697415679, +STORE, 140164814848000, 140164848418815, +SNULL, 140165058105344, 140165066498047, +STORE, 140165066498048, 140165074890751, +STORE, 140165058105344, 140165066498047, +SNULL, 140165066502143, 140165074890751, +STORE, 140165066498048, 140165066502143, +STORE, 140165066502144, 140165074890751, +SNULL, 140165058109439, 140165066498047, +STORE, 140165058105344, 140165058109439, +STORE, 140165058109440, 140165066498047, +STORE, 140164798066688, 140164814847999, +SNULL, 140164798066688, 140164806455295, +STORE, 140164806455296, 140164814847999, +STORE, 140164798066688, 140164806455295, +SNULL, 140164806459391, 140164814847999, +STORE, 140164806455296, 140164806459391, +STORE, 140164806459392, 140164814847999, +SNULL, 140164923891712, 140164932280319, +STORE, 140164932280320, 140164940673023, +STORE, 140164923891712, 140164932280319, +SNULL, 140164932284415, 140164940673023, +STORE, 140164932280320, 140164932284415, +STORE, 140164932284416, 140164940673023, +STORE, 140164672237568, 140164680630271, +STORE, 140164663844864, 140164680630271, +STORE, 140164647063552, 140164680630271, +SNULL, 140164647063552, 140164655452159, +STORE, 140164655452160, 140164680630271, +STORE, 140164647063552, 140164655452159, +SNULL, 140164655456255, 140164680630271, +STORE, 140164655452160, 140164655456255, +STORE, 140164655456256, 140164680630271, +STORE, 140164630274048, 140164638666751, +SNULL, 140164814852095, 140164848418815, +STORE, 140164814848000, 140164814852095, +STORE, 140164814852096, 140164848418815, +SNULL, 140164814852096, 140164831633407, +STORE, 140164831633408, 140164848418815, +STORE, 140164814852096, 140164831633407, +SNULL, 140164831637503, 140164848418815, +STORE, 140164831633408, 140164831637503, +STORE, 140164831637504, 140164848418815, +STORE, 140164621881344, 140164638666751, +SNULL, 140164831637504, 140164840026111, +STORE, 140164840026112, 140164848418815, +STORE, 140164831637504, 140164840026111, +SNULL, 140164840030207, 140164848418815, +STORE, 140164840026112, 140164840030207, +STORE, 140164840030208, 140164848418815, +STORE, 140164613488640, 140164638666751, +SNULL, 140164613492735, 140164638666751, +STORE, 140164613488640, 140164613492735, +STORE, 140164613492736, 140164638666751, +STORE, 140164605095936, 140164613488639, +SNULL, 140164605100031, 140164613488639, +STORE, 140164605095936, 140164605100031, +STORE, 140164605100032, 140164613488639, +STORE, 140164596703232, 140164605095935, +STORE, 140164588310528, 140164605095935, +SNULL, 140164588314623, 140164605095935, +STORE, 140164588310528, 140164588314623, +STORE, 140164588314624, 140164605095935, +STORE, 140164504481792, 140164512874495, +STORE, 140164496089088, 140164512874495, +SNULL, 140164496089088, 140164504481791, +STORE, 140164504481792, 140164512874495, +STORE, 140164496089088, 140164504481791, +SNULL, 140164504485887, 140164512874495, +STORE, 140164504481792, 140164504485887, +STORE, 140164504485888, 140164512874495, +SNULL, 140164613492736, 140164630274047, +STORE, 140164630274048, 140164638666751, +STORE, 140164613492736, 140164630274047, +SNULL, 140164630278143, 140164638666751, +STORE, 140164630274048, 140164630278143, +STORE, 140164630278144, 140164638666751, +STORE, 140164487696384, 140164504481791, +STORE, 140164479303680, 140164504481791, +SNULL, 140164814852096, 140164823240703, +STORE, 140164823240704, 140164831633407, +STORE, 140164814852096, 140164823240703, +SNULL, 140164823244799, 140164831633407, +STORE, 140164823240704, 140164823244799, +STORE, 140164823244800, 140164831633407, +STORE, 140164470910976, 140164504481791, +SNULL, 140164470910976, 140164496089087, +STORE, 140164496089088, 140164504481791, +STORE, 140164470910976, 140164496089087, +SNULL, 140164496093183, 140164504481791, +STORE, 140164496089088, 140164496093183, +STORE, 140164496093184, 140164504481791, +SNULL, 140164655456256, 140164672237567, +STORE, 140164672237568, 140164680630271, +STORE, 140164655456256, 140164672237567, +SNULL, 140164672241663, 140164680630271, +STORE, 140164672237568, 140164672241663, +STORE, 140164672241664, 140164680630271, +STORE, 140164462518272, 140164496089087, +STORE, 140164454125568, 140164496089087, +SNULL, 140164655456256, 140164663844863, +STORE, 140164663844864, 140164672237567, +STORE, 140164655456256, 140164663844863, +SNULL, 140164663848959, 140164672237567, +STORE, 140164663844864, 140164663848959, +STORE, 140164663848960, 140164672237567, +STORE, 140164370264064, 140164378656767, +STORE, 140164361871360, 140164378656767, +STORE, 140164353478656, 140164378656767, +STORE, 140164345085952, 140164378656767, +SNULL, 140164345085952, 140164353478655, +STORE, 140164353478656, 140164378656767, +STORE, 140164345085952, 140164353478655, +SNULL, 140164353482751, 140164378656767, +STORE, 140164353478656, 140164353482751, +STORE, 140164353482752, 140164378656767, +SNULL, 140164454125568, 140164487696383, +STORE, 140164487696384, 140164496089087, +STORE, 140164454125568, 140164487696383, +SNULL, 140164487700479, 140164496089087, +STORE, 140164487696384, 140164487700479, +STORE, 140164487700480, 140164496089087, +STORE, 140164336693248, 140164353478655, +SNULL, 140164336697343, 140164353478655, +STORE, 140164336693248, 140164336697343, +STORE, 140164336697344, 140164353478655, +STORE, 140164328300544, 140164336693247, +SNULL, 140164454125568, 140164479303679, +STORE, 140164479303680, 140164487696383, +STORE, 140164454125568, 140164479303679, +SNULL, 140164479307775, 140164487696383, +STORE, 140164479303680, 140164479307775, +STORE, 140164479307776, 140164487696383, +STORE, 140164319907840, 140164336693247, +STORE, 140164236046336, 140164244439039, +SNULL, 140164588314624, 140164596703231, +STORE, 140164596703232, 140164605095935, +STORE, 140164588314624, 140164596703231, +SNULL, 140164596707327, 140164605095935, +STORE, 140164596703232, 140164596707327, +STORE, 140164596707328, 140164605095935, +SNULL, 140164454125568, 140164462518271, +STORE, 140164462518272, 140164479303679, +STORE, 140164454125568, 140164462518271, +SNULL, 140164462522367, 140164479303679, +STORE, 140164462518272, 140164462522367, +STORE, 140164462522368, 140164479303679, +STORE, 140164227653632, 140164244439039, +SNULL, 140164227657727, 140164244439039, +STORE, 140164227653632, 140164227657727, +STORE, 140164227657728, 140164244439039, +SNULL, 140164462522368, 140164470910975, +STORE, 140164470910976, 140164479303679, +STORE, 140164462522368, 140164470910975, +SNULL, 140164470915071, 140164479303679, +STORE, 140164470910976, 140164470915071, +STORE, 140164470915072, 140164479303679, +SNULL, 140164613492736, 140164621881343, +STORE, 140164621881344, 140164630274047, +STORE, 140164613492736, 140164621881343, +SNULL, 140164621885439, 140164630274047, +STORE, 140164621881344, 140164621885439, +STORE, 140164621885440, 140164630274047, +SNULL, 140164353482752, 140164370264063, +STORE, 140164370264064, 140164378656767, +STORE, 140164353482752, 140164370264063, +SNULL, 140164370268159, 140164378656767, +STORE, 140164370264064, 140164370268159, +STORE, 140164370268160, 140164378656767, +STORE, 140164219260928, 140164227653631, +SNULL, 140164319911935, 140164336693247, +STORE, 140164319907840, 140164319911935, +STORE, 140164319911936, 140164336693247, +SNULL, 140164336697344, 140164345085951, +STORE, 140164345085952, 140164353478655, +STORE, 140164336697344, 140164345085951, +SNULL, 140164345090047, 140164353478655, +STORE, 140164345085952, 140164345090047, +STORE, 140164345090048, 140164353478655, +SNULL, 140164319911936, 140164328300543, +STORE, 140164328300544, 140164336693247, +STORE, 140164319911936, 140164328300543, +SNULL, 140164328304639, 140164336693247, +STORE, 140164328300544, 140164328304639, +STORE, 140164328304640, 140164336693247, +SNULL, 140164454129663, 140164462518271, +STORE, 140164454125568, 140164454129663, +STORE, 140164454129664, 140164462518271, +STORE, 140164210868224, 140164227653631, +STORE, 140164202475520, 140164227653631, +STORE, 140164194082816, 140164227653631, +SNULL, 140164194086911, 140164227653631, +STORE, 140164194082816, 140164194086911, +STORE, 140164194086912, 140164227653631, +SNULL, 140164353482752, 140164361871359, +STORE, 140164361871360, 140164370264063, +STORE, 140164353482752, 140164361871359, +SNULL, 140164361875455, 140164370264063, +STORE, 140164361871360, 140164361875455, +STORE, 140164361875456, 140164370264063, +SNULL, 140164227657728, 140164236046335, +STORE, 140164236046336, 140164244439039, +STORE, 140164227657728, 140164236046335, +SNULL, 140164236050431, 140164244439039, +STORE, 140164236046336, 140164236050431, +STORE, 140164236050432, 140164244439039, +STORE, 140164185690112, 140164194082815, +SNULL, 140164194086912, 140164219260927, +STORE, 140164219260928, 140164227653631, +STORE, 140164194086912, 140164219260927, +SNULL, 140164219265023, 140164227653631, +STORE, 140164219260928, 140164219265023, +STORE, 140164219265024, 140164227653631, +STORE, 140164101828608, 140164110221311, +STORE, 140164093435904, 140164110221311, +STORE, 140164085043200, 140164110221311, +SNULL, 140164085047295, 140164110221311, +STORE, 140164085043200, 140164085047295, +STORE, 140164085047296, 140164110221311, +STORE, 140164076650496, 140164085043199, +SNULL, 140164185694207, 140164194082815, +STORE, 140164185690112, 140164185694207, +STORE, 140164185694208, 140164194082815, +SNULL, 140164085047296, 140164101828607, +STORE, 140164101828608, 140164110221311, +STORE, 140164085047296, 140164101828607, +SNULL, 140164101832703, 140164110221311, +STORE, 140164101828608, 140164101832703, +STORE, 140164101832704, 140164110221311, +SNULL, 140164085047296, 140164093435903, +STORE, 140164093435904, 140164101828607, +STORE, 140164085047296, 140164093435903, +SNULL, 140164093439999, 140164101828607, +STORE, 140164093435904, 140164093439999, +STORE, 140164093440000, 140164101828607, +SNULL, 140164194086912, 140164202475519, +STORE, 140164202475520, 140164219260927, +STORE, 140164194086912, 140164202475519, +SNULL, 140164202479615, 140164219260927, +STORE, 140164202475520, 140164202479615, +STORE, 140164202479616, 140164219260927, +SNULL, 140164202479616, 140164210868223, +STORE, 140164210868224, 140164219260927, +STORE, 140164202479616, 140164210868223, +SNULL, 140164210872319, 140164219260927, +STORE, 140164210868224, 140164210872319, +STORE, 140164210872320, 140164219260927, +SNULL, 140164076654591, 140164085043199, +STORE, 140164076650496, 140164076654591, +STORE, 140164076654592, 140164085043199, +STORE, 140164068257792, 140164076650495, +SNULL, 140164068261887, 140164076650495, +STORE, 140164068257792, 140164068261887, +STORE, 140164068261888, 140164076650495, +STORE, 140165753053184, 140165753081855, +STORE, 140165725851648, 140165728043007, +SNULL, 140165725851648, 140165725941759, +STORE, 140165725941760, 140165728043007, +STORE, 140165725851648, 140165725941759, +SNULL, 140165728034815, 140165728043007, +STORE, 140165725941760, 140165728034815, +STORE, 140165728034816, 140165728043007, +ERASE, 140165728034816, 140165728043007, +STORE, 140165728034816, 140165728043007, +SNULL, 140165728038911, 140165728043007, +STORE, 140165728034816, 140165728038911, +STORE, 140165728038912, 140165728043007, +ERASE, 140165753053184, 140165753081855, +ERASE, 140164638666752, 140164638670847, +ERASE, 140164638670848, 140164647059455, +ERASE, 140165091676160, 140165091680255, +ERASE, 140165091680256, 140165100068863, +ERASE, 140164613488640, 140164613492735, +ERASE, 140164613492736, 140164621881343, +ERASE, 140164319907840, 140164319911935, +ERASE, 140164319911936, 140164328300543, +ERASE, 140165620154368, 140165620158463, +ERASE, 140165620158464, 140165628547071, +ERASE, 140164798062592, 140164798066687, +ERASE, 140164798066688, 140164806455295, +ERASE, 140164789669888, 140164789673983, +ERASE, 140164789673984, 140164798062591, +ERASE, 140164965851136, 140164965855231, +ERASE, 140164965855232, 140164974243839, +ERASE, 140165074890752, 140165074894847, +ERASE, 140165074894848, 140165083283455, +ERASE, 140164672237568, 140164672241663, +ERASE, 140164672241664, 140164680630271, +ERASE, 140164454125568, 140164454129663, +ERASE, 140164454129664, 140164462518271, +ERASE, 140165200715776, 140165200719871, +ERASE, 140165200719872, 140165209108479, +ERASE, 140164932280320, 140164932284415, +ERASE, 140164932284416, 140164940673023, +ERASE, 140164663844864, 140164663848959, +ERASE, 140164663848960, 140164672237567, +ERASE, 140164697415680, 140164697419775, +ERASE, 140164697419776, 140164705808383, +ERASE, 140164831633408, 140164831637503, +ERASE, 140164831637504, 140164840026111, +ERASE, 140165192323072, 140165192327167, +ERASE, 140165192327168, 140165200715775, +ERASE, 140165108461568, 140165108465663, +ERASE, 140165108465664, 140165116854271, +ERASE, 140164840026112, 140164840030207, +ERASE, 140164840030208, 140164848418815, +ERASE, 140164647059456, 140164647063551, +ERASE, 140164647063552, 140164655452159, +ERASE, 140165083283456, 140165083287551, +ERASE, 140165083287552, 140165091676159, +ERASE, 140164923887616, 140164923891711, +ERASE, 140164923891712, 140164932280319, +ERASE, 140164823240704, 140164823244799, +ERASE, 140164823244800, 140164831633407, +ERASE, 140164227653632, 140164227657727, +ERASE, 140164227657728, 140164236046335, +ERASE, 140164957458432, 140164957462527, +ERASE, 140164957462528, 140164965851135, +ERASE, 140164680630272, 140164680634367, +ERASE, 140164680634368, 140164689022975, +ERASE, 140164974243840, 140164974247935, +ERASE, 140164974247936, 140164982636543, +ERASE, 140165066498048, 140165066502143, +ERASE, 140165066502144, 140165074890751, +ERASE, 140164621881344, 140164621885439, +ERASE, 140164621885440, 140164630274047, +ERASE, 140164949065728, 140164949069823, +ERASE, 140164949069824, 140164957458431, +ERASE, 140164588310528, 140164588314623, +ERASE, 140164588314624, 140164596703231, +ERASE, 140164806455296, 140164806459391, +ERASE, 140164806459392, 140164814847999, +ERASE, 140164940673024, 140164940677119, +ERASE, 140164940677120, 140164949065727, +ERASE, 140164596703232, 140164596707327, +ERASE, 140164596707328, 140164605095935, +ERASE, 140164605095936, 140164605100031, +ERASE, 140164605100032, 140164613488639, +ERASE, 140164655452160, 140164655456255, +ERASE, 140164655456256, 140164663844863, +ERASE, 140164705808384, 140164705812479, +ERASE, 140164705812480, 140164714201087, +ERASE, 140164689022976, 140164689027071, +ERASE, 140164689027072, 140164697415679, +ERASE, 140164630274048, 140164630278143, +ERASE, 140164630278144, 140164638666751, +ERASE, 140164479303680, 140164479307775, +ERASE, 140164479307776, 140164487696383, +ERASE, 140164236046336, 140164236050431, +ERASE, 140164236050432, 140164244439039, +ERASE, 140164085043200, 140164085047295, +ERASE, 140164085047296, 140164093435903, +ERASE, 140164345085952, 140164345090047, +ERASE, 140164345090048, 140164353478655, +ERASE, 140164101828608, 140164101832703, +ERASE, 140164101832704, 140164110221311, +ERASE, 140164370264064, 140164370268159, +ERASE, 140164370268160, 140164378656767, +ERASE, 140164336693248, 140164336697343, +ERASE, 140164336697344, 140164345085951, +ERASE, 140164194082816, 140164194086911, +ERASE, 140164194086912, 140164202475519, +ERASE, 140164353478656, 140164353482751, +ERASE, 140164353482752, 140164361871359, +ERASE, 140164210868224, 140164210872319, +ERASE, 140164210872320, 140164219260927, +ERASE, 140164814848000, 140164814852095, +ERASE, 140164814852096, 140164823240703, +ERASE, 140164504481792, 140164504485887, +ERASE, 140164504485888, 140164512874495, +ERASE, 140165100068864, 140165100072959, +ERASE, 140165100072960, 140165108461567, +ERASE, 140164361871360, 140164361875455, +ERASE, 140164361875456, 140164370264063, +ERASE, 140164470910976, 140164470915071, +ERASE, 140164470915072, 140164479303679, +ERASE, 140164076650496, 140164076654591, +ERASE, 140164076654592, 140164085043199, +ERASE, 140164202475520, 140164202479615, +ERASE, 140164202479616, 140164210868223, +ERASE, 140164462518272, 140164462522367, +ERASE, 140164462522368, 140164470910975, +ERASE, 140165351718912, 140165351723007, +ERASE, 140165351723008, 140165360111615, +ERASE, 140164328300544, 140164328304639, +ERASE, 140164328304640, 140164336693247, +ERASE, 140164093435904, 140164093439999, +ERASE, 140164093440000, 140164101828607, +ERASE, 140165603368960, 140165603373055, +ERASE, 140165603373056, 140165611761663, +ERASE, 140165368504320, 140165368508415, +ERASE, 140165368508416, 140165376897023, +ERASE, 140165334933504, 140165334937599, +ERASE, 140165334937600, 140165343326207, +ERASE, 140165594976256, 140165594980351, +ERASE, 140165594980352, 140165603368959, +ERASE, 140164487696384, 140164487700479, +ERASE, 140164487700480, 140164496089087, +ERASE, 140164219260928, 140164219265023, +ERASE, 140164219265024, 140164227653631, +ERASE, 140164185690112, 140164185694207, +ERASE, 140164185694208, 140164194082815, +ERASE, 140164068257792, 140164068261887, +ERASE, 140164068261888, 140164076650495, +ERASE, 140165225893888, 140165225897983, +ERASE, 140165225897984, 140165234286591, +ERASE, 140165058105344, 140165058109439, + }; + unsigned long set31[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140730890784768, 140737488351231, +SNULL, 140730890788863, 140737488351231, +STORE, 140730890784768, 140730890788863, +STORE, 140730890653696, 140730890788863, +STORE, 94577123659776, 94577125912575, +SNULL, 94577123790847, 94577125912575, +STORE, 94577123659776, 94577123790847, +STORE, 94577123790848, 94577125912575, +ERASE, 94577123790848, 94577125912575, +STORE, 94577125883904, 94577125892095, +STORE, 94577125892096, 94577125912575, +STORE, 140624060407808, 140624062660607, +SNULL, 140624060551167, 140624062660607, +STORE, 140624060407808, 140624060551167, +STORE, 140624060551168, 140624062660607, +ERASE, 140624060551168, 140624062660607, +STORE, 140624062648320, 140624062656511, +STORE, 140624062656512, 140624062660607, +STORE, 140730892140544, 140730892144639, +STORE, 140730892128256, 140730892140543, +STORE, 140624062619648, 140624062648319, +STORE, 140624062611456, 140624062619647, +STORE, 140624058191872, 140624060407807, +SNULL, 140624058191872, 140624058290175, +STORE, 140624058290176, 140624060407807, +STORE, 140624058191872, 140624058290175, +SNULL, 140624060383231, 140624060407807, +STORE, 140624058290176, 140624060383231, +STORE, 140624060383232, 140624060407807, +SNULL, 140624060383232, 140624060391423, +STORE, 140624060391424, 140624060407807, +STORE, 140624060383232, 140624060391423, +ERASE, 140624060383232, 140624060391423, +STORE, 140624060383232, 140624060391423, +ERASE, 140624060391424, 140624060407807, +STORE, 140624060391424, 140624060407807, +STORE, 140624054394880, 140624058191871, +SNULL, 140624054394880, 140624056053759, +STORE, 140624056053760, 140624058191871, +STORE, 140624054394880, 140624056053759, +SNULL, 140624058150911, 140624058191871, +STORE, 140624056053760, 140624058150911, +STORE, 140624058150912, 140624058191871, +SNULL, 140624058150912, 140624058175487, +STORE, 140624058175488, 140624058191871, +STORE, 140624058150912, 140624058175487, +ERASE, 140624058150912, 140624058175487, +STORE, 140624058150912, 140624058175487, +ERASE, 140624058175488, 140624058191871, +STORE, 140624058175488, 140624058191871, +STORE, 140624062603264, 140624062619647, +SNULL, 140624058167295, 140624058175487, +STORE, 140624058150912, 140624058167295, +STORE, 140624058167296, 140624058175487, +SNULL, 140624060387327, 140624060391423, +STORE, 140624060383232, 140624060387327, +STORE, 140624060387328, 140624060391423, +SNULL, 94577125887999, 94577125892095, +STORE, 94577125883904, 94577125887999, +STORE, 94577125888000, 94577125892095, +SNULL, 140624062652415, 140624062656511, +STORE, 140624062648320, 140624062652415, +STORE, 140624062652416, 140624062656511, +ERASE, 140624062619648, 140624062648319, +STORE, 94577157709824, 94577157844991, +STORE, 140624046002176, 140624054394879, +SNULL, 140624046006271, 140624054394879, +STORE, 140624046002176, 140624046006271, +STORE, 140624046006272, 140624054394879, +STORE, 140624037609472, 140624046002175, +STORE, 140623903391744, 140624037609471, +SNULL, 140623903391744, 140623940157439, +STORE, 140623940157440, 140624037609471, +STORE, 140623903391744, 140623940157439, +ERASE, 140623903391744, 140623940157439, +SNULL, 140624007266303, 140624037609471, +STORE, 140623940157440, 140624007266303, +STORE, 140624007266304, 140624037609471, +ERASE, 140624007266304, 140624037609471, +SNULL, 140623940292607, 140624007266303, +STORE, 140623940157440, 140623940292607, +STORE, 140623940292608, 140624007266303, +SNULL, 140624037613567, 140624046002175, +STORE, 140624037609472, 140624037613567, +STORE, 140624037613568, 140624046002175, +STORE, 140624029216768, 140624037609471, +SNULL, 140624029220863, 140624037609471, +STORE, 140624029216768, 140624029220863, +STORE, 140624029220864, 140624037609471, +STORE, 140624020824064, 140624029216767, +SNULL, 140624020828159, 140624029216767, +STORE, 140624020824064, 140624020828159, +STORE, 140624020828160, 140624029216767, +STORE, 140624012431360, 140624020824063, +SNULL, 140624012435455, 140624020824063, +STORE, 140624012431360, 140624012435455, +STORE, 140624012435456, 140624020824063, +STORE, 140623931764736, 140623940157439, +STORE, 140623797547008, 140623931764735, +SNULL, 140623797547008, 140623805939711, +STORE, 140623805939712, 140623931764735, +STORE, 140623797547008, 140623805939711, +ERASE, 140623797547008, 140623805939711, +SNULL, 140623873048575, 140623931764735, +STORE, 140623805939712, 140623873048575, +STORE, 140623873048576, 140623931764735, +ERASE, 140623873048576, 140623931764735, +STORE, 140623923372032, 140623940157439, +STORE, 140623914979328, 140623940157439, +STORE, 140623906586624, 140623940157439, +STORE, 140623671721984, 140623873048575, +SNULL, 140623738830847, 140623873048575, +STORE, 140623671721984, 140623738830847, +STORE, 140623738830848, 140623873048575, +SNULL, 140623738830848, 140623805939711, +STORE, 140623805939712, 140623873048575, +STORE, 140623738830848, 140623805939711, +ERASE, 140623738830848, 140623805939711, +SNULL, 140623806074879, 140623873048575, +STORE, 140623805939712, 140623806074879, +STORE, 140623806074880, 140623873048575, +SNULL, 140623906586624, 140623931764735, +STORE, 140623931764736, 140623940157439, +STORE, 140623906586624, 140623931764735, +SNULL, 140623931768831, 140623940157439, +STORE, 140623931764736, 140623931768831, +STORE, 140623931768832, 140623940157439, +STORE, 140623537504256, 140623738830847, +SNULL, 140623537504256, 140623671721983, +STORE, 140623671721984, 140623738830847, +STORE, 140623537504256, 140623671721983, +SNULL, 140623671857151, 140623738830847, +STORE, 140623671721984, 140623671857151, +STORE, 140623671857152, 140623738830847, +SNULL, 140623604613119, 140623671721983, +STORE, 140623537504256, 140623604613119, +STORE, 140623604613120, 140623671721983, +ERASE, 140623604613120, 140623671721983, +SNULL, 140623537639423, 140623604613119, +STORE, 140623537504256, 140623537639423, +STORE, 140623537639424, 140623604613119, +STORE, 140623537639424, 140623671721983, +SNULL, 140623537639424, 140623604613119, +STORE, 140623604613120, 140623671721983, +STORE, 140623537639424, 140623604613119, +SNULL, 140623604748287, 140623671721983, +STORE, 140623604613120, 140623604748287, +STORE, 140623604748288, 140623671721983, +STORE, 140623898193920, 140623931764735, +SNULL, 140623898193920, 140623923372031, +STORE, 140623923372032, 140623931764735, +STORE, 140623898193920, 140623923372031, +SNULL, 140623923376127, 140623931764735, +STORE, 140623923372032, 140623923376127, +STORE, 140623923376128, 140623931764735, +STORE, 140623889801216, 140623923372031, +SNULL, 140623889801216, 140623898193919, +STORE, 140623898193920, 140623923372031, +STORE, 140623889801216, 140623898193919, +SNULL, 140623898198015, 140623923372031, +STORE, 140623898193920, 140623898198015, +STORE, 140623898198016, 140623923372031, +SNULL, 140623889805311, 140623898193919, +STORE, 140623889801216, 140623889805311, +STORE, 140623889805312, 140623898193919, +SNULL, 140623898198016, 140623906586623, +STORE, 140623906586624, 140623923372031, +STORE, 140623898198016, 140623906586623, +SNULL, 140623906590719, 140623923372031, +STORE, 140623906586624, 140623906590719, +STORE, 140623906590720, 140623923372031, +STORE, 140623881408512, 140623889801215, +SNULL, 140623906590720, 140623914979327, +STORE, 140623914979328, 140623923372031, +STORE, 140623906590720, 140623914979327, +SNULL, 140623914983423, 140623923372031, +STORE, 140623914979328, 140623914983423, +STORE, 140623914983424, 140623923372031, +SNULL, 140623881412607, 140623889801215, +STORE, 140623881408512, 140623881412607, +STORE, 140623881412608, 140623889801215, +STORE, 140623797547008, 140623805939711, +STORE, 140623789154304, 140623805939711, +STORE, 140623780761600, 140623805939711, +SNULL, 140623780761600, 140623789154303, +STORE, 140623789154304, 140623805939711, +STORE, 140623780761600, 140623789154303, +SNULL, 140623789158399, 140623805939711, +STORE, 140623789154304, 140623789158399, +STORE, 140623789158400, 140623805939711, +STORE, 140623772368896, 140623789154303, +STORE, 140623763976192, 140623789154303, +SNULL, 140623763976192, 140623780761599, +STORE, 140623780761600, 140623789154303, +STORE, 140623763976192, 140623780761599, +SNULL, 140623780765695, 140623789154303, +STORE, 140623780761600, 140623780765695, +STORE, 140623780765696, 140623789154303, +SNULL, 140623789158400, 140623797547007, +STORE, 140623797547008, 140623805939711, +STORE, 140623789158400, 140623797547007, +SNULL, 140623797551103, 140623805939711, +STORE, 140623797547008, 140623797551103, +STORE, 140623797551104, 140623805939711, +SNULL, 140623763976192, 140623772368895, +STORE, 140623772368896, 140623780761599, +STORE, 140623763976192, 140623772368895, +SNULL, 140623772372991, 140623780761599, +STORE, 140623772368896, 140623772372991, +STORE, 140623772372992, 140623780761599, +SNULL, 140623763980287, 140623772368895, +STORE, 140623763976192, 140623763980287, +STORE, 140623763980288, 140623772368895, +STORE, 140623755583488, 140623763976191, +STORE, 140623747190784, 140623763976191, +SNULL, 140623747190784, 140623755583487, +STORE, 140623755583488, 140623763976191, +STORE, 140623747190784, 140623755583487, +SNULL, 140623755587583, 140623763976191, +STORE, 140623755583488, 140623755587583, +STORE, 140623755587584, 140623763976191, +STORE, 140623529111552, 140623537504255, +SNULL, 140623747194879, 140623755583487, +STORE, 140623747190784, 140623747194879, +STORE, 140623747194880, 140623755583487, +SNULL, 140623529115647, 140623537504255, +STORE, 140623529111552, 140623529115647, +STORE, 140623529115648, 140623537504255, +STORE, 140623520718848, 140623529111551, +SNULL, 140623520722943, 140623529111551, +STORE, 140623520718848, 140623520722943, +STORE, 140623520722944, 140623529111551, +STORE, 140623512326144, 140623520718847, +STORE, 140623503933440, 140623520718847, +STORE, 140623495540736, 140623520718847, +STORE, 140623361323008, 140623495540735, +STORE, 140623227105280, 140623495540735, +STORE, 140623218712576, 140623227105279, +STORE, 140623084494848, 140623218712575, +STORE, 140623076102144, 140623084494847, +STORE, 140622941884416, 140623076102143, +SNULL, 140622941884416, 140623000633343, +STORE, 140623000633344, 140623076102143, +STORE, 140622941884416, 140623000633343, +ERASE, 140622941884416, 140623000633343, +STORE, 140622992240640, 140623000633343, +STORE, 140622983847936, 140623000633343, +STORE, 140622849630208, 140622983847935, +STORE, 140622841237504, 140622849630207, +SNULL, 140622849630208, 140622866415615, +STORE, 140622866415616, 140622983847935, +STORE, 140622849630208, 140622866415615, +ERASE, 140622849630208, 140622866415615, +STORE, 140622858022912, 140622866415615, +SNULL, 140622933524479, 140622983847935, +STORE, 140622866415616, 140622933524479, +STORE, 140622933524480, 140622983847935, +ERASE, 140622933524480, 140622983847935, +STORE, 140622975455232, 140623000633343, +STORE, 140622707019776, 140622841237503, +STORE, 140622967062528, 140623000633343, +STORE, 140622572802048, 140622841237503, +STORE, 140622958669824, 140623000633343, +STORE, 140622438584320, 140622841237503, +STORE, 140622950277120, 140623000633343, +SNULL, 140622858027007, 140622866415615, +STORE, 140622858022912, 140622858027007, +STORE, 140622858027008, 140622866415615, +STORE, 140622941884416, 140623000633343, +STORE, 140622841237504, 140622858022911, +SNULL, 140622841237504, 140622849630207, +STORE, 140622849630208, 140622858022911, +STORE, 140622841237504, 140622849630207, +SNULL, 140622849634303, 140622858022911, +STORE, 140622849630208, 140622849634303, +STORE, 140622849634304, 140622858022911, +STORE, 140622430191616, 140622438584319, +SNULL, 140622430195711, 140622438584319, +STORE, 140622430191616, 140622430195711, +STORE, 140622430195712, 140622438584319, +SNULL, 140623361323007, 140623495540735, +STORE, 140623227105280, 140623361323007, +STORE, 140623361323008, 140623495540735, +SNULL, 140623361323008, 140623403286527, +STORE, 140623403286528, 140623495540735, +STORE, 140623361323008, 140623403286527, +ERASE, 140623361323008, 140623403286527, +SNULL, 140623470395391, 140623495540735, +STORE, 140623403286528, 140623470395391, +STORE, 140623470395392, 140623495540735, +ERASE, 140623470395392, 140623495540735, +SNULL, 140623227105280, 140623269068799, +STORE, 140623269068800, 140623361323007, +STORE, 140623227105280, 140623269068799, +ERASE, 140623227105280, 140623269068799, +SNULL, 140623084494848, 140623134851071, +STORE, 140623134851072, 140623218712575, +STORE, 140623084494848, 140623134851071, +ERASE, 140623084494848, 140623134851071, +SNULL, 140623201959935, 140623218712575, +STORE, 140623134851072, 140623201959935, +STORE, 140623201959936, 140623218712575, +ERASE, 140623201959936, 140623218712575, +SNULL, 140623067742207, 140623076102143, +STORE, 140623000633344, 140623067742207, +STORE, 140623067742208, 140623076102143, +ERASE, 140623067742208, 140623076102143, +STORE, 140622295973888, 140622430191615, +SNULL, 140622295973888, 140622329544703, +STORE, 140622329544704, 140622430191615, +STORE, 140622295973888, 140622329544703, +ERASE, 140622295973888, 140622329544703, +SNULL, 140622866550783, 140622933524479, +STORE, 140622866415616, 140622866550783, +STORE, 140622866550784, 140622933524479, +SNULL, 140622707019775, 140622841237503, +STORE, 140622438584320, 140622707019775, +STORE, 140622707019776, 140622841237503, +SNULL, 140622707019776, 140622732197887, +STORE, 140622732197888, 140622841237503, +STORE, 140622707019776, 140622732197887, +ERASE, 140622707019776, 140622732197887, +SNULL, 140622799306751, 140622841237503, +STORE, 140622732197888, 140622799306751, +STORE, 140622799306752, 140622841237503, +ERASE, 140622799306752, 140622841237503, +SNULL, 140622572802047, 140622707019775, +STORE, 140622438584320, 140622572802047, +STORE, 140622572802048, 140622707019775, +SNULL, 140622572802048, 140622597980159, +STORE, 140622597980160, 140622707019775, +STORE, 140622572802048, 140622597980159, +ERASE, 140622572802048, 140622597980159, +SNULL, 140622438584320, 140622463762431, +STORE, 140622463762432, 140622572802047, +STORE, 140622438584320, 140622463762431, +ERASE, 140622438584320, 140622463762431, +SNULL, 140622530871295, 140622572802047, +STORE, 140622463762432, 140622530871295, +STORE, 140622530871296, 140622572802047, +ERASE, 140622530871296, 140622572802047, +STORE, 140622195326976, 140622430191615, +SNULL, 140622262435839, 140622430191615, +STORE, 140622195326976, 140622262435839, +STORE, 140622262435840, 140622430191615, +SNULL, 140622262435840, 140622329544703, +STORE, 140622329544704, 140622430191615, +STORE, 140622262435840, 140622329544703, +ERASE, 140622262435840, 140622329544703, +SNULL, 140622841241599, 140622849630207, +STORE, 140622841237504, 140622841241599, +STORE, 140622841241600, 140622849630207, +STORE, 140623487148032, 140623520718847, +STORE, 140623478755328, 140623520718847, +SNULL, 140622941884416, 140622983847935, +STORE, 140622983847936, 140623000633343, +STORE, 140622941884416, 140622983847935, +SNULL, 140622983852031, 140623000633343, +STORE, 140622983847936, 140622983852031, +STORE, 140622983852032, 140623000633343, +STORE, 140623394893824, 140623403286527, +SNULL, 140623394897919, 140623403286527, +STORE, 140623394893824, 140623394897919, +STORE, 140623394897920, 140623403286527, +SNULL, 140623403421695, 140623470395391, +STORE, 140623403286528, 140623403421695, +STORE, 140623403421696, 140623470395391, +SNULL, 140623478755328, 140623503933439, +STORE, 140623503933440, 140623520718847, +STORE, 140623478755328, 140623503933439, +SNULL, 140623503937535, 140623520718847, +STORE, 140623503933440, 140623503937535, +STORE, 140623503937536, 140623520718847, +SNULL, 140623336177663, 140623361323007, +STORE, 140623269068800, 140623336177663, +STORE, 140623336177664, 140623361323007, +ERASE, 140623336177664, 140623361323007, +SNULL, 140623269203967, 140623336177663, +STORE, 140623269068800, 140623269203967, +STORE, 140623269203968, 140623336177663, +SNULL, 140623134986239, 140623201959935, +STORE, 140623134851072, 140623134986239, +STORE, 140623134986240, 140623201959935, +SNULL, 140623000768511, 140623067742207, +STORE, 140623000633344, 140623000768511, +STORE, 140623000768512, 140623067742207, +SNULL, 140622396653567, 140622430191615, +STORE, 140622329544704, 140622396653567, +STORE, 140622396653568, 140622430191615, +ERASE, 140622396653568, 140622430191615, +SNULL, 140622732333055, 140622799306751, +STORE, 140622732197888, 140622732333055, +STORE, 140622732333056, 140622799306751, +SNULL, 140622941884416, 140622975455231, +STORE, 140622975455232, 140622983847935, +STORE, 140622941884416, 140622975455231, +SNULL, 140622975459327, 140622983847935, +STORE, 140622975455232, 140622975459327, +STORE, 140622975459328, 140622983847935, +SNULL, 140622665089023, 140622707019775, +STORE, 140622597980160, 140622665089023, +STORE, 140622665089024, 140622707019775, +ERASE, 140622665089024, 140622707019775, +SNULL, 140622598115327, 140622665089023, +STORE, 140622597980160, 140622598115327, +STORE, 140622598115328, 140622665089023, +SNULL, 140622463897599, 140622530871295, +STORE, 140622463762432, 140622463897599, +STORE, 140622463897600, 140622530871295, +SNULL, 140622195462143, 140622262435839, +STORE, 140622195326976, 140622195462143, +STORE, 140622195462144, 140622262435839, +STORE, 140623386501120, 140623394893823, +SNULL, 140622941884416, 140622950277119, +STORE, 140622950277120, 140622975455231, +STORE, 140622941884416, 140622950277119, +SNULL, 140622950281215, 140622975455231, +STORE, 140622950277120, 140622950281215, +STORE, 140622950281216, 140622975455231, +SNULL, 140622941888511, 140622950277119, +STORE, 140622941884416, 140622941888511, +STORE, 140622941888512, 140622950277119, +STORE, 140623378108416, 140623394893823, +SNULL, 140623478755328, 140623495540735, +STORE, 140623495540736, 140623503933439, +STORE, 140623478755328, 140623495540735, +SNULL, 140623495544831, 140623503933439, +STORE, 140623495540736, 140623495544831, +STORE, 140623495544832, 140623503933439, +SNULL, 140623478755328, 140623487148031, +STORE, 140623487148032, 140623495540735, +STORE, 140623478755328, 140623487148031, +SNULL, 140623487152127, 140623495540735, +STORE, 140623487148032, 140623487152127, +STORE, 140623487152128, 140623495540735, +SNULL, 140623218716671, 140623227105279, +STORE, 140623218712576, 140623218716671, +STORE, 140623218716672, 140623227105279, +SNULL, 140623076106239, 140623084494847, +STORE, 140623076102144, 140623076106239, +STORE, 140623076106240, 140623084494847, +SNULL, 140622329679871, 140622396653567, +STORE, 140622329544704, 140622329679871, +STORE, 140622329679872, 140622396653567, +SNULL, 140622950281216, 140622958669823, +STORE, 140622958669824, 140622975455231, +STORE, 140622950281216, 140622958669823, +SNULL, 140622958673919, 140622975455231, +STORE, 140622958669824, 140622958673919, +STORE, 140622958673920, 140622975455231, +SNULL, 140623503937536, 140623512326143, +STORE, 140623512326144, 140623520718847, +STORE, 140623503937536, 140623512326143, +SNULL, 140623512330239, 140623520718847, +STORE, 140623512326144, 140623512330239, +STORE, 140623512330240, 140623520718847, +SNULL, 140623378108416, 140623386501119, +STORE, 140623386501120, 140623394893823, +STORE, 140623378108416, 140623386501119, +SNULL, 140623386505215, 140623394893823, +STORE, 140623386501120, 140623386505215, +STORE, 140623386505216, 140623394893823, +STORE, 140623369715712, 140623386501119, +STORE, 140623361323008, 140623386501119, +STORE, 140623352930304, 140623386501119, +SNULL, 140623352930304, 140623361323007, +STORE, 140623361323008, 140623386501119, +STORE, 140623352930304, 140623361323007, +SNULL, 140623361327103, 140623386501119, +STORE, 140623361323008, 140623361327103, +STORE, 140623361327104, 140623386501119, +SNULL, 140623478759423, 140623487148031, +STORE, 140623478755328, 140623478759423, +STORE, 140623478759424, 140623487148031, +STORE, 140623344537600, 140623361323007, +STORE, 140623260676096, 140623269068799, +SNULL, 140622958673920, 140622967062527, +STORE, 140622967062528, 140622975455231, +STORE, 140622958673920, 140622967062527, +SNULL, 140622967066623, 140622975455231, +STORE, 140622967062528, 140622967066623, +STORE, 140622967066624, 140622975455231, +STORE, 140623252283392, 140623269068799, +STORE, 140623243890688, 140623269068799, +SNULL, 140622983852032, 140622992240639, +STORE, 140622992240640, 140623000633343, +STORE, 140622983852032, 140622992240639, +SNULL, 140622992244735, 140623000633343, +STORE, 140622992240640, 140622992244735, +STORE, 140622992244736, 140623000633343, +STORE, 140623235497984, 140623269068799, +STORE, 140623218716672, 140623235497983, +STORE, 140623210319872, 140623218712575, +STORE, 140623126458368, 140623134851071, +SNULL, 140623210323967, 140623218712575, +STORE, 140623210319872, 140623210323967, +STORE, 140623210323968, 140623218712575, +SNULL, 140623218716672, 140623227105279, +STORE, 140623227105280, 140623235497983, +STORE, 140623218716672, 140623227105279, +SNULL, 140623227109375, 140623235497983, +STORE, 140623227105280, 140623227109375, +STORE, 140623227109376, 140623235497983, +STORE, 140623118065664, 140623134851071, +STORE, 140623109672960, 140623134851071, +SNULL, 140623109677055, 140623134851071, +STORE, 140623109672960, 140623109677055, +STORE, 140623109677056, 140623134851071, +STORE, 140623101280256, 140623109672959, +STORE, 140623092887552, 140623109672959, +SNULL, 140623092887552, 140623101280255, +STORE, 140623101280256, 140623109672959, +STORE, 140623092887552, 140623101280255, +SNULL, 140623101284351, 140623109672959, +STORE, 140623101280256, 140623101284351, +STORE, 140623101284352, 140623109672959, +SNULL, 140623361327104, 140623378108415, +STORE, 140623378108416, 140623386501119, +STORE, 140623361327104, 140623378108415, +SNULL, 140623378112511, 140623386501119, +STORE, 140623378108416, 140623378112511, +STORE, 140623378112512, 140623386501119, +SNULL, 140623235497984, 140623243890687, +STORE, 140623243890688, 140623269068799, +STORE, 140623235497984, 140623243890687, +SNULL, 140623243894783, 140623269068799, +STORE, 140623243890688, 140623243894783, +STORE, 140623243894784, 140623269068799, +SNULL, 140623361327104, 140623369715711, +STORE, 140623369715712, 140623378108415, +STORE, 140623361327104, 140623369715711, +SNULL, 140623369719807, 140623378108415, +STORE, 140623369715712, 140623369719807, +STORE, 140623369719808, 140623378108415, +SNULL, 140623243894784, 140623252283391, +STORE, 140623252283392, 140623269068799, +STORE, 140623243894784, 140623252283391, +SNULL, 140623252287487, 140623269068799, +STORE, 140623252283392, 140623252287487, +STORE, 140623252287488, 140623269068799, +SNULL, 140623235502079, 140623243890687, +STORE, 140623235497984, 140623235502079, +STORE, 140623235502080, 140623243890687, +SNULL, 140623344541695, 140623361323007, +STORE, 140623344537600, 140623344541695, +STORE, 140623344541696, 140623361323007, +STORE, 140623076106240, 140623092887551, +SNULL, 140623076106240, 140623084494847, +STORE, 140623084494848, 140623092887551, +STORE, 140623076106240, 140623084494847, +SNULL, 140623084498943, 140623092887551, +STORE, 140623084494848, 140623084498943, +STORE, 140623084498944, 140623092887551, +SNULL, 140623344541696, 140623352930303, +STORE, 140623352930304, 140623361323007, +STORE, 140623344541696, 140623352930303, +SNULL, 140623352934399, 140623361323007, +STORE, 140623352930304, 140623352934399, +STORE, 140623352934400, 140623361323007, +SNULL, 140623109677056, 140623118065663, +STORE, 140623118065664, 140623134851071, +STORE, 140623109677056, 140623118065663, +SNULL, 140623118069759, 140623134851071, +STORE, 140623118065664, 140623118069759, +STORE, 140623118069760, 140623134851071, +STORE, 140622832844800, 140622841237503, +STORE, 140622824452096, 140622841237503, +SNULL, 140622824452096, 140622832844799, +STORE, 140622832844800, 140622841237503, +STORE, 140622824452096, 140622832844799, +SNULL, 140622832848895, 140622841237503, +STORE, 140622832844800, 140622832848895, +STORE, 140622832848896, 140622841237503, +STORE, 140622816059392, 140622832844799, +SNULL, 140623092891647, 140623101280255, +STORE, 140623092887552, 140623092891647, +STORE, 140623092891648, 140623101280255, +SNULL, 140623118069760, 140623126458367, +STORE, 140623126458368, 140623134851071, +STORE, 140623118069760, 140623126458367, +SNULL, 140623126462463, 140623134851071, +STORE, 140623126458368, 140623126462463, +STORE, 140623126462464, 140623134851071, +SNULL, 140623252287488, 140623260676095, +STORE, 140623260676096, 140623269068799, +STORE, 140623252287488, 140623260676095, +SNULL, 140623260680191, 140623269068799, +STORE, 140623260676096, 140623260680191, +STORE, 140623260680192, 140623269068799, +STORE, 140622807666688, 140622832844799, +STORE, 140622723805184, 140622732197887, +STORE, 140622715412480, 140622732197887, +STORE, 140622707019776, 140622732197887, +SNULL, 140622707023871, 140622732197887, +STORE, 140622707019776, 140622707023871, +STORE, 140622707023872, 140622732197887, +STORE, 140622698627072, 140622707019775, +STORE, 140622690234368, 140622707019775, +SNULL, 140622690238463, 140622707019775, +STORE, 140622690234368, 140622690238463, +STORE, 140622690238464, 140622707019775, +SNULL, 140622807666688, 140622816059391, +STORE, 140622816059392, 140622832844799, +STORE, 140622807666688, 140622816059391, +SNULL, 140622816063487, 140622832844799, +STORE, 140622816059392, 140622816063487, +STORE, 140622816063488, 140622832844799, +STORE, 140622681841664, 140622690234367, +STORE, 140622673448960, 140622690234367, +SNULL, 140622673453055, 140622690234367, +STORE, 140622673448960, 140622673453055, +STORE, 140622673453056, 140622690234367, +STORE, 140622589587456, 140622597980159, +SNULL, 140622807670783, 140622816059391, +STORE, 140622807666688, 140622807670783, +STORE, 140622807670784, 140622816059391, +STORE, 140622581194752, 140622597980159, +SNULL, 140622581198847, 140622597980159, +STORE, 140622581194752, 140622581198847, +STORE, 140622581198848, 140622597980159, +SNULL, 140622816063488, 140622824452095, +STORE, 140622824452096, 140622832844799, +STORE, 140622816063488, 140622824452095, +SNULL, 140622824456191, 140622832844799, +STORE, 140622824452096, 140622824456191, +STORE, 140622824456192, 140622832844799, +STORE, 140622572802048, 140622581194751, +SNULL, 140622572806143, 140622581194751, +STORE, 140622572802048, 140622572806143, +STORE, 140622572806144, 140622581194751, +STORE, 140622564409344, 140622572802047, +STORE, 140622556016640, 140622572802047, +SNULL, 140622556016640, 140622564409343, +STORE, 140622564409344, 140622572802047, +STORE, 140622556016640, 140622564409343, +SNULL, 140622564413439, 140622572802047, +STORE, 140622564409344, 140622564413439, +STORE, 140622564413440, 140622572802047, +SNULL, 140622690238464, 140622698627071, +STORE, 140622698627072, 140622707019775, +STORE, 140622690238464, 140622698627071, +SNULL, 140622698631167, 140622707019775, +STORE, 140622698627072, 140622698631167, +STORE, 140622698631168, 140622707019775, +SNULL, 140622707023872, 140622723805183, +STORE, 140622723805184, 140622732197887, +STORE, 140622707023872, 140622723805183, +SNULL, 140622723809279, 140622732197887, +STORE, 140622723805184, 140622723809279, +STORE, 140622723809280, 140622732197887, +SNULL, 140622707023872, 140622715412479, +STORE, 140622715412480, 140622723805183, +STORE, 140622707023872, 140622715412479, +SNULL, 140622715416575, 140622723805183, +STORE, 140622715412480, 140622715416575, +STORE, 140622715416576, 140622723805183, +STORE, 140622547623936, 140622564409343, +SNULL, 140622547628031, 140622564409343, +STORE, 140622547623936, 140622547628031, +STORE, 140622547628032, 140622564409343, +STORE, 140622539231232, 140622547623935, +SNULL, 140622539235327, 140622547623935, +STORE, 140622539231232, 140622539235327, +STORE, 140622539235328, 140622547623935, +SNULL, 140622581198848, 140622589587455, +STORE, 140622589587456, 140622597980159, +STORE, 140622581198848, 140622589587455, +SNULL, 140622589591551, 140622597980159, +STORE, 140622589587456, 140622589591551, +STORE, 140622589591552, 140622597980159, +STORE, 140622455369728, 140622463762431, +SNULL, 140622455373823, 140622463762431, +STORE, 140622455369728, 140622455373823, +STORE, 140622455373824, 140622463762431, +STORE, 140622446977024, 140622455369727, +SNULL, 140622446981119, 140622455369727, +STORE, 140622446977024, 140622446981119, +STORE, 140622446981120, 140622455369727, +SNULL, 140622547628032, 140622556016639, +STORE, 140622556016640, 140622564409343, +STORE, 140622547628032, 140622556016639, +SNULL, 140622556020735, 140622564409343, +STORE, 140622556016640, 140622556020735, +STORE, 140622556020736, 140622564409343, +STORE, 140622430195712, 140622446977023, +STORE, 140622421798912, 140622430191615, +SNULL, 140622430195712, 140622438584319, +STORE, 140622438584320, 140622446977023, +STORE, 140622430195712, 140622438584319, +SNULL, 140622438588415, 140622446977023, +STORE, 140622438584320, 140622438588415, +STORE, 140622438588416, 140622446977023, +STORE, 140622413406208, 140622430191615, +STORE, 140622405013504, 140622430191615, +SNULL, 140622405013504, 140622413406207, +STORE, 140622413406208, 140622430191615, +STORE, 140622405013504, 140622413406207, +SNULL, 140622413410303, 140622430191615, +STORE, 140622413406208, 140622413410303, +STORE, 140622413410304, 140622430191615, +SNULL, 140622673453056, 140622681841663, +STORE, 140622681841664, 140622690234367, +STORE, 140622673453056, 140622681841663, +SNULL, 140622681845759, 140622690234367, +STORE, 140622681841664, 140622681845759, +STORE, 140622681845760, 140622690234367, +STORE, 140622321152000, 140622329544703, +SNULL, 140622413410304, 140622421798911, +STORE, 140622421798912, 140622430191615, +STORE, 140622413410304, 140622421798911, +SNULL, 140622421803007, 140622430191615, +STORE, 140622421798912, 140622421803007, +STORE, 140622421803008, 140622430191615, +STORE, 140622312759296, 140622329544703, +SNULL, 140622312763391, 140622329544703, +STORE, 140622312759296, 140622312763391, +STORE, 140622312763392, 140622329544703, +SNULL, 140622405017599, 140622413406207, +STORE, 140622405013504, 140622405017599, +STORE, 140622405017600, 140622413406207, +STORE, 140622304366592, 140622312759295, +SNULL, 140622304370687, 140622312759295, +STORE, 140622304366592, 140622304370687, +STORE, 140622304370688, 140622312759295, +SNULL, 140622312763392, 140622321151999, +STORE, 140622321152000, 140622329544703, +STORE, 140622312763392, 140622321151999, +SNULL, 140622321156095, 140622329544703, +STORE, 140622321152000, 140622321156095, +STORE, 140622321156096, 140622329544703, +STORE, 140624062619648, 140624062648319, +STORE, 140624010240000, 140624012431359, +SNULL, 140624010240000, 140624010330111, +STORE, 140624010330112, 140624012431359, +STORE, 140624010240000, 140624010330111, +SNULL, 140624012423167, 140624012431359, +STORE, 140624010330112, 140624012423167, +STORE, 140624012423168, 140624012431359, +ERASE, 140624012423168, 140624012431359, +STORE, 140624012423168, 140624012431359, +SNULL, 140624012427263, 140624012431359, +STORE, 140624012423168, 140624012427263, +STORE, 140624012427264, 140624012431359, +ERASE, 140624062619648, 140624062648319, +ERASE, 140622849630208, 140622849634303, +ERASE, 140622849634304, 140622858022911, +ERASE, 140623394893824, 140623394897919, +ERASE, 140623394897920, 140623403286527, +ERASE, 140623361323008, 140623361327103, +ERASE, 140623361327104, 140623369715711, +ERASE, 140623084494848, 140623084498943, +ERASE, 140623084498944, 140623092887551, +ERASE, 140623931764736, 140623931768831, +ERASE, 140623931768832, 140623940157439, +ERASE, 140622841237504, 140622841241599, +ERASE, 140622841241600, 140622849630207, +ERASE, 140623487148032, 140623487152127, +ERASE, 140623487152128, 140623495540735, +ERASE, 140623109672960, 140623109677055, +ERASE, 140623109677056, 140623118065663, +ERASE, 140622983847936, 140622983852031, +ERASE, 140622983852032, 140622992240639, +ERASE, 140623352930304, 140623352934399, +ERASE, 140623352934400, 140623361323007, +ERASE, 140622564409344, 140622564413439, +ERASE, 140622564413440, 140622572802047, +ERASE, 140622430191616, 140622430195711, +ERASE, 140622430195712, 140622438584319, +ERASE, 140622958669824, 140622958673919, +ERASE, 140622958673920, 140622967062527, +ERASE, 140622992240640, 140622992244735, +ERASE, 140622992244736, 140623000633343, +ERASE, 140623227105280, 140623227109375, +ERASE, 140623227109376, 140623235497983, +ERASE, 140622321152000, 140622321156095, +ERASE, 140622321156096, 140622329544703, +ERASE, 140622858022912, 140622858027007, +ERASE, 140622858027008, 140622866415615, +ERASE, 140622975455232, 140622975459327, +ERASE, 140622975459328, 140622983847935, +ERASE, 140623378108416, 140623378112511, +ERASE, 140623378112512, 140623386501119, +ERASE, 140623495540736, 140623495544831, +ERASE, 140623495544832, 140623503933439, +ERASE, 140623118065664, 140623118069759, +ERASE, 140623118069760, 140623126458367, +ERASE, 140622572802048, 140622572806143, +ERASE, 140622572806144, 140622581194751, +ERASE, 140622421798912, 140622421803007, +ERASE, 140622421803008, 140622430191615, +ERASE, 140622967062528, 140622967066623, +ERASE, 140622967066624, 140622975455231, +ERASE, 140623252283392, 140623252287487, +ERASE, 140623252287488, 140623260676095, +ERASE, 140622673448960, 140622673453055, +ERASE, 140622673453056, 140622681841663, +ERASE, 140623076102144, 140623076106239, +ERASE, 140623076106240, 140623084494847, +ERASE, 140623101280256, 140623101284351, +ERASE, 140623101284352, 140623109672959, +ERASE, 140622715412480, 140622715416575, +ERASE, 140622715416576, 140622723805183, +ERASE, 140622405013504, 140622405017599, +ERASE, 140622405017600, 140622413406207, +ERASE, 140623478755328, 140623478759423, +ERASE, 140623478759424, 140623487148031, +ERASE, 140623906586624, 140623906590719, +ERASE, 140623906590720, 140623914979327, +ERASE, 140622950277120, 140622950281215, +ERASE, 140622950281216, 140622958669823, + }; + unsigned long set32[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140731244212224, 140737488351231, +SNULL, 140731244216319, 140737488351231, +STORE, 140731244212224, 140731244216319, +STORE, 140731244081152, 140731244216319, +STORE, 94427773984768, 94427776237567, +SNULL, 94427774115839, 94427776237567, +STORE, 94427773984768, 94427774115839, +STORE, 94427774115840, 94427776237567, +ERASE, 94427774115840, 94427776237567, +STORE, 94427776208896, 94427776217087, +STORE, 94427776217088, 94427776237567, +STORE, 140401464893440, 140401467146239, +SNULL, 140401465036799, 140401467146239, +STORE, 140401464893440, 140401465036799, +STORE, 140401465036800, 140401467146239, +ERASE, 140401465036800, 140401467146239, +STORE, 140401467133952, 140401467142143, +STORE, 140401467142144, 140401467146239, +STORE, 140731244507136, 140731244511231, +STORE, 140731244494848, 140731244507135, +STORE, 140401467105280, 140401467133951, +STORE, 140401467097088, 140401467105279, +STORE, 140401462677504, 140401464893439, +SNULL, 140401462677504, 140401462775807, +STORE, 140401462775808, 140401464893439, +STORE, 140401462677504, 140401462775807, +SNULL, 140401464868863, 140401464893439, +STORE, 140401462775808, 140401464868863, +STORE, 140401464868864, 140401464893439, +SNULL, 140401464868864, 140401464877055, +STORE, 140401464877056, 140401464893439, +STORE, 140401464868864, 140401464877055, +ERASE, 140401464868864, 140401464877055, +STORE, 140401464868864, 140401464877055, +ERASE, 140401464877056, 140401464893439, +STORE, 140401464877056, 140401464893439, +STORE, 140401458880512, 140401462677503, +SNULL, 140401458880512, 140401460539391, +STORE, 140401460539392, 140401462677503, +STORE, 140401458880512, 140401460539391, +SNULL, 140401462636543, 140401462677503, +STORE, 140401460539392, 140401462636543, +STORE, 140401462636544, 140401462677503, +SNULL, 140401462636544, 140401462661119, +STORE, 140401462661120, 140401462677503, +STORE, 140401462636544, 140401462661119, +ERASE, 140401462636544, 140401462661119, +STORE, 140401462636544, 140401462661119, +ERASE, 140401462661120, 140401462677503, +STORE, 140401462661120, 140401462677503, +STORE, 140401467088896, 140401467105279, +SNULL, 140401462652927, 140401462661119, +STORE, 140401462636544, 140401462652927, +STORE, 140401462652928, 140401462661119, +SNULL, 140401464872959, 140401464877055, +STORE, 140401464868864, 140401464872959, +STORE, 140401464872960, 140401464877055, +SNULL, 94427776212991, 94427776217087, +STORE, 94427776208896, 94427776212991, +STORE, 94427776212992, 94427776217087, +SNULL, 140401467138047, 140401467142143, +STORE, 140401467133952, 140401467138047, +STORE, 140401467138048, 140401467142143, +ERASE, 140401467105280, 140401467133951, +STORE, 94427784683520, 94427784818687, +STORE, 140401450487808, 140401458880511, +SNULL, 140401450491903, 140401458880511, +STORE, 140401450487808, 140401450491903, +STORE, 140401450491904, 140401458880511, +STORE, 140401442095104, 140401450487807, +STORE, 140401307877376, 140401442095103, +SNULL, 140401307877376, 140401340055551, +STORE, 140401340055552, 140401442095103, +STORE, 140401307877376, 140401340055551, +ERASE, 140401307877376, 140401340055551, +SNULL, 140401407164415, 140401442095103, +STORE, 140401340055552, 140401407164415, +STORE, 140401407164416, 140401442095103, +ERASE, 140401407164416, 140401442095103, +SNULL, 140401340190719, 140401407164415, +STORE, 140401340055552, 140401340190719, +STORE, 140401340190720, 140401407164415, +SNULL, 140401442099199, 140401450487807, +STORE, 140401442095104, 140401442099199, +STORE, 140401442099200, 140401450487807, +STORE, 140401433702400, 140401442095103, +SNULL, 140401433706495, 140401442095103, +STORE, 140401433702400, 140401433706495, +STORE, 140401433706496, 140401442095103, +STORE, 140401425309696, 140401433702399, +SNULL, 140401425313791, 140401433702399, +STORE, 140401425309696, 140401425313791, +STORE, 140401425313792, 140401433702399, +STORE, 140401416916992, 140401425309695, +SNULL, 140401416921087, 140401425309695, +STORE, 140401416916992, 140401416921087, +STORE, 140401416921088, 140401425309695, +STORE, 140401408524288, 140401416916991, +STORE, 140401205837824, 140401340055551, +SNULL, 140401272946687, 140401340055551, +STORE, 140401205837824, 140401272946687, +STORE, 140401272946688, 140401340055551, +ERASE, 140401272946688, 140401340055551, +SNULL, 140401205972991, 140401272946687, +STORE, 140401205837824, 140401205972991, +STORE, 140401205972992, 140401272946687, +STORE, 140401331662848, 140401340055551, +STORE, 140401323270144, 140401340055551, +STORE, 140401138728960, 140401205837823, +STORE, 140401314877440, 140401340055551, +SNULL, 140401408528383, 140401416916991, +STORE, 140401408524288, 140401408528383, +STORE, 140401408528384, 140401416916991, +SNULL, 140401138864127, 140401205837823, +STORE, 140401138728960, 140401138864127, +STORE, 140401138864128, 140401205837823, +STORE, 140401004511232, 140401138728959, +SNULL, 140401071620095, 140401138728959, +STORE, 140401004511232, 140401071620095, +STORE, 140401071620096, 140401138728959, +ERASE, 140401071620096, 140401138728959, +STORE, 140400870293504, 140401071620095, +SNULL, 140400937402367, 140401071620095, +STORE, 140400870293504, 140400937402367, +STORE, 140400937402368, 140401071620095, +SNULL, 140400937402368, 140401004511231, +STORE, 140401004511232, 140401071620095, +STORE, 140400937402368, 140401004511231, +ERASE, 140400937402368, 140401004511231, +STORE, 140401306484736, 140401340055551, +SNULL, 140401306484736, 140401323270143, +STORE, 140401323270144, 140401340055551, +STORE, 140401306484736, 140401323270143, +SNULL, 140401323274239, 140401340055551, +STORE, 140401323270144, 140401323274239, +STORE, 140401323274240, 140401340055551, +SNULL, 140401004646399, 140401071620095, +STORE, 140401004511232, 140401004646399, +STORE, 140401004646400, 140401071620095, +SNULL, 140400870428671, 140400937402367, +STORE, 140400870293504, 140400870428671, +STORE, 140400870428672, 140400937402367, +SNULL, 140401306488831, 140401323270143, +STORE, 140401306484736, 140401306488831, +STORE, 140401306488832, 140401323270143, +STORE, 140401298092032, 140401306484735, +SNULL, 140401306488832, 140401314877439, +STORE, 140401314877440, 140401323270143, +STORE, 140401306488832, 140401314877439, +SNULL, 140401314881535, 140401323270143, +STORE, 140401314877440, 140401314881535, +STORE, 140401314881536, 140401323270143, +SNULL, 140401323274240, 140401331662847, +STORE, 140401331662848, 140401340055551, +STORE, 140401323274240, 140401331662847, +SNULL, 140401331666943, 140401340055551, +STORE, 140401331662848, 140401331666943, +STORE, 140401331666944, 140401340055551, +SNULL, 140401298096127, 140401306484735, +STORE, 140401298092032, 140401298096127, +STORE, 140401298096128, 140401306484735, +STORE, 140401289699328, 140401298092031, +STORE, 140401281306624, 140401298092031, +STORE, 140401130336256, 140401138728959, +SNULL, 140401281306624, 140401289699327, +STORE, 140401289699328, 140401298092031, +STORE, 140401281306624, 140401289699327, +SNULL, 140401289703423, 140401298092031, +STORE, 140401289699328, 140401289703423, +STORE, 140401289703424, 140401298092031, +STORE, 140401121943552, 140401138728959, +STORE, 140401113550848, 140401138728959, +SNULL, 140401281310719, 140401289699327, +STORE, 140401281306624, 140401281310719, +STORE, 140401281310720, 140401289699327, +SNULL, 140401113550848, 140401121943551, +STORE, 140401121943552, 140401138728959, +STORE, 140401113550848, 140401121943551, +SNULL, 140401121947647, 140401138728959, +STORE, 140401121943552, 140401121947647, +STORE, 140401121947648, 140401138728959, +STORE, 140401105158144, 140401121943551, +SNULL, 140401121947648, 140401130336255, +STORE, 140401130336256, 140401138728959, +STORE, 140401121947648, 140401130336255, +SNULL, 140401130340351, 140401138728959, +STORE, 140401130336256, 140401130340351, +STORE, 140401130340352, 140401138728959, +STORE, 140401096765440, 140401121943551, +SNULL, 140401096765440, 140401113550847, +STORE, 140401113550848, 140401121943551, +STORE, 140401096765440, 140401113550847, +SNULL, 140401113554943, 140401121943551, +STORE, 140401113550848, 140401113554943, +STORE, 140401113554944, 140401121943551, +STORE, 140401088372736, 140401113550847, +SNULL, 140401088372736, 140401096765439, +STORE, 140401096765440, 140401113550847, +STORE, 140401088372736, 140401096765439, +SNULL, 140401096769535, 140401113550847, +STORE, 140401096765440, 140401096769535, +STORE, 140401096769536, 140401113550847, +SNULL, 140401096769536, 140401105158143, +STORE, 140401105158144, 140401113550847, +STORE, 140401096769536, 140401105158143, +SNULL, 140401105162239, 140401113550847, +STORE, 140401105158144, 140401105162239, +STORE, 140401105162240, 140401113550847, +SNULL, 140401088376831, 140401096765439, +STORE, 140401088372736, 140401088376831, +STORE, 140401088376832, 140401096765439, +STORE, 140401079980032, 140401088372735, +STORE, 140400996118528, 140401004511231, +SNULL, 140401079984127, 140401088372735, +STORE, 140401079980032, 140401079984127, +STORE, 140401079984128, 140401088372735, +SNULL, 140400996122623, 140401004511231, +STORE, 140400996118528, 140400996122623, +STORE, 140400996122624, 140401004511231, +STORE, 140400987725824, 140400996118527, +STORE, 140400979333120, 140400996118527, +STORE, 140400803184640, 140400870293503, +SNULL, 140400803319807, 140400870293503, +STORE, 140400803184640, 140400803319807, +STORE, 140400803319808, 140400870293503, +SNULL, 140400979333120, 140400987725823, +STORE, 140400987725824, 140400996118527, +STORE, 140400979333120, 140400987725823, +SNULL, 140400987729919, 140400996118527, +STORE, 140400987725824, 140400987729919, +STORE, 140400987729920, 140400996118527, +STORE, 140400970940416, 140400987725823, +STORE, 140400962547712, 140400987725823, +STORE, 140400668966912, 140400803184639, +STORE, 140400954155008, 140400987725823, +STORE, 140400945762304, 140400987725823, +STORE, 140400660574208, 140400668966911, +STORE, 140400593465344, 140400660574207, +STORE, 140400585072640, 140400593465343, +STORE, 140400450854912, 140400585072639, +STORE, 140400442462208, 140400450854911, +STORE, 140400434069504, 140400450854911, +STORE, 140400299851776, 140400434069503, +STORE, 140400291459072, 140400299851775, +SNULL, 140400299851776, 140400333422591, +STORE, 140400333422592, 140400434069503, +STORE, 140400299851776, 140400333422591, +ERASE, 140400299851776, 140400333422591, +STORE, 140400325029888, 140400333422591, +STORE, 140400157241344, 140400291459071, +STORE, 140400316637184, 140400333422591, +STORE, 140400308244480, 140400333422591, +STORE, 140400023023616, 140400291459071, +STORE, 140400291459072, 140400333422591, +SNULL, 140400023023616, 140400064987135, +STORE, 140400064987136, 140400291459071, +STORE, 140400023023616, 140400064987135, +ERASE, 140400023023616, 140400064987135, +STORE, 140400056594432, 140400064987135, +SNULL, 140400056598527, 140400064987135, +STORE, 140400056594432, 140400056598527, +STORE, 140400056598528, 140400064987135, +STORE, 140399989485568, 140400056594431, +SNULL, 140400291459072, 140400316637183, +STORE, 140400316637184, 140400333422591, +STORE, 140400291459072, 140400316637183, +SNULL, 140400316641279, 140400333422591, +STORE, 140400316637184, 140400316641279, +STORE, 140400316641280, 140400333422591, +STORE, 140399855267840, 140400056594431, +SNULL, 140399855267840, 140399863660543, +STORE, 140399863660544, 140400056594431, +STORE, 140399855267840, 140399863660543, +ERASE, 140399855267840, 140399863660543, +SNULL, 140400736075775, 140400803184639, +STORE, 140400668966912, 140400736075775, +STORE, 140400736075776, 140400803184639, +ERASE, 140400736075776, 140400803184639, +SNULL, 140400669102079, 140400736075775, +STORE, 140400668966912, 140400669102079, +STORE, 140400669102080, 140400736075775, +STORE, 140400669102080, 140400803184639, +SNULL, 140400669102080, 140400736075775, +STORE, 140400736075776, 140400803184639, +STORE, 140400669102080, 140400736075775, +SNULL, 140400736210943, 140400803184639, +STORE, 140400736075776, 140400736210943, +STORE, 140400736210944, 140400803184639, +ERASE, 140400593465344, 140400660574207, +SNULL, 140400450854912, 140400467640319, +STORE, 140400467640320, 140400585072639, +STORE, 140400450854912, 140400467640319, +ERASE, 140400450854912, 140400467640319, +STORE, 140399729442816, 140400056594431, +SNULL, 140400400531455, 140400434069503, +STORE, 140400333422592, 140400400531455, +STORE, 140400400531456, 140400434069503, +ERASE, 140400400531456, 140400434069503, +SNULL, 140400333557759, 140400400531455, +STORE, 140400333422592, 140400333557759, +STORE, 140400333557760, 140400400531455, +SNULL, 140400157241343, 140400291459071, +STORE, 140400064987136, 140400157241343, +STORE, 140400157241344, 140400291459071, +SNULL, 140400157241344, 140400199204863, +STORE, 140400199204864, 140400291459071, +STORE, 140400157241344, 140400199204863, +ERASE, 140400157241344, 140400199204863, +SNULL, 140400266313727, 140400291459071, +STORE, 140400199204864, 140400266313727, +STORE, 140400266313728, 140400291459071, +ERASE, 140400266313728, 140400291459071, +SNULL, 140400132095999, 140400157241343, +STORE, 140400064987136, 140400132095999, +STORE, 140400132096000, 140400157241343, +ERASE, 140400132096000, 140400157241343, +SNULL, 140400065122303, 140400132095999, +STORE, 140400064987136, 140400065122303, +STORE, 140400065122304, 140400132095999, +SNULL, 140400945762304, 140400954155007, +STORE, 140400954155008, 140400987725823, +STORE, 140400945762304, 140400954155007, +SNULL, 140400954159103, 140400987725823, +STORE, 140400954155008, 140400954159103, +STORE, 140400954159104, 140400987725823, +SNULL, 140400434069504, 140400442462207, +STORE, 140400442462208, 140400450854911, +STORE, 140400434069504, 140400442462207, +SNULL, 140400442466303, 140400450854911, +STORE, 140400442462208, 140400442466303, +STORE, 140400442466304, 140400450854911, +SNULL, 140400291463167, 140400316637183, +STORE, 140400291459072, 140400291463167, +STORE, 140400291463168, 140400316637183, +STORE, 140400652181504, 140400668966911, +STORE, 140400643788800, 140400668966911, +SNULL, 140400291463168, 140400299851775, +STORE, 140400299851776, 140400316637183, +STORE, 140400291463168, 140400299851775, +SNULL, 140400299855871, 140400316637183, +STORE, 140400299851776, 140400299855871, +STORE, 140400299855872, 140400316637183, +STORE, 140400635396096, 140400668966911, +SNULL, 140400635396096, 140400643788799, +STORE, 140400643788800, 140400668966911, +STORE, 140400635396096, 140400643788799, +SNULL, 140400643792895, 140400668966911, +STORE, 140400643788800, 140400643792895, +STORE, 140400643792896, 140400668966911, +SNULL, 140399989485567, 140400056594431, +STORE, 140399729442816, 140399989485567, +STORE, 140399989485568, 140400056594431, +ERASE, 140399989485568, 140400056594431, +SNULL, 140399930769407, 140399989485567, +STORE, 140399729442816, 140399930769407, +STORE, 140399930769408, 140399989485567, +ERASE, 140399930769408, 140399989485567, +SNULL, 140400945766399, 140400954155007, +STORE, 140400945762304, 140400945766399, +STORE, 140400945766400, 140400954155007, +SNULL, 140400534749183, 140400585072639, +STORE, 140400467640320, 140400534749183, +STORE, 140400534749184, 140400585072639, +ERASE, 140400534749184, 140400585072639, +SNULL, 140399796551679, 140399930769407, +STORE, 140399729442816, 140399796551679, +STORE, 140399796551680, 140399930769407, +SNULL, 140399796551680, 140399863660543, +STORE, 140399863660544, 140399930769407, +STORE, 140399796551680, 140399863660543, +ERASE, 140399796551680, 140399863660543, +SNULL, 140400199340031, 140400266313727, +STORE, 140400199204864, 140400199340031, +STORE, 140400199340032, 140400266313727, +STORE, 140400627003392, 140400643788799, +SNULL, 140400316641280, 140400325029887, +STORE, 140400325029888, 140400333422591, +STORE, 140400316641280, 140400325029887, +SNULL, 140400325033983, 140400333422591, +STORE, 140400325029888, 140400325033983, +STORE, 140400325033984, 140400333422591, +SNULL, 140400627003392, 140400635396095, +STORE, 140400635396096, 140400643788799, +STORE, 140400627003392, 140400635396095, +SNULL, 140400635400191, 140400643788799, +STORE, 140400635396096, 140400635400191, +STORE, 140400635400192, 140400643788799, +SNULL, 140400434073599, 140400442462207, +STORE, 140400434069504, 140400434073599, +STORE, 140400434073600, 140400442462207, +STORE, 140400618610688, 140400635396095, +STORE, 140400610217984, 140400635396095, +SNULL, 140400954159104, 140400962547711, +STORE, 140400962547712, 140400987725823, +STORE, 140400954159104, 140400962547711, +SNULL, 140400962551807, 140400987725823, +STORE, 140400962547712, 140400962551807, +STORE, 140400962551808, 140400987725823, +SNULL, 140400299855872, 140400308244479, +STORE, 140400308244480, 140400316637183, +STORE, 140400299855872, 140400308244479, +SNULL, 140400308248575, 140400316637183, +STORE, 140400308244480, 140400308248575, +STORE, 140400308248576, 140400316637183, +STORE, 140400601825280, 140400635396095, +SNULL, 140400601829375, 140400635396095, +STORE, 140400601825280, 140400601829375, +STORE, 140400601829376, 140400635396095, +STORE, 140400576679936, 140400593465343, +SNULL, 140400576684031, 140400593465343, +STORE, 140400576679936, 140400576684031, +STORE, 140400576684032, 140400593465343, +SNULL, 140400643792896, 140400652181503, +STORE, 140400652181504, 140400668966911, +STORE, 140400643792896, 140400652181503, +SNULL, 140400652185599, 140400668966911, +STORE, 140400652181504, 140400652185599, +STORE, 140400652185600, 140400668966911, +STORE, 140399595225088, 140399796551679, +SNULL, 140399662333951, 140399796551679, +STORE, 140399595225088, 140399662333951, +STORE, 140399662333952, 140399796551679, +SNULL, 140399662333952, 140399729442815, +STORE, 140399729442816, 140399796551679, +STORE, 140399662333952, 140399729442815, +ERASE, 140399662333952, 140399729442815, +SNULL, 140399863795711, 140399930769407, +STORE, 140399863660544, 140399863795711, +STORE, 140399863795712, 140399930769407, +STORE, 140400568287232, 140400576679935, +SNULL, 140400568291327, 140400576679935, +STORE, 140400568287232, 140400568291327, +STORE, 140400568291328, 140400576679935, +SNULL, 140400467775487, 140400534749183, +STORE, 140400467640320, 140400467775487, +STORE, 140400467775488, 140400534749183, +SNULL, 140399729577983, 140399796551679, +STORE, 140399729442816, 140399729577983, +STORE, 140399729577984, 140399796551679, +SNULL, 140400601829376, 140400627003391, +STORE, 140400627003392, 140400635396095, +STORE, 140400601829376, 140400627003391, +SNULL, 140400627007487, 140400635396095, +STORE, 140400627003392, 140400627007487, +STORE, 140400627007488, 140400635396095, +STORE, 140400559894528, 140400568287231, +STORE, 140400551501824, 140400568287231, +STORE, 140400543109120, 140400568287231, +STORE, 140400459247616, 140400467640319, +STORE, 140400442466304, 140400467640319, +SNULL, 140399595360255, 140399662333951, +STORE, 140399595225088, 140399595360255, +STORE, 140399595360256, 140399662333951, +SNULL, 140400962551808, 140400970940415, +STORE, 140400970940416, 140400987725823, +STORE, 140400962551808, 140400970940415, +SNULL, 140400970944511, 140400987725823, +STORE, 140400970940416, 140400970944511, +STORE, 140400970944512, 140400987725823, +SNULL, 140400652185600, 140400660574207, +STORE, 140400660574208, 140400668966911, +STORE, 140400652185600, 140400660574207, +SNULL, 140400660578303, 140400668966911, +STORE, 140400660574208, 140400660578303, +STORE, 140400660578304, 140400668966911, +SNULL, 140400576684032, 140400585072639, +STORE, 140400585072640, 140400593465343, +STORE, 140400576684032, 140400585072639, +SNULL, 140400585076735, 140400593465343, +STORE, 140400585072640, 140400585076735, +STORE, 140400585076736, 140400593465343, +STORE, 140400425676800, 140400434069503, +STORE, 140400417284096, 140400434069503, +STORE, 140400408891392, 140400434069503, +SNULL, 140400408891392, 140400417284095, +STORE, 140400417284096, 140400434069503, +STORE, 140400408891392, 140400417284095, +SNULL, 140400417288191, 140400434069503, +STORE, 140400417284096, 140400417288191, +STORE, 140400417288192, 140400434069503, +STORE, 140400283066368, 140400291459071, +SNULL, 140400601829376, 140400618610687, +STORE, 140400618610688, 140400627003391, +STORE, 140400601829376, 140400618610687, +SNULL, 140400618614783, 140400627003391, +STORE, 140400618610688, 140400618614783, +STORE, 140400618614784, 140400627003391, +SNULL, 140400601829376, 140400610217983, +STORE, 140400610217984, 140400618610687, +STORE, 140400601829376, 140400610217983, +SNULL, 140400610222079, 140400618610687, +STORE, 140400610217984, 140400610222079, +STORE, 140400610222080, 140400618610687, +STORE, 140400274673664, 140400291459071, +STORE, 140400190812160, 140400199204863, +STORE, 140400182419456, 140400199204863, +SNULL, 140400442466304, 140400450854911, +STORE, 140400450854912, 140400467640319, +STORE, 140400442466304, 140400450854911, +SNULL, 140400450859007, 140400467640319, +STORE, 140400450854912, 140400450859007, +STORE, 140400450859008, 140400467640319, +SNULL, 140400543109120, 140400559894527, +STORE, 140400559894528, 140400568287231, +STORE, 140400543109120, 140400559894527, +SNULL, 140400559898623, 140400568287231, +STORE, 140400559894528, 140400559898623, +STORE, 140400559898624, 140400568287231, +SNULL, 140400450859008, 140400459247615, +STORE, 140400459247616, 140400467640319, +STORE, 140400450859008, 140400459247615, +SNULL, 140400459251711, 140400467640319, +STORE, 140400459247616, 140400459251711, +STORE, 140400459251712, 140400467640319, +SNULL, 140400543113215, 140400559894527, +STORE, 140400543109120, 140400543113215, +STORE, 140400543113216, 140400559894527, +SNULL, 140400970944512, 140400979333119, +STORE, 140400979333120, 140400987725823, +STORE, 140400970944512, 140400979333119, +SNULL, 140400979337215, 140400987725823, +STORE, 140400979333120, 140400979337215, +STORE, 140400979337216, 140400987725823, +STORE, 140400174026752, 140400199204863, +SNULL, 140400174030847, 140400199204863, +STORE, 140400174026752, 140400174030847, +STORE, 140400174030848, 140400199204863, +SNULL, 140400274673664, 140400283066367, +STORE, 140400283066368, 140400291459071, +STORE, 140400274673664, 140400283066367, +SNULL, 140400283070463, 140400291459071, +STORE, 140400283066368, 140400283070463, +STORE, 140400283070464, 140400291459071, +STORE, 140400165634048, 140400174026751, +SNULL, 140400165638143, 140400174026751, +STORE, 140400165634048, 140400165638143, +STORE, 140400165638144, 140400174026751, +SNULL, 140400174030848, 140400182419455, +STORE, 140400182419456, 140400199204863, +STORE, 140400174030848, 140400182419455, +SNULL, 140400182423551, 140400199204863, +STORE, 140400182419456, 140400182423551, +STORE, 140400182423552, 140400199204863, +SNULL, 140400182423552, 140400190812159, +STORE, 140400190812160, 140400199204863, +STORE, 140400182423552, 140400190812159, +SNULL, 140400190816255, 140400199204863, +STORE, 140400190812160, 140400190816255, +STORE, 140400190816256, 140400199204863, +STORE, 140400157241344, 140400165634047, +SNULL, 140400157245439, 140400165634047, +STORE, 140400157241344, 140400157245439, +STORE, 140400157245440, 140400165634047, +SNULL, 140400408895487, 140400417284095, +STORE, 140400408891392, 140400408895487, +STORE, 140400408895488, 140400417284095, +SNULL, 140400417288192, 140400425676799, +STORE, 140400425676800, 140400434069503, +STORE, 140400417288192, 140400425676799, +SNULL, 140400425680895, 140400434069503, +STORE, 140400425676800, 140400425680895, +STORE, 140400425680896, 140400434069503, +STORE, 140400148848640, 140400157241343, +SNULL, 140400148852735, 140400157241343, +STORE, 140400148848640, 140400148852735, +STORE, 140400148852736, 140400157241343, +SNULL, 140400543113216, 140400551501823, +STORE, 140400551501824, 140400559894527, +STORE, 140400543113216, 140400551501823, +SNULL, 140400551505919, 140400559894527, +STORE, 140400551501824, 140400551505919, +STORE, 140400551505920, 140400559894527, +STORE, 140400140455936, 140400148848639, +STORE, 140400048201728, 140400056594431, +SNULL, 140400140460031, 140400148848639, +STORE, 140400140455936, 140400140460031, +STORE, 140400140460032, 140400148848639, +STORE, 140400039809024, 140400056594431, +SNULL, 140400039813119, 140400056594431, +STORE, 140400039809024, 140400039813119, +STORE, 140400039813120, 140400056594431, +STORE, 140400031416320, 140400039809023, +STORE, 140400023023616, 140400039809023, +SNULL, 140400274677759, 140400283066367, +STORE, 140400274673664, 140400274677759, +STORE, 140400274677760, 140400283066367, +STORE, 140400014630912, 140400039809023, +STORE, 140400006238208, 140400039809023, +STORE, 140399997845504, 140400039809023, +SNULL, 140399997849599, 140400039809023, +STORE, 140399997845504, 140399997849599, +STORE, 140399997849600, 140400039809023, +STORE, 140399989452800, 140399997845503, +SNULL, 140399989456895, 140399997845503, +STORE, 140399989452800, 140399989456895, +STORE, 140399989456896, 140399997845503, +STORE, 140399981060096, 140399989452799, +SNULL, 140399981064191, 140399989452799, +STORE, 140399981060096, 140399981064191, +STORE, 140399981064192, 140399989452799, +STORE, 140399972667392, 140399981060095, +STORE, 140399964274688, 140399981060095, +SNULL, 140399964278783, 140399981060095, +STORE, 140399964274688, 140399964278783, +STORE, 140399964278784, 140399981060095, +SNULL, 140400039813120, 140400048201727, +STORE, 140400048201728, 140400056594431, +STORE, 140400039813120, 140400048201727, +SNULL, 140400048205823, 140400056594431, +STORE, 140400048201728, 140400048205823, +STORE, 140400048205824, 140400056594431, +SNULL, 140399997849600, 140400031416319, +STORE, 140400031416320, 140400039809023, +STORE, 140399997849600, 140400031416319, +SNULL, 140400031420415, 140400039809023, +STORE, 140400031416320, 140400031420415, +STORE, 140400031420416, 140400039809023, +STORE, 140399955881984, 140399964274687, +SNULL, 140399955886079, 140399964274687, +STORE, 140399955881984, 140399955886079, +STORE, 140399955886080, 140399964274687, +STORE, 140399947489280, 140399955881983, +STORE, 140399939096576, 140399955881983, +STORE, 140399855267840, 140399863660543, +SNULL, 140399939100671, 140399955881983, +STORE, 140399939096576, 140399939100671, +STORE, 140399939100672, 140399955881983, +SNULL, 140399997849600, 140400014630911, +STORE, 140400014630912, 140400031416319, +STORE, 140399997849600, 140400014630911, +SNULL, 140400014635007, 140400031416319, +STORE, 140400014630912, 140400014635007, +STORE, 140400014635008, 140400031416319, +SNULL, 140400014635008, 140400023023615, +STORE, 140400023023616, 140400031416319, +STORE, 140400014635008, 140400023023615, +SNULL, 140400023027711, 140400031416319, +STORE, 140400023023616, 140400023027711, +STORE, 140400023027712, 140400031416319, +SNULL, 140399997849600, 140400006238207, +STORE, 140400006238208, 140400014630911, +STORE, 140399997849600, 140400006238207, +SNULL, 140400006242303, 140400014630911, +STORE, 140400006238208, 140400006242303, +STORE, 140400006242304, 140400014630911, +STORE, 140399846875136, 140399863660543, +STORE, 140399838482432, 140399863660543, +SNULL, 140399838486527, 140399863660543, +STORE, 140399838482432, 140399838486527, +STORE, 140399838486528, 140399863660543, +SNULL, 140399939100672, 140399947489279, +STORE, 140399947489280, 140399955881983, +STORE, 140399939100672, 140399947489279, +SNULL, 140399947493375, 140399955881983, +STORE, 140399947489280, 140399947493375, +STORE, 140399947493376, 140399955881983, +SNULL, 140399964278784, 140399972667391, +STORE, 140399972667392, 140399981060095, +STORE, 140399964278784, 140399972667391, +SNULL, 140399972671487, 140399981060095, +STORE, 140399972667392, 140399972671487, +STORE, 140399972671488, 140399981060095, +SNULL, 140399838486528, 140399855267839, +STORE, 140399855267840, 140399863660543, +STORE, 140399838486528, 140399855267839, +SNULL, 140399855271935, 140399863660543, +STORE, 140399855267840, 140399855271935, +STORE, 140399855271936, 140399863660543, +STORE, 140399830089728, 140399838482431, +SNULL, 140399830093823, 140399838482431, +STORE, 140399830089728, 140399830093823, +STORE, 140399830093824, 140399838482431, +STORE, 140399821697024, 140399830089727, +SNULL, 140399821701119, 140399830089727, +STORE, 140399821697024, 140399821701119, +STORE, 140399821701120, 140399830089727, +SNULL, 140399838486528, 140399846875135, +STORE, 140399846875136, 140399855267839, +STORE, 140399838486528, 140399846875135, +SNULL, 140399846879231, 140399855267839, +STORE, 140399846875136, 140399846879231, +STORE, 140399846879232, 140399855267839, +STORE, 140399813304320, 140399821697023, +STORE, 140399804911616, 140399821697023, +SNULL, 140399804915711, 140399821697023, +STORE, 140399804911616, 140399804915711, +STORE, 140399804915712, 140399821697023, +STORE, 140399721050112, 140399729442815, +SNULL, 140399804915712, 140399813304319, +STORE, 140399813304320, 140399821697023, +STORE, 140399804915712, 140399813304319, +SNULL, 140399813308415, 140399821697023, +STORE, 140399813304320, 140399813308415, +STORE, 140399813308416, 140399821697023, +SNULL, 140399721054207, 140399729442815, +STORE, 140399721050112, 140399721054207, +STORE, 140399721054208, 140399729442815, +STORE, 140401467105280, 140401467133951, +STORE, 140401279115264, 140401281306623, +SNULL, 140401279115264, 140401279205375, +STORE, 140401279205376, 140401281306623, +STORE, 140401279115264, 140401279205375, +SNULL, 140401281298431, 140401281306623, +STORE, 140401279205376, 140401281298431, +STORE, 140401281298432, 140401281306623, +ERASE, 140401281298432, 140401281306623, +STORE, 140401281298432, 140401281306623, +SNULL, 140401281302527, 140401281306623, +STORE, 140401281298432, 140401281302527, +STORE, 140401281302528, 140401281306623, +ERASE, 140401467105280, 140401467133951, +ERASE, 140400056594432, 140400056598527, +ERASE, 140400056598528, 140400064987135, +ERASE, 140400635396096, 140400635400191, +ERASE, 140400635400192, 140400643788799, +ERASE, 140400408891392, 140400408895487, +ERASE, 140400408895488, 140400417284095, +ERASE, 140400299851776, 140400299855871, +ERASE, 140400299855872, 140400308244479, +ERASE, 140400627003392, 140400627007487, +ERASE, 140400627007488, 140400635396095, +ERASE, 140400954155008, 140400954159103, +ERASE, 140400954159104, 140400962547711, +ERASE, 140400291459072, 140400291463167, +ERASE, 140400291463168, 140400299851775, +ERASE, 140400643788800, 140400643792895, +ERASE, 140400643792896, 140400652181503, +ERASE, 140400325029888, 140400325033983, +ERASE, 140400325033984, 140400333422591, +ERASE, 140400610217984, 140400610222079, +ERASE, 140400610222080, 140400618610687, +ERASE, 140400190812160, 140400190816255, +ERASE, 140400190816256, 140400199204863, +ERASE, 140399964274688, 140399964278783, +ERASE, 140399964278784, 140399972667391, +ERASE, 140400945762304, 140400945766399, +ERASE, 140400945766400, 140400954155007, +ERASE, 140400568287232, 140400568291327, +ERASE, 140400568291328, 140400576679935, +ERASE, 140399972667392, 140399972671487, +ERASE, 140399972671488, 140399981060095, +ERASE, 140400962547712, 140400962551807, +ERASE, 140400962551808, 140400970940415, +ERASE, 140400987725824, 140400987729919, +ERASE, 140400987729920, 140400996118527, +ERASE, 140400652181504, 140400652185599, +ERASE, 140400652185600, 140400660574207, +ERASE, 140400450854912, 140400450859007, +ERASE, 140400450859008, 140400459247615, +ERASE, 140400031416320, 140400031420415, +ERASE, 140400031420416, 140400039809023, +ERASE, 140400308244480, 140400308248575, +ERASE, 140400308248576, 140400316637183, +ERASE, 140400434069504, 140400434073599, +ERASE, 140400434073600, 140400442462207, +ERASE, 140400543109120, 140400543113215, +ERASE, 140400543113216, 140400551501823, +ERASE, 140400023023616, 140400023027711, +ERASE, 140400023027712, 140400031416319, +ERASE, 140399813304320, 140399813308415, +ERASE, 140399813308416, 140399821697023, +ERASE, 140400316637184, 140400316641279, +ERASE, 140400316641280, 140400325029887, +ERASE, 140400585072640, 140400585076735, +ERASE, 140400585076736, 140400593465343, +ERASE, 140400148848640, 140400148852735, +ERASE, 140400148852736, 140400157241343, +ERASE, 140399955881984, 140399955886079, +ERASE, 140399955886080, 140399964274687, +ERASE, 140399821697024, 140399821701119, +ERASE, 140399821701120, 140399830089727, +ERASE, 140400601825280, 140400601829375, +ERASE, 140400601829376, 140400610217983, +ERASE, 140400979333120, 140400979337215, +ERASE, 140400979337216, 140400987725823, +ERASE, 140399997845504, 140399997849599, +ERASE, 140399997849600, 140400006238207, +ERASE, 140400459247616, 140400459251711, +ERASE, 140400459251712, 140400467640319, +ERASE, 140400551501824, 140400551505919, +ERASE, 140400551505920, 140400559894527, +ERASE, 140399939096576, 140399939100671, +ERASE, 140399939100672, 140399947489279, +ERASE, 140400442462208, 140400442466303, +ERASE, 140400442466304, 140400450854911, +ERASE, 140400576679936, 140400576684031, +ERASE, 140400576684032, 140400585072639, +ERASE, 140400559894528, 140400559898623, +ERASE, 140400559898624, 140400568287231, +ERASE, 140400417284096, 140400417288191, +ERASE, 140400417288192, 140400425676799, +ERASE, 140400283066368, 140400283070463, +ERASE, 140400283070464, 140400291459071, + }; + unsigned long set33[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140734562918400, 140737488351231, +SNULL, 140734562922495, 140737488351231, +STORE, 140734562918400, 140734562922495, +STORE, 140734562787328, 140734562922495, +STORE, 94133878984704, 94133881237503, +SNULL, 94133879115775, 94133881237503, +STORE, 94133878984704, 94133879115775, +STORE, 94133879115776, 94133881237503, +ERASE, 94133879115776, 94133881237503, +STORE, 94133881208832, 94133881217023, +STORE, 94133881217024, 94133881237503, +STORE, 140583654043648, 140583656296447, +SNULL, 140583654187007, 140583656296447, +STORE, 140583654043648, 140583654187007, +STORE, 140583654187008, 140583656296447, +ERASE, 140583654187008, 140583656296447, +STORE, 140583656284160, 140583656292351, +STORE, 140583656292352, 140583656296447, +STORE, 140734564319232, 140734564323327, +STORE, 140734564306944, 140734564319231, +STORE, 140583656255488, 140583656284159, +STORE, 140583656247296, 140583656255487, +STORE, 140583651827712, 140583654043647, +SNULL, 140583651827712, 140583651926015, +STORE, 140583651926016, 140583654043647, +STORE, 140583651827712, 140583651926015, +SNULL, 140583654019071, 140583654043647, +STORE, 140583651926016, 140583654019071, +STORE, 140583654019072, 140583654043647, +SNULL, 140583654019072, 140583654027263, +STORE, 140583654027264, 140583654043647, +STORE, 140583654019072, 140583654027263, +ERASE, 140583654019072, 140583654027263, +STORE, 140583654019072, 140583654027263, +ERASE, 140583654027264, 140583654043647, +STORE, 140583654027264, 140583654043647, +STORE, 140583648030720, 140583651827711, +SNULL, 140583648030720, 140583649689599, +STORE, 140583649689600, 140583651827711, +STORE, 140583648030720, 140583649689599, +SNULL, 140583651786751, 140583651827711, +STORE, 140583649689600, 140583651786751, +STORE, 140583651786752, 140583651827711, +SNULL, 140583651786752, 140583651811327, +STORE, 140583651811328, 140583651827711, +STORE, 140583651786752, 140583651811327, +ERASE, 140583651786752, 140583651811327, +STORE, 140583651786752, 140583651811327, +ERASE, 140583651811328, 140583651827711, +STORE, 140583651811328, 140583651827711, +STORE, 140583656239104, 140583656255487, +SNULL, 140583651803135, 140583651811327, +STORE, 140583651786752, 140583651803135, +STORE, 140583651803136, 140583651811327, +SNULL, 140583654023167, 140583654027263, +STORE, 140583654019072, 140583654023167, +STORE, 140583654023168, 140583654027263, +SNULL, 94133881212927, 94133881217023, +STORE, 94133881208832, 94133881212927, +STORE, 94133881212928, 94133881217023, +SNULL, 140583656288255, 140583656292351, +STORE, 140583656284160, 140583656288255, +STORE, 140583656288256, 140583656292351, +ERASE, 140583656255488, 140583656284159, +STORE, 94133881733120, 94133881868287, +STORE, 140583639638016, 140583648030719, +SNULL, 140583639642111, 140583648030719, +STORE, 140583639638016, 140583639642111, +STORE, 140583639642112, 140583648030719, +STORE, 140583631245312, 140583639638015, +STORE, 140583497027584, 140583631245311, +SNULL, 140583497027584, 140583540621311, +STORE, 140583540621312, 140583631245311, +STORE, 140583497027584, 140583540621311, +ERASE, 140583497027584, 140583540621311, +SNULL, 140583607730175, 140583631245311, +STORE, 140583540621312, 140583607730175, +STORE, 140583607730176, 140583631245311, +ERASE, 140583607730176, 140583631245311, +SNULL, 140583540756479, 140583607730175, +STORE, 140583540621312, 140583540756479, +STORE, 140583540756480, 140583607730175, +SNULL, 140583631249407, 140583639638015, +STORE, 140583631245312, 140583631249407, +STORE, 140583631249408, 140583639638015, +STORE, 140583622852608, 140583631245311, +SNULL, 140583622856703, 140583631245311, +STORE, 140583622852608, 140583622856703, +STORE, 140583622856704, 140583631245311, +STORE, 140583614459904, 140583622852607, +SNULL, 140583614463999, 140583622852607, +STORE, 140583614459904, 140583614463999, +STORE, 140583614464000, 140583622852607, +STORE, 140583532228608, 140583540621311, +SNULL, 140583532232703, 140583540621311, +STORE, 140583532228608, 140583532232703, +STORE, 140583532232704, 140583540621311, +STORE, 140583523835904, 140583532228607, +STORE, 140583515443200, 140583532228607, +STORE, 140583507050496, 140583532228607, +STORE, 140583372832768, 140583507050495, +STORE, 140583364440064, 140583372832767, +STORE, 140583230222336, 140583364440063, +STORE, 140583096004608, 140583364440063, +SNULL, 140583230222335, 140583364440063, +STORE, 140583096004608, 140583230222335, +STORE, 140583230222336, 140583364440063, +SNULL, 140583230222336, 140583272185855, +STORE, 140583272185856, 140583364440063, +STORE, 140583230222336, 140583272185855, +ERASE, 140583230222336, 140583272185855, +STORE, 140582961786880, 140583230222335, +SNULL, 140583372832768, 140583406403583, +STORE, 140583406403584, 140583507050495, +STORE, 140583372832768, 140583406403583, +ERASE, 140583372832768, 140583406403583, +SNULL, 140583473512447, 140583507050495, +STORE, 140583406403584, 140583473512447, +STORE, 140583473512448, 140583507050495, +ERASE, 140583473512448, 140583507050495, +SNULL, 140583096004607, 140583230222335, +STORE, 140582961786880, 140583096004607, +STORE, 140583096004608, 140583230222335, +SNULL, 140583096004608, 140583137968127, +STORE, 140583137968128, 140583230222335, +STORE, 140583096004608, 140583137968127, +ERASE, 140583096004608, 140583137968127, +SNULL, 140583339294719, 140583364440063, +STORE, 140583272185856, 140583339294719, +STORE, 140583339294720, 140583364440063, +ERASE, 140583339294720, 140583364440063, +SNULL, 140583272321023, 140583339294719, +STORE, 140583272185856, 140583272321023, +STORE, 140583272321024, 140583339294719, +SNULL, 140582961786880, 140583003750399, +STORE, 140583003750400, 140583096004607, +STORE, 140582961786880, 140583003750399, +ERASE, 140582961786880, 140583003750399, + }; + + unsigned long set34[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140731327180800, 140737488351231, +SNULL, 140731327184895, 140737488351231, +STORE, 140731327180800, 140731327184895, +STORE, 140731327049728, 140731327184895, +STORE, 94632924487680, 94632926740479, +SNULL, 94632924618751, 94632926740479, +STORE, 94632924487680, 94632924618751, +STORE, 94632924618752, 94632926740479, +ERASE, 94632924618752, 94632926740479, +STORE, 94632926711808, 94632926719999, +STORE, 94632926720000, 94632926740479, +STORE, 140012544888832, 140012547141631, +SNULL, 140012545032191, 140012547141631, +STORE, 140012544888832, 140012545032191, +STORE, 140012545032192, 140012547141631, +ERASE, 140012545032192, 140012547141631, +STORE, 140012547129344, 140012547137535, +STORE, 140012547137536, 140012547141631, +STORE, 140731327725568, 140731327729663, +STORE, 140731327713280, 140731327725567, +STORE, 140012547100672, 140012547129343, +STORE, 140012547092480, 140012547100671, +STORE, 140012542672896, 140012544888831, +SNULL, 140012542672896, 140012542771199, +STORE, 140012542771200, 140012544888831, +STORE, 140012542672896, 140012542771199, +SNULL, 140012544864255, 140012544888831, +STORE, 140012542771200, 140012544864255, +STORE, 140012544864256, 140012544888831, +SNULL, 140012544864256, 140012544872447, +STORE, 140012544872448, 140012544888831, +STORE, 140012544864256, 140012544872447, +ERASE, 140012544864256, 140012544872447, +STORE, 140012544864256, 140012544872447, +ERASE, 140012544872448, 140012544888831, +STORE, 140012544872448, 140012544888831, +STORE, 140012538875904, 140012542672895, +SNULL, 140012538875904, 140012540534783, +STORE, 140012540534784, 140012542672895, +STORE, 140012538875904, 140012540534783, +SNULL, 140012542631935, 140012542672895, +STORE, 140012540534784, 140012542631935, +STORE, 140012542631936, 140012542672895, +SNULL, 140012542631936, 140012542656511, +STORE, 140012542656512, 140012542672895, +STORE, 140012542631936, 140012542656511, +ERASE, 140012542631936, 140012542656511, +STORE, 140012542631936, 140012542656511, +ERASE, 140012542656512, 140012542672895, +STORE, 140012542656512, 140012542672895, +STORE, 140012547084288, 140012547100671, +SNULL, 140012542648319, 140012542656511, +STORE, 140012542631936, 140012542648319, +STORE, 140012542648320, 140012542656511, +SNULL, 140012544868351, 140012544872447, +STORE, 140012544864256, 140012544868351, +STORE, 140012544868352, 140012544872447, +SNULL, 94632926715903, 94632926719999, +STORE, 94632926711808, 94632926715903, +STORE, 94632926715904, 94632926719999, +SNULL, 140012547133439, 140012547137535, +STORE, 140012547129344, 140012547133439, +STORE, 140012547133440, 140012547137535, +ERASE, 140012547100672, 140012547129343, +STORE, 94632939606016, 94632939741183, +STORE, 140012530483200, 140012538875903, +SNULL, 140012530487295, 140012538875903, +STORE, 140012530483200, 140012530487295, +STORE, 140012530487296, 140012538875903, +STORE, 140012522090496, 140012530483199, +STORE, 140012387872768, 140012522090495, +SNULL, 140012387872768, 140012444188671, +STORE, 140012444188672, 140012522090495, +STORE, 140012387872768, 140012444188671, +ERASE, 140012387872768, 140012444188671, +SNULL, 140012511297535, 140012522090495, +STORE, 140012444188672, 140012511297535, +STORE, 140012511297536, 140012522090495, +ERASE, 140012511297536, 140012522090495, +SNULL, 140012444323839, 140012511297535, +STORE, 140012444188672, 140012444323839, +STORE, 140012444323840, 140012511297535, +SNULL, 140012522094591, 140012530483199, +STORE, 140012522090496, 140012522094591, +STORE, 140012522094592, 140012530483199, +STORE, 140012513697792, 140012522090495, +SNULL, 140012513701887, 140012522090495, +STORE, 140012513697792, 140012513701887, +STORE, 140012513701888, 140012522090495, +STORE, 140012435795968, 140012444188671, +SNULL, 140012435800063, 140012444188671, +STORE, 140012435795968, 140012435800063, +STORE, 140012435800064, 140012444188671, +STORE, 140012427403264, 140012435795967, +SNULL, 140012427407359, 140012435795967, +STORE, 140012427403264, 140012427407359, +STORE, 140012427407360, 140012435795967, +STORE, 140012419010560, 140012427403263, +STORE, 140012410617856, 140012427403263, +STORE, 140012276400128, 140012410617855, +STORE, 140012268007424, 140012276400127, +STORE, 140012133789696, 140012268007423, +SNULL, 140012133789696, 140012175753215, +STORE, 140012175753216, 140012268007423, +STORE, 140012133789696, 140012175753215, +ERASE, 140012133789696, 140012175753215, +STORE, 140012041535488, 140012268007423, +SNULL, 140012108644351, 140012268007423, +STORE, 140012041535488, 140012108644351, +STORE, 140012108644352, 140012268007423, +SNULL, 140012108644352, 140012175753215, +STORE, 140012175753216, 140012268007423, +STORE, 140012108644352, 140012175753215, +ERASE, 140012108644352, 140012175753215, +SNULL, 140012276400128, 140012309970943, +STORE, 140012309970944, 140012410617855, +STORE, 140012276400128, 140012309970943, +ERASE, 140012276400128, 140012309970943, +STORE, 140012301578240, 140012309970943, +STORE, 140012041535488, 140012268007423, +SNULL, 140012242862079, 140012268007423, +STORE, 140012041535488, 140012242862079, +STORE, 140012242862080, 140012268007423, +ERASE, 140012242862080, 140012268007423, +SNULL, 140012041670655, 140012242862079, +STORE, 140012041535488, 140012041670655, +STORE, 140012041670656, 140012242862079, +SNULL, 140012041670656, 140012108644351, +STORE, 140012108644352, 140012242862079, +STORE, 140012041670656, 140012108644351, +SNULL, 140012108779519, 140012242862079, +STORE, 140012108644352, 140012108779519, +STORE, 140012108779520, 140012242862079, +SNULL, 140012377079807, 140012410617855, +STORE, 140012309970944, 140012377079807, +STORE, 140012377079808, 140012410617855, +ERASE, 140012377079808, 140012410617855, +SNULL, 140012310106111, 140012377079807, +STORE, 140012309970944, 140012310106111, +STORE, 140012310106112, 140012377079807, +SNULL, 140012410621951, 140012427403263, +STORE, 140012410617856, 140012410621951, +STORE, 140012410621952, 140012427403263, +SNULL, 140012108779520, 140012175753215, +STORE, 140012175753216, 140012242862079, +STORE, 140012108779520, 140012175753215, +SNULL, 140012175888383, 140012242862079, +STORE, 140012175753216, 140012175888383, +STORE, 140012175888384, 140012242862079, +SNULL, 140012301582335, 140012309970943, +STORE, 140012301578240, 140012301582335, +STORE, 140012301582336, 140012309970943, +SNULL, 140012410621952, 140012419010559, +STORE, 140012419010560, 140012427403263, +STORE, 140012410621952, 140012419010559, +SNULL, 140012419014655, 140012427403263, +STORE, 140012419010560, 140012419014655, +STORE, 140012419014656, 140012427403263, +SNULL, 140012268011519, 140012276400127, +STORE, 140012268007424, 140012268011519, +STORE, 140012268011520, 140012276400127, +STORE, 140012402225152, 140012410617855, +STORE, 140012393832448, 140012410617855, +SNULL, 140012393832448, 140012402225151, +STORE, 140012402225152, 140012410617855, +STORE, 140012393832448, 140012402225151, +SNULL, 140012402229247, 140012410617855, +STORE, 140012402225152, 140012402229247, +STORE, 140012402229248, 140012410617855, +STORE, 140012385439744, 140012402225151, +SNULL, 140012385439744, 140012393832447, +STORE, 140012393832448, 140012402225151, +STORE, 140012385439744, 140012393832447, +SNULL, 140012393836543, 140012402225151, +STORE, 140012393832448, 140012393836543, +STORE, 140012393836544, 140012402225151, +STORE, 140012293185536, 140012301578239, +STORE, 140012284792832, 140012301578239, +SNULL, 140012284792832, 140012293185535, +STORE, 140012293185536, 140012301578239, +STORE, 140012284792832, 140012293185535, +SNULL, 140012293189631, 140012301578239, +STORE, 140012293185536, 140012293189631, +STORE, 140012293189632, 140012301578239, +STORE, 140012268011520, 140012284792831, +SNULL, 140012385443839, 140012393832447, +STORE, 140012385439744, 140012385443839, +STORE, 140012385443840, 140012393832447, +STORE, 140012259614720, 140012268007423, +SNULL, 140012259618815, 140012268007423, +STORE, 140012259614720, 140012259618815, +STORE, 140012259618816, 140012268007423, +STORE, 140012251222016, 140012259614719, +SNULL, 140012251226111, 140012259614719, +STORE, 140012251222016, 140012251226111, +STORE, 140012251226112, 140012259614719, +SNULL, 140012284796927, 140012293185535, +STORE, 140012284792832, 140012284796927, +STORE, 140012284796928, 140012293185535, +SNULL, 140012268011520, 140012276400127, +STORE, 140012276400128, 140012284792831, +STORE, 140012268011520, 140012276400127, +SNULL, 140012276404223, 140012284792831, +STORE, 140012276400128, 140012276404223, +STORE, 140012276404224, 140012284792831, +STORE, 140012033142784, 140012041535487, +SNULL, 140012033146879, 140012041535487, +STORE, 140012033142784, 140012033146879, +STORE, 140012033146880, 140012041535487, +STORE, 140012024750080, 140012033142783, +STORE, 140012016357376, 140012033142783, +SNULL, 140012016357376, 140012024750079, +STORE, 140012024750080, 140012033142783, +STORE, 140012016357376, 140012024750079, +SNULL, 140012024754175, 140012033142783, +STORE, 140012024750080, 140012024754175, +STORE, 140012024754176, 140012033142783, +SNULL, 140012016361471, 140012024750079, +STORE, 140012016357376, 140012016361471, +STORE, 140012016361472, 140012024750079, +STORE, 140012007964672, 140012016357375, +SNULL, 140012007968767, 140012016357375, +STORE, 140012007964672, 140012007968767, +STORE, 140012007968768, 140012016357375, +STORE, 140011999571968, 140012007964671, +STORE, 140011991179264, 140012007964671, +STORE, 140011856961536, 140011991179263, +STORE, 140011848568832, 140011856961535, +STORE, 140011714351104, 140011848568831, +SNULL, 140011714351104, 140011773100031, +STORE, 140011773100032, 140011848568831, +STORE, 140011714351104, 140011773100031, +ERASE, 140011714351104, 140011773100031, +STORE, 140011764707328, 140011773100031, +STORE, 140011756314624, 140011773100031, +STORE, 140011622096896, 140011756314623, +STORE, 140011613704192, 140011622096895, +STORE, 140011479486464, 140011613704191, +STORE, 140011471093760, 140011479486463, +SNULL, 140011479486464, 140011504664575, +STORE, 140011504664576, 140011613704191, +STORE, 140011479486464, 140011504664575, +ERASE, 140011479486464, 140011504664575, +STORE, 140011496271872, 140011504664575, +STORE, 140011487879168, 140011504664575, +STORE, 140011336876032, 140011471093759, +SNULL, 140011336876032, 140011370446847, +STORE, 140011370446848, 140011471093759, +STORE, 140011336876032, 140011370446847, +ERASE, 140011336876032, 140011370446847, +STORE, 140011471093760, 140011487879167, +STORE, 140011362054144, 140011370446847, +SNULL, 140011362058239, 140011370446847, +STORE, 140011362054144, 140011362058239, +STORE, 140011362058240, 140011370446847, +STORE, 140011353661440, 140011362054143, +STORE, 140011345268736, 140011362054143, +SNULL, 140011345272831, 140011362054143, +STORE, 140011345268736, 140011345272831, +STORE, 140011345272832, 140011362054143, +STORE, 140011336876032, 140011345268735, +STORE, 140011328483328, 140011345268735, +SNULL, 140011328487423, 140011345268735, +STORE, 140011328483328, 140011328487423, +STORE, 140011328487424, 140011345268735, +STORE, 140011320090624, 140011328483327, +STORE, 140011185872896, 140011320090623, +SNULL, 140011185872896, 140011236229119, +STORE, 140011236229120, 140011320090623, +STORE, 140011185872896, 140011236229119, +ERASE, 140011185872896, 140011236229119, +SNULL, 140011856961536, 140011907317759, +STORE, 140011907317760, 140011991179263, +STORE, 140011856961536, 140011907317759, +ERASE, 140011856961536, 140011907317759, +SNULL, 140011974426623, 140011991179263, +STORE, 140011907317760, 140011974426623, +STORE, 140011974426624, 140011991179263, +ERASE, 140011974426624, 140011991179263, +SNULL, 140011840208895, 140011848568831, +STORE, 140011773100032, 140011840208895, +STORE, 140011840208896, 140011848568831, +ERASE, 140011840208896, 140011848568831, +SNULL, 140011773235199, 140011840208895, +STORE, 140011773100032, 140011773235199, +STORE, 140011773235200, 140011840208895, +STORE, 140011102011392, 140011320090623, +SNULL, 140011169120255, 140011320090623, +STORE, 140011102011392, 140011169120255, +STORE, 140011169120256, 140011320090623, +SNULL, 140011169120256, 140011236229119, +STORE, 140011236229120, 140011320090623, +STORE, 140011169120256, 140011236229119, +ERASE, 140011169120256, 140011236229119, +SNULL, 140011622096896, 140011638882303, +STORE, 140011638882304, 140011756314623, +STORE, 140011622096896, 140011638882303, +ERASE, 140011622096896, 140011638882303, +SNULL, 140011705991167, 140011756314623, +STORE, 140011638882304, 140011705991167, +STORE, 140011705991168, 140011756314623, +ERASE, 140011705991168, 140011756314623, +SNULL, 140011571773439, 140011613704191, +STORE, 140011504664576, 140011571773439, +STORE, 140011571773440, 140011613704191, +ERASE, 140011571773440, 140011613704191, +STORE, 140010967793664, 140011169120255, +SNULL, 140011034902527, 140011169120255, +STORE, 140010967793664, 140011034902527, +STORE, 140011034902528, 140011169120255, +SNULL, 140011034902528, 140011102011391, +STORE, 140011102011392, 140011169120255, +STORE, 140011034902528, 140011102011391, +ERASE, 140011034902528, 140011102011391, +STORE, 140010833575936, 140011034902527, +SNULL, 140011437555711, 140011471093759, +STORE, 140011370446848, 140011437555711, +STORE, 140011437555712, 140011471093759, +ERASE, 140011437555712, 140011471093759, +SNULL, 140011370582015, 140011437555711, +STORE, 140011370446848, 140011370582015, +STORE, 140011370582016, 140011437555711, +STORE, 140010699358208, 140011034902527, +SNULL, 140011487883263, 140011504664575, +STORE, 140011487879168, 140011487883263, +STORE, 140011487883264, 140011504664575, +SNULL, 140011345272832, 140011353661439, +STORE, 140011353661440, 140011362054143, +STORE, 140011345272832, 140011353661439, +SNULL, 140011353665535, 140011362054143, +STORE, 140011353661440, 140011353665535, +STORE, 140011353665536, 140011362054143, +SNULL, 140011328487424, 140011336876031, +STORE, 140011336876032, 140011345268735, +STORE, 140011328487424, 140011336876031, +SNULL, 140011336880127, 140011345268735, +STORE, 140011336876032, 140011336880127, +STORE, 140011336880128, 140011345268735, +SNULL, 140011303337983, 140011320090623, +STORE, 140011236229120, 140011303337983, +STORE, 140011303337984, 140011320090623, +ERASE, 140011303337984, 140011320090623, +SNULL, 140011907452927, 140011974426623, +STORE, 140011907317760, 140011907452927, +STORE, 140011907452928, 140011974426623, +SNULL, 140011102146559, 140011169120255, +STORE, 140011102011392, 140011102146559, +STORE, 140011102146560, 140011169120255, +SNULL, 140011639017471, 140011705991167, +STORE, 140011638882304, 140011639017471, +STORE, 140011639017472, 140011705991167, +SNULL, 140011504799743, 140011571773439, +STORE, 140011504664576, 140011504799743, +STORE, 140011504799744, 140011571773439, +SNULL, 140011613708287, 140011622096895, +STORE, 140011613704192, 140011613708287, +STORE, 140011613708288, 140011622096895, +SNULL, 140010699358208, 140010967793663, +STORE, 140010967793664, 140011034902527, +STORE, 140010699358208, 140010967793663, +SNULL, 140010967928831, 140011034902527, +STORE, 140010967793664, 140010967928831, +STORE, 140010967928832, 140011034902527, +SNULL, 140010900684799, 140010967793663, +STORE, 140010699358208, 140010900684799, +STORE, 140010900684800, 140010967793663, +ERASE, 140010900684800, 140010967793663, +SNULL, 140010766467071, 140010900684799, +STORE, 140010699358208, 140010766467071, +STORE, 140010766467072, 140010900684799, +SNULL, 140010766467072, 140010833575935, +STORE, 140010833575936, 140010900684799, +STORE, 140010766467072, 140010833575935, +ERASE, 140010766467072, 140010833575935, +SNULL, 140010699493375, 140010766467071, +STORE, 140010699358208, 140010699493375, +STORE, 140010699493376, 140010766467071, +SNULL, 140011848572927, 140011856961535, +STORE, 140011848568832, 140011848572927, +STORE, 140011848572928, 140011856961535, +STORE, 140011982786560, 140012007964671, +STORE, 140011898925056, 140011907317759, +SNULL, 140011898929151, 140011907317759, +STORE, 140011898925056, 140011898929151, +STORE, 140011898929152, 140011907317759, +SNULL, 140011320094719, 140011328483327, +STORE, 140011320090624, 140011320094719, +STORE, 140011320094720, 140011328483327, +STORE, 140011890532352, 140011898925055, +STORE, 140011882139648, 140011898925055, +SNULL, 140011882143743, 140011898925055, +STORE, 140011882139648, 140011882143743, +STORE, 140011882143744, 140011898925055, +STORE, 140011873746944, 140011882139647, +SNULL, 140011873751039, 140011882139647, +STORE, 140011873746944, 140011873751039, +STORE, 140011873751040, 140011882139647, +SNULL, 140011236364287, 140011303337983, +STORE, 140011236229120, 140011236364287, +STORE, 140011236364288, 140011303337983, +SNULL, 140011756318719, 140011773100031, +STORE, 140011756314624, 140011756318719, +STORE, 140011756318720, 140011773100031, +SNULL, 140011756318720, 140011764707327, +STORE, 140011764707328, 140011773100031, +STORE, 140011756318720, 140011764707327, +SNULL, 140011764711423, 140011773100031, +STORE, 140011764707328, 140011764711423, +STORE, 140011764711424, 140011773100031, +SNULL, 140011471097855, 140011487879167, +STORE, 140011471093760, 140011471097855, +STORE, 140011471097856, 140011487879167, +SNULL, 140010833711103, 140010900684799, +STORE, 140010833575936, 140010833711103, +STORE, 140010833711104, 140010900684799, +SNULL, 140011982790655, 140012007964671, +STORE, 140011982786560, 140011982790655, +STORE, 140011982790656, 140012007964671, +STORE, 140011865354240, 140011873746943, +STORE, 140011848572928, 140011865354239, +SNULL, 140011848572928, 140011856961535, +STORE, 140011856961536, 140011865354239, +STORE, 140011848572928, 140011856961535, +SNULL, 140011856965631, 140011865354239, +STORE, 140011856961536, 140011856965631, +STORE, 140011856965632, 140011865354239, +STORE, 140011747921920, 140011756314623, +STORE, 140011739529216, 140011756314623, +SNULL, 140011471097856, 140011479486463, +STORE, 140011479486464, 140011487879167, +STORE, 140011471097856, 140011479486463, +SNULL, 140011479490559, 140011487879167, +STORE, 140011479486464, 140011479490559, +STORE, 140011479490560, 140011487879167, +STORE, 140011731136512, 140011756314623, +STORE, 140011722743808, 140011756314623, +SNULL, 140011982790656, 140011999571967, +STORE, 140011999571968, 140012007964671, +STORE, 140011982790656, 140011999571967, +SNULL, 140011999576063, 140012007964671, +STORE, 140011999571968, 140011999576063, +STORE, 140011999576064, 140012007964671, +STORE, 140011714351104, 140011756314623, +SNULL, 140011882143744, 140011890532351, +STORE, 140011890532352, 140011898925055, +STORE, 140011882143744, 140011890532351, +SNULL, 140011890536447, 140011898925055, +STORE, 140011890532352, 140011890536447, +STORE, 140011890536448, 140011898925055, +STORE, 140011630489600, 140011638882303, +STORE, 140011613708288, 140011638882303, +STORE, 140011605311488, 140011613704191, +STORE, 140011596918784, 140011613704191, +STORE, 140011588526080, 140011613704191, +SNULL, 140011487883264, 140011496271871, +STORE, 140011496271872, 140011504664575, +STORE, 140011487883264, 140011496271871, +SNULL, 140011496275967, 140011504664575, +STORE, 140011496271872, 140011496275967, +STORE, 140011496275968, 140011504664575, +STORE, 140011580133376, 140011613704191, +SNULL, 140011580137471, 140011613704191, +STORE, 140011580133376, 140011580137471, +STORE, 140011580137472, 140011613704191, +SNULL, 140011982790656, 140011991179263, +STORE, 140011991179264, 140011999571967, +STORE, 140011982790656, 140011991179263, +SNULL, 140011991183359, 140011999571967, +STORE, 140011991179264, 140011991183359, +STORE, 140011991183360, 140011999571967, +SNULL, 140011865358335, 140011873746943, +STORE, 140011865354240, 140011865358335, +STORE, 140011865358336, 140011873746943, +STORE, 140011462701056, 140011471093759, +SNULL, 140011714351104, 140011739529215, +STORE, 140011739529216, 140011756314623, +STORE, 140011714351104, 140011739529215, +SNULL, 140011739533311, 140011756314623, +STORE, 140011739529216, 140011739533311, +STORE, 140011739533312, 140011756314623, +SNULL, 140011739533312, 140011747921919, +STORE, 140011747921920, 140011756314623, +STORE, 140011739533312, 140011747921919, +SNULL, 140011747926015, 140011756314623, +STORE, 140011747921920, 140011747926015, +STORE, 140011747926016, 140011756314623, +SNULL, 140011613708288, 140011630489599, +STORE, 140011630489600, 140011638882303, +STORE, 140011613708288, 140011630489599, +SNULL, 140011630493695, 140011638882303, +STORE, 140011630489600, 140011630493695, +STORE, 140011630493696, 140011638882303, +SNULL, 140011714351104, 140011722743807, +STORE, 140011722743808, 140011739529215, +STORE, 140011714351104, 140011722743807, +SNULL, 140011722747903, 140011739529215, +STORE, 140011722743808, 140011722747903, +STORE, 140011722747904, 140011739529215, +SNULL, 140011714355199, 140011722743807, +STORE, 140011714351104, 140011714355199, +STORE, 140011714355200, 140011722743807, +SNULL, 140011722747904, 140011731136511, +STORE, 140011731136512, 140011739529215, +STORE, 140011722747904, 140011731136511, +SNULL, 140011731140607, 140011739529215, +STORE, 140011731136512, 140011731140607, +STORE, 140011731140608, 140011739529215, +STORE, 140011454308352, 140011471093759, +STORE, 140011445915648, 140011471093759, +SNULL, 140011580137472, 140011588526079, +STORE, 140011588526080, 140011613704191, +STORE, 140011580137472, 140011588526079, +SNULL, 140011588530175, 140011613704191, +STORE, 140011588526080, 140011588530175, +STORE, 140011588530176, 140011613704191, +SNULL, 140011445915648, 140011462701055, +STORE, 140011462701056, 140011471093759, +STORE, 140011445915648, 140011462701055, +SNULL, 140011462705151, 140011471093759, +STORE, 140011462701056, 140011462705151, +STORE, 140011462705152, 140011471093759, +SNULL, 140011588530176, 140011596918783, +STORE, 140011596918784, 140011613704191, +STORE, 140011588530176, 140011596918783, +SNULL, 140011596922879, 140011613704191, +STORE, 140011596918784, 140011596922879, +STORE, 140011596922880, 140011613704191, +SNULL, 140011596922880, 140011605311487, +STORE, 140011605311488, 140011613704191, +STORE, 140011596922880, 140011605311487, +SNULL, 140011605315583, 140011613704191, +STORE, 140011605311488, 140011605315583, +STORE, 140011605315584, 140011613704191, +SNULL, 140011613708288, 140011622096895, +STORE, 140011622096896, 140011630489599, +STORE, 140011613708288, 140011622096895, +SNULL, 140011622100991, 140011630489599, +STORE, 140011622096896, 140011622100991, +STORE, 140011622100992, 140011630489599, +STORE, 140011311697920, 140011320090623, +STORE, 140011227836416, 140011236229119, +STORE, 140011219443712, 140011236229119, +SNULL, 140011219447807, 140011236229119, +STORE, 140011219443712, 140011219447807, +STORE, 140011219447808, 140011236229119, +STORE, 140011211051008, 140011219443711, +STORE, 140011202658304, 140011219443711, +SNULL, 140011202662399, 140011219443711, +STORE, 140011202658304, 140011202662399, +STORE, 140011202662400, 140011219443711, +STORE, 140011194265600, 140011202658303, +STORE, 140011185872896, 140011202658303, +STORE, 140011177480192, 140011202658303, +STORE, 140011093618688, 140011102011391, +SNULL, 140011445915648, 140011454308351, +STORE, 140011454308352, 140011462701055, +STORE, 140011445915648, 140011454308351, +SNULL, 140011454312447, 140011462701055, +STORE, 140011454308352, 140011454312447, +STORE, 140011454312448, 140011462701055, +STORE, 140011085225984, 140011102011391, +SNULL, 140011085230079, 140011102011391, +STORE, 140011085225984, 140011085230079, +STORE, 140011085230080, 140011102011391, +SNULL, 140011177484287, 140011202658303, +STORE, 140011177480192, 140011177484287, +STORE, 140011177484288, 140011202658303, +SNULL, 140011445919743, 140011454308351, +STORE, 140011445915648, 140011445919743, +STORE, 140011445919744, 140011454308351, +SNULL, 140011177484288, 140011185872895, +STORE, 140011185872896, 140011202658303, +STORE, 140011177484288, 140011185872895, +SNULL, 140011185876991, 140011202658303, +STORE, 140011185872896, 140011185876991, +STORE, 140011185876992, 140011202658303, +STORE, 140011076833280, 140011085225983, +SNULL, 140011202662400, 140011211051007, +STORE, 140011211051008, 140011219443711, +STORE, 140011202662400, 140011211051007, +SNULL, 140011211055103, 140011219443711, +STORE, 140011211051008, 140011211055103, +STORE, 140011211055104, 140011219443711, +SNULL, 140011185876992, 140011194265599, +STORE, 140011194265600, 140011202658303, +STORE, 140011185876992, 140011194265599, +SNULL, 140011194269695, 140011202658303, +STORE, 140011194265600, 140011194269695, +STORE, 140011194269696, 140011202658303, +STORE, 140011068440576, 140011085225983, +SNULL, 140011311702015, 140011320090623, +STORE, 140011311697920, 140011311702015, +STORE, 140011311702016, 140011320090623, +STORE, 140011060047872, 140011085225983, +SNULL, 140011060051967, 140011085225983, +STORE, 140011060047872, 140011060051967, +STORE, 140011060051968, 140011085225983, +STORE, 140011051655168, 140011060047871, +STORE, 140011043262464, 140011060047871, +SNULL, 140011043266559, 140011060047871, +STORE, 140011043262464, 140011043266559, +STORE, 140011043266560, 140011060047871, +SNULL, 140011219447808, 140011227836415, +STORE, 140011227836416, 140011236229119, +STORE, 140011219447808, 140011227836415, +SNULL, 140011227840511, 140011236229119, +STORE, 140011227836416, 140011227840511, +STORE, 140011227840512, 140011236229119, +SNULL, 140011085230080, 140011093618687, +STORE, 140011093618688, 140011102011391, +STORE, 140011085230080, 140011093618687, +SNULL, 140011093622783, 140011102011391, +STORE, 140011093618688, 140011093622783, +STORE, 140011093622784, 140011102011391, +STORE, 140010959400960, 140010967793663, +STORE, 140010951008256, 140010967793663, +SNULL, 140010951008256, 140010959400959, +STORE, 140010959400960, 140010967793663, +STORE, 140010951008256, 140010959400959, +SNULL, 140010959405055, 140010967793663, +STORE, 140010959400960, 140010959405055, +STORE, 140010959405056, 140010967793663, +STORE, 140010942615552, 140010959400959, +STORE, 140010934222848, 140010959400959, +SNULL, 140011060051968, 140011076833279, +STORE, 140011076833280, 140011085225983, +STORE, 140011060051968, 140011076833279, +SNULL, 140011076837375, 140011085225983, +STORE, 140011076833280, 140011076837375, +STORE, 140011076837376, 140011085225983, +SNULL, 140011043266560, 140011051655167, +STORE, 140011051655168, 140011060047871, +STORE, 140011043266560, 140011051655167, +SNULL, 140011051659263, 140011060047871, +STORE, 140011051655168, 140011051659263, +STORE, 140011051659264, 140011060047871, +STORE, 140010925830144, 140010959400959, +SNULL, 140011060051968, 140011068440575, +STORE, 140011068440576, 140011076833279, +STORE, 140011060051968, 140011068440575, +SNULL, 140011068444671, 140011076833279, +STORE, 140011068440576, 140011068444671, +STORE, 140011068444672, 140011076833279, +STORE, 140010917437440, 140010959400959, +STORE, 140010909044736, 140010959400959, +STORE, 140010825183232, 140010833575935, +SNULL, 140010909044736, 140010942615551, +STORE, 140010942615552, 140010959400959, +STORE, 140010909044736, 140010942615551, +SNULL, 140010942619647, 140010959400959, +STORE, 140010942615552, 140010942619647, +STORE, 140010942619648, 140010959400959, +SNULL, 140010909044736, 140010934222847, +STORE, 140010934222848, 140010942615551, +STORE, 140010909044736, 140010934222847, +SNULL, 140010934226943, 140010942615551, +STORE, 140010934222848, 140010934226943, +STORE, 140010934226944, 140010942615551, +SNULL, 140010909048831, 140010934222847, +STORE, 140010909044736, 140010909048831, +STORE, 140010909048832, 140010934222847, +STORE, 140010816790528, 140010833575935, +SNULL, 140010816794623, 140010833575935, +STORE, 140010816790528, 140010816794623, +STORE, 140010816794624, 140010833575935, +STORE, 140010808397824, 140010816790527, +SNULL, 140010942619648, 140010951008255, +STORE, 140010951008256, 140010959400959, +STORE, 140010942619648, 140010951008255, +SNULL, 140010951012351, 140010959400959, +STORE, 140010951008256, 140010951012351, +STORE, 140010951012352, 140010959400959, +STORE, 140010800005120, 140010816790527, +SNULL, 140010800009215, 140010816790527, +STORE, 140010800005120, 140010800009215, +STORE, 140010800009216, 140010816790527, +SNULL, 140010909048832, 140010925830143, +STORE, 140010925830144, 140010934222847, +STORE, 140010909048832, 140010925830143, +SNULL, 140010925834239, 140010934222847, +STORE, 140010925830144, 140010925834239, +STORE, 140010925834240, 140010934222847, +SNULL, 140010816794624, 140010825183231, +STORE, 140010825183232, 140010833575935, +STORE, 140010816794624, 140010825183231, +SNULL, 140010825187327, 140010833575935, +STORE, 140010825183232, 140010825187327, +STORE, 140010825187328, 140010833575935, +SNULL, 140010909048832, 140010917437439, +STORE, 140010917437440, 140010925830143, +STORE, 140010909048832, 140010917437439, +SNULL, 140010917441535, 140010925830143, +STORE, 140010917437440, 140010917441535, +STORE, 140010917441536, 140010925830143, +SNULL, 140010800009216, 140010808397823, +STORE, 140010808397824, 140010816790527, +STORE, 140010800009216, 140010808397823, +SNULL, 140010808401919, 140010816790527, +STORE, 140010808397824, 140010808401919, +STORE, 140010808401920, 140010816790527, +STORE, 140010791612416, 140010800005119, +SNULL, 140010791616511, 140010800005119, +STORE, 140010791612416, 140010791616511, +STORE, 140010791616512, 140010800005119, +STORE, 140012547100672, 140012547129343, +STORE, 140012511506432, 140012513697791, +SNULL, 140012511506432, 140012511596543, +STORE, 140012511596544, 140012513697791, +STORE, 140012511506432, 140012511596543, +SNULL, 140012513689599, 140012513697791, +STORE, 140012511596544, 140012513689599, +STORE, 140012513689600, 140012513697791, +ERASE, 140012513689600, 140012513697791, +STORE, 140012513689600, 140012513697791, +SNULL, 140012513693695, 140012513697791, +STORE, 140012513689600, 140012513693695, +STORE, 140012513693696, 140012513697791, +ERASE, 140012547100672, 140012547129343, +ERASE, 140011362054144, 140011362058239, +ERASE, 140011362058240, 140011370446847, +ERASE, 140011882139648, 140011882143743, +ERASE, 140011882143744, 140011890532351, +ERASE, 140011873746944, 140011873751039, +ERASE, 140011873751040, 140011882139647, +ERASE, 140011588526080, 140011588530175, +ERASE, 140011588530176, 140011596918783, +ERASE, 140011328483328, 140011328487423, +ERASE, 140011328487424, 140011336876031, +ERASE, 140011898925056, 140011898929151, +ERASE, 140011898929152, 140011907317759, +ERASE, 140011353661440, 140011353665535, +ERASE, 140011353665536, 140011362054143, +ERASE, 140011336876032, 140011336880127, +ERASE, 140011336880128, 140011345268735, +ERASE, 140011731136512, 140011731140607, +ERASE, 140011731140608, 140011739529215, +ERASE, 140011479486464, 140011479490559, +ERASE, 140011479490560, 140011487879167, +ERASE, 140011756314624, 140011756318719, +ERASE, 140011756318720, 140011764707327, +ERASE, 140011580133376, 140011580137471, +ERASE, 140011580137472, 140011588526079, +ERASE, 140011219443712, 140011219447807, +ERASE, 140011219447808, 140011227836415, +ERASE, 140011051655168, 140011051659263, +ERASE, 140011051659264, 140011060047871, +ERASE, 140011999571968, 140011999576063, +ERASE, 140011999576064, 140012007964671, +ERASE, 140011714351104, 140011714355199, +ERASE, 140011714355200, 140011722743807, +ERASE, 140011739529216, 140011739533311, +ERASE, 140011739533312, 140011747921919, +ERASE, 140011320090624, 140011320094719, +ERASE, 140011320094720, 140011328483327, +ERASE, 140011630489600, 140011630493695, +ERASE, 140011630493696, 140011638882303, +ERASE, 140011345268736, 140011345272831, +ERASE, 140011345272832, 140011353661439, +ERASE, 140011496271872, 140011496275967, +ERASE, 140011496275968, 140011504664575, +ERASE, 140011194265600, 140011194269695, +ERASE, 140011194269696, 140011202658303, +ERASE, 140011068440576, 140011068444671, +ERASE, 140011068444672, 140011076833279, +ERASE, 140010909044736, 140010909048831, +ERASE, 140010909048832, 140010917437439, +ERASE, 140011764707328, 140011764711423, +ERASE, 140011764711424, 140011773100031, +ERASE, 140011462701056, 140011462705151, +ERASE, 140011462705152, 140011471093759, +ERASE, 140011076833280, 140011076837375, +ERASE, 140011076837376, 140011085225983, +ERASE, 140011991179264, 140011991183359, +ERASE, 140011991183360, 140011999571967, +ERASE, 140011211051008, 140011211055103, +ERASE, 140011211055104, 140011219443711, +ERASE, 140010917437440, 140010917441535, +ERASE, 140010917441536, 140010925830143, +ERASE, 140011085225984, 140011085230079, +ERASE, 140011085230080, 140011093618687, +ERASE, 140011487879168, 140011487883263, +ERASE, 140011487883264, 140011496271871, +ERASE, 140011856961536, 140011856965631, +ERASE, 140011856965632, 140011865354239, +ERASE, 140011982786560, 140011982790655, +ERASE, 140011982790656, 140011991179263, +ERASE, 140011722743808, 140011722747903, +ERASE, 140011722747904, 140011731136511, +ERASE, 140011177480192, 140011177484287, +ERASE, 140011177484288, 140011185872895, +ERASE, 140011848568832, 140011848572927, +ERASE, 140011848572928, 140011856961535, +ERASE, 140011890532352, 140011890536447, +ERASE, 140011890536448, 140011898925055, +ERASE, 140011622096896, 140011622100991, +ERASE, 140011622100992, 140011630489599, +ERASE, 140011311697920, 140011311702015, +ERASE, 140011311702016, 140011320090623, +ERASE, 140011471093760, 140011471097855, +ERASE, 140011471097856, 140011479486463, +ERASE, 140011605311488, 140011605315583, +ERASE, 140011605315584, 140011613704191, +ERASE, 140010791612416, 140010791616511, +ERASE, 140010791616512, 140010800005119, +ERASE, 140010959400960, 140010959405055, +ERASE, 140010959405056, 140010967793663, +ERASE, 140011185872896, 140011185876991, +ERASE, 140011185876992, 140011194265599, +ERASE, 140011454308352, 140011454312447, +ERASE, 140011454312448, 140011462701055, +ERASE, 140011596918784, 140011596922879, +ERASE, 140011596922880, 140011605311487, +ERASE, 140011060047872, 140011060051967, +ERASE, 140011060051968, 140011068440575, +ERASE, 140010925830144, 140010925834239, +ERASE, 140010925834240, 140010934222847, +ERASE, 140011747921920, 140011747926015, +ERASE, 140011747926016, 140011756314623, +ERASE, 140011202658304, 140011202662399, +ERASE, 140011202662400, 140011211051007, +ERASE, 140010800005120, 140010800009215, +ERASE, 140010800009216, 140010808397823, +ERASE, 140011093618688, 140011093622783, +ERASE, 140011093622784, 140011102011391, +ERASE, 140010808397824, 140010808401919, +ERASE, 140010808401920, 140010816790527, +ERASE, 140012419010560, 140012419014655, +ERASE, 140012419014656, 140012427403263, +ERASE, 140010934222848, 140010934226943, +ERASE, 140010934226944, 140010942615551, +ERASE, 140010942615552, 140010942619647, +ERASE, 140010942619648, 140010951008255, +ERASE, 140011613704192, 140011613708287, +ERASE, 140011613708288, 140011622096895, +ERASE, 140011865354240, 140011865358335, +ERASE, 140011865358336, 140011873746943, +ERASE, 140012301578240, 140012301582335, +ERASE, 140012301582336, 140012309970943, +ERASE, 140012393832448, 140012393836543, +ERASE, 140012393836544, 140012402225151, +ERASE, 140012410617856, 140012410621951, +ERASE, 140012410621952, 140012419010559, +ERASE, 140012402225152, 140012402229247, +ERASE, 140012402229248, 140012410617855, +ERASE, 140012259614720, 140012259618815, +ERASE, 140012259618816, 140012268007423, +ERASE, 140012251222016, 140012251226111, +ERASE, 140012251226112, 140012259614719, +ERASE, 140012284792832, 140012284796927, +ERASE, 140012284796928, 140012293185535, +ERASE, 140011445915648, 140011445919743, +ERASE, 140011445919744, 140011454308351, +ERASE, 140010951008256, 140010951012351, +ERASE, 140010951012352, 140010959400959, +ERASE, 140011043262464, 140011043266559, +ERASE, 140011043266560, 140011051655167, +ERASE, 140010825183232, 140010825187327, +ERASE, 140010825187328, 140010833575935, +ERASE, 140012293185536, 140012293189631, +ERASE, 140012293189632, 140012301578239, +ERASE, 140012276400128, 140012276404223, +ERASE, 140012276404224, 140012284792831, +ERASE, 140012016357376, 140012016361471, +ERASE, 140012016361472, 140012024750079, +ERASE, 140012024750080, 140012024754175, +ERASE, 140012024754176, 140012033142783, +ERASE, 140011227836416, 140011227840511, +ERASE, 140011227840512, 140011236229119, +ERASE, 140010816790528, 140010816794623, +ERASE, 140010816794624, 140010825183231, +ERASE, 140012268007424, 140012268011519, +ERASE, 140012268011520, 140012276400127, +ERASE, 140012385439744, 140012385443839, +ERASE, 140012385443840, 140012393832447, +ERASE, 140012522090496, 140012522094591, +ERASE, 140012522094592, 140012530483199, +ERASE, 140012033142784, 140012033146879, +ERASE, 140012033146880, 140012041535487, + }; + unsigned long set35[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140730536939520, 140737488351231, +SNULL, 140730536943615, 140737488351231, +STORE, 140730536939520, 140730536943615, +STORE, 140730536808448, 140730536943615, +STORE, 94245239877632, 94245242130431, +SNULL, 94245240008703, 94245242130431, +STORE, 94245239877632, 94245240008703, +STORE, 94245240008704, 94245242130431, +ERASE, 94245240008704, 94245242130431, +STORE, 94245242101760, 94245242109951, +STORE, 94245242109952, 94245242130431, +STORE, 140475575263232, 140475577516031, +SNULL, 140475575406591, 140475577516031, +STORE, 140475575263232, 140475575406591, +STORE, 140475575406592, 140475577516031, +ERASE, 140475575406592, 140475577516031, +STORE, 140475577503744, 140475577511935, +STORE, 140475577511936, 140475577516031, +STORE, 140730538164224, 140730538168319, +STORE, 140730538151936, 140730538164223, +STORE, 140475577475072, 140475577503743, +STORE, 140475577466880, 140475577475071, +STORE, 140475573047296, 140475575263231, +SNULL, 140475573047296, 140475573145599, +STORE, 140475573145600, 140475575263231, +STORE, 140475573047296, 140475573145599, +SNULL, 140475575238655, 140475575263231, +STORE, 140475573145600, 140475575238655, +STORE, 140475575238656, 140475575263231, +SNULL, 140475575238656, 140475575246847, +STORE, 140475575246848, 140475575263231, +STORE, 140475575238656, 140475575246847, +ERASE, 140475575238656, 140475575246847, +STORE, 140475575238656, 140475575246847, +ERASE, 140475575246848, 140475575263231, +STORE, 140475575246848, 140475575263231, +STORE, 140475569250304, 140475573047295, +SNULL, 140475569250304, 140475570909183, +STORE, 140475570909184, 140475573047295, +STORE, 140475569250304, 140475570909183, +SNULL, 140475573006335, 140475573047295, +STORE, 140475570909184, 140475573006335, +STORE, 140475573006336, 140475573047295, +SNULL, 140475573006336, 140475573030911, +STORE, 140475573030912, 140475573047295, +STORE, 140475573006336, 140475573030911, +ERASE, 140475573006336, 140475573030911, +STORE, 140475573006336, 140475573030911, +ERASE, 140475573030912, 140475573047295, +STORE, 140475573030912, 140475573047295, +STORE, 140475577458688, 140475577475071, +SNULL, 140475573022719, 140475573030911, +STORE, 140475573006336, 140475573022719, +STORE, 140475573022720, 140475573030911, +SNULL, 140475575242751, 140475575246847, +STORE, 140475575238656, 140475575242751, +STORE, 140475575242752, 140475575246847, +SNULL, 94245242105855, 94245242109951, +STORE, 94245242101760, 94245242105855, +STORE, 94245242105856, 94245242109951, +SNULL, 140475577507839, 140475577511935, +STORE, 140475577503744, 140475577507839, +STORE, 140475577507840, 140475577511935, +ERASE, 140475577475072, 140475577503743, +STORE, 94245271216128, 94245271351295, +STORE, 140475560857600, 140475569250303, +SNULL, 140475560861695, 140475569250303, +STORE, 140475560857600, 140475560861695, +STORE, 140475560861696, 140475569250303, +STORE, 140475552464896, 140475560857599, +STORE, 140475418247168, 140475552464895, +SNULL, 140475418247168, 140475428241407, +STORE, 140475428241408, 140475552464895, +STORE, 140475418247168, 140475428241407, +ERASE, 140475418247168, 140475428241407, +SNULL, 140475495350271, 140475552464895, +STORE, 140475428241408, 140475495350271, +STORE, 140475495350272, 140475552464895, +ERASE, 140475495350272, 140475552464895, +SNULL, 140475428376575, 140475495350271, +STORE, 140475428241408, 140475428376575, +STORE, 140475428376576, 140475495350271, +SNULL, 140475552468991, 140475560857599, +STORE, 140475552464896, 140475552468991, +STORE, 140475552468992, 140475560857599, +STORE, 140475544072192, 140475552464895, +SNULL, 140475544076287, 140475552464895, +STORE, 140475544072192, 140475544076287, +STORE, 140475544076288, 140475552464895, +STORE, 140475535679488, 140475544072191, +SNULL, 140475535683583, 140475544072191, +STORE, 140475535679488, 140475535683583, +STORE, 140475535683584, 140475544072191, +STORE, 140475527286784, 140475535679487, +SNULL, 140475527290879, 140475535679487, +STORE, 140475527286784, 140475527290879, +STORE, 140475527290880, 140475535679487, +STORE, 140475518894080, 140475527286783, +STORE, 140475510501376, 140475527286783, +STORE, 140475502108672, 140475527286783, +STORE, 140475419848704, 140475428241407, +STORE, 140475285630976, 140475419848703, +SNULL, 140475285630976, 140475294023679, +STORE, 140475294023680, 140475419848703, +STORE, 140475285630976, 140475294023679, +ERASE, 140475285630976, 140475294023679, +STORE, 140475159805952, 140475419848703, +STORE, 140475025588224, 140475419848703, +SNULL, 140475092697087, 140475419848703, +STORE, 140475025588224, 140475092697087, +STORE, 140475092697088, 140475419848703, +SNULL, 140475092697088, 140475159805951, +STORE, 140475159805952, 140475419848703, +STORE, 140475092697088, 140475159805951, +ERASE, 140475092697088, 140475159805951, +STORE, 140474891370496, 140475092697087, +SNULL, 140474958479359, 140475092697087, +STORE, 140474891370496, 140474958479359, +STORE, 140474958479360, 140475092697087, +SNULL, 140474958479360, 140475025588223, +STORE, 140475025588224, 140475092697087, +STORE, 140474958479360, 140475025588223, +ERASE, 140474958479360, 140475025588223, +SNULL, 140475361132543, 140475419848703, +STORE, 140475159805952, 140475361132543, +STORE, 140475361132544, 140475419848703, +ERASE, 140475361132544, 140475419848703, +SNULL, 140475159805952, 140475294023679, +STORE, 140475294023680, 140475361132543, +STORE, 140475159805952, 140475294023679, +SNULL, 140475294158847, 140475361132543, +STORE, 140475294023680, 140475294158847, +STORE, 140475294158848, 140475361132543, +SNULL, 140475226914815, 140475294023679, +STORE, 140475159805952, 140475226914815, +STORE, 140475226914816, 140475294023679, +ERASE, 140475226914816, 140475294023679, +SNULL, 140475025723391, 140475092697087, +STORE, 140475025588224, 140475025723391, +STORE, 140475025723392, 140475092697087, +SNULL, 140475159941119, 140475226914815, +STORE, 140475159805952, 140475159941119, +STORE, 140475159941120, 140475226914815, +SNULL, 140474891505663, 140474958479359, +STORE, 140474891370496, 140474891505663, +STORE, 140474891505664, 140474958479359, +SNULL, 140475502108672, 140475518894079, +STORE, 140475518894080, 140475527286783, +STORE, 140475502108672, 140475518894079, +SNULL, 140475518898175, 140475527286783, +STORE, 140475518894080, 140475518898175, +STORE, 140475518898176, 140475527286783, +STORE, 140475411456000, 140475428241407, +SNULL, 140475502112767, 140475518894079, +STORE, 140475502108672, 140475502112767, +STORE, 140475502112768, 140475518894079, +SNULL, 140475411460095, 140475428241407, +STORE, 140475411456000, 140475411460095, +STORE, 140475411460096, 140475428241407, +SNULL, 140475411460096, 140475419848703, +STORE, 140475419848704, 140475428241407, +STORE, 140475411460096, 140475419848703, +SNULL, 140475419852799, 140475428241407, +STORE, 140475419848704, 140475419852799, +STORE, 140475419852800, 140475428241407, +STORE, 140475403063296, 140475411455999, +SNULL, 140475502112768, 140475510501375, +STORE, 140475510501376, 140475518894079, +STORE, 140475502112768, 140475510501375, +SNULL, 140475510505471, 140475518894079, +STORE, 140475510501376, 140475510505471, +STORE, 140475510505472, 140475518894079, +SNULL, 140475403067391, 140475411455999, +STORE, 140475403063296, 140475403067391, +STORE, 140475403067392, 140475411455999, +STORE, 140475394670592, 140475403063295, +SNULL, 140475394674687, 140475403063295, +STORE, 140475394670592, 140475394674687, +STORE, 140475394674688, 140475403063295, +STORE, 140475386277888, 140475394670591, +STORE, 140475377885184, 140475394670591, +STORE, 140475369492480, 140475394670591, +SNULL, 140475369496575, 140475394670591, +STORE, 140475369492480, 140475369496575, +STORE, 140475369496576, 140475394670591, +SNULL, 140475369496576, 140475377885183, +STORE, 140475377885184, 140475394670591, +STORE, 140475369496576, 140475377885183, +SNULL, 140475377889279, 140475394670591, +STORE, 140475377885184, 140475377889279, +STORE, 140475377889280, 140475394670591, +STORE, 140475285630976, 140475294023679, +SNULL, 140475377889280, 140475386277887, +STORE, 140475386277888, 140475394670591, +STORE, 140475377889280, 140475386277887, +SNULL, 140475386281983, 140475394670591, +STORE, 140475386277888, 140475386281983, +STORE, 140475386281984, 140475394670591, +SNULL, 140475285635071, 140475294023679, +STORE, 140475285630976, 140475285635071, +STORE, 140475285635072, 140475294023679, +STORE, 140475277238272, 140475285630975, +STORE, 140475268845568, 140475285630975, +SNULL, 140475268845568, 140475277238271, +STORE, 140475277238272, 140475285630975, +STORE, 140475268845568, 140475277238271, +SNULL, 140475277242367, 140475285630975, +STORE, 140475277238272, 140475277242367, +STORE, 140475277242368, 140475285630975, +STORE, 140475260452864, 140475277238271, +SNULL, 140475260452864, 140475268845567, +STORE, 140475268845568, 140475277238271, +STORE, 140475260452864, 140475268845567, +SNULL, 140475268849663, 140475277238271, +STORE, 140475268845568, 140475268849663, +STORE, 140475268849664, 140475277238271, +SNULL, 140475260456959, 140475268845567, +STORE, 140475260452864, 140475260456959, +STORE, 140475260456960, 140475268845567, +STORE, 140475252060160, 140475260452863, +SNULL, 140475252064255, 140475260452863, +STORE, 140475252060160, 140475252064255, +STORE, 140475252064256, 140475260452863, +STORE, 140475243667456, 140475252060159, +SNULL, 140475243671551, 140475252060159, +STORE, 140475243667456, 140475243671551, +STORE, 140475243671552, 140475252060159, +STORE, 140475235274752, 140475243667455, +STORE, 140475151413248, 140475159805951, +STORE, 140474891505664, 140475025588223, +STORE, 140475143020544, 140475159805951, +SNULL, 140474891505664, 140474958479359, +STORE, 140474958479360, 140475025588223, +STORE, 140474891505664, 140474958479359, +SNULL, 140474958614527, 140475025588223, +STORE, 140474958479360, 140474958614527, +STORE, 140474958614528, 140475025588223, +STORE, 140474824261632, 140474891370495, +SNULL, 140474824396799, 140474891370495, +STORE, 140474824261632, 140474824396799, +STORE, 140474824396800, 140474891370495, +STORE, 140475134627840, 140475159805951, +STORE, 140474690043904, 140474824261631, +STORE, 140475126235136, 140475159805951, +STORE, 140475117842432, 140475159805951, +STORE, 140474622935040, 140474824261631, +STORE, 140475109449728, 140475159805951, +STORE, 140474488717312, 140474824261631, +STORE, 140475101057024, 140475159805951, +STORE, 140474480324608, 140474488717311, +STORE, 140474413215744, 140474480324607, +STORE, 140474404823040, 140474413215743, +ERASE, 140474413215744, 140474480324607, +STORE, 140474471931904, 140474488717311, +STORE, 140474270605312, 140474404823039, +SNULL, 140475101057024, 140475126235135, +STORE, 140475126235136, 140475159805951, +STORE, 140475101057024, 140475126235135, +SNULL, 140475126239231, 140475159805951, +STORE, 140475126235136, 140475126239231, +STORE, 140475126239232, 140475159805951, +STORE, 140474463539200, 140474488717311, +STORE, 140474455146496, 140474488717311, +SNULL, 140474455150591, 140474488717311, +STORE, 140474455146496, 140474455150591, +STORE, 140474455150592, 140474488717311, +STORE, 140474446753792, 140474455146495, +SNULL, 140474446757887, 140474455146495, +STORE, 140474446753792, 140474446757887, +STORE, 140474446757888, 140474455146495, +STORE, 140474438361088, 140474446753791, +STORE, 140474429968384, 140474446753791, +SNULL, 140474429972479, 140474446753791, +STORE, 140474429968384, 140474429972479, +STORE, 140474429972480, 140474446753791, +SNULL, 140475235278847, 140475243667455, +STORE, 140475235274752, 140475235278847, +STORE, 140475235278848, 140475243667455, +SNULL, 140474757152767, 140474824261631, +STORE, 140474488717312, 140474757152767, +STORE, 140474757152768, 140474824261631, +ERASE, 140474757152768, 140474824261631, +SNULL, 140474488717312, 140474690043903, +STORE, 140474690043904, 140474757152767, +STORE, 140474488717312, 140474690043903, +SNULL, 140474690179071, 140474757152767, +STORE, 140474690043904, 140474690179071, +STORE, 140474690179072, 140474757152767, +SNULL, 140474488717312, 140474622935039, +STORE, 140474622935040, 140474690043903, +STORE, 140474488717312, 140474622935039, +SNULL, 140474623070207, 140474690043903, +STORE, 140474622935040, 140474623070207, +STORE, 140474623070208, 140474690043903, +SNULL, 140475101057024, 140475117842431, +STORE, 140475117842432, 140475126235135, +STORE, 140475101057024, 140475117842431, +SNULL, 140475117846527, 140475126235135, +STORE, 140475117842432, 140475117846527, +STORE, 140475117846528, 140475126235135, +SNULL, 140474555826175, 140474622935039, +STORE, 140474488717312, 140474555826175, +STORE, 140474555826176, 140474622935039, +ERASE, 140474555826176, 140474622935039, +STORE, 140474136387584, 140474404823039, +SNULL, 140474136387584, 140474153172991, +STORE, 140474153172992, 140474404823039, +STORE, 140474136387584, 140474153172991, +ERASE, 140474136387584, 140474153172991, +STORE, 140474018955264, 140474404823039, +STORE, 140473884737536, 140474404823039, +SNULL, 140474086064127, 140474404823039, +STORE, 140473884737536, 140474086064127, +STORE, 140474086064128, 140474404823039, +SNULL, 140474086064128, 140474153172991, +STORE, 140474153172992, 140474404823039, +STORE, 140474086064128, 140474153172991, +ERASE, 140474086064128, 140474153172991, +STORE, 140473750519808, 140474086064127, +SNULL, 140473817628671, 140474086064127, +STORE, 140473750519808, 140473817628671, +STORE, 140473817628672, 140474086064127, +SNULL, 140473817628672, 140473884737535, +STORE, 140473884737536, 140474086064127, +STORE, 140473817628672, 140473884737535, +ERASE, 140473817628672, 140473884737535, +SNULL, 140475126239232, 140475151413247, +STORE, 140475151413248, 140475159805951, +STORE, 140475126239232, 140475151413247, +SNULL, 140475151417343, 140475159805951, +STORE, 140475151413248, 140475151417343, +STORE, 140475151417344, 140475159805951, +SNULL, 140474270605311, 140474404823039, +STORE, 140474153172992, 140474270605311, +STORE, 140474270605312, 140474404823039, +SNULL, 140474270605312, 140474287390719, +STORE, 140474287390720, 140474404823039, +STORE, 140474270605312, 140474287390719, +ERASE, 140474270605312, 140474287390719, +SNULL, 140474429972480, 140474438361087, +STORE, 140474438361088, 140474446753791, +STORE, 140474429972480, 140474438361087, +SNULL, 140474438365183, 140474446753791, +STORE, 140474438361088, 140474438365183, +STORE, 140474438365184, 140474446753791, +STORE, 140474815868928, 140474824261631, +SNULL, 140474815873023, 140474824261631, +STORE, 140474815868928, 140474815873023, +STORE, 140474815873024, 140474824261631, +SNULL, 140474220281855, 140474270605311, +STORE, 140474153172992, 140474220281855, +STORE, 140474220281856, 140474270605311, +ERASE, 140474220281856, 140474270605311, +SNULL, 140474488852479, 140474555826175, +STORE, 140474488717312, 140474488852479, +STORE, 140474488852480, 140474555826175, +SNULL, 140475101057024, 140475109449727, +STORE, 140475109449728, 140475117842431, +STORE, 140475101057024, 140475109449727, +SNULL, 140475109453823, 140475117842431, +STORE, 140475109449728, 140475109453823, +STORE, 140475109453824, 140475117842431, +SNULL, 140473951846399, 140474086064127, +STORE, 140473884737536, 140473951846399, +STORE, 140473951846400, 140474086064127, +SNULL, 140473951846400, 140474018955263, +STORE, 140474018955264, 140474086064127, +STORE, 140473951846400, 140474018955263, +ERASE, 140473951846400, 140474018955263, +SNULL, 140473884872703, 140473951846399, +STORE, 140473884737536, 140473884872703, +STORE, 140473884872704, 140473951846399, +SNULL, 140474019090431, 140474086064127, +STORE, 140474018955264, 140474019090431, +STORE, 140474019090432, 140474086064127, +SNULL, 140473750654975, 140473817628671, +STORE, 140473750519808, 140473750654975, +STORE, 140473750654976, 140473817628671, +SNULL, 140474455150592, 140474463539199, +STORE, 140474463539200, 140474488717311, +STORE, 140474455150592, 140474463539199, +SNULL, 140474463543295, 140474488717311, +STORE, 140474463539200, 140474463543295, +STORE, 140474463543296, 140474488717311, +STORE, 140474807476224, 140474815868927, +SNULL, 140474463543296, 140474471931903, +STORE, 140474471931904, 140474488717311, +STORE, 140474463543296, 140474471931903, +SNULL, 140474471935999, 140474488717311, +STORE, 140474471931904, 140474471935999, +STORE, 140474471936000, 140474488717311, +STORE, 140474799083520, 140474815868927, +STORE, 140474790690816, 140474815868927, +SNULL, 140474790690816, 140474799083519, +STORE, 140474799083520, 140474815868927, +STORE, 140474790690816, 140474799083519, +SNULL, 140474799087615, 140474815868927, +STORE, 140474799083520, 140474799087615, +STORE, 140474799087616, 140474815868927, +SNULL, 140474354499583, 140474404823039, +STORE, 140474287390720, 140474354499583, +STORE, 140474354499584, 140474404823039, +ERASE, 140474354499584, 140474404823039, +SNULL, 140474287525887, 140474354499583, +STORE, 140474287390720, 140474287525887, +STORE, 140474287525888, 140474354499583, +STORE, 140474782298112, 140474799083519, +STORE, 140474773905408, 140474799083519, +SNULL, 140474773909503, 140474799083519, +STORE, 140474773905408, 140474773909503, +STORE, 140474773909504, 140474799083519, +SNULL, 140475126239232, 140475134627839, +STORE, 140475134627840, 140475151413247, +STORE, 140475126239232, 140475134627839, +SNULL, 140475134631935, 140475151413247, +STORE, 140475134627840, 140475134631935, +STORE, 140475134631936, 140475151413247, +STORE, 140474765512704, 140474773905407, +STORE, 140474614542336, 140474622935039, +SNULL, 140474153308159, 140474220281855, +STORE, 140474153172992, 140474153308159, +STORE, 140474153308160, 140474220281855, +SNULL, 140474404827135, 140474413215743, +STORE, 140474404823040, 140474404827135, +STORE, 140474404827136, 140474413215743, +STORE, 140474606149632, 140474622935039, +SNULL, 140474606153727, 140474622935039, +STORE, 140474606149632, 140474606153727, +STORE, 140474606153728, 140474622935039, +STORE, 140474597756928, 140474606149631, +SNULL, 140474597761023, 140474606149631, +STORE, 140474597756928, 140474597761023, +STORE, 140474597761024, 140474606149631, +SNULL, 140475134631936, 140475143020543, +STORE, 140475143020544, 140475151413247, +STORE, 140475134631936, 140475143020543, +SNULL, 140475143024639, 140475151413247, +STORE, 140475143020544, 140475143024639, +STORE, 140475143024640, 140475151413247, +STORE, 140474589364224, 140474597756927, +SNULL, 140474606153728, 140474614542335, +STORE, 140474614542336, 140474622935039, +STORE, 140474606153728, 140474614542335, +SNULL, 140474614546431, 140474622935039, +STORE, 140474614542336, 140474614546431, +STORE, 140474614546432, 140474622935039, +SNULL, 140474765516799, 140474773905407, +STORE, 140474765512704, 140474765516799, +STORE, 140474765516800, 140474773905407, +STORE, 140474580971520, 140474597756927, +SNULL, 140474773909504, 140474782298111, +STORE, 140474782298112, 140474799083519, +STORE, 140474773909504, 140474782298111, +SNULL, 140474782302207, 140474799083519, +STORE, 140474782298112, 140474782302207, +STORE, 140474782302208, 140474799083519, +SNULL, 140474471936000, 140474480324607, +STORE, 140474480324608, 140474488717311, +STORE, 140474471936000, 140474480324607, +SNULL, 140474480328703, 140474488717311, +STORE, 140474480324608, 140474480328703, +STORE, 140474480328704, 140474488717311, +STORE, 140474572578816, 140474597756927, +SNULL, 140474572582911, 140474597756927, +STORE, 140474572578816, 140474572582911, +STORE, 140474572582912, 140474597756927, +SNULL, 140474782302208, 140474790690815, +STORE, 140474790690816, 140474799083519, +STORE, 140474782302208, 140474790690815, +SNULL, 140474790694911, 140474799083519, +STORE, 140474790690816, 140474790694911, +STORE, 140474790694912, 140474799083519, +STORE, 140474564186112, 140474572578815, +STORE, 140474421575680, 140474429968383, +STORE, 140474396430336, 140474404823039, +SNULL, 140474396434431, 140474404823039, +STORE, 140474396430336, 140474396434431, +STORE, 140474396434432, 140474404823039, +STORE, 140474388037632, 140474396430335, +SNULL, 140474799087616, 140474807476223, +STORE, 140474807476224, 140474815868927, +STORE, 140474799087616, 140474807476223, +SNULL, 140474807480319, 140474815868927, +STORE, 140474807476224, 140474807480319, +STORE, 140474807480320, 140474815868927, +SNULL, 140475101061119, 140475109449727, +STORE, 140475101057024, 140475101061119, +STORE, 140475101061120, 140475109449727, +STORE, 140474379644928, 140474396430335, +SNULL, 140474572582912, 140474589364223, +STORE, 140474589364224, 140474597756927, +STORE, 140474572582912, 140474589364223, +SNULL, 140474589368319, 140474597756927, +STORE, 140474589364224, 140474589368319, +STORE, 140474589368320, 140474597756927, +STORE, 140474371252224, 140474396430335, +STORE, 140474362859520, 140474396430335, +STORE, 140474278998016, 140474287390719, +STORE, 140474270605312, 140474287390719, +STORE, 140474262212608, 140474287390719, +SNULL, 140474262216703, 140474287390719, +STORE, 140474262212608, 140474262216703, +STORE, 140474262216704, 140474287390719, +STORE, 140474253819904, 140474262212607, +SNULL, 140474253823999, 140474262212607, +STORE, 140474253819904, 140474253823999, +STORE, 140474253824000, 140474262212607, +SNULL, 140474362859520, 140474388037631, +STORE, 140474388037632, 140474396430335, +STORE, 140474362859520, 140474388037631, +SNULL, 140474388041727, 140474396430335, +STORE, 140474388037632, 140474388041727, +STORE, 140474388041728, 140474396430335, +SNULL, 140474362859520, 140474379644927, +STORE, 140474379644928, 140474388037631, +STORE, 140474362859520, 140474379644927, +SNULL, 140474379649023, 140474388037631, +STORE, 140474379644928, 140474379649023, +STORE, 140474379649024, 140474388037631, +STORE, 140474245427200, 140474253819903, +STORE, 140474237034496, 140474253819903, +STORE, 140474228641792, 140474253819903, +STORE, 140474144780288, 140474153172991, +SNULL, 140474228645887, 140474253819903, +STORE, 140474228641792, 140474228645887, +STORE, 140474228645888, 140474253819903, +SNULL, 140474564190207, 140474572578815, +STORE, 140474564186112, 140474564190207, +STORE, 140474564190208, 140474572578815, +STORE, 140474136387584, 140474153172991, +SNULL, 140474362859520, 140474371252223, +STORE, 140474371252224, 140474379644927, +STORE, 140474362859520, 140474371252223, +SNULL, 140474371256319, 140474379644927, +STORE, 140474371252224, 140474371256319, +STORE, 140474371256320, 140474379644927, +STORE, 140474127994880, 140474153172991, +STORE, 140474119602176, 140474153172991, +SNULL, 140474421579775, 140474429968383, +STORE, 140474421575680, 140474421579775, +STORE, 140474421579776, 140474429968383, +STORE, 140474111209472, 140474153172991, +SNULL, 140474111213567, 140474153172991, +STORE, 140474111209472, 140474111213567, +STORE, 140474111213568, 140474153172991, +SNULL, 140474262216704, 140474270605311, +STORE, 140474270605312, 140474287390719, +STORE, 140474262216704, 140474270605311, +SNULL, 140474270609407, 140474287390719, +STORE, 140474270605312, 140474270609407, +STORE, 140474270609408, 140474287390719, +STORE, 140474102816768, 140474111209471, +SNULL, 140474102820863, 140474111209471, +STORE, 140474102816768, 140474102820863, +STORE, 140474102820864, 140474111209471, +SNULL, 140474270609408, 140474278998015, +STORE, 140474278998016, 140474287390719, +STORE, 140474270609408, 140474278998015, +SNULL, 140474279002111, 140474287390719, +STORE, 140474278998016, 140474279002111, +STORE, 140474279002112, 140474287390719, +STORE, 140474094424064, 140474102816767, +SNULL, 140474572582912, 140474580971519, +STORE, 140474580971520, 140474589364223, +STORE, 140474572582912, 140474580971519, +SNULL, 140474580975615, 140474589364223, +STORE, 140474580971520, 140474580975615, +STORE, 140474580975616, 140474589364223, +SNULL, 140474362863615, 140474371252223, +STORE, 140474362859520, 140474362863615, +STORE, 140474362863616, 140474371252223, +STORE, 140474010562560, 140474018955263, +SNULL, 140474228645888, 140474245427199, +STORE, 140474245427200, 140474253819903, +STORE, 140474228645888, 140474245427199, +SNULL, 140474245431295, 140474253819903, +STORE, 140474245427200, 140474245431295, +STORE, 140474245431296, 140474253819903, +SNULL, 140474111213568, 140474136387583, +STORE, 140474136387584, 140474153172991, +STORE, 140474111213568, 140474136387583, +SNULL, 140474136391679, 140474153172991, +STORE, 140474136387584, 140474136391679, +STORE, 140474136391680, 140474153172991, +STORE, 140474002169856, 140474018955263, +STORE, 140473993777152, 140474018955263, +SNULL, 140474111213568, 140474127994879, +STORE, 140474127994880, 140474136387583, +STORE, 140474111213568, 140474127994879, +SNULL, 140474127998975, 140474136387583, +STORE, 140474127994880, 140474127998975, +STORE, 140474127998976, 140474136387583, +SNULL, 140474228645888, 140474237034495, +STORE, 140474237034496, 140474245427199, +STORE, 140474228645888, 140474237034495, +SNULL, 140474237038591, 140474245427199, +STORE, 140474237034496, 140474237038591, +STORE, 140474237038592, 140474245427199, +SNULL, 140474136391680, 140474144780287, +STORE, 140474144780288, 140474153172991, +STORE, 140474136391680, 140474144780287, +SNULL, 140474144784383, 140474153172991, +STORE, 140474144780288, 140474144784383, +STORE, 140474144784384, 140474153172991, +STORE, 140473985384448, 140474018955263, +STORE, 140473976991744, 140474018955263, +STORE, 140473968599040, 140474018955263, +SNULL, 140473968603135, 140474018955263, +STORE, 140473968599040, 140473968603135, +STORE, 140473968603136, 140474018955263, +SNULL, 140474111213568, 140474119602175, +STORE, 140474119602176, 140474127994879, +STORE, 140474111213568, 140474119602175, +SNULL, 140474119606271, 140474127994879, +STORE, 140474119602176, 140474119606271, +STORE, 140474119606272, 140474127994879, +STORE, 140473960206336, 140473968599039, +SNULL, 140474094428159, 140474102816767, +STORE, 140474094424064, 140474094428159, +STORE, 140474094428160, 140474102816767, +STORE, 140473876344832, 140473884737535, +STORE, 140473867952128, 140473884737535, +STORE, 140473859559424, 140473884737535, +SNULL, 140473859563519, 140473884737535, +STORE, 140473859559424, 140473859563519, +STORE, 140473859563520, 140473884737535, +SNULL, 140473968603136, 140473993777151, +STORE, 140473993777152, 140474018955263, +STORE, 140473968603136, 140473993777151, +SNULL, 140473993781247, 140474018955263, +STORE, 140473993777152, 140473993781247, +STORE, 140473993781248, 140474018955263, +SNULL, 140473960210431, 140473968599039, +STORE, 140473960206336, 140473960210431, +STORE, 140473960210432, 140473968599039, +SNULL, 140473993781248, 140474010562559, +STORE, 140474010562560, 140474018955263, +STORE, 140473993781248, 140474010562559, +SNULL, 140474010566655, 140474018955263, +STORE, 140474010562560, 140474010566655, +STORE, 140474010566656, 140474018955263, +SNULL, 140473968603136, 140473985384447, +STORE, 140473985384448, 140473993777151, +STORE, 140473968603136, 140473985384447, +SNULL, 140473985388543, 140473993777151, +STORE, 140473985384448, 140473985388543, +STORE, 140473985388544, 140473993777151, +SNULL, 140473993781248, 140474002169855, +STORE, 140474002169856, 140474010562559, +STORE, 140473993781248, 140474002169855, +SNULL, 140474002173951, 140474010562559, +STORE, 140474002169856, 140474002173951, +STORE, 140474002173952, 140474010562559, +STORE, 140473851166720, 140473859559423, +SNULL, 140473851170815, 140473859559423, +STORE, 140473851166720, 140473851170815, +STORE, 140473851170816, 140473859559423, +SNULL, 140473968603136, 140473976991743, +STORE, 140473976991744, 140473985384447, +STORE, 140473968603136, 140473976991743, +SNULL, 140473976995839, 140473985384447, +STORE, 140473976991744, 140473976995839, +STORE, 140473976995840, 140473985384447, +STORE, 140473842774016, 140473851166719, +SNULL, 140473859563520, 140473867952127, +STORE, 140473867952128, 140473884737535, +STORE, 140473859563520, 140473867952127, +SNULL, 140473867956223, 140473884737535, +STORE, 140473867952128, 140473867956223, +STORE, 140473867956224, 140473884737535, +SNULL, 140473867956224, 140473876344831, +STORE, 140473876344832, 140473884737535, +STORE, 140473867956224, 140473876344831, +SNULL, 140473876348927, 140473884737535, +STORE, 140473876344832, 140473876348927, +STORE, 140473876348928, 140473884737535, +STORE, 140473834381312, 140473851166719, +SNULL, 140473834385407, 140473851166719, +STORE, 140473834381312, 140473834385407, +STORE, 140473834385408, 140473851166719, +SNULL, 140473834385408, 140473842774015, +STORE, 140473842774016, 140473851166719, +STORE, 140473834385408, 140473842774015, +SNULL, 140473842778111, 140473851166719, +STORE, 140473842774016, 140473842778111, +STORE, 140473842778112, 140473851166719, +STORE, 140473825988608, 140473834381311, +SNULL, 140473825992703, 140473834381311, +STORE, 140473825988608, 140473825992703, +STORE, 140473825992704, 140473834381311, +STORE, 140475577475072, 140475577503743, +STORE, 140475499917312, 140475502108671, +SNULL, 140475499917312, 140475500007423, +STORE, 140475500007424, 140475502108671, +STORE, 140475499917312, 140475500007423, +SNULL, 140475502100479, 140475502108671, +STORE, 140475500007424, 140475502100479, +STORE, 140475502100480, 140475502108671, +ERASE, 140475502100480, 140475502108671, +STORE, 140475502100480, 140475502108671, +SNULL, 140475502104575, 140475502108671, +STORE, 140475502100480, 140475502104575, +STORE, 140475502104576, 140475502108671, +ERASE, 140475577475072, 140475577503743, +ERASE, 140475235274752, 140475235278847, +ERASE, 140475235278848, 140475243667455, +ERASE, 140474815868928, 140474815873023, +ERASE, 140474815873024, 140474824261631, +ERASE, 140474606149632, 140474606153727, +ERASE, 140474606153728, 140474614542335, +ERASE, 140474270605312, 140474270609407, +ERASE, 140474270609408, 140474278998015, +ERASE, 140474438361088, 140474438365183, +ERASE, 140474438365184, 140474446753791, +ERASE, 140474597756928, 140474597761023, +ERASE, 140474597761024, 140474606149631, +ERASE, 140475126235136, 140475126239231, +ERASE, 140475126239232, 140475134627839, +ERASE, 140474463539200, 140474463543295, +ERASE, 140474463543296, 140474471931903, +ERASE, 140474388037632, 140474388041727, +ERASE, 140474388041728, 140474396430335, +ERASE, 140474404823040, 140474404827135, +ERASE, 140474404827136, 140474413215743, +ERASE, 140474278998016, 140474279002111, +ERASE, 140474279002112, 140474287390719, +ERASE, 140474094424064, 140474094428159, +ERASE, 140474094428160, 140474102816767, +ERASE, 140473867952128, 140473867956223, +ERASE, 140473867956224, 140473876344831, +ERASE, 140475151413248, 140475151417343, +ERASE, 140475151417344, 140475159805951, +ERASE, 140474455146496, 140474455150591, +ERASE, 140474455150592, 140474463539199, +ERASE, 140474807476224, 140474807480319, +ERASE, 140474807480320, 140474815868927, +ERASE, 140475117842432, 140475117846527, +ERASE, 140475117846528, 140475126235135, +ERASE, 140474446753792, 140474446757887, +ERASE, 140474446757888, 140474455146495, +ERASE, 140474429968384, 140474429972479, +ERASE, 140474429972480, 140474438361087, +ERASE, 140474782298112, 140474782302207, +ERASE, 140474782302208, 140474790690815, +ERASE, 140474136387584, 140474136391679, +ERASE, 140474136391680, 140474144780287, +ERASE, 140474002169856, 140474002173951, +ERASE, 140474002173952, 140474010562559, +ERASE, 140475134627840, 140475134631935, +ERASE, 140475134631936, 140475143020543, +ERASE, 140474471931904, 140474471935999, +ERASE, 140474471936000, 140474480324607, +ERASE, 140474396430336, 140474396434431, +ERASE, 140474396434432, 140474404823039, + }; + unsigned long set36[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140723893125120, 140737488351231, +SNULL, 140723893129215, 140737488351231, +STORE, 140723893125120, 140723893129215, +STORE, 140723892994048, 140723893129215, +STORE, 94076829786112, 94076832038911, +SNULL, 94076829917183, 94076832038911, +STORE, 94076829786112, 94076829917183, +STORE, 94076829917184, 94076832038911, +ERASE, 94076829917184, 94076832038911, +STORE, 94076832010240, 94076832018431, +STORE, 94076832018432, 94076832038911, +STORE, 140122444345344, 140122446598143, +SNULL, 140122444488703, 140122446598143, +STORE, 140122444345344, 140122444488703, +STORE, 140122444488704, 140122446598143, +ERASE, 140122444488704, 140122446598143, +STORE, 140122446585856, 140122446594047, +STORE, 140122446594048, 140122446598143, +STORE, 140723893538816, 140723893542911, +STORE, 140723893526528, 140723893538815, +STORE, 140122446557184, 140122446585855, +STORE, 140122446548992, 140122446557183, +STORE, 140122442129408, 140122444345343, +SNULL, 140122442129408, 140122442227711, +STORE, 140122442227712, 140122444345343, +STORE, 140122442129408, 140122442227711, +SNULL, 140122444320767, 140122444345343, +STORE, 140122442227712, 140122444320767, +STORE, 140122444320768, 140122444345343, +SNULL, 140122444320768, 140122444328959, +STORE, 140122444328960, 140122444345343, +STORE, 140122444320768, 140122444328959, +ERASE, 140122444320768, 140122444328959, +STORE, 140122444320768, 140122444328959, +ERASE, 140122444328960, 140122444345343, +STORE, 140122444328960, 140122444345343, +STORE, 140122438332416, 140122442129407, +SNULL, 140122438332416, 140122439991295, +STORE, 140122439991296, 140122442129407, +STORE, 140122438332416, 140122439991295, +SNULL, 140122442088447, 140122442129407, +STORE, 140122439991296, 140122442088447, +STORE, 140122442088448, 140122442129407, +SNULL, 140122442088448, 140122442113023, +STORE, 140122442113024, 140122442129407, +STORE, 140122442088448, 140122442113023, +ERASE, 140122442088448, 140122442113023, +STORE, 140122442088448, 140122442113023, +ERASE, 140122442113024, 140122442129407, +STORE, 140122442113024, 140122442129407, +STORE, 140122446540800, 140122446557183, +SNULL, 140122442104831, 140122442113023, +STORE, 140122442088448, 140122442104831, +STORE, 140122442104832, 140122442113023, +SNULL, 140122444324863, 140122444328959, +STORE, 140122444320768, 140122444324863, +STORE, 140122444324864, 140122444328959, +SNULL, 94076832014335, 94076832018431, +STORE, 94076832010240, 94076832014335, +STORE, 94076832014336, 94076832018431, +SNULL, 140122446589951, 140122446594047, +STORE, 140122446585856, 140122446589951, +STORE, 140122446589952, 140122446594047, +ERASE, 140122446557184, 140122446585855, +STORE, 94076845723648, 94076845858815, +STORE, 140122429939712, 140122438332415, +SNULL, 140122429943807, 140122438332415, +STORE, 140122429939712, 140122429943807, +STORE, 140122429943808, 140122438332415, +STORE, 140122421547008, 140122429939711, +STORE, 140122287329280, 140122421547007, +SNULL, 140122287329280, 140122301399039, +STORE, 140122301399040, 140122421547007, +STORE, 140122287329280, 140122301399039, +ERASE, 140122287329280, 140122301399039, +SNULL, 140122368507903, 140122421547007, +STORE, 140122301399040, 140122368507903, +STORE, 140122368507904, 140122421547007, +ERASE, 140122368507904, 140122421547007, +SNULL, 140122301534207, 140122368507903, +STORE, 140122301399040, 140122301534207, +STORE, 140122301534208, 140122368507903, +SNULL, 140122421551103, 140122429939711, +STORE, 140122421547008, 140122421551103, +STORE, 140122421551104, 140122429939711, +STORE, 140122413154304, 140122421547007, +SNULL, 140122413158399, 140122421547007, +STORE, 140122413154304, 140122413158399, +STORE, 140122413158400, 140122421547007, +STORE, 140122404761600, 140122413154303, +SNULL, 140122404765695, 140122413154303, +STORE, 140122404761600, 140122404765695, +STORE, 140122404765696, 140122413154303, +STORE, 140122396368896, 140122404761599, +SNULL, 140122396372991, 140122404761599, +STORE, 140122396368896, 140122396372991, +STORE, 140122396372992, 140122404761599, +STORE, 140122387976192, 140122396368895, +STORE, 140122167181312, 140122301399039, +SNULL, 140122234290175, 140122301399039, +STORE, 140122167181312, 140122234290175, +STORE, 140122234290176, 140122301399039, +ERASE, 140122234290176, 140122301399039, +SNULL, 140122167316479, 140122234290175, +STORE, 140122167181312, 140122167316479, +STORE, 140122167316480, 140122234290175, +STORE, 140122379583488, 140122396368895, +STORE, 140122371190784, 140122396368895, +STORE, 140122167316480, 140122301399039, +STORE, 140122158788608, 140122167181311, +SNULL, 140122371190784, 140122387976191, +STORE, 140122387976192, 140122396368895, +STORE, 140122371190784, 140122387976191, +SNULL, 140122387980287, 140122396368895, +STORE, 140122387976192, 140122387980287, +STORE, 140122387980288, 140122396368895, +SNULL, 140122167316480, 140122234290175, +STORE, 140122234290176, 140122301399039, +STORE, 140122167316480, 140122234290175, +SNULL, 140122234425343, 140122301399039, +STORE, 140122234290176, 140122234425343, +STORE, 140122234425344, 140122301399039, +STORE, 140122024570880, 140122158788607, +SNULL, 140122024570880, 140122032963583, +STORE, 140122032963584, 140122158788607, +STORE, 140122024570880, 140122032963583, +ERASE, 140122024570880, 140122032963583, +STORE, 140121898745856, 140122158788607, +STORE, 140121890353152, 140121898745855, +SNULL, 140122100072447, 140122158788607, +STORE, 140121898745856, 140122100072447, +STORE, 140122100072448, 140122158788607, +ERASE, 140122100072448, 140122158788607, +SNULL, 140121965854719, 140122100072447, +STORE, 140121898745856, 140121965854719, +STORE, 140121965854720, 140122100072447, +SNULL, 140121965854720, 140122032963583, +STORE, 140122032963584, 140122100072447, +STORE, 140121965854720, 140122032963583, +ERASE, 140121965854720, 140122032963583, +SNULL, 140121898881023, 140121965854719, +STORE, 140121898745856, 140121898881023, +STORE, 140121898881024, 140121965854719, +SNULL, 140121890357247, 140121898745855, +STORE, 140121890353152, 140121890357247, +STORE, 140121890357248, 140121898745855, +SNULL, 140122371190784, 140122379583487, +STORE, 140122379583488, 140122387976191, +STORE, 140122371190784, 140122379583487, +SNULL, 140122379587583, 140122387976191, +STORE, 140122379583488, 140122379587583, +STORE, 140122379587584, 140122387976191, +SNULL, 140122033098751, 140122100072447, +STORE, 140122032963584, 140122033098751, +STORE, 140122033098752, 140122100072447, +SNULL, 140122158792703, 140122167181311, +STORE, 140122158788608, 140122158792703, +STORE, 140122158792704, 140122167181311, +STORE, 140122150395904, 140122158788607, +STORE, 140122142003200, 140122158788607, +SNULL, 140122142007295, 140122158788607, +STORE, 140122142003200, 140122142007295, +STORE, 140122142007296, 140122158788607, +SNULL, 140122371194879, 140122379583487, +STORE, 140122371190784, 140122371194879, +STORE, 140122371194880, 140122379583487, +SNULL, 140122142007296, 140122150395903, +STORE, 140122150395904, 140122158788607, +STORE, 140122142007296, 140122150395903, +SNULL, 140122150399999, 140122158788607, +STORE, 140122150395904, 140122150399999, +STORE, 140122150400000, 140122158788607, +STORE, 140122133610496, 140122142003199, +STORE, 140122125217792, 140122142003199, +STORE, 140122116825088, 140122142003199, +SNULL, 140122116829183, 140122142003199, +STORE, 140122116825088, 140122116829183, +STORE, 140122116829184, 140122142003199, +SNULL, 140122116829184, 140122133610495, +STORE, 140122133610496, 140122142003199, +STORE, 140122116829184, 140122133610495, +SNULL, 140122133614591, 140122142003199, +STORE, 140122133610496, 140122133614591, +STORE, 140122133614592, 140122142003199, +SNULL, 140122116829184, 140122125217791, +STORE, 140122125217792, 140122133610495, +STORE, 140122116829184, 140122125217791, +SNULL, 140122125221887, 140122133610495, +STORE, 140122125217792, 140122125221887, +STORE, 140122125221888, 140122133610495, +STORE, 140122108432384, 140122116825087, +SNULL, 140122108436479, 140122116825087, +STORE, 140122108432384, 140122108436479, +STORE, 140122108436480, 140122116825087, +STORE, 140122024570880, 140122032963583, +STORE, 140122016178176, 140122032963583, +SNULL, 140122016182271, 140122032963583, +STORE, 140122016178176, 140122016182271, +STORE, 140122016182272, 140122032963583, +SNULL, 140122016182272, 140122024570879, +STORE, 140122024570880, 140122032963583, +STORE, 140122016182272, 140122024570879, +SNULL, 140122024574975, 140122032963583, +STORE, 140122024570880, 140122024574975, +STORE, 140122024574976, 140122032963583, +STORE, 140122007785472, 140122016178175, +SNULL, 140122007789567, 140122016178175, +STORE, 140122007785472, 140122007789567, +STORE, 140122007789568, 140122016178175, +STORE, 140121999392768, 140122007785471, +STORE, 140121991000064, 140122007785471, +SNULL, 140121991004159, 140122007785471, +STORE, 140121991000064, 140121991004159, +STORE, 140121991004160, 140122007785471, +SNULL, 140121991004160, 140121999392767, +STORE, 140121999392768, 140122007785471, +STORE, 140121991004160, 140121999392767, +SNULL, 140121999396863, 140122007785471, +STORE, 140121999392768, 140121999396863, +STORE, 140121999396864, 140122007785471, +STORE, 140121982607360, 140121991000063, +STORE, 140121823244288, 140121890353151, +ERASE, 140121823244288, 140121890353151, +STORE, 140121756135424, 140121890353151, +SNULL, 140121756135424, 140121764528127, +STORE, 140121764528128, 140121890353151, +STORE, 140121756135424, 140121764528127, +ERASE, 140121756135424, 140121764528127, +SNULL, 140121831636991, 140121890353151, +STORE, 140121764528128, 140121831636991, +STORE, 140121831636992, 140121890353151, +ERASE, 140121831636992, 140121890353151, +STORE, 140121974214656, 140121991000063, +STORE, 140121630310400, 140121831636991, +SNULL, 140121697419263, 140121831636991, +STORE, 140121630310400, 140121697419263, +STORE, 140121697419264, 140121831636991, +SNULL, 140121697419264, 140121764528127, +STORE, 140121764528128, 140121831636991, +STORE, 140121697419264, 140121764528127, +ERASE, 140121697419264, 140121764528127, +STORE, 140121881960448, 140121890353151, +STORE, 140121630310400, 140121831636991, +STORE, 140121873567744, 140121890353151, +SNULL, 140121630310400, 140121697419263, +STORE, 140121697419264, 140121831636991, +STORE, 140121630310400, 140121697419263, +SNULL, 140121697554431, 140121831636991, +STORE, 140121697419264, 140121697554431, +STORE, 140121697554432, 140121831636991, +STORE, 140121865175040, 140121890353151, +STORE, 140121856782336, 140121890353151, +STORE, 140121848389632, 140121890353151, +STORE, 140121839996928, 140121890353151, +STORE, 140121496092672, 140121697419263, +STORE, 140121487699968, 140121496092671, +STORE, 140121420591104, 140121487699967, +STORE, 140121412198400, 140121420591103, +ERASE, 140121420591104, 140121487699967, +STORE, 140121479307264, 140121496092671, +STORE, 140121277980672, 140121412198399, +SNULL, 140121277980672, 140121294766079, +STORE, 140121294766080, 140121412198399, +STORE, 140121277980672, 140121294766079, +ERASE, 140121277980672, 140121294766079, +STORE, 140121470914560, 140121496092671, +STORE, 140121462521856, 140121496092671, +STORE, 140121160548352, 140121412198399, +STORE, 140121454129152, 140121496092671, +SNULL, 140121227657215, 140121412198399, +STORE, 140121160548352, 140121227657215, +STORE, 140121227657216, 140121412198399, +SNULL, 140121227657216, 140121294766079, +STORE, 140121294766080, 140121412198399, +STORE, 140121227657216, 140121294766079, +ERASE, 140121227657216, 140121294766079, +STORE, 140121445736448, 140121496092671, +STORE, 140121437343744, 140121496092671, +SNULL, 140121437343744, 140121445736447, +STORE, 140121445736448, 140121496092671, +STORE, 140121437343744, 140121445736447, +SNULL, 140121445740543, 140121496092671, +STORE, 140121445736448, 140121445740543, +STORE, 140121445740544, 140121496092671, +SNULL, 140121697554432, 140121764528127, +STORE, 140121764528128, 140121831636991, +STORE, 140121697554432, 140121764528127, +SNULL, 140121764663295, 140121831636991, +STORE, 140121764528128, 140121764663295, +STORE, 140121764663296, 140121831636991, +SNULL, 140121496092672, 140121630310399, +STORE, 140121630310400, 140121697419263, +STORE, 140121496092672, 140121630310399, +SNULL, 140121630445567, 140121697419263, +STORE, 140121630310400, 140121630445567, +STORE, 140121630445568, 140121697419263, +SNULL, 140121445740544, 140121454129151, +STORE, 140121454129152, 140121496092671, +STORE, 140121445740544, 140121454129151, +SNULL, 140121454133247, 140121496092671, +STORE, 140121454129152, 140121454133247, +STORE, 140121454133248, 140121496092671, +STORE, 140121026330624, 140121227657215, +SNULL, 140121093439487, 140121227657215, +STORE, 140121026330624, 140121093439487, +STORE, 140121093439488, 140121227657215, +SNULL, 140121093439488, 140121160548351, +STORE, 140121160548352, 140121227657215, +STORE, 140121093439488, 140121160548351, +ERASE, 140121093439488, 140121160548351, +SNULL, 140121563201535, 140121630310399, +STORE, 140121496092672, 140121563201535, +STORE, 140121563201536, 140121630310399, +ERASE, 140121563201536, 140121630310399, +STORE, 140120892112896, 140121093439487, +SNULL, 140120959221759, 140121093439487, +STORE, 140120892112896, 140120959221759, +STORE, 140120959221760, 140121093439487, +SNULL, 140120959221760, 140121026330623, +STORE, 140121026330624, 140121093439487, +STORE, 140120959221760, 140121026330623, +ERASE, 140120959221760, 140121026330623, +STORE, 140120757895168, 140120959221759, +SNULL, 140121361874943, 140121412198399, +STORE, 140121294766080, 140121361874943, +STORE, 140121361874944, 140121412198399, +ERASE, 140121361874944, 140121412198399, +SNULL, 140121294901247, 140121361874943, +STORE, 140121294766080, 140121294901247, +STORE, 140121294901248, 140121361874943, +STORE, 140120623677440, 140120959221759, +SNULL, 140120690786303, 140120959221759, +STORE, 140120623677440, 140120690786303, +STORE, 140120690786304, 140120959221759, +SNULL, 140120690786304, 140120757895167, +STORE, 140120757895168, 140120959221759, +STORE, 140120690786304, 140120757895167, +ERASE, 140120690786304, 140120757895167, +SNULL, 140121160683519, 140121227657215, +STORE, 140121160548352, 140121160683519, +STORE, 140121160683520, 140121227657215, +SNULL, 140121974214656, 140121982607359, +STORE, 140121982607360, 140121991000063, +STORE, 140121974214656, 140121982607359, +SNULL, 140121982611455, 140121991000063, +STORE, 140121982607360, 140121982611455, +STORE, 140121982611456, 140121991000063, +SNULL, 140121839996928, 140121873567743, +STORE, 140121873567744, 140121890353151, +STORE, 140121839996928, 140121873567743, +SNULL, 140121873571839, 140121890353151, +STORE, 140121873567744, 140121873571839, +STORE, 140121873571840, 140121890353151, +SNULL, 140121873571840, 140121881960447, +STORE, 140121881960448, 140121890353151, +STORE, 140121873571840, 140121881960447, +SNULL, 140121881964543, 140121890353151, +STORE, 140121881960448, 140121881964543, +STORE, 140121881964544, 140121890353151, +SNULL, 140121840001023, 140121873567743, +STORE, 140121839996928, 140121840001023, +STORE, 140121840001024, 140121873567743, +SNULL, 140121840001024, 140121865175039, +STORE, 140121865175040, 140121873567743, +STORE, 140121840001024, 140121865175039, +SNULL, 140121865179135, 140121873567743, +STORE, 140121865175040, 140121865179135, +STORE, 140121865179136, 140121873567743, +SNULL, 140121437347839, 140121445736447, +STORE, 140121437343744, 140121437347839, +STORE, 140121437347840, 140121445736447, +STORE, 140121621917696, 140121630310399, +STORE, 140121613524992, 140121630310399, +SNULL, 140121026465791, 140121093439487, +STORE, 140121026330624, 140121026465791, +STORE, 140121026465792, 140121093439487, +SNULL, 140121496227839, 140121563201535, +STORE, 140121496092672, 140121496227839, +STORE, 140121496227840, 140121563201535, +SNULL, 140120757895168, 140120892112895, +STORE, 140120892112896, 140120959221759, +STORE, 140120757895168, 140120892112895, +SNULL, 140120892248063, 140120959221759, +STORE, 140120892112896, 140120892248063, +STORE, 140120892248064, 140120959221759, +SNULL, 140120825004031, 140120892112895, +STORE, 140120757895168, 140120825004031, +STORE, 140120825004032, 140120892112895, +ERASE, 140120825004032, 140120892112895, +SNULL, 140120623812607, 140120690786303, +STORE, 140120623677440, 140120623812607, +STORE, 140120623812608, 140120690786303, +SNULL, 140120758030335, 140120825004031, +STORE, 140120757895168, 140120758030335, +STORE, 140120758030336, 140120825004031, +SNULL, 140121454133248, 140121462521855, +STORE, 140121462521856, 140121496092671, +STORE, 140121454133248, 140121462521855, +SNULL, 140121462525951, 140121496092671, +STORE, 140121462521856, 140121462525951, +STORE, 140121462525952, 140121496092671, +STORE, 140121605132288, 140121630310399, +SNULL, 140121605136383, 140121630310399, +STORE, 140121605132288, 140121605136383, +STORE, 140121605136384, 140121630310399, +STORE, 140121596739584, 140121605132287, +SNULL, 140121605136384, 140121621917695, +STORE, 140121621917696, 140121630310399, +STORE, 140121605136384, 140121621917695, +SNULL, 140121621921791, 140121630310399, +STORE, 140121621917696, 140121621921791, +STORE, 140121621921792, 140121630310399, +STORE, 140121588346880, 140121605132287, +STORE, 140121579954176, 140121605132287, +SNULL, 140121412202495, 140121420591103, +STORE, 140121412198400, 140121412202495, +STORE, 140121412202496, 140121420591103, +SNULL, 140121974218751, 140121982607359, +STORE, 140121974214656, 140121974218751, +STORE, 140121974218752, 140121982607359, +SNULL, 140121462525952, 140121479307263, +STORE, 140121479307264, 140121496092671, +STORE, 140121462525952, 140121479307263, +SNULL, 140121479311359, 140121496092671, +STORE, 140121479307264, 140121479311359, +STORE, 140121479311360, 140121496092671, +STORE, 140121571561472, 140121605132287, +SNULL, 140121571565567, 140121605132287, +STORE, 140121571561472, 140121571565567, +STORE, 140121571565568, 140121605132287, +STORE, 140121428951040, 140121437343743, +SNULL, 140121428955135, 140121437343743, +STORE, 140121428951040, 140121428955135, +STORE, 140121428955136, 140121437343743, +SNULL, 140121840001024, 140121856782335, +STORE, 140121856782336, 140121865175039, +STORE, 140121840001024, 140121856782335, +SNULL, 140121856786431, 140121865175039, +STORE, 140121856782336, 140121856786431, +STORE, 140121856786432, 140121865175039, +STORE, 140121403805696, 140121412198399, +SNULL, 140121840001024, 140121848389631, +STORE, 140121848389632, 140121856782335, +STORE, 140121840001024, 140121848389631, +SNULL, 140121848393727, 140121856782335, +STORE, 140121848389632, 140121848393727, +STORE, 140121848393728, 140121856782335, +SNULL, 140121479311360, 140121487699967, +STORE, 140121487699968, 140121496092671, +STORE, 140121479311360, 140121487699967, +SNULL, 140121487704063, 140121496092671, +STORE, 140121487699968, 140121487704063, +STORE, 140121487704064, 140121496092671, +STORE, 140121395412992, 140121412198399, +STORE, 140121387020288, 140121412198399, +SNULL, 140121387024383, 140121412198399, +STORE, 140121387020288, 140121387024383, +STORE, 140121387024384, 140121412198399, +SNULL, 140121605136384, 140121613524991, +STORE, 140121613524992, 140121621917695, +STORE, 140121605136384, 140121613524991, +SNULL, 140121613529087, 140121621917695, +STORE, 140121613524992, 140121613529087, +STORE, 140121613529088, 140121621917695, +SNULL, 140121462525952, 140121470914559, +STORE, 140121470914560, 140121479307263, +STORE, 140121462525952, 140121470914559, +SNULL, 140121470918655, 140121479307263, +STORE, 140121470914560, 140121470918655, +STORE, 140121470918656, 140121479307263, +STORE, 140121378627584, 140121387020287, +SNULL, 140121378631679, 140121387020287, +STORE, 140121378627584, 140121378631679, +STORE, 140121378631680, 140121387020287, +SNULL, 140121571565568, 140121596739583, +STORE, 140121596739584, 140121605132287, +STORE, 140121571565568, 140121596739583, +SNULL, 140121596743679, 140121605132287, +STORE, 140121596739584, 140121596743679, +STORE, 140121596743680, 140121605132287, +SNULL, 140121387024384, 140121403805695, +STORE, 140121403805696, 140121412198399, +STORE, 140121387024384, 140121403805695, +SNULL, 140121403809791, 140121412198399, +STORE, 140121403805696, 140121403809791, +STORE, 140121403809792, 140121412198399, +STORE, 140121370234880, 140121378627583, +SNULL, 140121387024384, 140121395412991, +STORE, 140121395412992, 140121403805695, +STORE, 140121387024384, 140121395412991, +SNULL, 140121395417087, 140121403805695, +STORE, 140121395412992, 140121395417087, +STORE, 140121395417088, 140121403805695, +SNULL, 140121571565568, 140121588346879, +STORE, 140121588346880, 140121596739583, +STORE, 140121571565568, 140121588346879, +SNULL, 140121588350975, 140121596739583, +STORE, 140121588346880, 140121588350975, +STORE, 140121588350976, 140121596739583, +SNULL, 140121571565568, 140121579954175, +STORE, 140121579954176, 140121588346879, +STORE, 140121571565568, 140121579954175, +SNULL, 140121579958271, 140121588346879, +STORE, 140121579954176, 140121579958271, +STORE, 140121579958272, 140121588346879, +STORE, 140121286373376, 140121294766079, +STORE, 140121277980672, 140121294766079, +SNULL, 140121277980672, 140121286373375, +STORE, 140121286373376, 140121294766079, +STORE, 140121277980672, 140121286373375, +SNULL, 140121286377471, 140121294766079, +STORE, 140121286373376, 140121286377471, +STORE, 140121286377472, 140121294766079, +STORE, 140121269587968, 140121286373375, +STORE, 140121261195264, 140121286373375, +SNULL, 140121261195264, 140121269587967, +STORE, 140121269587968, 140121286373375, +STORE, 140121261195264, 140121269587967, +SNULL, 140121269592063, 140121286373375, +STORE, 140121269587968, 140121269592063, +STORE, 140121269592064, 140121286373375, +STORE, 140121252802560, 140121269587967, +SNULL, 140121252806655, 140121269587967, +STORE, 140121252802560, 140121252806655, +STORE, 140121252806656, 140121269587967, +STORE, 140121244409856, 140121252802559, +STORE, 140121236017152, 140121252802559, +SNULL, 140121236017152, 140121244409855, +STORE, 140121244409856, 140121252802559, +STORE, 140121236017152, 140121244409855, +SNULL, 140121244413951, 140121252802559, +STORE, 140121244409856, 140121244413951, +STORE, 140121244413952, 140121252802559, +SNULL, 140121370238975, 140121378627583, +STORE, 140121370234880, 140121370238975, +STORE, 140121370238976, 140121378627583, +STORE, 140121152155648, 140121160548351, +STORE, 140121143762944, 140121160548351, +STORE, 140121135370240, 140121160548351, +SNULL, 140121135374335, 140121160548351, +STORE, 140121135370240, 140121135374335, +STORE, 140121135374336, 140121160548351, +STORE, 140121126977536, 140121135370239, +STORE, 140121118584832, 140121135370239, +STORE, 140121110192128, 140121135370239, +SNULL, 140121110192128, 140121118584831, +STORE, 140121118584832, 140121135370239, +STORE, 140121110192128, 140121118584831, +SNULL, 140121118588927, 140121135370239, +STORE, 140121118584832, 140121118588927, +STORE, 140121118588928, 140121135370239, +STORE, 140121101799424, 140121118584831, +STORE, 140121017937920, 140121026330623, +STORE, 140121009545216, 140121026330623, +SNULL, 140121009545216, 140121017937919, +STORE, 140121017937920, 140121026330623, +STORE, 140121009545216, 140121017937919, +SNULL, 140121017942015, 140121026330623, +STORE, 140121017937920, 140121017942015, +STORE, 140121017942016, 140121026330623, +SNULL, 140121269592064, 140121277980671, +STORE, 140121277980672, 140121286373375, +STORE, 140121269592064, 140121277980671, +SNULL, 140121277984767, 140121286373375, +STORE, 140121277980672, 140121277984767, +STORE, 140121277984768, 140121286373375, +STORE, 140121001152512, 140121017937919, +SNULL, 140121252806656, 140121261195263, +STORE, 140121261195264, 140121269587967, +STORE, 140121252806656, 140121261195263, +SNULL, 140121261199359, 140121269587967, +STORE, 140121261195264, 140121261199359, +STORE, 140121261199360, 140121269587967, +SNULL, 140121135374336, 140121152155647, +STORE, 140121152155648, 140121160548351, +STORE, 140121135374336, 140121152155647, +SNULL, 140121152159743, 140121160548351, +STORE, 140121152155648, 140121152159743, +STORE, 140121152159744, 140121160548351, +STORE, 140120992759808, 140121017937919, +STORE, 140120984367104, 140121017937919, +STORE, 140120975974400, 140121017937919, +SNULL, 140121101799424, 140121110192127, +STORE, 140121110192128, 140121118584831, +STORE, 140121101799424, 140121110192127, +SNULL, 140121110196223, 140121118584831, +STORE, 140121110192128, 140121110196223, +STORE, 140121110196224, 140121118584831, +SNULL, 140121118588928, 140121126977535, +STORE, 140121126977536, 140121135370239, +STORE, 140121118588928, 140121126977535, +SNULL, 140121126981631, 140121135370239, +STORE, 140121126977536, 140121126981631, +STORE, 140121126981632, 140121135370239, +STORE, 140120967581696, 140121017937919, +STORE, 140120883720192, 140120892112895, +SNULL, 140120883724287, 140120892112895, +STORE, 140120883720192, 140120883724287, +STORE, 140120883724288, 140120892112895, +STORE, 140120875327488, 140120883720191, +SNULL, 140121101803519, 140121110192127, +STORE, 140121101799424, 140121101803519, +STORE, 140121101803520, 140121110192127, +SNULL, 140121135374336, 140121143762943, +STORE, 140121143762944, 140121152155647, +STORE, 140121135374336, 140121143762943, +SNULL, 140121143767039, 140121152155647, +STORE, 140121143762944, 140121143767039, +STORE, 140121143767040, 140121152155647, +STORE, 140120866934784, 140120883720191, +SNULL, 140120967581696, 140120984367103, +STORE, 140120984367104, 140121017937919, +STORE, 140120967581696, 140120984367103, +SNULL, 140120984371199, 140121017937919, +STORE, 140120984367104, 140120984371199, +STORE, 140120984371200, 140121017937919, +STORE, 140120858542080, 140120883720191, +SNULL, 140121236021247, 140121244409855, +STORE, 140121236017152, 140121236021247, +STORE, 140121236021248, 140121244409855, +SNULL, 140120984371200, 140121009545215, +STORE, 140121009545216, 140121017937919, +STORE, 140120984371200, 140121009545215, +SNULL, 140121009549311, 140121017937919, +STORE, 140121009545216, 140121009549311, +STORE, 140121009549312, 140121017937919, +SNULL, 140120984371200, 140120992759807, +STORE, 140120992759808, 140121009545215, +STORE, 140120984371200, 140120992759807, +SNULL, 140120992763903, 140121009545215, +STORE, 140120992759808, 140120992763903, +STORE, 140120992763904, 140121009545215, +SNULL, 140120992763904, 140121001152511, +STORE, 140121001152512, 140121009545215, +STORE, 140120992763904, 140121001152511, +SNULL, 140121001156607, 140121009545215, +STORE, 140121001152512, 140121001156607, +STORE, 140121001156608, 140121009545215, +STORE, 140120850149376, 140120883720191, +SNULL, 140120850153471, 140120883720191, +STORE, 140120850149376, 140120850153471, +STORE, 140120850153472, 140120883720191, +SNULL, 140120967585791, 140120984367103, +STORE, 140120967581696, 140120967585791, +STORE, 140120967585792, 140120984367103, +SNULL, 140120850153472, 140120866934783, +STORE, 140120866934784, 140120883720191, +STORE, 140120850153472, 140120866934783, +SNULL, 140120866938879, 140120883720191, +STORE, 140120866934784, 140120866938879, +STORE, 140120866938880, 140120883720191, +STORE, 140120841756672, 140120850149375, +SNULL, 140120967585792, 140120975974399, +STORE, 140120975974400, 140120984367103, +STORE, 140120967585792, 140120975974399, +SNULL, 140120975978495, 140120984367103, +STORE, 140120975974400, 140120975978495, +STORE, 140120975978496, 140120984367103, +SNULL, 140120866938880, 140120875327487, +STORE, 140120875327488, 140120883720191, +STORE, 140120866938880, 140120875327487, +SNULL, 140120875331583, 140120883720191, +STORE, 140120875327488, 140120875331583, +STORE, 140120875331584, 140120883720191, +STORE, 140120833363968, 140120850149375, +STORE, 140120749502464, 140120757895167, +STORE, 140120741109760, 140120757895167, +STORE, 140120732717056, 140120757895167, +STORE, 140120724324352, 140120757895167, +SNULL, 140120724324352, 140120732717055, +STORE, 140120732717056, 140120757895167, +STORE, 140120724324352, 140120732717055, +SNULL, 140120732721151, 140120757895167, +STORE, 140120732717056, 140120732721151, +STORE, 140120732721152, 140120757895167, +STORE, 140120715931648, 140120732717055, +SNULL, 140120715935743, 140120732717055, +STORE, 140120715931648, 140120715935743, +STORE, 140120715935744, 140120732717055, +SNULL, 140120850153472, 140120858542079, +STORE, 140120858542080, 140120866934783, +STORE, 140120850153472, 140120858542079, +SNULL, 140120858546175, 140120866934783, +STORE, 140120858542080, 140120858546175, +STORE, 140120858546176, 140120866934783, +STORE, 140120707538944, 140120715931647, +SNULL, 140120707543039, 140120715931647, +STORE, 140120707538944, 140120707543039, +STORE, 140120707543040, 140120715931647, +SNULL, 140120833368063, 140120850149375, +STORE, 140120833363968, 140120833368063, +STORE, 140120833368064, 140120850149375, +SNULL, 140120833368064, 140120841756671, +STORE, 140120841756672, 140120850149375, +STORE, 140120833368064, 140120841756671, +SNULL, 140120841760767, 140120850149375, +STORE, 140120841756672, 140120841760767, +STORE, 140120841760768, 140120850149375, +STORE, 140120699146240, 140120707538943, +SNULL, 140120715935744, 140120724324351, +STORE, 140120724324352, 140120732717055, +STORE, 140120715935744, 140120724324351, +SNULL, 140120724328447, 140120732717055, +STORE, 140120724324352, 140120724328447, +STORE, 140120724328448, 140120732717055, +SNULL, 140120732721152, 140120741109759, +STORE, 140120741109760, 140120757895167, +STORE, 140120732721152, 140120741109759, +SNULL, 140120741113855, 140120757895167, +STORE, 140120741109760, 140120741113855, +STORE, 140120741113856, 140120757895167, +SNULL, 140120741113856, 140120749502463, +STORE, 140120749502464, 140120757895167, +STORE, 140120741113856, 140120749502463, +SNULL, 140120749506559, 140120757895167, +STORE, 140120749502464, 140120749506559, +STORE, 140120749506560, 140120757895167, +SNULL, 140120699150335, 140120707538943, +STORE, 140120699146240, 140120699150335, +STORE, 140120699150336, 140120707538943, +STORE, 140122446557184, 140122446585855, +STORE, 140122368999424, 140122371190783, +SNULL, 140122368999424, 140122369089535, +STORE, 140122369089536, 140122371190783, +STORE, 140122368999424, 140122369089535, +SNULL, 140122371182591, 140122371190783, +STORE, 140122369089536, 140122371182591, +STORE, 140122371182592, 140122371190783, +ERASE, 140122371182592, 140122371190783, +STORE, 140122371182592, 140122371190783, +SNULL, 140122371186687, 140122371190783, +STORE, 140122371182592, 140122371186687, +STORE, 140122371186688, 140122371190783, +ERASE, 140122446557184, 140122446585855, +ERASE, 140121445736448, 140121445740543, +ERASE, 140121445740544, 140121454129151, +ERASE, 140121621917696, 140121621921791, +ERASE, 140121621921792, 140121630310399, +ERASE, 140121579954176, 140121579958271, +ERASE, 140121579958272, 140121588346879, +ERASE, 140121261195264, 140121261199359, +ERASE, 140121261199360, 140121269587967, +ERASE, 140121454129152, 140121454133247, +ERASE, 140121454133248, 140121462521855, +ERASE, 140121588346880, 140121588350975, +ERASE, 140121588350976, 140121596739583, +ERASE, 140121135370240, 140121135374335, +ERASE, 140121135374336, 140121143762943, +ERASE, 140121881960448, 140121881964543, +ERASE, 140121881964544, 140121890353151, +ERASE, 140121428951040, 140121428955135, +ERASE, 140121428955136, 140121437343743, +ERASE, 140121387020288, 140121387024383, +ERASE, 140121387024384, 140121395412991, +ERASE, 140121487699968, 140121487704063, +ERASE, 140121487704064, 140121496092671, +ERASE, 140121437343744, 140121437347839, +ERASE, 140121437347840, 140121445736447, +ERASE, 140121613524992, 140121613529087, +ERASE, 140121613529088, 140121621917695, +ERASE, 140121856782336, 140121856786431, +ERASE, 140121856786432, 140121865175039, +ERASE, 140121252802560, 140121252806655, +ERASE, 140121252806656, 140121261195263, +ERASE, 140121839996928, 140121840001023, +ERASE, 140121840001024, 140121848389631, +ERASE, 140121596739584, 140121596743679, +ERASE, 140121596743680, 140121605132287, +ERASE, 140121009545216, 140121009549311, +ERASE, 140121009549312, 140121017937919, +ERASE, 140120724324352, 140120724328447, +ERASE, 140120724328448, 140120732717055, +ERASE, 140120883720192, 140120883724287, +ERASE, 140120883724288, 140120892112895, +ERASE, 140121982607360, 140121982611455, +ERASE, 140121982611456, 140121991000063, +ERASE, 140121571561472, 140121571565567, +ERASE, 140121571565568, 140121579954175, +ERASE, 140121286373376, 140121286377471, +ERASE, 140121286377472, 140121294766079, +ERASE, 140120875327488, 140120875331583, +ERASE, 140120875331584, 140120883720191, +ERASE, 140121848389632, 140121848393727, +ERASE, 140121848393728, 140121856782335, +ERASE, 140121370234880, 140121370238975, +ERASE, 140121370238976, 140121378627583, +ERASE, 140121143762944, 140121143767039, +ERASE, 140121143767040, 140121152155647, +ERASE, 140121118584832, 140121118588927, +ERASE, 140121118588928, 140121126977535, +ERASE, 140120866934784, 140120866938879, +ERASE, 140120866938880, 140120875327487, +ERASE, 140120741109760, 140120741113855, +ERASE, 140120741113856, 140120749502463, +ERASE, 140121865175040, 140121865179135, +ERASE, 140121865179136, 140121873567743, +ERASE, 140121403805696, 140121403809791, +ERASE, 140121403809792, 140121412198399, +ERASE, 140121236017152, 140121236021247, +ERASE, 140121236021248, 140121244409855, +ERASE, 140120732717056, 140120732721151, +ERASE, 140120732721152, 140120741109759, +ERASE, 140121017937920, 140121017942015, +ERASE, 140121017942016, 140121026330623, +ERASE, 140121873567744, 140121873571839, +ERASE, 140121873571840, 140121881960447, +ERASE, 140121470914560, 140121470918655, +ERASE, 140121470918656, 140121479307263, +ERASE, 140121126977536, 140121126981631, +ERASE, 140121126981632, 140121135370239, +ERASE, 140120850149376, 140120850153471, +ERASE, 140120850153472, 140120858542079, +ERASE, 140120707538944, 140120707543039, +ERASE, 140120707543040, 140120715931647, +ERASE, 140121479307264, 140121479311359, +ERASE, 140121479311360, 140121487699967, +ERASE, 140120967581696, 140120967585791, +ERASE, 140120967585792, 140120975974399, +ERASE, 140120841756672, 140120841760767, +ERASE, 140120841760768, 140120850149375, +ERASE, 140121412198400, 140121412202495, +ERASE, 140121412202496, 140121420591103, +ERASE, 140122158788608, 140122158792703, +ERASE, 140122158792704, 140122167181311, +ERASE, 140122142003200, 140122142007295, +ERASE, 140122142007296, 140122150395903, +ERASE, 140121101799424, 140121101803519, +ERASE, 140121101803520, 140121110192127, +ERASE, 140120858542080, 140120858546175, +ERASE, 140120858546176, 140120866934783, +ERASE, 140120833363968, 140120833368063, +ERASE, 140120833368064, 140120841756671, +ERASE, 140121277980672, 140121277984767, +ERASE, 140121277984768, 140121286373375, +ERASE, 140121001152512, 140121001156607, +ERASE, 140121001156608, 140121009545215, +ERASE, 140120749502464, 140120749506559, +ERASE, 140120749506560, 140120757895167, +ERASE, 140121605132288, 140121605136383, +ERASE, 140121605136384, 140121613524991, +ERASE, 140121378627584, 140121378631679, +ERASE, 140121378631680, 140121387020287, +ERASE, 140121110192128, 140121110196223, +ERASE, 140121110196224, 140121118584831, +ERASE, 140121462521856, 140121462525951, +ERASE, 140121462525952, 140121470914559, +ERASE, 140121395412992, 140121395417087, +ERASE, 140121395417088, 140121403805695, +ERASE, 140121152155648, 140121152159743, +ERASE, 140121152159744, 140121160548351, +ERASE, 140120992759808, 140120992763903, +ERASE, 140120992763904, 140121001152511, +ERASE, 140122387976192, 140122387980287, +ERASE, 140122387980288, 140122396368895, +ERASE, 140121890353152, 140121890357247, +ERASE, 140121890357248, 140121898745855, +ERASE, 140121269587968, 140121269592063, +ERASE, 140121269592064, 140121277980671, + }; + unsigned long set37[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140722404016128, 140737488351231, +SNULL, 140722404020223, 140737488351231, +STORE, 140722404016128, 140722404020223, +STORE, 140722403885056, 140722404020223, +STORE, 94637010001920, 94637012254719, +SNULL, 94637010132991, 94637012254719, +STORE, 94637010001920, 94637010132991, +STORE, 94637010132992, 94637012254719, +ERASE, 94637010132992, 94637012254719, +STORE, 94637012226048, 94637012234239, +STORE, 94637012234240, 94637012254719, +STORE, 139760240594944, 139760242847743, +SNULL, 139760240738303, 139760242847743, +STORE, 139760240594944, 139760240738303, +STORE, 139760240738304, 139760242847743, +ERASE, 139760240738304, 139760242847743, +STORE, 139760242835456, 139760242843647, +STORE, 139760242843648, 139760242847743, +STORE, 140722405232640, 140722405236735, +STORE, 140722405220352, 140722405232639, +STORE, 139760242806784, 139760242835455, +STORE, 139760242798592, 139760242806783, +STORE, 139760238379008, 139760240594943, +SNULL, 139760238379008, 139760238477311, +STORE, 139760238477312, 139760240594943, +STORE, 139760238379008, 139760238477311, +SNULL, 139760240570367, 139760240594943, +STORE, 139760238477312, 139760240570367, +STORE, 139760240570368, 139760240594943, +SNULL, 139760240570368, 139760240578559, +STORE, 139760240578560, 139760240594943, +STORE, 139760240570368, 139760240578559, +ERASE, 139760240570368, 139760240578559, +STORE, 139760240570368, 139760240578559, +ERASE, 139760240578560, 139760240594943, +STORE, 139760240578560, 139760240594943, +STORE, 139760234582016, 139760238379007, +SNULL, 139760234582016, 139760236240895, +STORE, 139760236240896, 139760238379007, +STORE, 139760234582016, 139760236240895, +SNULL, 139760238338047, 139760238379007, +STORE, 139760236240896, 139760238338047, +STORE, 139760238338048, 139760238379007, +SNULL, 139760238338048, 139760238362623, +STORE, 139760238362624, 139760238379007, +STORE, 139760238338048, 139760238362623, +ERASE, 139760238338048, 139760238362623, +STORE, 139760238338048, 139760238362623, +ERASE, 139760238362624, 139760238379007, +STORE, 139760238362624, 139760238379007, +STORE, 139760242790400, 139760242806783, +SNULL, 139760238354431, 139760238362623, +STORE, 139760238338048, 139760238354431, +STORE, 139760238354432, 139760238362623, +SNULL, 139760240574463, 139760240578559, +STORE, 139760240570368, 139760240574463, +STORE, 139760240574464, 139760240578559, +SNULL, 94637012230143, 94637012234239, +STORE, 94637012226048, 94637012230143, +STORE, 94637012230144, 94637012234239, +SNULL, 139760242839551, 139760242843647, +STORE, 139760242835456, 139760242839551, +STORE, 139760242839552, 139760242843647, +ERASE, 139760242806784, 139760242835455, +STORE, 94637033324544, 94637033459711, +STORE, 139760226189312, 139760234582015, +SNULL, 139760226193407, 139760234582015, +STORE, 139760226189312, 139760226193407, +STORE, 139760226193408, 139760234582015, +STORE, 139760217796608, 139760226189311, +STORE, 139760083578880, 139760217796607, +SNULL, 139760083578880, 139760114860031, +STORE, 139760114860032, 139760217796607, +STORE, 139760083578880, 139760114860031, +ERASE, 139760083578880, 139760114860031, +SNULL, 139760181968895, 139760217796607, +STORE, 139760114860032, 139760181968895, +STORE, 139760181968896, 139760217796607, +ERASE, 139760181968896, 139760217796607, +SNULL, 139760114995199, 139760181968895, +STORE, 139760114860032, 139760114995199, +STORE, 139760114995200, 139760181968895, +SNULL, 139760217800703, 139760226189311, +STORE, 139760217796608, 139760217800703, +STORE, 139760217800704, 139760226189311, +STORE, 139760209403904, 139760217796607, +SNULL, 139760209407999, 139760217796607, +STORE, 139760209403904, 139760209407999, +STORE, 139760209408000, 139760217796607, +STORE, 139760201011200, 139760209403903, +SNULL, 139760201015295, 139760209403903, +STORE, 139760201011200, 139760201015295, +STORE, 139760201015296, 139760209403903, +STORE, 139760192618496, 139760201011199, +SNULL, 139760192622591, 139760201011199, +STORE, 139760192618496, 139760192622591, +STORE, 139760192622592, 139760201011199, +STORE, 139760184225792, 139760192618495, +STORE, 139759980642304, 139760114860031, +STORE, 139759972249600, 139759980642303, +STORE, 139759963856896, 139759980642303, +STORE, 139759955464192, 139759980642303, +STORE, 139759888355328, 139759955464191, +SNULL, 139760047751167, 139760114860031, +STORE, 139759980642304, 139760047751167, +STORE, 139760047751168, 139760114860031, +ERASE, 139760047751168, 139760114860031, +SNULL, 139759980777471, 139760047751167, +STORE, 139759980642304, 139759980777471, +STORE, 139759980777472, 139760047751167, +STORE, 139759980777472, 139760114860031, +SNULL, 139759980777472, 139760047751167, +STORE, 139760047751168, 139760114860031, +STORE, 139759980777472, 139760047751167, +SNULL, 139760047886335, 139760114860031, +STORE, 139760047751168, 139760047886335, +STORE, 139760047886336, 139760114860031, +STORE, 139759821246464, 139759955464191, +SNULL, 139759821246464, 139759888355327, +STORE, 139759888355328, 139759955464191, +STORE, 139759821246464, 139759888355327, +ERASE, 139759821246464, 139759888355327, +ERASE, 139759888355328, 139759955464191, + }; + unsigned long set38[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140730666221568, 140737488351231, +SNULL, 140730666225663, 140737488351231, +STORE, 140730666221568, 140730666225663, +STORE, 140730666090496, 140730666225663, +STORE, 94177584803840, 94177587056639, +SNULL, 94177584934911, 94177587056639, +STORE, 94177584803840, 94177584934911, +STORE, 94177584934912, 94177587056639, +ERASE, 94177584934912, 94177587056639, +STORE, 94177587027968, 94177587036159, +STORE, 94177587036160, 94177587056639, +STORE, 140614382714880, 140614384967679, +SNULL, 140614382858239, 140614384967679, +STORE, 140614382714880, 140614382858239, +STORE, 140614382858240, 140614384967679, +ERASE, 140614382858240, 140614384967679, +STORE, 140614384955392, 140614384963583, +STORE, 140614384963584, 140614384967679, +STORE, 140730666315776, 140730666319871, +STORE, 140730666303488, 140730666315775, +STORE, 140614384926720, 140614384955391, +STORE, 140614384918528, 140614384926719, +STORE, 140614380498944, 140614382714879, +SNULL, 140614380498944, 140614380597247, +STORE, 140614380597248, 140614382714879, +STORE, 140614380498944, 140614380597247, +SNULL, 140614382690303, 140614382714879, +STORE, 140614380597248, 140614382690303, +STORE, 140614382690304, 140614382714879, +SNULL, 140614382690304, 140614382698495, +STORE, 140614382698496, 140614382714879, +STORE, 140614382690304, 140614382698495, +ERASE, 140614382690304, 140614382698495, +STORE, 140614382690304, 140614382698495, +ERASE, 140614382698496, 140614382714879, +STORE, 140614382698496, 140614382714879, +STORE, 140614376701952, 140614380498943, +SNULL, 140614376701952, 140614378360831, +STORE, 140614378360832, 140614380498943, +STORE, 140614376701952, 140614378360831, +SNULL, 140614380457983, 140614380498943, +STORE, 140614378360832, 140614380457983, +STORE, 140614380457984, 140614380498943, +SNULL, 140614380457984, 140614380482559, +STORE, 140614380482560, 140614380498943, +STORE, 140614380457984, 140614380482559, +ERASE, 140614380457984, 140614380482559, +STORE, 140614380457984, 140614380482559, +ERASE, 140614380482560, 140614380498943, +STORE, 140614380482560, 140614380498943, +STORE, 140614384910336, 140614384926719, +SNULL, 140614380474367, 140614380482559, +STORE, 140614380457984, 140614380474367, +STORE, 140614380474368, 140614380482559, +SNULL, 140614382694399, 140614382698495, +STORE, 140614382690304, 140614382694399, +STORE, 140614382694400, 140614382698495, +SNULL, 94177587032063, 94177587036159, +STORE, 94177587027968, 94177587032063, +STORE, 94177587032064, 94177587036159, +SNULL, 140614384959487, 140614384963583, +STORE, 140614384955392, 140614384959487, +STORE, 140614384959488, 140614384963583, +ERASE, 140614384926720, 140614384955391, +STORE, 94177619791872, 94177619927039, +STORE, 140614368309248, 140614376701951, +SNULL, 140614368313343, 140614376701951, +STORE, 140614368309248, 140614368313343, +STORE, 140614368313344, 140614376701951, +STORE, 140614359916544, 140614368309247, +STORE, 140614225698816, 140614359916543, +SNULL, 140614225698816, 140614276481023, +STORE, 140614276481024, 140614359916543, +STORE, 140614225698816, 140614276481023, +ERASE, 140614225698816, 140614276481023, +SNULL, 140614343589887, 140614359916543, +STORE, 140614276481024, 140614343589887, +STORE, 140614343589888, 140614359916543, +ERASE, 140614343589888, 140614359916543, +SNULL, 140614276616191, 140614343589887, +STORE, 140614276481024, 140614276616191, +STORE, 140614276616192, 140614343589887, +SNULL, 140614359920639, 140614368309247, +STORE, 140614359916544, 140614359920639, +STORE, 140614359920640, 140614368309247, +STORE, 140614351523840, 140614359916543, +SNULL, 140614351527935, 140614359916543, +STORE, 140614351523840, 140614351527935, +STORE, 140614351527936, 140614359916543, +STORE, 140614268088320, 140614276481023, +SNULL, 140614268092415, 140614276481023, +STORE, 140614268088320, 140614268092415, +STORE, 140614268092416, 140614276481023, +STORE, 140614259695616, 140614268088319, +SNULL, 140614259699711, 140614268088319, +STORE, 140614259695616, 140614259699711, +STORE, 140614259699712, 140614268088319, +STORE, 140614251302912, 140614259695615, +STORE, 140614242910208, 140614259695615, +STORE, 140614108692480, 140614242910207, +SNULL, 140614108692480, 140614142263295, +STORE, 140614142263296, 140614242910207, +STORE, 140614108692480, 140614142263295, +ERASE, 140614108692480, 140614142263295, +STORE, 140614133870592, 140614142263295, +STORE, 140613999652864, 140614133870591, +SNULL, 140613999652864, 140614008045567, +STORE, 140614008045568, 140614133870591, +STORE, 140613999652864, 140614008045567, +ERASE, 140613999652864, 140614008045567, +STORE, 140613999652864, 140614008045567, +STORE, 140613865435136, 140613999652863, +SNULL, 140613865435136, 140613873827839, +STORE, 140613873827840, 140613999652863, +STORE, 140613865435136, 140613873827839, +ERASE, 140613865435136, 140613873827839, +SNULL, 140614209372159, 140614242910207, +STORE, 140614142263296, 140614209372159, +STORE, 140614209372160, 140614242910207, +ERASE, 140614209372160, 140614242910207, +SNULL, 140614142398463, 140614209372159, +STORE, 140614142263296, 140614142398463, +STORE, 140614142398464, 140614209372159, +SNULL, 140614075154431, 140614133870591, +STORE, 140614008045568, 140614075154431, +STORE, 140614075154432, 140614133870591, +ERASE, 140614075154432, 140614133870591, +SNULL, 140614008180735, 140614075154431, +STORE, 140614008045568, 140614008180735, +STORE, 140614008180736, 140614075154431, +SNULL, 140613940936703, 140613999652863, +STORE, 140613873827840, 140613940936703, +STORE, 140613940936704, 140613999652863, +ERASE, 140613940936704, 140613999652863, +SNULL, 140614242914303, 140614259695615, +STORE, 140614242910208, 140614242914303, +STORE, 140614242914304, 140614259695615, +STORE, 140613739610112, 140613940936703, +STORE, 140614234517504, 140614242910207, +SNULL, 140614242914304, 140614251302911, +STORE, 140614251302912, 140614259695615, +STORE, 140614242914304, 140614251302911, +SNULL, 140614251307007, 140614259695615, +STORE, 140614251302912, 140614251307007, +STORE, 140614251307008, 140614259695615, +SNULL, 140613739610112, 140613873827839, +STORE, 140613873827840, 140613940936703, +STORE, 140613739610112, 140613873827839, +SNULL, 140613873963007, 140613940936703, +STORE, 140613873827840, 140613873963007, +STORE, 140613873963008, 140613940936703, +SNULL, 140614133874687, 140614142263295, +STORE, 140614133870592, 140614133874687, +STORE, 140614133874688, 140614142263295, +SNULL, 140613806718975, 140613873827839, +STORE, 140613739610112, 140613806718975, +STORE, 140613806718976, 140613873827839, +ERASE, 140613806718976, 140613873827839, +STORE, 140614226124800, 140614242910207, +SNULL, 140613739745279, 140613806718975, +STORE, 140613739610112, 140613739745279, +STORE, 140613739745280, 140613806718975, +SNULL, 140613999656959, 140614008045567, +STORE, 140613999652864, 140613999656959, +STORE, 140613999656960, 140614008045567, +SNULL, 140614226124800, 140614234517503, +STORE, 140614234517504, 140614242910207, +STORE, 140614226124800, 140614234517503, +SNULL, 140614234521599, 140614242910207, +STORE, 140614234517504, 140614234521599, +STORE, 140614234521600, 140614242910207, +STORE, 140614217732096, 140614234517503, +STORE, 140614125477888, 140614133870591, +SNULL, 140614125481983, 140614133870591, +STORE, 140614125477888, 140614125481983, +STORE, 140614125481984, 140614133870591, +STORE, 140614117085184, 140614125477887, +SNULL, 140614217736191, 140614234517503, +STORE, 140614217732096, 140614217736191, +STORE, 140614217736192, 140614234517503, +SNULL, 140614117089279, 140614125477887, +STORE, 140614117085184, 140614117089279, +STORE, 140614117089280, 140614125477887, +SNULL, 140614217736192, 140614226124799, +STORE, 140614226124800, 140614234517503, +STORE, 140614217736192, 140614226124799, +SNULL, 140614226128895, 140614234517503, +STORE, 140614226124800, 140614226128895, +STORE, 140614226128896, 140614234517503, +STORE, 140614108692480, 140614117085183, +STORE, 140614100299776, 140614117085183, +STORE, 140614091907072, 140614117085183, +SNULL, 140614091907072, 140614108692479, +STORE, 140614108692480, 140614117085183, +STORE, 140614091907072, 140614108692479, +SNULL, 140614108696575, 140614117085183, +STORE, 140614108692480, 140614108696575, +STORE, 140614108696576, 140614117085183, +SNULL, 140614091907072, 140614100299775, +STORE, 140614100299776, 140614108692479, +STORE, 140614091907072, 140614100299775, +SNULL, 140614100303871, 140614108692479, +STORE, 140614100299776, 140614100303871, +STORE, 140614100303872, 140614108692479, +STORE, 140614083514368, 140614100299775, +SNULL, 140614083518463, 140614100299775, +STORE, 140614083514368, 140614083518463, +STORE, 140614083518464, 140614100299775, +STORE, 140613991260160, 140613999652863, +SNULL, 140614083518464, 140614091907071, +STORE, 140614091907072, 140614100299775, +STORE, 140614083518464, 140614091907071, +SNULL, 140614091911167, 140614100299775, +STORE, 140614091907072, 140614091911167, +STORE, 140614091911168, 140614100299775, +SNULL, 140613991264255, 140613999652863, +STORE, 140613991260160, 140613991264255, +STORE, 140613991264256, 140613999652863, +STORE, 140613982867456, 140613991260159, +SNULL, 140613982871551, 140613991260159, +STORE, 140613982867456, 140613982871551, +STORE, 140613982871552, 140613991260159, +STORE, 140613974474752, 140613982867455, +SNULL, 140613974478847, 140613982867455, +STORE, 140613974474752, 140613974478847, +STORE, 140613974478848, 140613982867455, +STORE, 140613966082048, 140613974474751, +STORE, 140613739745280, 140613873827839, +SNULL, 140613739745280, 140613806718975, +STORE, 140613806718976, 140613873827839, +STORE, 140613739745280, 140613806718975, +SNULL, 140613806854143, 140613873827839, +STORE, 140613806718976, 140613806854143, +STORE, 140613806854144, 140613873827839, +SNULL, 140613966086143, 140613974474751, +STORE, 140613966082048, 140613966086143, +STORE, 140613966086144, 140613974474751, +STORE, 140613957689344, 140613966082047, +STORE, 140613605392384, 140613739610111, +STORE, 140613949296640, 140613966082047, +STORE, 140613596999680, 140613605392383, +STORE, 140613529890816, 140613596999679, +STORE, 140613521498112, 140613529890815, +STORE, 140613513105408, 140613529890815, +STORE, 140613378887680, 140613513105407, +SNULL, 140613378887680, 140613404065791, +STORE, 140613404065792, 140613513105407, +STORE, 140613378887680, 140613404065791, +ERASE, 140613378887680, 140613404065791, +STORE, 140613395673088, 140613404065791, +STORE, 140613261455360, 140613395673087, +SNULL, 140613261455360, 140613269848063, +STORE, 140613269848064, 140613395673087, +STORE, 140613261455360, 140613269848063, +ERASE, 140613261455360, 140613269848063, +STORE, 140613261455360, 140613269848063, +STORE, 140613253062656, 140613269848063, +STORE, 140613118844928, 140613253062655, +STORE, 140613110452224, 140613118844927, +SNULL, 140613118844928, 140613135630335, +STORE, 140613135630336, 140613253062655, +STORE, 140613118844928, 140613135630335, +ERASE, 140613118844928, 140613135630335, +STORE, 140613127237632, 140613135630335, +STORE, 140613110452224, 140613135630335, +STORE, 140612976234496, 140613110452223, +STORE, 140612967841792, 140612976234495, +STORE, 140612833624064, 140612967841791, +STORE, 140612825231360, 140612833624063, +STORE, 140612816838656, 140612833624063, +STORE, 140612682620928, 140612816838655, +STORE, 140612674228224, 140612682620927, +SNULL, 140612682620928, 140612732977151, +STORE, 140612732977152, 140612816838655, +STORE, 140612682620928, 140612732977151, +ERASE, 140612682620928, 140612732977151, +SNULL, 140613672501247, 140613739610111, +STORE, 140613605392384, 140613672501247, +STORE, 140613672501248, 140613739610111, +ERASE, 140613672501248, 140613739610111, +SNULL, 140613605527551, 140613672501247, +STORE, 140613605392384, 140613605527551, +STORE, 140613605527552, 140613672501247, +ERASE, 140613529890816, 140613596999679, +STORE, 140612540010496, 140612674228223, +SNULL, 140612540010496, 140612598759423, +STORE, 140612598759424, 140612674228223, +STORE, 140612540010496, 140612598759423, +ERASE, 140612540010496, 140612598759423, +SNULL, 140613471174655, 140613513105407, +STORE, 140613404065792, 140613471174655, +STORE, 140613471174656, 140613513105407, +ERASE, 140613471174656, 140613513105407, +SNULL, 140613404200959, 140613471174655, +STORE, 140613404065792, 140613404200959, +STORE, 140613404200960, 140613471174655, +SNULL, 140613336956927, 140613395673087, +STORE, 140613269848064, 140613336956927, +STORE, 140613336956928, 140613395673087, +ERASE, 140613336956928, 140613395673087, +SNULL, 140612833624064, 140612867194879, +STORE, 140612867194880, 140612967841791, +STORE, 140612833624064, 140612867194879, +ERASE, 140612833624064, 140612867194879, +SNULL, 140612976234496, 140613001412607, +STORE, 140613001412608, 140613110452223, +STORE, 140612976234496, 140613001412607, +ERASE, 140612976234496, 140613001412607, +SNULL, 140613202739199, 140613253062655, +STORE, 140613135630336, 140613202739199, +STORE, 140613202739200, 140613253062655, +ERASE, 140613202739200, 140613253062655, +SNULL, 140613135765503, 140613202739199, +STORE, 140613135630336, 140613135765503, +STORE, 140613135765504, 140613202739199, +SNULL, 140612816842751, 140612833624063, +STORE, 140612816838656, 140612816842751, +STORE, 140612816842752, 140612833624063, +SNULL, 140613110456319, 140613135630335, +STORE, 140613110452224, 140613110456319, +STORE, 140613110456320, 140613135630335, +SNULL, 140613949300735, 140613966082047, +STORE, 140613949296640, 140613949300735, +STORE, 140613949300736, 140613966082047, +SNULL, 140613110456320, 140613118844927, +STORE, 140613118844928, 140613135630335, +STORE, 140613110456320, 140613118844927, +SNULL, 140613118849023, 140613135630335, +STORE, 140613118844928, 140613118849023, +STORE, 140613118849024, 140613135630335, +SNULL, 140612800086015, 140612816838655, +STORE, 140612732977152, 140612800086015, +STORE, 140612800086016, 140612816838655, +ERASE, 140612800086016, 140612816838655, +SNULL, 140613253062656, 140613261455359, +STORE, 140613261455360, 140613269848063, +STORE, 140613253062656, 140613261455359, +SNULL, 140613261459455, 140613269848063, +STORE, 140613261455360, 140613261459455, +STORE, 140613261459456, 140613269848063, +SNULL, 140612674232319, 140612682620927, +STORE, 140612674228224, 140612674232319, +STORE, 140612674232320, 140612682620927, +STORE, 140613731217408, 140613739610111, +STORE, 140613722824704, 140613739610111, +SNULL, 140613949300736, 140613957689343, +STORE, 140613957689344, 140613966082047, +STORE, 140613949300736, 140613957689343, +SNULL, 140613957693439, 140613966082047, +STORE, 140613957689344, 140613957693439, +STORE, 140613957693440, 140613966082047, +STORE, 140612464541696, 140612674228223, +SNULL, 140612531650559, 140612674228223, +STORE, 140612464541696, 140612531650559, +STORE, 140612531650560, 140612674228223, +SNULL, 140612531650560, 140612598759423, +STORE, 140612598759424, 140612674228223, +STORE, 140612531650560, 140612598759423, +ERASE, 140612531650560, 140612598759423, +SNULL, 140612665868287, 140612674228223, +STORE, 140612598759424, 140612665868287, +STORE, 140612665868288, 140612674228223, +ERASE, 140612665868288, 140612674228223, +SNULL, 140613269983231, 140613336956927, +STORE, 140613269848064, 140613269983231, +STORE, 140613269983232, 140613336956927, +SNULL, 140612934303743, 140612967841791, +STORE, 140612867194880, 140612934303743, +STORE, 140612934303744, 140612967841791, +ERASE, 140612934303744, 140612967841791, +SNULL, 140613068521471, 140613110452223, +STORE, 140613001412608, 140613068521471, +STORE, 140613068521472, 140613110452223, +ERASE, 140613068521472, 140613110452223, +STORE, 140613714432000, 140613739610111, +SNULL, 140613001547775, 140613068521471, +STORE, 140613001412608, 140613001547775, +STORE, 140613001547776, 140613068521471, +SNULL, 140612733112319, 140612800086015, +STORE, 140612732977152, 140612733112319, +STORE, 140612733112320, 140612800086015, +SNULL, 140613513109503, 140613529890815, +STORE, 140613513105408, 140613513109503, +STORE, 140613513109504, 140613529890815, +STORE, 140613706039296, 140613739610111, +STORE, 140613697646592, 140613739610111, +STORE, 140613689253888, 140613739610111, +SNULL, 140613689257983, 140613739610111, +STORE, 140613689253888, 140613689257983, +STORE, 140613689257984, 140613739610111, +SNULL, 140613253066751, 140613261455359, +STORE, 140613253062656, 140613253066751, +STORE, 140613253066752, 140613261455359, +STORE, 140613680861184, 140613689253887, +STORE, 140613588606976, 140613605392383, +SNULL, 140613689257984, 140613731217407, +STORE, 140613731217408, 140613739610111, +STORE, 140613689257984, 140613731217407, +SNULL, 140613731221503, 140613739610111, +STORE, 140613731217408, 140613731221503, +STORE, 140613731221504, 140613739610111, +STORE, 140613580214272, 140613605392383, +SNULL, 140612464676863, 140612531650559, +STORE, 140612464541696, 140612464676863, +STORE, 140612464676864, 140612531650559, +SNULL, 140612598894591, 140612665868287, +STORE, 140612598759424, 140612598894591, +STORE, 140612598894592, 140612665868287, +SNULL, 140612867330047, 140612934303743, +STORE, 140612867194880, 140612867330047, +STORE, 140612867330048, 140612934303743, +STORE, 140613571821568, 140613605392383, +SNULL, 140613571825663, 140613605392383, +STORE, 140613571821568, 140613571825663, +STORE, 140613571825664, 140613605392383, +SNULL, 140613689257984, 140613722824703, +STORE, 140613722824704, 140613731217407, +STORE, 140613689257984, 140613722824703, +SNULL, 140613722828799, 140613731217407, +STORE, 140613722824704, 140613722828799, +STORE, 140613722828800, 140613731217407, +SNULL, 140613689257984, 140613714431999, +STORE, 140613714432000, 140613722824703, +STORE, 140613689257984, 140613714431999, +SNULL, 140613714436095, 140613722824703, +STORE, 140613714432000, 140613714436095, +STORE, 140613714436096, 140613722824703, +SNULL, 140612816842752, 140612825231359, +STORE, 140612825231360, 140612833624063, +STORE, 140612816842752, 140612825231359, +SNULL, 140612825235455, 140612833624063, +STORE, 140612825231360, 140612825235455, +STORE, 140612825235456, 140612833624063, +SNULL, 140613395677183, 140613404065791, +STORE, 140613395673088, 140613395677183, +STORE, 140613395677184, 140613404065791, +SNULL, 140613689257984, 140613706039295, +STORE, 140613706039296, 140613714431999, +STORE, 140613689257984, 140613706039295, +SNULL, 140613706043391, 140613714431999, +STORE, 140613706039296, 140613706043391, +STORE, 140613706043392, 140613714431999, +SNULL, 140613118849024, 140613127237631, +STORE, 140613127237632, 140613135630335, +STORE, 140613118849024, 140613127237631, +SNULL, 140613127241727, 140613135630335, +STORE, 140613127237632, 140613127241727, +STORE, 140613127241728, 140613135630335, +SNULL, 140613571825664, 140613580214271, +STORE, 140613580214272, 140613605392383, +STORE, 140613571825664, 140613580214271, +SNULL, 140613580218367, 140613605392383, +STORE, 140613580214272, 140613580218367, +STORE, 140613580218368, 140613605392383, +SNULL, 140613689257984, 140613697646591, +STORE, 140613697646592, 140613706039295, +STORE, 140613689257984, 140613697646591, +SNULL, 140613697650687, 140613706039295, +STORE, 140613697646592, 140613697650687, +STORE, 140613697650688, 140613706039295, +SNULL, 140613680865279, 140613689253887, +STORE, 140613680861184, 140613680865279, +STORE, 140613680865280, 140613689253887, +STORE, 140613563428864, 140613571821567, +SNULL, 140613563432959, 140613571821567, +STORE, 140613563428864, 140613563432959, +STORE, 140613563432960, 140613571821567, +SNULL, 140613580218368, 140613588606975, +STORE, 140613588606976, 140613605392383, +STORE, 140613580218368, 140613588606975, +SNULL, 140613588611071, 140613605392383, +STORE, 140613588606976, 140613588611071, +STORE, 140613588611072, 140613605392383, +SNULL, 140613513109504, 140613521498111, +STORE, 140613521498112, 140613529890815, +STORE, 140613513109504, 140613521498111, +SNULL, 140613521502207, 140613529890815, +STORE, 140613521498112, 140613521502207, +STORE, 140613521502208, 140613529890815, +SNULL, 140613588611072, 140613596999679, +STORE, 140613596999680, 140613605392383, +STORE, 140613588611072, 140613596999679, +SNULL, 140613597003775, 140613605392383, +STORE, 140613596999680, 140613597003775, +STORE, 140613597003776, 140613605392383, +STORE, 140613555036160, 140613563428863, +SNULL, 140613555040255, 140613563428863, +STORE, 140613555036160, 140613555040255, +STORE, 140613555040256, 140613563428863, +STORE, 140613546643456, 140613555036159, +STORE, 140613538250752, 140613555036159, +SNULL, 140613538250752, 140613546643455, +STORE, 140613546643456, 140613555036159, +STORE, 140613538250752, 140613546643455, +SNULL, 140613546647551, 140613555036159, +STORE, 140613546643456, 140613546647551, +STORE, 140613546647552, 140613555036159, +STORE, 140613504712704, 140613513105407, +STORE, 140613496320000, 140613513105407, +SNULL, 140613496324095, 140613513105407, +STORE, 140613496320000, 140613496324095, +STORE, 140613496324096, 140613513105407, +STORE, 140613487927296, 140613496319999, +SNULL, 140613487931391, 140613496319999, +STORE, 140613487927296, 140613487931391, +STORE, 140613487931392, 140613496319999, +STORE, 140613479534592, 140613487927295, +SNULL, 140612967845887, 140612976234495, +STORE, 140612967841792, 140612967845887, +STORE, 140612967845888, 140612976234495, +STORE, 140613387280384, 140613395673087, +STORE, 140613378887680, 140613395673087, +SNULL, 140613378887680, 140613387280383, +STORE, 140613387280384, 140613395673087, +STORE, 140613378887680, 140613387280383, +SNULL, 140613387284479, 140613395673087, +STORE, 140613387280384, 140613387284479, +STORE, 140613387284480, 140613395673087, +STORE, 140613370494976, 140613387280383, +STORE, 140613362102272, 140613387280383, +SNULL, 140613479538687, 140613487927295, +STORE, 140613479534592, 140613479538687, +STORE, 140613479538688, 140613487927295, +STORE, 140613353709568, 140613387280383, +STORE, 140613345316864, 140613387280383, +STORE, 140613244669952, 140613253062655, +SNULL, 140613345320959, 140613387280383, +STORE, 140613345316864, 140613345320959, +STORE, 140613345320960, 140613387280383, +SNULL, 140613538254847, 140613546643455, +STORE, 140613538250752, 140613538254847, +STORE, 140613538254848, 140613546643455, +STORE, 140613236277248, 140613253062655, +STORE, 140613227884544, 140613253062655, +STORE, 140613219491840, 140613253062655, +STORE, 140613211099136, 140613253062655, +SNULL, 140613211103231, 140613253062655, +STORE, 140613211099136, 140613211103231, +STORE, 140613211103232, 140613253062655, +STORE, 140613102059520, 140613110452223, +STORE, 140613093666816, 140613110452223, +SNULL, 140613093670911, 140613110452223, +STORE, 140613093666816, 140613093670911, +STORE, 140613093670912, 140613110452223, +STORE, 140613085274112, 140613093666815, +SNULL, 140613496324096, 140613504712703, +STORE, 140613504712704, 140613513105407, +STORE, 140613496324096, 140613504712703, +SNULL, 140613504716799, 140613513105407, +STORE, 140613504712704, 140613504716799, +STORE, 140613504716800, 140613513105407, +SNULL, 140613345320960, 140613378887679, +STORE, 140613378887680, 140613387280383, +STORE, 140613345320960, 140613378887679, +SNULL, 140613378891775, 140613387280383, +STORE, 140613378887680, 140613378891775, +STORE, 140613378891776, 140613387280383, +SNULL, 140613345320960, 140613362102271, +STORE, 140613362102272, 140613378887679, +STORE, 140613345320960, 140613362102271, +SNULL, 140613362106367, 140613378887679, +STORE, 140613362102272, 140613362106367, +STORE, 140613362106368, 140613378887679, +SNULL, 140613362106368, 140613370494975, +STORE, 140613370494976, 140613378887679, +STORE, 140613362106368, 140613370494975, +SNULL, 140613370499071, 140613378887679, +STORE, 140613370494976, 140613370499071, +STORE, 140613370499072, 140613378887679, +STORE, 140613076881408, 140613093666815, +STORE, 140612993019904, 140613001412607, +SNULL, 140613076885503, 140613093666815, +STORE, 140613076881408, 140613076885503, +STORE, 140613076885504, 140613093666815, +SNULL, 140613093670912, 140613102059519, +STORE, 140613102059520, 140613110452223, +STORE, 140613093670912, 140613102059519, +SNULL, 140613102063615, 140613110452223, +STORE, 140613102059520, 140613102063615, +STORE, 140613102063616, 140613110452223, +SNULL, 140613076885504, 140613085274111, +STORE, 140613085274112, 140613093666815, +STORE, 140613076885504, 140613085274111, +SNULL, 140613085278207, 140613093666815, +STORE, 140613085274112, 140613085278207, +STORE, 140613085278208, 140613093666815, +STORE, 140612984627200, 140613001412607, +STORE, 140612967845888, 140612984627199, +SNULL, 140613211103232, 140613219491839, +STORE, 140613219491840, 140613253062655, +STORE, 140613211103232, 140613219491839, +SNULL, 140613219495935, 140613253062655, +STORE, 140613219491840, 140613219495935, +STORE, 140613219495936, 140613253062655, +STORE, 140612959449088, 140612967841791, +STORE, 140612951056384, 140612967841791, +SNULL, 140612951060479, 140612967841791, +STORE, 140612951056384, 140612951060479, +STORE, 140612951060480, 140612967841791, +SNULL, 140613345320960, 140613353709567, +STORE, 140613353709568, 140613362102271, +STORE, 140613345320960, 140613353709567, +SNULL, 140613353713663, 140613362102271, +STORE, 140613353709568, 140613353713663, +STORE, 140613353713664, 140613362102271, +SNULL, 140613219495936, 140613244669951, +STORE, 140613244669952, 140613253062655, +STORE, 140613219495936, 140613244669951, +SNULL, 140613244674047, 140613253062655, +STORE, 140613244669952, 140613244674047, +STORE, 140613244674048, 140613253062655, +STORE, 140612942663680, 140612951056383, +SNULL, 140613219495936, 140613236277247, +STORE, 140613236277248, 140613244669951, +STORE, 140613219495936, 140613236277247, +SNULL, 140613236281343, 140613244669951, +STORE, 140613236277248, 140613236281343, +STORE, 140613236281344, 140613244669951, +SNULL, 140613219495936, 140613227884543, +STORE, 140613227884544, 140613236277247, +STORE, 140613219495936, 140613227884543, +SNULL, 140613227888639, 140613236277247, +STORE, 140613227884544, 140613227888639, +STORE, 140613227888640, 140613236277247, +SNULL, 140612984627200, 140612993019903, +STORE, 140612993019904, 140613001412607, +STORE, 140612984627200, 140612993019903, +SNULL, 140612993023999, 140613001412607, +STORE, 140612993019904, 140612993023999, +STORE, 140612993024000, 140613001412607, +STORE, 140612858802176, 140612867194879, +STORE, 140612850409472, 140612867194879, +SNULL, 140612951060480, 140612959449087, +STORE, 140612959449088, 140612967841791, +STORE, 140612951060480, 140612959449087, +SNULL, 140612959453183, 140612967841791, +STORE, 140612959449088, 140612959453183, +STORE, 140612959453184, 140612967841791, +SNULL, 140612967845888, 140612976234495, +STORE, 140612976234496, 140612984627199, +STORE, 140612967845888, 140612976234495, +SNULL, 140612976238591, 140612984627199, +STORE, 140612976234496, 140612976238591, +STORE, 140612976238592, 140612984627199, +STORE, 140612842016768, 140612867194879, +SNULL, 140612842020863, 140612867194879, +STORE, 140612842016768, 140612842020863, +STORE, 140612842020864, 140612867194879, +SNULL, 140612984631295, 140612993019903, +STORE, 140612984627200, 140612984631295, +STORE, 140612984631296, 140612993019903, +STORE, 140612825235456, 140612842016767, +STORE, 140612808445952, 140612816838655, +SNULL, 140612942667775, 140612951056383, +STORE, 140612942663680, 140612942667775, +STORE, 140612942667776, 140612951056383, +STORE, 140612724584448, 140612732977151, +SNULL, 140612724588543, 140612732977151, +STORE, 140612724584448, 140612724588543, +STORE, 140612724588544, 140612732977151, +STORE, 140612716191744, 140612724584447, +SNULL, 140612842020864, 140612850409471, +STORE, 140612850409472, 140612867194879, +STORE, 140612842020864, 140612850409471, +SNULL, 140612850413567, 140612867194879, +STORE, 140612850409472, 140612850413567, +STORE, 140612850413568, 140612867194879, +SNULL, 140612850413568, 140612858802175, +STORE, 140612858802176, 140612867194879, +STORE, 140612850413568, 140612858802175, +SNULL, 140612858806271, 140612867194879, +STORE, 140612858802176, 140612858806271, +STORE, 140612858806272, 140612867194879, +STORE, 140612707799040, 140612724584447, +SNULL, 140612707803135, 140612724584447, +STORE, 140612707799040, 140612707803135, +STORE, 140612707803136, 140612724584447, +SNULL, 140612707803136, 140612716191743, +STORE, 140612716191744, 140612724584447, +STORE, 140612707803136, 140612716191743, +SNULL, 140612716195839, 140612724584447, +STORE, 140612716191744, 140612716195839, +STORE, 140612716195840, 140612724584447, +SNULL, 140612808450047, 140612816838655, +STORE, 140612808445952, 140612808450047, +STORE, 140612808450048, 140612816838655, +SNULL, 140612825235456, 140612833624063, +STORE, 140612833624064, 140612842016767, +STORE, 140612825235456, 140612833624063, +SNULL, 140612833628159, 140612842016767, +STORE, 140612833624064, 140612833628159, +STORE, 140612833628160, 140612842016767, +STORE, 140612699406336, 140612707799039, +SNULL, 140612699410431, 140612707799039, +STORE, 140612699406336, 140612699410431, +STORE, 140612699410432, 140612707799039, +STORE, 140614384926720, 140614384955391, +STORE, 140614349332480, 140614351523839, +SNULL, 140614349332480, 140614349422591, +STORE, 140614349422592, 140614351523839, +STORE, 140614349332480, 140614349422591, +SNULL, 140614351515647, 140614351523839, +STORE, 140614349422592, 140614351515647, +STORE, 140614351515648, 140614351523839, +ERASE, 140614351515648, 140614351523839, +STORE, 140614351515648, 140614351523839, +SNULL, 140614351519743, 140614351523839, +STORE, 140614351515648, 140614351519743, +STORE, 140614351519744, 140614351523839, +ERASE, 140614384926720, 140614384955391, +ERASE, 140613949296640, 140613949300735, +ERASE, 140613949300736, 140613957689343, +ERASE, 140613689253888, 140613689257983, +ERASE, 140613689257984, 140613697646591, +ERASE, 140613563428864, 140613563432959, +ERASE, 140613563432960, 140613571821567, +ERASE, 140613211099136, 140613211103231, +ERASE, 140613211103232, 140613219491839, +ERASE, 140614133870592, 140614133874687, +ERASE, 140614133874688, 140614142263295, +ERASE, 140612967841792, 140612967845887, +ERASE, 140612967845888, 140612976234495, +ERASE, 140613076881408, 140613076885503, +ERASE, 140613076885504, 140613085274111, +ERASE, 140612850409472, 140612850413567, +ERASE, 140612850413568, 140612858802175, +ERASE, 140613110452224, 140613110456319, +ERASE, 140613110456320, 140613118844927, +ERASE, 140613706039296, 140613706043391, +ERASE, 140613706043392, 140613714431999, +ERASE, 140613521498112, 140613521502207, +ERASE, 140613521502208, 140613529890815, +ERASE, 140613362102272, 140613362106367, +ERASE, 140613362106368, 140613370494975, +ERASE, 140613253062656, 140613253066751, +ERASE, 140613253066752, 140613261455359, +ERASE, 140612816838656, 140612816842751, +ERASE, 140612816842752, 140612825231359, +ERASE, 140613261455360, 140613261459455, +ERASE, 140613261459456, 140613269848063, +ERASE, 140613118844928, 140613118849023, +ERASE, 140613118849024, 140613127237631, +ERASE, 140613714432000, 140613714436095, +ERASE, 140613714436096, 140613722824703, +ERASE, 140613496320000, 140613496324095, +ERASE, 140613496324096, 140613504712703, +ERASE, 140613513105408, 140613513109503, +ERASE, 140613513109504, 140613521498111, +ERASE, 140613697646592, 140613697650687, +ERASE, 140613697650688, 140613706039295, +ERASE, 140613093666816, 140613093670911, +ERASE, 140613093670912, 140613102059519, +ERASE, 140612993019904, 140612993023999, +ERASE, 140612993024000, 140613001412607, +ERASE, 140613127237632, 140613127241727, +ERASE, 140613127241728, 140613135630335, +ERASE, 140613957689344, 140613957693439, +ERASE, 140613957693440, 140613966082047, +ERASE, 140613571821568, 140613571825663, +ERASE, 140613571825664, 140613580214271, +ERASE, 140613479534592, 140613479538687, +ERASE, 140613479538688, 140613487927295, +ERASE, 140612984627200, 140612984631295, +ERASE, 140612984631296, 140612993019903, +ERASE, 140613588606976, 140613588611071, +ERASE, 140613588611072, 140613596999679, +ERASE, 140613680861184, 140613680865279, +ERASE, 140613680865280, 140613689253887, +ERASE, 140613345316864, 140613345320959, +ERASE, 140613345320960, 140613353709567, +ERASE, 140613596999680, 140613597003775, +ERASE, 140613597003776, 140613605392383, +ERASE, 140613966082048, 140613966086143, +ERASE, 140613966086144, 140613974474751, +ERASE, 140613731217408, 140613731221503, +ERASE, 140613731221504, 140613739610111, +ERASE, 140613395673088, 140613395677183, +ERASE, 140613395677184, 140613404065791, +ERASE, 140612825231360, 140612825235455, +ERASE, 140612825235456, 140612833624063, +ERASE, 140612674228224, 140612674232319, +ERASE, 140612674232320, 140612682620927, +ERASE, 140613722824704, 140613722828799, +ERASE, 140613722828800, 140613731217407, +ERASE, 140613487927296, 140613487931391, +ERASE, 140613487931392, 140613496319999, +ERASE, 140613102059520, 140613102063615, +ERASE, 140613102063616, 140613110452223, +ERASE, 140614242910208, 140614242914303, +ERASE, 140614242914304, 140614251302911, +ERASE, 140612808445952, 140612808450047, +ERASE, 140612808450048, 140612816838655, +ERASE, 140613236277248, 140613236281343, +ERASE, 140613236281344, 140613244669951, +ERASE, 140613580214272, 140613580218367, +ERASE, 140613580218368, 140613588606975, +ERASE, 140613370494976, 140613370499071, +ERASE, 140613370499072, 140613378887679, +ERASE, 140613244669952, 140613244674047, +ERASE, 140613244674048, 140613253062655, +ERASE, 140612724584448, 140612724588543, +ERASE, 140612724588544, 140612732977151, +ERASE, 140612707799040, 140612707803135, +ERASE, 140612707803136, 140612716191743, +ERASE, 140613504712704, 140613504716799, +ERASE, 140613504716800, 140613513105407, + }; + + unsigned long set39[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140736271417344, 140737488351231, +SNULL, 140736271421439, 140737488351231, +STORE, 140736271417344, 140736271421439, +STORE, 140736271286272, 140736271421439, +STORE, 94412930822144, 94412933074943, +SNULL, 94412930953215, 94412933074943, +STORE, 94412930822144, 94412930953215, +STORE, 94412930953216, 94412933074943, +ERASE, 94412930953216, 94412933074943, +STORE, 94412933046272, 94412933054463, +STORE, 94412933054464, 94412933074943, +STORE, 140326136901632, 140326139154431, +SNULL, 140326137044991, 140326139154431, +STORE, 140326136901632, 140326137044991, +STORE, 140326137044992, 140326139154431, +ERASE, 140326137044992, 140326139154431, +STORE, 140326139142144, 140326139150335, +STORE, 140326139150336, 140326139154431, +STORE, 140736271585280, 140736271589375, +STORE, 140736271572992, 140736271585279, +STORE, 140326139113472, 140326139142143, +STORE, 140326139105280, 140326139113471, +STORE, 140326134685696, 140326136901631, +SNULL, 140326134685696, 140326134783999, +STORE, 140326134784000, 140326136901631, +STORE, 140326134685696, 140326134783999, +SNULL, 140326136877055, 140326136901631, +STORE, 140326134784000, 140326136877055, +STORE, 140326136877056, 140326136901631, +SNULL, 140326136877056, 140326136885247, +STORE, 140326136885248, 140326136901631, +STORE, 140326136877056, 140326136885247, +ERASE, 140326136877056, 140326136885247, +STORE, 140326136877056, 140326136885247, +ERASE, 140326136885248, 140326136901631, +STORE, 140326136885248, 140326136901631, +STORE, 140326130888704, 140326134685695, +SNULL, 140326130888704, 140326132547583, +STORE, 140326132547584, 140326134685695, +STORE, 140326130888704, 140326132547583, +SNULL, 140326134644735, 140326134685695, +STORE, 140326132547584, 140326134644735, +STORE, 140326134644736, 140326134685695, +SNULL, 140326134644736, 140326134669311, +STORE, 140326134669312, 140326134685695, +STORE, 140326134644736, 140326134669311, +ERASE, 140326134644736, 140326134669311, +STORE, 140326134644736, 140326134669311, +ERASE, 140326134669312, 140326134685695, +STORE, 140326134669312, 140326134685695, +STORE, 140326139097088, 140326139113471, +SNULL, 140326134661119, 140326134669311, +STORE, 140326134644736, 140326134661119, +STORE, 140326134661120, 140326134669311, +SNULL, 140326136881151, 140326136885247, +STORE, 140326136877056, 140326136881151, +STORE, 140326136881152, 140326136885247, +SNULL, 94412933050367, 94412933054463, +STORE, 94412933046272, 94412933050367, +STORE, 94412933050368, 94412933054463, +SNULL, 140326139146239, 140326139150335, +STORE, 140326139142144, 140326139146239, +STORE, 140326139146240, 140326139150335, +ERASE, 140326139113472, 140326139142143, +STORE, 94412939493376, 94412939628543, +STORE, 140326122496000, 140326130888703, +SNULL, 140326122500095, 140326130888703, +STORE, 140326122496000, 140326122500095, +STORE, 140326122500096, 140326130888703, +STORE, 140326114103296, 140326122495999, +STORE, 140325979885568, 140326114103295, +SNULL, 140325979885568, 140326043910143, +STORE, 140326043910144, 140326114103295, +STORE, 140325979885568, 140326043910143, +ERASE, 140325979885568, 140326043910143, +SNULL, 140326111019007, 140326114103295, +STORE, 140326043910144, 140326111019007, +STORE, 140326111019008, 140326114103295, +ERASE, 140326111019008, 140326114103295, +SNULL, 140326044045311, 140326111019007, +STORE, 140326043910144, 140326044045311, +STORE, 140326044045312, 140326111019007, +SNULL, 140326114107391, 140326122495999, +STORE, 140326114103296, 140326114107391, +STORE, 140326114107392, 140326122495999, +STORE, 140326035517440, 140326043910143, +SNULL, 140326035521535, 140326043910143, +STORE, 140326035517440, 140326035521535, +STORE, 140326035521536, 140326043910143, +STORE, 140326027124736, 140326035517439, +SNULL, 140326027128831, 140326035517439, +STORE, 140326027124736, 140326027128831, +STORE, 140326027128832, 140326035517439, +STORE, 140326018732032, 140326027124735, +SNULL, 140326018736127, 140326027124735, +STORE, 140326018732032, 140326018736127, +STORE, 140326018736128, 140326027124735, +STORE, 140326010339328, 140326018732031, +STORE, 140326001946624, 140326018732031, +STORE, 140325993553920, 140326018732031, +STORE, 140325859336192, 140325993553919, +SNULL, 140325859336192, 140325909692415, +STORE, 140325909692416, 140325993553919, +STORE, 140325859336192, 140325909692415, +ERASE, 140325859336192, 140325909692415, +SNULL, 140325976801279, 140325993553919, +STORE, 140325909692416, 140325976801279, +STORE, 140325976801280, 140325993553919, +ERASE, 140325976801280, 140325993553919, +STORE, 140325985161216, 140326018732031, +STORE, 140325775474688, 140325976801279, +STORE, 140325708365824, 140325976801279, +SNULL, 140325708500991, 140325976801279, +STORE, 140325708365824, 140325708500991, +STORE, 140325708500992, 140325976801279, +SNULL, 140325708500992, 140325909692415, +STORE, 140325909692416, 140325976801279, +STORE, 140325708500992, 140325909692415, +SNULL, 140325909827583, 140325976801279, +STORE, 140325909692416, 140325909827583, +STORE, 140325909827584, 140325976801279, +SNULL, 140325842583551, 140325909692415, +STORE, 140325708500992, 140325842583551, +STORE, 140325842583552, 140325909692415, +ERASE, 140325842583552, 140325909692415, +SNULL, 140325708500992, 140325775474687, +STORE, 140325775474688, 140325842583551, +STORE, 140325708500992, 140325775474687, +SNULL, 140325775609855, 140325842583551, +STORE, 140325775474688, 140325775609855, +STORE, 140325775609856, 140325842583551, +STORE, 140325775609856, 140325909692415, +SNULL, 140325775609856, 140325842583551, +STORE, 140325842583552, 140325909692415, +STORE, 140325775609856, 140325842583551, +SNULL, 140325842718719, 140325909692415, +STORE, 140325842583552, 140325842718719, +STORE, 140325842718720, 140325909692415, +SNULL, 140325985161216, 140325993553919, +STORE, 140325993553920, 140326018732031, +STORE, 140325985161216, 140325993553919, +SNULL, 140325993558015, 140326018732031, +STORE, 140325993553920, 140325993558015, +STORE, 140325993558016, 140326018732031, +SNULL, 140325985165311, 140325993553919, +STORE, 140325985161216, 140325985165311, +STORE, 140325985165312, 140325993553919, +SNULL, 140325993558016, 140326001946623, +STORE, 140326001946624, 140326018732031, +STORE, 140325993558016, 140326001946623, +SNULL, 140326001950719, 140326018732031, +STORE, 140326001946624, 140326001950719, +STORE, 140326001950720, 140326018732031, +SNULL, 140326001950720, 140326010339327, +STORE, 140326010339328, 140326018732031, +STORE, 140326001950720, 140326010339327, +SNULL, 140326010343423, 140326018732031, +STORE, 140326010339328, 140326010343423, +STORE, 140326010343424, 140326018732031, +STORE, 140325699973120, 140325708365823, +STORE, 140325691580416, 140325708365823, +STORE, 140325683187712, 140325708365823, +SNULL, 140325683191807, 140325708365823, +STORE, 140325683187712, 140325683191807, +STORE, 140325683191808, 140325708365823, +SNULL, 140325683191808, 140325699973119, +STORE, 140325699973120, 140325708365823, +STORE, 140325683191808, 140325699973119, +SNULL, 140325699977215, 140325708365823, +STORE, 140325699973120, 140325699977215, +STORE, 140325699977216, 140325708365823, +STORE, 140325674795008, 140325683187711, +STORE, 140325666402304, 140325683187711, +STORE, 140325658009600, 140325683187711, +SNULL, 140325658009600, 140325666402303, +STORE, 140325666402304, 140325683187711, +STORE, 140325658009600, 140325666402303, +SNULL, 140325666406399, 140325683187711, +STORE, 140325666402304, 140325666406399, +STORE, 140325666406400, 140325683187711, +SNULL, 140325683191808, 140325691580415, +STORE, 140325691580416, 140325699973119, +STORE, 140325683191808, 140325691580415, +SNULL, 140325691584511, 140325699973119, +STORE, 140325691580416, 140325691584511, +STORE, 140325691584512, 140325699973119, +SNULL, 140325666406400, 140325674795007, +STORE, 140325674795008, 140325683187711, +STORE, 140325666406400, 140325674795007, +SNULL, 140325674799103, 140325683187711, +STORE, 140325674795008, 140325674799103, +STORE, 140325674799104, 140325683187711, +STORE, 140325649616896, 140325666402303, +SNULL, 140325649616896, 140325658009599, +STORE, 140325658009600, 140325666402303, +STORE, 140325649616896, 140325658009599, +SNULL, 140325658013695, 140325666402303, +STORE, 140325658009600, 140325658013695, +STORE, 140325658013696, 140325666402303, +SNULL, 140325649620991, 140325658009599, +STORE, 140325649616896, 140325649620991, +STORE, 140325649620992, 140325658009599, +STORE, 140325641224192, 140325649616895, +STORE, 140325632831488, 140325649616895, +SNULL, 140325632835583, 140325649616895, +STORE, 140325632831488, 140325632835583, +STORE, 140325632835584, 140325649616895, +STORE, 140325624438784, 140325632831487, +SNULL, 140325624442879, 140325632831487, +STORE, 140325624438784, 140325624442879, +STORE, 140325624442880, 140325632831487, +SNULL, 140325632835584, 140325641224191, +STORE, 140325641224192, 140325649616895, +STORE, 140325632835584, 140325641224191, +SNULL, 140325641228287, 140325649616895, +STORE, 140325641224192, 140325641228287, +STORE, 140325641228288, 140325649616895, +STORE, 140325616046080, 140325624438783, +SNULL, 140325616050175, 140325624438783, +STORE, 140325616046080, 140325616050175, +STORE, 140325616050176, 140325624438783, +STORE, 140325607653376, 140325616046079, +SNULL, 140325607657471, 140325616046079, +STORE, 140325607653376, 140325607657471, +STORE, 140325607657472, 140325616046079, +STORE, 140325599260672, 140325607653375, +STORE, 140325590867968, 140325607653375, +STORE, 140325456650240, 140325590867967, +SNULL, 140325456650240, 140325507039231, +STORE, 140325507039232, 140325590867967, +STORE, 140325456650240, 140325507039231, +ERASE, 140325456650240, 140325507039231, +STORE, 140325498646528, 140325507039231, +STORE, 140325364428800, 140325498646527, +SNULL, 140325364428800, 140325372821503, +STORE, 140325372821504, 140325498646527, +STORE, 140325364428800, 140325372821503, +ERASE, 140325364428800, 140325372821503, +STORE, 140325364428800, 140325372821503, +STORE, 140325356036096, 140325372821503, +STORE, 140325221818368, 140325356036095, +SNULL, 140325221818368, 140325238603775, +STORE, 140325238603776, 140325356036095, +STORE, 140325221818368, 140325238603775, +ERASE, 140325221818368, 140325238603775, +STORE, 140325230211072, 140325238603775, +STORE, 140325221818368, 140325238603775, +STORE, 140325087600640, 140325221818367, +STORE, 140325079207936, 140325087600639, +SNULL, 140325087600640, 140325104386047, +STORE, 140325104386048, 140325221818367, +STORE, 140325087600640, 140325104386047, +ERASE, 140325087600640, 140325104386047, +STORE, 140325095993344, 140325104386047, +STORE, 140325079207936, 140325104386047, +STORE, 140324944990208, 140325079207935, +SNULL, 140324944990208, 140324970168319, +STORE, 140324970168320, 140325079207935, +STORE, 140324944990208, 140324970168319, +ERASE, 140324944990208, 140324970168319, +STORE, 140324961775616, 140324970168319, +STORE, 140324953382912, 140324970168319, +STORE, 140324819165184, 140324953382911, +STORE, 140324684947456, 140324953382911, +STORE, 140324676554752, 140324684947455, +STORE, 140324668162048, 140324684947455, +STORE, 140324533944320, 140324668162047, +STORE, 140324525551616, 140324533944319, +SNULL, 140324533944320, 140324567515135, +STORE, 140324567515136, 140324668162047, +STORE, 140324533944320, 140324567515135, +ERASE, 140324533944320, 140324567515135, +STORE, 140324559122432, 140324567515135, +STORE, 140324391333888, 140324525551615, +SNULL, 140325574148095, 140325590867967, +STORE, 140325507039232, 140325574148095, +STORE, 140325574148096, 140325590867967, +ERASE, 140325574148096, 140325590867967, +SNULL, 140325439930367, 140325498646527, +STORE, 140325372821504, 140325439930367, +STORE, 140325439930368, 140325498646527, +ERASE, 140325439930368, 140325498646527, +SNULL, 140325305712639, 140325356036095, +STORE, 140325238603776, 140325305712639, +STORE, 140325305712640, 140325356036095, +ERASE, 140325305712640, 140325356036095, +SNULL, 140325171494911, 140325221818367, +STORE, 140325104386048, 140325171494911, +STORE, 140325171494912, 140325221818367, +ERASE, 140325171494912, 140325221818367, +SNULL, 140325104521215, 140325171494911, +STORE, 140325104386048, 140325104521215, +STORE, 140325104521216, 140325171494911, +STORE, 140324257116160, 140324525551615, +SNULL, 140324257116160, 140324299079679, +STORE, 140324299079680, 140324525551615, +STORE, 140324257116160, 140324299079679, +ERASE, 140324257116160, 140324299079679, +SNULL, 140325037277183, 140325079207935, +STORE, 140324970168320, 140325037277183, +STORE, 140325037277184, 140325079207935, +ERASE, 140325037277184, 140325079207935, +SNULL, 140324819165183, 140324953382911, +STORE, 140324684947456, 140324819165183, +STORE, 140324819165184, 140324953382911, +SNULL, 140324819165184, 140324835950591, +STORE, 140324835950592, 140324953382911, +STORE, 140324819165184, 140324835950591, +ERASE, 140324819165184, 140324835950591, +SNULL, 140324903059455, 140324953382911, +STORE, 140324835950592, 140324903059455, +STORE, 140324903059456, 140324953382911, +ERASE, 140324903059456, 140324953382911, +SNULL, 140324684947456, 140324701732863, +STORE, 140324701732864, 140324819165183, +STORE, 140324684947456, 140324701732863, +ERASE, 140324684947456, 140324701732863, +SNULL, 140324768841727, 140324819165183, +STORE, 140324701732864, 140324768841727, +STORE, 140324768841728, 140324819165183, +ERASE, 140324768841728, 140324819165183, +SNULL, 140324634623999, 140324668162047, +STORE, 140324567515136, 140324634623999, +STORE, 140324634624000, 140324668162047, +ERASE, 140324634624000, 140324668162047, +SNULL, 140324391333887, 140324525551615, +STORE, 140324299079680, 140324391333887, +STORE, 140324391333888, 140324525551615, +SNULL, 140324391333888, 140324433297407, +STORE, 140324433297408, 140324525551615, +STORE, 140324391333888, 140324433297407, +ERASE, 140324391333888, 140324433297407, +SNULL, 140325507174399, 140325574148095, +STORE, 140325507039232, 140325507174399, +STORE, 140325507174400, 140325574148095, +SNULL, 140325590867968, 140325599260671, +STORE, 140325599260672, 140325607653375, +STORE, 140325590867968, 140325599260671, +SNULL, 140325599264767, 140325607653375, +STORE, 140325599260672, 140325599264767, +STORE, 140325599264768, 140325607653375, +SNULL, 140325372956671, 140325439930367, +STORE, 140325372821504, 140325372956671, +STORE, 140325372956672, 140325439930367, +SNULL, 140324668166143, 140324684947455, +STORE, 140324668162048, 140324668166143, +STORE, 140324668166144, 140324684947455, +SNULL, 140324525555711, 140324533944319, +STORE, 140324525551616, 140324525555711, +STORE, 140324525555712, 140324533944319, +SNULL, 140324953382912, 140324961775615, +STORE, 140324961775616, 140324970168319, +STORE, 140324953382912, 140324961775615, +SNULL, 140324961779711, 140324970168319, +STORE, 140324961775616, 140324961779711, +STORE, 140324961779712, 140324970168319, +SNULL, 140325079212031, 140325104386047, +STORE, 140325079207936, 140325079212031, +STORE, 140325079212032, 140325104386047, +SNULL, 140325221818368, 140325230211071, +STORE, 140325230211072, 140325238603775, +STORE, 140325221818368, 140325230211071, +SNULL, 140325230215167, 140325238603775, +STORE, 140325230211072, 140325230215167, +STORE, 140325230215168, 140325238603775, +SNULL, 140325356036096, 140325364428799, +STORE, 140325364428800, 140325372821503, +STORE, 140325356036096, 140325364428799, +SNULL, 140325364432895, 140325372821503, + }; + unsigned long set40[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140734309167104, 140737488351231, +SNULL, 140734309171199, 140737488351231, +STORE, 140734309167104, 140734309171199, +STORE, 140734309036032, 140734309171199, +STORE, 94270500081664, 94270502334463, +SNULL, 94270500212735, 94270502334463, +STORE, 94270500081664, 94270500212735, +STORE, 94270500212736, 94270502334463, +ERASE, 94270500212736, 94270502334463, +STORE, 94270502305792, 94270502313983, +STORE, 94270502313984, 94270502334463, +STORE, 140321935110144, 140321937362943, +SNULL, 140321935253503, 140321937362943, +STORE, 140321935110144, 140321935253503, +STORE, 140321935253504, 140321937362943, +ERASE, 140321935253504, 140321937362943, +STORE, 140321937350656, 140321937358847, +STORE, 140321937358848, 140321937362943, +STORE, 140734309625856, 140734309629951, +STORE, 140734309613568, 140734309625855, +STORE, 140321937321984, 140321937350655, +STORE, 140321937313792, 140321937321983, +STORE, 140321932894208, 140321935110143, +SNULL, 140321932894208, 140321932992511, +STORE, 140321932992512, 140321935110143, +STORE, 140321932894208, 140321932992511, +SNULL, 140321935085567, 140321935110143, +STORE, 140321932992512, 140321935085567, +STORE, 140321935085568, 140321935110143, +SNULL, 140321935085568, 140321935093759, +STORE, 140321935093760, 140321935110143, +STORE, 140321935085568, 140321935093759, +ERASE, 140321935085568, 140321935093759, +STORE, 140321935085568, 140321935093759, +ERASE, 140321935093760, 140321935110143, +STORE, 140321935093760, 140321935110143, +STORE, 140321929097216, 140321932894207, +SNULL, 140321929097216, 140321930756095, +STORE, 140321930756096, 140321932894207, +STORE, 140321929097216, 140321930756095, +SNULL, 140321932853247, 140321932894207, +STORE, 140321930756096, 140321932853247, +STORE, 140321932853248, 140321932894207, +SNULL, 140321932853248, 140321932877823, +STORE, 140321932877824, 140321932894207, +STORE, 140321932853248, 140321932877823, +ERASE, 140321932853248, 140321932877823, +STORE, 140321932853248, 140321932877823, +ERASE, 140321932877824, 140321932894207, +STORE, 140321932877824, 140321932894207, +STORE, 140321937305600, 140321937321983, +SNULL, 140321932869631, 140321932877823, +STORE, 140321932853248, 140321932869631, +STORE, 140321932869632, 140321932877823, +SNULL, 140321935089663, 140321935093759, +STORE, 140321935085568, 140321935089663, +STORE, 140321935089664, 140321935093759, +SNULL, 94270502309887, 94270502313983, +STORE, 94270502305792, 94270502309887, +STORE, 94270502309888, 94270502313983, +SNULL, 140321937354751, 140321937358847, +STORE, 140321937350656, 140321937354751, +STORE, 140321937354752, 140321937358847, +ERASE, 140321937321984, 140321937350655, +STORE, 94270507364352, 94270507499519, +STORE, 140321920704512, 140321929097215, +SNULL, 140321920708607, 140321929097215, +STORE, 140321920704512, 140321920708607, +STORE, 140321920708608, 140321929097215, +STORE, 140321912311808, 140321920704511, +STORE, 140321778094080, 140321912311807, +SNULL, 140321778094080, 140321816051711, +STORE, 140321816051712, 140321912311807, +STORE, 140321778094080, 140321816051711, +ERASE, 140321778094080, 140321816051711, +SNULL, 140321883160575, 140321912311807, +STORE, 140321816051712, 140321883160575, +STORE, 140321883160576, 140321912311807, +ERASE, 140321883160576, 140321912311807, +SNULL, 140321816186879, 140321883160575, +STORE, 140321816051712, 140321816186879, +STORE, 140321816186880, 140321883160575, +SNULL, 140321912315903, 140321920704511, +STORE, 140321912311808, 140321912315903, +STORE, 140321912315904, 140321920704511, +STORE, 140321903919104, 140321912311807, +SNULL, 140321903923199, 140321912311807, +STORE, 140321903919104, 140321903923199, +STORE, 140321903923200, 140321912311807, +STORE, 140321895526400, 140321903919103, +SNULL, 140321895530495, 140321903919103, +STORE, 140321895526400, 140321895530495, +STORE, 140321895530496, 140321903919103, +STORE, 140321887133696, 140321895526399, +SNULL, 140321887137791, 140321895526399, +STORE, 140321887133696, 140321887137791, +STORE, 140321887137792, 140321895526399, +STORE, 140321807659008, 140321816051711, +STORE, 140321673441280, 140321807659007, +SNULL, 140321673441280, 140321681833983, +STORE, 140321681833984, 140321807659007, +STORE, 140321673441280, 140321681833983, +ERASE, 140321673441280, 140321681833983, +SNULL, 140321748942847, 140321807659007, +STORE, 140321681833984, 140321748942847, +STORE, 140321748942848, 140321807659007, +ERASE, 140321748942848, 140321807659007, +STORE, 140321799266304, 140321816051711, +STORE, 140321790873600, 140321816051711, +STORE, 140321782480896, 140321816051711, +STORE, 140321547616256, 140321748942847, +SNULL, 140321614725119, 140321748942847, +STORE, 140321547616256, 140321614725119, +STORE, 140321614725120, 140321748942847, +SNULL, 140321614725120, 140321681833983, +STORE, 140321681833984, 140321748942847, +STORE, 140321614725120, 140321681833983, +ERASE, 140321614725120, 140321681833983, +SNULL, 140321681969151, 140321748942847, +STORE, 140321681833984, 140321681969151, +STORE, 140321681969152, 140321748942847, +STORE, 140321547616256, 140321681833983, +SNULL, 140321547616256, 140321614725119, +STORE, 140321614725120, 140321681833983, +STORE, 140321547616256, 140321614725119, +SNULL, 140321614860287, 140321681833983, +STORE, 140321614725120, 140321614860287, +STORE, 140321614860288, 140321681833983, +SNULL, 140321547751423, 140321614725119, +STORE, 140321547616256, 140321547751423, +STORE, 140321547751424, 140321614725119, +STORE, 140321480507392, 140321547616255, +SNULL, 140321782480896, 140321799266303, +STORE, 140321799266304, 140321816051711, +STORE, 140321782480896, 140321799266303, +SNULL, 140321799270399, 140321816051711, +STORE, 140321799266304, 140321799270399, +STORE, 140321799270400, 140321816051711, +STORE, 140321774088192, 140321799266303, +SNULL, 140321774088192, 140321790873599, +STORE, 140321790873600, 140321799266303, +STORE, 140321774088192, 140321790873599, +SNULL, 140321790877695, 140321799266303, +STORE, 140321790873600, 140321790877695, +STORE, 140321790877696, 140321799266303, +SNULL, 140321480642559, 140321547616255, +STORE, 140321480507392, 140321480642559, +STORE, 140321480642560, 140321547616255, +SNULL, 140321774088192, 140321782480895, +STORE, 140321782480896, 140321790873599, +STORE, 140321774088192, 140321782480895, +SNULL, 140321782484991, 140321790873599, +STORE, 140321782480896, 140321782484991, +STORE, 140321782484992, 140321790873599, +SNULL, 140321799270400, 140321807659007, +STORE, 140321807659008, 140321816051711, +STORE, 140321799270400, 140321807659007, +SNULL, 140321807663103, 140321816051711, +STORE, 140321807659008, 140321807663103, +STORE, 140321807663104, 140321816051711, +STORE, 140321765695488, 140321782480895, +STORE, 140321757302784, 140321782480895, +SNULL, 140321757306879, 140321782480895, +STORE, 140321757302784, 140321757306879, +STORE, 140321757306880, 140321782480895, +STORE, 140321472114688, 140321480507391, +STORE, 140321463721984, 140321480507391, +SNULL, 140321463726079, 140321480507391, +STORE, 140321463721984, 140321463726079, +STORE, 140321463726080, 140321480507391, +SNULL, 140321757306880, 140321774088191, +STORE, 140321774088192, 140321782480895, +STORE, 140321757306880, 140321774088191, +SNULL, 140321774092287, 140321782480895, +STORE, 140321774088192, 140321774092287, +STORE, 140321774092288, 140321782480895, +SNULL, 140321463726080, 140321472114687, +STORE, 140321472114688, 140321480507391, +STORE, 140321463726080, 140321472114687, +SNULL, 140321472118783, 140321480507391, +STORE, 140321472114688, 140321472118783, +STORE, 140321472118784, 140321480507391, +SNULL, 140321757306880, 140321765695487, +STORE, 140321765695488, 140321774088191, +STORE, 140321757306880, 140321765695487, +SNULL, 140321765699583, 140321774088191, +STORE, 140321765695488, 140321765699583, +STORE, 140321765699584, 140321774088191, +STORE, 140321455329280, 140321463721983, +SNULL, 140321455333375, 140321463721983, +STORE, 140321455329280, 140321455333375, +STORE, 140321455333376, 140321463721983, +STORE, 140321446936576, 140321455329279, +STORE, 140321438543872, 140321455329279, +STORE, 140321430151168, 140321455329279, +SNULL, 140321430155263, 140321455329279, +STORE, 140321430151168, 140321430155263, +STORE, 140321430155264, 140321455329279, +SNULL, 140321430155264, 140321446936575, +STORE, 140321446936576, 140321455329279, +STORE, 140321430155264, 140321446936575, +SNULL, 140321446940671, 140321455329279, +STORE, 140321446936576, 140321446940671, +STORE, 140321446940672, 140321455329279, +SNULL, 140321430155264, 140321438543871, +STORE, 140321438543872, 140321446936575, +STORE, 140321430155264, 140321438543871, +SNULL, 140321438547967, 140321446936575, +STORE, 140321438543872, 140321438547967, +STORE, 140321438547968, 140321446936575, +STORE, 140321421758464, 140321430151167, +SNULL, 140321421762559, 140321430151167, +STORE, 140321421758464, 140321421762559, +STORE, 140321421762560, 140321430151167, +STORE, 140321413365760, 140321421758463, +SNULL, 140321413369855, 140321421758463, +STORE, 140321413365760, 140321413369855, +STORE, 140321413369856, 140321421758463, +STORE, 140321404973056, 140321413365759, +SNULL, 140321404977151, 140321413365759, +STORE, 140321404973056, 140321404977151, +STORE, 140321404977152, 140321413365759, +STORE, 140321396580352, 140321404973055, +STORE, 140321388187648, 140321404973055, +STORE, 140321253969920, 140321388187647, +SNULL, 140321253969920, 140321279180799, +STORE, 140321279180800, 140321388187647, +STORE, 140321253969920, 140321279180799, +ERASE, 140321253969920, 140321279180799, +SNULL, 140321346289663, 140321388187647, +STORE, 140321279180800, 140321346289663, +STORE, 140321346289664, 140321388187647, +ERASE, 140321346289664, 140321388187647, +STORE, 140321144963072, 140321346289663, +STORE, 140321379794944, 140321404973055, +STORE, 140321371402240, 140321404973055, +STORE, 140321010745344, 140321346289663, +STORE, 140321363009536, 140321404973055, +SNULL, 140321077854207, 140321346289663, +STORE, 140321010745344, 140321077854207, +STORE, 140321077854208, 140321346289663, +SNULL, 140321077854208, 140321144963071, +STORE, 140321144963072, 140321346289663, +STORE, 140321077854208, 140321144963071, +ERASE, 140321077854208, 140321144963071, +STORE, 140321354616832, 140321404973055, +STORE, 140321136570368, 140321144963071, +STORE, 140320943636480, 140321077854207, +STORE, 140320876527616, 140321077854207, +STORE, 140321128177664, 140321144963071, +SNULL, 140320876662783, 140321077854207, +STORE, 140320876527616, 140320876662783, +STORE, 140320876662784, 140321077854207, +STORE, 140321119784960, 140321144963071, +STORE, 140321111392256, 140321144963071, +STORE, 140320742309888, 140320876527615, +STORE, 140321102999552, 140321144963071, +STORE, 140320608092160, 140320876527615, +SNULL, 140320675201023, 140320876527615, +STORE, 140320608092160, 140320675201023, +STORE, 140320675201024, 140320876527615, +SNULL, 140320675201024, 140320742309887, +STORE, 140320742309888, 140320876527615, +STORE, 140320675201024, 140320742309887, +ERASE, 140320675201024, 140320742309887, +STORE, 140321094606848, 140321144963071, +STORE, 140321086214144, 140321144963071, +STORE, 140320608092160, 140320876527615, +SNULL, 140320608092160, 140320675201023, +STORE, 140320675201024, 140320876527615, +STORE, 140320608092160, 140320675201023, +SNULL, 140320675336191, 140320876527615, +STORE, 140320675201024, 140320675336191, +STORE, 140320675336192, 140320876527615, +STORE, 140320599699456, 140320608092159, +STORE, 140320591306752, 140320608092159, +STORE, 140320457089024, 140320591306751, +STORE, 140320448696320, 140320457089023, +STORE, 140320314478592, 140320448696319, +SNULL, 140321144963072, 140321279180799, +STORE, 140321279180800, 140321346289663, +STORE, 140321144963072, 140321279180799, +SNULL, 140321279315967, 140321346289663, +STORE, 140321279180800, 140321279315967, +STORE, 140321279315968, 140321346289663, +SNULL, 140321086214144, 140321136570367, +STORE, 140321136570368, 140321144963071, +STORE, 140321086214144, 140321136570367, +SNULL, 140321136574463, 140321144963071, +STORE, 140321136570368, 140321136574463, +STORE, 140321136574464, 140321144963071, +SNULL, 140321212071935, 140321279180799, +STORE, 140321144963072, 140321212071935, +STORE, 140321212071936, 140321279180799, +ERASE, 140321212071936, 140321279180799, +SNULL, 140321145098239, 140321212071935, +STORE, 140321144963072, 140321145098239, +STORE, 140321145098240, 140321212071935, +SNULL, 140320876662784, 140321010745343, +STORE, 140321010745344, 140321077854207, +STORE, 140320876662784, 140321010745343, +SNULL, 140321010880511, 140321077854207, +STORE, 140321010745344, 140321010880511, +STORE, 140321010880512, 140321077854207, +SNULL, 140321354616832, 140321379794943, +STORE, 140321379794944, 140321404973055, +STORE, 140321354616832, 140321379794943, +SNULL, 140321379799039, 140321404973055, +STORE, 140321379794944, 140321379799039, +STORE, 140321379799040, 140321404973055, +SNULL, 140320876662784, 140320943636479, +STORE, 140320943636480, 140321010745343, +STORE, 140320876662784, 140320943636479, +SNULL, 140320943771647, 140321010745343, +STORE, 140320943636480, 140320943771647, +STORE, 140320943771648, 140321010745343, +SNULL, 140320809418751, 140320876527615, +STORE, 140320675336192, 140320809418751, +STORE, 140320809418752, 140320876527615, +ERASE, 140320809418752, 140320876527615, +SNULL, 140320675336192, 140320742309887, +STORE, 140320742309888, 140320809418751, +STORE, 140320675336192, 140320742309887, +SNULL, 140320742445055, 140320809418751, +STORE, 140320742309888, 140320742445055, +STORE, 140320742445056, 140320809418751, +SNULL, 140320608227327, 140320675201023, +STORE, 140320608092160, 140320608227327, +STORE, 140320608227328, 140320675201023, +SNULL, 140320457089024, 140320473874431, +STORE, 140320473874432, 140320591306751, +STORE, 140320457089024, 140320473874431, +ERASE, 140320457089024, 140320473874431, +SNULL, 140320540983295, 140320591306751, +STORE, 140320473874432, 140320540983295, +STORE, 140320540983296, 140320591306751, +ERASE, 140320540983296, 140320591306751, +SNULL, 140320314478592, 140320339656703, +STORE, 140320339656704, 140320448696319, +STORE, 140320314478592, 140320339656703, +ERASE, 140320314478592, 140320339656703, +SNULL, 140321086214144, 140321128177663, +STORE, 140321128177664, 140321136570367, +STORE, 140321086214144, 140321128177663, +SNULL, 140321128181759, 140321136570367, +STORE, 140321128177664, 140321128181759, +STORE, 140321128181760, 140321136570367, +SNULL, 140321354616832, 140321371402239, +STORE, 140321371402240, 140321379794943, +STORE, 140321354616832, 140321371402239, +SNULL, 140321371406335, 140321379794943, +STORE, 140321371402240, 140321371406335, +STORE, 140321371406336, 140321379794943, +SNULL, 140320591310847, 140320608092159, +STORE, 140320591306752, 140320591310847, +STORE, 140320591310848, 140320608092159, +SNULL, 140321354616832, 140321363009535, +STORE, 140321363009536, 140321371402239, +STORE, 140321354616832, 140321363009535, +SNULL, 140321363013631, 140321371402239, +STORE, 140321363009536, 140321363013631, +STORE, 140321363013632, 140321371402239, +SNULL, 140321086214144, 140321119784959, +STORE, 140321119784960, 140321128177663, +STORE, 140321086214144, 140321119784959, +SNULL, 140321119789055, 140321128177663, +STORE, 140321119784960, 140321119789055, +STORE, 140321119789056, 140321128177663, +SNULL, 140321086218239, 140321119784959, +STORE, 140321086214144, 140321086218239, +STORE, 140321086218240, 140321119784959, +SNULL, 140321086218240, 140321094606847, +STORE, 140321094606848, 140321119784959, +STORE, 140321086218240, 140321094606847, +SNULL, 140321094610943, 140321119784959, +STORE, 140321094606848, 140321094610943, +STORE, 140321094610944, 140321119784959, +SNULL, 140320474009599, 140320540983295, +STORE, 140320473874432, 140320474009599, +STORE, 140320474009600, 140320540983295, +SNULL, 140320406765567, 140320448696319, +STORE, 140320339656704, 140320406765567, +STORE, 140320406765568, 140320448696319, +ERASE, 140320406765568, 140320448696319, +SNULL, 140320339791871, 140320406765567, +STORE, 140320339656704, 140320339791871, +STORE, 140320339791872, 140320406765567, +STORE, 140321270788096, 140321279180799, +STORE, 140321262395392, 140321279180799, +STORE, 140321254002688, 140321279180799, +SNULL, 140321254002688, 140321262395391, +STORE, 140321262395392, 140321279180799, +STORE, 140321254002688, 140321262395391, +SNULL, 140321262399487, 140321279180799, +STORE, 140321262395392, 140321262399487, +STORE, 140321262399488, 140321279180799, +STORE, 140321245609984, 140321262395391, +STORE, 140321237217280, 140321262395391, +SNULL, 140321237217280, 140321245609983, +STORE, 140321245609984, 140321262395391, +STORE, 140321237217280, 140321245609983, +SNULL, 140321245614079, 140321262395391, +STORE, 140321245609984, 140321245614079, +STORE, 140321245614080, 140321262395391, +SNULL, 140321379799040, 140321388187647, +STORE, 140321388187648, 140321404973055, +STORE, 140321379799040, 140321388187647, +SNULL, 140321388191743, 140321404973055, +STORE, 140321388187648, 140321388191743, +STORE, 140321388191744, 140321404973055, +SNULL, 140321354620927, 140321363009535, +STORE, 140321354616832, 140321354620927, +STORE, 140321354620928, 140321363009535, +SNULL, 140321388191744, 140321396580351, +STORE, 140321396580352, 140321404973055, +STORE, 140321388191744, 140321396580351, +SNULL, 140321396584447, 140321404973055, +STORE, 140321396580352, 140321396584447, +STORE, 140321396584448, 140321404973055, +SNULL, 140321094610944, 140321111392255, +STORE, 140321111392256, 140321119784959, +STORE, 140321094610944, 140321111392255, +SNULL, 140321111396351, 140321119784959, +STORE, 140321111392256, 140321111396351, +STORE, 140321111396352, 140321119784959, +STORE, 140321228824576, 140321245609983, +SNULL, 140321094610944, 140321102999551, +STORE, 140321102999552, 140321111392255, +STORE, 140321094610944, 140321102999551, +SNULL, 140321103003647, 140321111392255, +STORE, 140321102999552, 140321103003647, +STORE, 140321103003648, 140321111392255, +STORE, 140321220431872, 140321245609983, +SNULL, 140321220435967, 140321245609983, +STORE, 140321220431872, 140321220435967, +STORE, 140321220435968, 140321245609983, +STORE, 140320868134912, 140320876527615, +SNULL, 140320868139007, 140320876527615, +STORE, 140320868134912, 140320868139007, +STORE, 140320868139008, 140320876527615, +SNULL, 140320591310848, 140320599699455, +STORE, 140320599699456, 140320608092159, +STORE, 140320591310848, 140320599699455, +SNULL, 140320599703551, 140320608092159, +STORE, 140320599699456, 140320599703551, +STORE, 140320599703552, 140320608092159, +STORE, 140320859742208, 140320868134911, +SNULL, 140321262399488, 140321270788095, +STORE, 140321270788096, 140321279180799, +STORE, 140321262399488, 140321270788095, +SNULL, 140321270792191, 140321279180799, +STORE, 140321270788096, 140321270792191, +STORE, 140321270792192, 140321279180799, +STORE, 140320851349504, 140320868134911, +STORE, 140320842956800, 140320868134911, +STORE, 140320834564096, 140320868134911, +STORE, 140320826171392, 140320868134911, +SNULL, 140320826171392, 140320834564095, +STORE, 140320834564096, 140320868134911, +STORE, 140320826171392, 140320834564095, +SNULL, 140320834568191, 140320868134911, +STORE, 140320834564096, 140320834568191, +STORE, 140320834568192, 140320868134911, +SNULL, 140321220435968, 140321228824575, +STORE, 140321228824576, 140321245609983, +STORE, 140321220435968, 140321228824575, +SNULL, 140321228828671, 140321245609983, +STORE, 140321228824576, 140321228828671, +STORE, 140321228828672, 140321245609983, +STORE, 140320817778688, 140320834564095, +SNULL, 140320817782783, 140320834564095, +STORE, 140320817778688, 140320817782783, +STORE, 140320817782784, 140320834564095, +STORE, 140320582914048, 140320591306751, +SNULL, 140321228828672, 140321237217279, +STORE, 140321237217280, 140321245609983, +STORE, 140321228828672, 140321237217279, +SNULL, 140321237221375, 140321245609983, +STORE, 140321237217280, 140321237221375, +STORE, 140321237221376, 140321245609983, +SNULL, 140320448700415, 140320457089023, +STORE, 140320448696320, 140320448700415, +STORE, 140320448700416, 140320457089023, +SNULL, 140321245614080, 140321254002687, +STORE, 140321254002688, 140321262395391, +STORE, 140321245614080, 140321254002687, +SNULL, 140321254006783, 140321262395391, +STORE, 140321254002688, 140321254006783, +STORE, 140321254006784, 140321262395391, +STORE, 140320574521344, 140320591306751, +SNULL, 140320574525439, 140320591306751, +STORE, 140320574521344, 140320574525439, +STORE, 140320574525440, 140320591306751, +STORE, 140320566128640, 140320574521343, +SNULL, 140320566132735, 140320574521343, +STORE, 140320566128640, 140320566132735, +STORE, 140320566132736, 140320574521343, +SNULL, 140320574525440, 140320582914047, +STORE, 140320582914048, 140320591306751, +STORE, 140320574525440, 140320582914047, +SNULL, 140320582918143, 140320591306751, +STORE, 140320582914048, 140320582918143, +STORE, 140320582918144, 140320591306751, +STORE, 140320557735936, 140320566128639, +SNULL, 140320557740031, 140320566128639, +STORE, 140320557735936, 140320557740031, +STORE, 140320557740032, 140320566128639, +STORE, 140320549343232, 140320557735935, +STORE, 140320465481728, 140320473874431, +STORE, 140320448700416, 140320473874431, +SNULL, 140320834568192, 140320859742207, +STORE, 140320859742208, 140320868134911, +STORE, 140320834568192, 140320859742207, +SNULL, 140320859746303, 140320868134911, +STORE, 140320859742208, 140320859746303, +STORE, 140320859746304, 140320868134911, +STORE, 140320440303616, 140320448696319, +STORE, 140320431910912, 140320448696319, +SNULL, 140320834568192, 140320851349503, +STORE, 140320851349504, 140320859742207, +STORE, 140320834568192, 140320851349503, +SNULL, 140320851353599, 140320859742207, +STORE, 140320851349504, 140320851353599, +STORE, 140320851353600, 140320859742207, +SNULL, 140320817782784, 140320826171391, +STORE, 140320826171392, 140320834564095, +STORE, 140320817782784, 140320826171391, +SNULL, 140320826175487, 140320834564095, +STORE, 140320826171392, 140320826175487, +STORE, 140320826175488, 140320834564095, +SNULL, 140320834568192, 140320842956799, +STORE, 140320842956800, 140320851349503, +STORE, 140320834568192, 140320842956799, +SNULL, 140320842960895, 140320851349503, +STORE, 140320842956800, 140320842960895, +STORE, 140320842960896, 140320851349503, +STORE, 140320423518208, 140320448696319, +SNULL, 140320423522303, 140320448696319, +STORE, 140320423518208, 140320423522303, +STORE, 140320423522304, 140320448696319, +STORE, 140320415125504, 140320423518207, +STORE, 140320331264000, 140320339656703, +STORE, 140320322871296, 140320339656703, +STORE, 140320314478592, 140320339656703, +SNULL, 140320314482687, 140320339656703, +STORE, 140320314478592, 140320314482687, +STORE, 140320314482688, 140320339656703, +STORE, 140320306085888, 140320314478591, +SNULL, 140320306089983, 140320314478591, +STORE, 140320306085888, 140320306089983, +STORE, 140320306089984, 140320314478591, +STORE, 140320297693184, 140320306085887, +SNULL, 140320297697279, 140320306085887, +STORE, 140320297693184, 140320297697279, +STORE, 140320297697280, 140320306085887, +STORE, 140320289300480, 140320297693183, +STORE, 140320280907776, 140320297693183, +SNULL, 140320280911871, 140320297693183, +STORE, 140320280907776, 140320280911871, +STORE, 140320280911872, 140320297693183, +SNULL, 140320423522304, 140320431910911, +STORE, 140320431910912, 140320448696319, +STORE, 140320423522304, 140320431910911, +SNULL, 140320431915007, 140320448696319, +STORE, 140320431910912, 140320431915007, +STORE, 140320431915008, 140320448696319, +SNULL, 140320549347327, 140320557735935, +STORE, 140320549343232, 140320549347327, +STORE, 140320549347328, 140320557735935, +STORE, 140320272515072, 140320280907775, +SNULL, 140320448700416, 140320457089023, +STORE, 140320457089024, 140320473874431, +STORE, 140320448700416, 140320457089023, +SNULL, 140320457093119, 140320473874431, +STORE, 140320457089024, 140320457093119, +STORE, 140320457093120, 140320473874431, +STORE, 140320264122368, 140320280907775, +SNULL, 140320457093120, 140320465481727, +STORE, 140320465481728, 140320473874431, +STORE, 140320457093120, 140320465481727, +SNULL, 140320465485823, 140320473874431, +STORE, 140320465481728, 140320465485823, +STORE, 140320465485824, 140320473874431, +SNULL, 140320431915008, 140320440303615, +STORE, 140320440303616, 140320448696319, +STORE, 140320431915008, 140320440303615, +SNULL, 140320440307711, 140320448696319, +STORE, 140320440303616, 140320440307711, +STORE, 140320440307712, 140320448696319, +STORE, 140320255729664, 140320280907775, +STORE, 140320247336960, 140320280907775, +SNULL, 140320247341055, 140320280907775, +STORE, 140320247336960, 140320247341055, +STORE, 140320247341056, 140320280907775, +STORE, 140320238944256, 140320247336959, +STORE, 140320230551552, 140320247336959, +SNULL, 140320230551552, 140320238944255, +STORE, 140320238944256, 140320247336959, +STORE, 140320230551552, 140320238944255, +SNULL, 140320238948351, 140320247336959, +STORE, 140320238944256, 140320238948351, +STORE, 140320238948352, 140320247336959, +SNULL, 140320314482688, 140320331263999, +STORE, 140320331264000, 140320339656703, +STORE, 140320314482688, 140320331263999, +SNULL, 140320331268095, 140320339656703, +STORE, 140320331264000, 140320331268095, +STORE, 140320331268096, 140320339656703, +SNULL, 140320280911872, 140320289300479, +STORE, 140320289300480, 140320297693183, +STORE, 140320280911872, 140320289300479, +SNULL, 140320289304575, 140320297693183, +STORE, 140320289300480, 140320289304575, +STORE, 140320289304576, 140320297693183, +SNULL, 140320415129599, 140320423518207, +STORE, 140320415125504, 140320415129599, +STORE, 140320415129600, 140320423518207, +STORE, 140320222158848, 140320238944255, +STORE, 140320213766144, 140320238944255, +STORE, 140320205373440, 140320238944255, +SNULL, 140320205377535, 140320238944255, +STORE, 140320205373440, 140320205377535, +STORE, 140320205377536, 140320238944255, +SNULL, 140320314482688, 140320322871295, +STORE, 140320322871296, 140320331263999, +STORE, 140320314482688, 140320322871295, +SNULL, 140320322875391, 140320331263999, +STORE, 140320322871296, 140320322875391, +STORE, 140320322875392, 140320331263999, +SNULL, 140320247341056, 140320272515071, +STORE, 140320272515072, 140320280907775, +STORE, 140320247341056, 140320272515071, +SNULL, 140320272519167, 140320280907775, +STORE, 140320272515072, 140320272519167, +STORE, 140320272519168, 140320280907775, +SNULL, 140320247341056, 140320264122367, +STORE, 140320264122368, 140320272515071, +STORE, 140320247341056, 140320264122367, +SNULL, 140320264126463, 140320272515071, +STORE, 140320264122368, 140320264126463, +STORE, 140320264126464, 140320272515071, +SNULL, 140320205377536, 140320230551551, +STORE, 140320230551552, 140320238944255, +STORE, 140320205377536, 140320230551551, +SNULL, 140320230555647, 140320238944255, +STORE, 140320230551552, 140320230555647, +STORE, 140320230555648, 140320238944255, +STORE, 140320196980736, 140320205373439, +SNULL, 140320196984831, 140320205373439, +STORE, 140320196980736, 140320196984831, +STORE, 140320196984832, 140320205373439, +STORE, 140320188588032, 140320196980735, +SNULL, 140320247341056, 140320255729663, +STORE, 140320255729664, 140320264122367, +STORE, 140320247341056, 140320255729663, +SNULL, 140320255733759, 140320264122367, +STORE, 140320255729664, 140320255733759, +STORE, 140320255733760, 140320264122367, +STORE, 140320180195328, 140320196980735, +SNULL, 140320180199423, 140320196980735, +STORE, 140320180195328, 140320180199423, +STORE, 140320180199424, 140320196980735, +STORE, 140320171802624, 140320180195327, +STORE, 140320163409920, 140320180195327, +SNULL, 140320163414015, 140320180195327, +STORE, 140320163409920, 140320163414015, +STORE, 140320163414016, 140320180195327, +SNULL, 140320205377536, 140320222158847, +STORE, 140320222158848, 140320230551551, +STORE, 140320205377536, 140320222158847, +SNULL, 140320222162943, 140320230551551, +STORE, 140320222158848, 140320222162943, +STORE, 140320222162944, 140320230551551, +SNULL, 140320205377536, 140320213766143, +STORE, 140320213766144, 140320222158847, +STORE, 140320205377536, 140320213766143, +SNULL, 140320213770239, 140320222158847, +STORE, 140320213766144, 140320213770239, +STORE, 140320213770240, 140320222158847, +STORE, 140320155017216, 140320163409919, +SNULL, 140320180199424, 140320188588031, +STORE, 140320188588032, 140320196980735, +STORE, 140320180199424, 140320188588031, +SNULL, 140320188592127, 140320196980735, +STORE, 140320188588032, 140320188592127, +STORE, 140320188592128, 140320196980735, +SNULL, 140320155021311, 140320163409919, +STORE, 140320155017216, 140320155021311, +STORE, 140320155021312, 140320163409919, +SNULL, 140320163414016, 140320171802623, +STORE, 140320171802624, 140320180195327, +STORE, 140320163414016, 140320171802623, +SNULL, 140320171806719, 140320180195327, +STORE, 140320171802624, 140320171806719, +STORE, 140320171806720, 140320180195327, +STORE, 140320146624512, 140320155017215, +SNULL, 140320146628607, 140320155017215, +STORE, 140320146624512, 140320146628607, +STORE, 140320146628608, 140320155017215, +STORE, 140321937321984, 140321937350655, +STORE, 140321884942336, 140321887133695, +SNULL, 140321884942336, 140321885032447, +STORE, 140321885032448, 140321887133695, +STORE, 140321884942336, 140321885032447, +SNULL, 140321887125503, 140321887133695, +STORE, 140321885032448, 140321887125503, +STORE, 140321887125504, 140321887133695, +ERASE, 140321887125504, 140321887133695, +STORE, 140321887125504, 140321887133695, +SNULL, 140321887129599, 140321887133695, +STORE, 140321887125504, 140321887129599, +STORE, 140321887129600, 140321887133695, +ERASE, 140321937321984, 140321937350655, +ERASE, 140321086214144, 140321086218239, +ERASE, 140321086218240, 140321094606847, +ERASE, 140321119784960, 140321119789055, +ERASE, 140321119789056, 140321128177663, +ERASE, 140321245609984, 140321245614079, +ERASE, 140321245614080, 140321254002687, +ERASE, 140320574521344, 140320574525439, +ERASE, 140320574525440, 140320582914047, +ERASE, 140320297693184, 140320297697279, +ERASE, 140320297697280, 140320306085887, +ERASE, 140321354616832, 140321354620927, +ERASE, 140321354620928, 140321363009535, +ERASE, 140320834564096, 140320834568191, +ERASE, 140320834568192, 140320842956799, +ERASE, 140320591306752, 140320591310847, +ERASE, 140320591310848, 140320599699455, +ERASE, 140321136570368, 140321136574463, +ERASE, 140321136574464, 140321144963071, +ERASE, 140321237217280, 140321237221375, +ERASE, 140321237221376, 140321245609983, +ERASE, 140321363009536, 140321363013631, +ERASE, 140321363013632, 140321371402239, +ERASE, 140320599699456, 140320599703551, +ERASE, 140320599703552, 140320608092159, +ERASE, 140321396580352, 140321396584447, +ERASE, 140321396584448, 140321404973055, +ERASE, 140320566128640, 140320566132735, +ERASE, 140320566132736, 140320574521343, +ERASE, 140321094606848, 140321094610943, +ERASE, 140321094610944, 140321102999551, +ERASE, 140320582914048, 140320582918143, +ERASE, 140320582918144, 140320591306751, +ERASE, 140320289300480, 140320289304575, +ERASE, 140320289304576, 140320297693183, +ERASE, 140320163409920, 140320163414015, + }; + unsigned long set41[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140728157171712, 140737488351231, +SNULL, 140728157175807, 140737488351231, +STORE, 140728157171712, 140728157175807, +STORE, 140728157040640, 140728157175807, +STORE, 94376106364928, 94376108613631, +SNULL, 94376106487807, 94376108613631, +STORE, 94376106364928, 94376106487807, +STORE, 94376106487808, 94376108613631, +SNULL, 94376106487808, 94376108613631, +STORE, 94376108584960, 94376108593151, +STORE, 94376108593152, 94376108613631, +STORE, 140113496432640, 140113498685439, +SNULL, 140113496575999, 140113498685439, +STORE, 140113496432640, 140113496575999, +STORE, 140113496576000, 140113498685439, +SNULL, 140113496576000, 140113498685439, +STORE, 140113498673152, 140113498681343, +STORE, 140113498681344, 140113498685439, +STORE, 140728157609984, 140728157618175, +STORE, 140728157593600, 140728157609983, +STORE, 140113498636288, 140113498673151, +STORE, 140113498628096, 140113498636287, +STORE, 140113492635648, 140113496432639, +SNULL, 140113492635648, 140113494294527, +STORE, 140113494294528, 140113496432639, +STORE, 140113492635648, 140113494294527, +SNULL, 140113496391679, 140113496432639, +STORE, 140113494294528, 140113496391679, +STORE, 140113496391680, 140113496432639, +SNULL, 140113496391680, 140113496416255, +STORE, 140113496416256, 140113496432639, +STORE, 140113496391680, 140113496416255, +SNULL, 140113496391680, 140113496416255, +STORE, 140113496391680, 140113496416255, +SNULL, 140113496416256, 140113496432639, +STORE, 140113496416256, 140113496432639, +SNULL, 140113496408063, 140113496416255, +STORE, 140113496391680, 140113496408063, +STORE, 140113496408064, 140113496416255, +SNULL, 94376108589055, 94376108593151, +STORE, 94376108584960, 94376108589055, +STORE, 94376108589056, 94376108593151, +SNULL, 140113498677247, 140113498681343, +STORE, 140113498673152, 140113498677247, +STORE, 140113498677248, 140113498681343, +SNULL, 140113498636288, 140113498673151, +STORE, 94376135090176, 94376135094271, +STORE, 94376135090176, 94376135098367, +STORE, 94376139288576, 94376139292671, +STORE, 94376143482880, 94376143486975, +STORE, 94376147677184, 94376147681279, +STORE, 94376151871488, 94376151875583, +STORE, 94376156065792, 94376156069887, +STORE, 94376160260096, 94376160264191, +STORE, 94376164454400, 94376164458495, +STORE, 94376168648704, 94376168652799, +STORE, 94376172843008, 94376172847103, +STORE, 94376177037312, 94376177041407, +STORE, 94376181231616, 94376181235711, +STORE, 94376185425920, 94376185430015, +STORE, 94376189620224, 94376189624319, +STORE, 94376193814528, 94376193818623, +STORE, 94376198008832, 94376198012927, +STORE, 94376202203136, 94376202207231, +STORE, 94376206397440, 94376206401535, +STORE, 94376210591744, 94376210595839, +STORE, 94376214786048, 94376214790143, +STORE, 94376218980352, 94376218984447, +STORE, 94376223174656, 94376223178751, +STORE, 94376227368960, 94376227373055, +STORE, 94376231563264, 94376231567359, +STORE, 94376235757568, 94376235761663, +STORE, 94376239951872, 94376239955967, +STORE, 94376244146176, 94376244150271, +STORE, 94376248340480, 94376248344575, +STORE, 94376252534784, 94376252538879, +STORE, 94376256729088, 94376256733183, +STORE, 94376260923392, 94376260927487, +STORE, 94376265117696, 94376265121791, +STORE, 94376269312000, 94376269316095, +STORE, 94376273506304, 94376273510399, +STORE, 94376277700608, 94376277704703, +STORE, 94376281894912, 94376281899007, +STORE, 94376286089216, 94376286093311, +STORE, 94376290283520, 94376290287615, +STORE, 94376294477824, 94376294481919, +STORE, 94376298672128, 94376298676223, +STORE, 94376302866432, 94376302870527, +STORE, 94376307060736, 94376307064831, +STORE, 94376311255040, 94376311259135, +STORE, 94376315449344, 94376315453439, +STORE, 94376319643648, 94376319647743, +STORE, 94376323837952, 94376323842047, +STORE, 94376328032256, 94376328036351, +STORE, 94376332226560, 94376332230655, +STORE, 94376336420864, 94376336424959, +STORE, 94376340615168, 94376340619263, +STORE, 94376344809472, 94376344813567, +STORE, 94376349003776, 94376349007871, +STORE, 94376353198080, 94376353202175, +STORE, 94376357392384, 94376357396479, +STORE, 94376361586688, 94376361590783, +STORE, 94376365780992, 94376365785087, +STORE, 94376369975296, 94376369979391, +STORE, 94376374169600, 94376374173695, +STORE, 94376378363904, 94376378367999, +STORE, 94376382558208, 94376382562303, +STORE, 94376386752512, 94376386756607, +STORE, 94376390946816, 94376390950911, +STORE, 94376395141120, 94376395145215, +STORE, 94376399335424, 94376399339519, +STORE, 94376403529728, 94376403533823, +STORE, 94376407724032, 94376407728127, +STORE, 94376411918336, 94376411922431, +STORE, 94376416112640, 94376416116735, +STORE, 94376420306944, 94376420311039, +STORE, 94376424501248, 94376424505343, +STORE, 94376428695552, 94376428699647, +STORE, 94376432889856, 94376432893951, +STORE, 94376437084160, 94376437088255, +STORE, 94376441278464, 94376441282559, +STORE, 94376445472768, 94376445476863, +STORE, 94376449667072, 94376449671167, +STORE, 94376453861376, 94376453865471, +STORE, 94376458055680, 94376458059775, +STORE, 94376462249984, 94376462254079, +STORE, 94376466444288, 94376466448383, +STORE, 94376470638592, 94376470642687, +STORE, 94376474832896, 94376474836991, +STORE, 94376479027200, 94376479031295, +STORE, 94376483221504, 94376483225599, +STORE, 94376487415808, 94376487419903, +STORE, 94376491610112, 94376491614207, +STORE, 94376495804416, 94376495808511, +STORE, 94376499998720, 94376500002815, +STORE, 94376504193024, 94376504197119, +STORE, 94376508387328, 94376508391423, +STORE, 94376512581632, 94376512585727, +STORE, 94376516775936, 94376516780031, +STORE, 94376520970240, 94376520974335, +STORE, 94376525164544, 94376525168639, +STORE, 94376529358848, 94376529362943, +STORE, 94376533553152, 94376533557247, +STORE, 94376537747456, 94376537751551, +STORE, 94376541941760, 94376541945855, +STORE, 94376546136064, 94376546140159, +STORE, 94376550330368, 94376550334463, +STORE, 94376554524672, 94376554528767, +STORE, 94376558718976, 94376558723071, +STORE, 94376562913280, 94376562917375, +STORE, 94376567107584, 94376567111679, +STORE, 94376571301888, 94376571305983, +STORE, 94376575496192, 94376575500287, +STORE, 94376579690496, 94376579694591, +STORE, 94376583884800, 94376583888895, +STORE, 94376588079104, 94376588083199, +STORE, 94376592273408, 94376592277503, +STORE, 94376596467712, 94376596471807, +STORE, 94376600662016, 94376600666111, +STORE, 94376604856320, 94376604860415, +STORE, 94376609050624, 94376609054719, +STORE, 94376613244928, 94376613249023, +STORE, 94376617439232, 94376617443327, +STORE, 94376621633536, 94376621637631, +STORE, 94376625827840, 94376625831935, +STORE, 94376630022144, 94376630026239, +STORE, 94376634216448, 94376634220543, +STORE, 94376638410752, 94376638414847, +STORE, 94376642605056, 94376642609151, +STORE, 94376646799360, 94376646803455, +STORE, 94376650993664, 94376650997759, +STORE, 94376655187968, 94376655192063, +STORE, 94376659382272, 94376659386367, +STORE, 94376663576576, 94376663580671, +STORE, 94376667770880, 94376667774975, +STORE, 94376671965184, 94376671969279, +STORE, 94376676159488, 94376676163583, +STORE, 94376680353792, 94376680357887, +STORE, 94376684548096, 94376684552191, +STORE, 94376688742400, 94376688746495, +STORE, 94376692936704, 94376692940799, +STORE, 94376697131008, 94376697135103, +STORE, 94376701325312, 94376701329407, +STORE, 94376705519616, 94376705523711, +STORE, 94376709713920, 94376709718015, +STORE, 94376713908224, 94376713912319, +STORE, 94376718102528, 94376718106623, +STORE, 94376722296832, 94376722300927, +STORE, 94376726491136, 94376726495231, +STORE, 94376730685440, 94376730689535, +STORE, 94376734879744, 94376734883839, +STORE, 94376739074048, 94376739078143, +STORE, 94376743268352, 94376743272447, +STORE, 94376747462656, 94376747466751, +STORE, 94376751656960, 94376751661055, +STORE, 94376755851264, 94376755855359, +STORE, 94376760045568, 94376760049663, +STORE, 94376764239872, 94376764243967, +STORE, 94376768434176, 94376768438271, +STORE, 94376772628480, 94376772632575, +STORE, 94376776822784, 94376776826879, +STORE, 94376781017088, 94376781021183, +STORE, 94376785211392, 94376785215487, +STORE, 94376789405696, 94376789409791, +STORE, 94376793600000, 94376793604095, +STORE, 94376797794304, 94376797798399, +STORE, 94376801988608, 94376801992703, +STORE, 94376806182912, 94376806187007, +STORE, 94376810377216, 94376810381311, +STORE, 94376814571520, 94376814575615, +STORE, 94376818765824, 94376818769919, +STORE, 94376822960128, 94376822964223, +STORE, 94376827154432, 94376827158527, +STORE, 94376831348736, 94376831352831, +STORE, 94376835543040, 94376835547135, +STORE, 94376839737344, 94376839741439, +STORE, 94376843931648, 94376843935743, +STORE, 94376848125952, 94376848130047, +STORE, 94376852320256, 94376852324351, +STORE, 94376856514560, 94376856518655, +STORE, 94376860708864, 94376860712959, +STORE, 94376864903168, 94376864907263, +STORE, 94376869097472, 94376869101567, +STORE, 94376873291776, 94376873295871, +STORE, 94376877486080, 94376877490175, +STORE, 94376881680384, 94376881684479, +STORE, 94376885874688, 94376885878783, +STORE, 94376890068992, 94376890073087, +STORE, 94376894263296, 94376894267391, +STORE, 94376898457600, 94376898461695, +STORE, 94376902651904, 94376902655999, +STORE, 94376906846208, 94376906850303, +STORE, 94376911040512, 94376911044607, +STORE, 94376915234816, 94376915238911, +STORE, 94376919429120, 94376919433215, +STORE, 94376923623424, 94376923627519, +STORE, 94376927817728, 94376927821823, +STORE, 94376932012032, 94376932016127, +STORE, 94376936206336, 94376936210431, +STORE, 94376940400640, 94376940404735, +STORE, 94376944594944, 94376944599039, +STORE, 94376948789248, 94376948793343, +STORE, 94376952983552, 94376952987647, +STORE, 94376957177856, 94376957181951, +STORE, 94376961372160, 94376961376255, +STORE, 94376965566464, 94376965570559, +STORE, 94376969760768, 94376969764863, +STORE, 94376973955072, 94376973959167, +STORE, 94376978149376, 94376978153471, +STORE, 94376982343680, 94376982347775, +STORE, 94376986537984, 94376986542079, +STORE, 94376990732288, 94376990736383, +STORE, 94376994926592, 94376994930687, +STORE, 94376999120896, 94376999124991, +STORE, 94377003315200, 94377003319295, +STORE, 94377007509504, 94377007513599, +STORE, 94377011703808, 94377011707903, +STORE, 94377015898112, 94377015902207, +STORE, 94377020092416, 94377020096511, +STORE, 94377024286720, 94377024290815, +STORE, 94377028481024, 94377028485119, +STORE, 94377032675328, 94377032679423, +STORE, 94377036869632, 94377036873727, +STORE, 94377041063936, 94377041068031, +STORE, 94377045258240, 94377045262335, +STORE, 94377049452544, 94377049456639, +STORE, 94377053646848, 94377053650943, +STORE, 94377057841152, 94377057845247, +STORE, 94377062035456, 94377062039551, +STORE, 94377066229760, 94377066233855, +STORE, 94377070424064, 94377070428159, +STORE, 94377074618368, 94377074622463, +STORE, 94377078812672, 94377078816767, +STORE, 94377083006976, 94377083011071, +STORE, 94377087201280, 94377087205375, +STORE, 94377091395584, 94377091399679, +STORE, 94377095589888, 94377095593983, +STORE, 94377099784192, 94377099788287, +STORE, 94377103978496, 94377103982591, +STORE, 94377108172800, 94377108176895, +STORE, 94377112367104, 94377112371199, +STORE, 94377116561408, 94377116565503, +STORE, 94377120755712, 94377120759807, +STORE, 94377124950016, 94377124954111, +STORE, 94377129144320, 94377129148415, +STORE, 94377133338624, 94377133342719, +STORE, 94377137532928, 94377137537023, +STORE, 94377141727232, 94377141731327, +STORE, 94377145921536, 94377145925631, +STORE, 94377150115840, 94377150119935, +STORE, 94377154310144, 94377154314239, +STORE, 94377158504448, 94377158508543, +STORE, 94377162698752, 94377162702847, +STORE, 94377166893056, 94377166897151, +STORE, 94377171087360, 94377171091455, +STORE, 94377175281664, 94377175285759, +STORE, 94377179475968, 94377179480063, +STORE, 94377183670272, 94377183674367, +STORE, 94377187864576, 94377187868671, +STORE, 94377192058880, 94377192062975, +STORE, 94377196253184, 94377196257279, +STORE, 94377200447488, 94377200451583, +STORE, 94377204641792, 94377204645887, +SNULL, 94376135094271, 94376135098367, +STORE, 94376135090176, 94376135094271, +STORE, 94376135094272, 94376135098367, +SNULL, 94376135094272, 94377208836095, + }; + unsigned long set42[] = { +STORE, 314572800, 1388314623, +STORE, 1462157312, 1462169599, +STORE, 1462169600, 1462185983, +STORE, 1462185984, 1462190079, +STORE, 1462190080, 1462194175, +STORE, 1462194176, 1462198271, +STORE, 1879986176, 1881800703, +STORE, 1881800704, 1882034175, +STORE, 1882034176, 1882193919, +STORE, 1882193920, 1882406911, +STORE, 1882406912, 1882451967, +STORE, 1882451968, 1882996735, +STORE, 1882996736, 1885892607, +STORE, 1885892608, 1885896703, +STORE, 1885896704, 1885904895, +STORE, 1885904896, 1885908991, +STORE, 1885908992, 1885913087, +STORE, 1885913088, 1885966335, +STORE, 1885966336, 1886232575, +STORE, 1886232576, 1886236671, +STORE, 1886236672, 1886240767, +STORE, 1886240768, 1886244863, +STORE, 1886244864, 1886248959, +STORE, 1886248960, 1886294015, +STORE, 1886294016, 1886494719, +STORE, 1886494720, 1886498815, +STORE, 1886498816, 1886502911, +STORE, 1886502912, 1886507007, +STORE, 1886507008, 1886511103, +STORE, 1886511104, 1886556159, +STORE, 1886556160, 1886629887, +STORE, 1886629888, 1886633983, +STORE, 1886633984, 1886638079, +STORE, 1886638080, 1886642175, +STORE, 1886642176, 1886646271, +STORE, 1886646272, 1886666751, +STORE, 1886666752, 1886670847, +STORE, 1886670848, 1886674943, +STORE, 1886674944, 1886679039, +STORE, 1886679040, 1895419903, +STORE, 1895419904, 1895550975, +STORE, 1895550976, 1896148991, +STORE, 1896148992, 1897189375, +STORE, 1897189376, 1897701375, +STORE, 1897701376, 1897803775, +STORE, 1897803776, 1897816063, +STORE, 1897816064, 1899913215, +STORE, 1899913216, 1909379071, +STORE, 1909379072, 1909387263, +STORE, 1909387264, 1909391359, +STORE, 1909391360, 1909432319, +STORE, 1909432320, 1909436415, +STORE, 1909436416, 1909440511, +STORE, 1909440512, 1909460991, +STORE, 1909460992, 1909547007, +STORE, 1909547008, 1909551103, +STORE, 1909551104, 1909555199, +STORE, 1909555200, 1909559295, +STORE, 1909559296, 1909563391, +STORE, 1909563392, 1909739519, +STORE, 1909739520, 1910566911, +STORE, 1910566912, 1910571007, +STORE, 1910571008, 1910575103, +STORE, 1910575104, 1910579199, +STORE, 1910579200, 1910583295, +STORE, 1910583296, 1910587391, +STORE, 1910587392, 1910620159, +STORE, 1910620160, 1910624255, +STORE, 1910624256, 1910628351, +STORE, 1910628352, 1910632447, +STORE, 1910632448, 1910652927, +STORE, 1910652928, 1910657023, +STORE, 1910657024, 1910661119, +STORE, 1910661120, 1910665215, +STORE, 1910665216, 1910669311, +STORE, 1910669312, 1910677503, +STORE, 1910677504, 1910681599, +STORE, 1910681600, 1910685695, +STORE, 1910685696, 1910689791, +STORE, 1910689792, 1910697983, +STORE, 1910697984, 1910702079, +STORE, 1910702080, 1910706175, +STORE, 1910706176, 1910710271, +STORE, 1910710272, 1914093567, +STORE, 1914093568, 1914097663, +STORE, 1914097664, 1969434623, +STORE, 1969434624, 1977819135, +STORE, 3290435584, 3426750463, +STORE, 3426750464, 3426754559, +STORE, 3426754560, 3426762751, +STORE, 3426762752, 3426766847, +STORE, 3426766848, 3426770943, +STORE, 3427037184, 3427061759, +STORE, 3427061760, 3427135487, +STORE, 3427135488, 3427143679, +STORE, 3427143680, 3427147775, +STORE, 3427147776, 3427209215, +STORE, 3427319808, 3432116223, +STORE, 3432116224, 3450130431, +STORE, 3450130432, 3451027455, +STORE, 3451027456, 3451031551, +STORE, 3451031552, 3451461631, +STORE, 3451736064, 3456688127, +STORE, 3456688128, 3475222527, +STORE, 3475222528, 3476119551, +STORE, 3476119552, 3476127743, +STORE, 3476127744, 3476553727, +STORE, 3476631552, 3477315583, +STORE, 3477315584, 3479949311, +STORE, 3479949312, 3480002559, +STORE, 3480002560, 3480006655, +STORE, 3480006656, 3480432639, +STORE, 3480539136, 3480543231, +STORE, 3480543232, 3480547327, +STORE, 3480547328, 3480555519, +STORE, 3480854528, 3480903679, +STORE, 3480903680, 3480969215, +STORE, 3480969216, 3480977407, +STORE, 3480977408, 3480981503, +STORE, 3481030656, 3481092095, +STORE, 3481092096, 3481235455, +STORE, 3481235456, 3481243647, +STORE, 3481243648, 3481247743, +STORE, 3481436160, 3481444351, +STORE, 3481444352, 3481456639, +STORE, 3481456640, 3481460735, +STORE, 3481460736, 3481464831, +STORE, 3481587712, 3481645055, +STORE, 3481645056, 3481772031, +STORE, 3481772032, 3481776127, +STORE, 3481776128, 3481780223, +STORE, 3481874432, 3481935871, +STORE, 3481935872, 3482030079, +STORE, 3482030080, 3482038271, +STORE, 3482038272, 3482042367, +STORE, 3482198016, 3482230783, +STORE, 3482230784, 3482271743, +STORE, 3482271744, 3482279935, +STORE, 3482279936, 3482284031, +STORE, 3482562560, 3482566655, +STORE, 3482566656, 3482570751, +STORE, 3482570752, 3482574847, +STORE, 3482636288, 3482689535, +STORE, 3482689536, 3482746879, +STORE, 3482746880, 3482755071, +STORE, 3482755072, 3482759167, +STORE, 3482972160, 3483062271, +STORE, 3483062272, 3483242495, +STORE, 3483242496, 3483246591, +STORE, 3483246592, 3483250687, +STORE, 3483398144, 3483688959, +STORE, 3483688960, 3484114943, +STORE, 3484114944, 3484131327, +STORE, 3484131328, 3484135423, +STORE, 3484135424, 3484143615, +STORE, 3484184576, 3484475391, +STORE, 3484475392, 3485028351, +STORE, 3485028352, 3485057023, +STORE, 3485057024, 3485061119, +STORE, 3485360128, 3485364223, +STORE, 3485364224, 3485368319, +STORE, 3485368320, 3485372415, +STORE, 3485589504, 3485593599, +STORE, 3485593600, 3485597695, +STORE, 3485597696, 3485601791, +STORE, 3485913088, 3485937663, +STORE, 3485937664, 3485974527, +STORE, 3485974528, 3485982719, +STORE, 3485982720, 3485986815, +STORE, 3486052352, 3486056447, +STORE, 3486056448, 3486064639, +STORE, 3486064640, 3486068735, +STORE, 3486068736, 3486072831, +STORE, 3486294016, 3486302207, +STORE, 3486302208, 3486306303, +STORE, 3486306304, 3486310399, +STORE, 3486310400, 3486314495, +STORE, 3486670848, 3486679039, +STORE, 3486679040, 3486683135, +STORE, 3486683136, 3486687231, +STORE, 3486687232, 3486691327, +STORE, 3486863360, 3486871551, +STORE, 3486871552, 3486875647, +STORE, 3486875648, 3486879743, +STORE, 3486879744, 3486883839, +STORE, 3487584256, 3522543615, +STORE, 3522543616, 3523321855, +STORE, 3523321856, 3523342335, +STORE, 3523342336, 3523387391, +STORE, 3523387392, 3523391487, +STORE, 3523391488, 3523395583, +STORE, 3523477504, 3523686399, +STORE, 3523686400, 3523981311, +STORE, 3523981312, 3523997695, +STORE, 3523997696, 3524001791, +STORE, 3524177920, 3525013503, +STORE, 3525013504, 3526582271, +STORE, 3526582272, 3526606847, +STORE, 3526606848, 3526610943, +STORE, 3526610944, 3526615039, +STORE, 3526672384, 3526746111, +STORE, 3526746112, 3526860799, +STORE, 3526860800, 3526868991, +STORE, 3526868992, 3526873087, +STORE, 3527000064, 3527475199, +STORE, 3527475200, 3527479295, +STORE, 3527479296, 3527573503, +STORE, 3527573504, 3527581695, +STORE, 3527581696, 3527585791, +STORE, 3527585792, 3527606271, +STORE, 3527909376, 3527913471, +STORE, 3527913472, 3527917567, +STORE, 3527917568, 3527921663, +STORE, 3527950336, 3528011775, +STORE, 3528011776, 3528093695, +STORE, 3528093696, 3528101887, +STORE, 3528101888, 3528105983, +STORE, 3528228864, 3528241151, +STORE, 3528241152, 3528261631, +STORE, 3528261632, 3528265727, +STORE, 3528273920, 3528593407, +STORE, 3528593408, 3528609791, +STORE, 3528609792, 3528638463, +STORE, 3528638464, 3528642559, +STORE, 3528642560, 3528646655, +STORE, 3528880128, 3528912895, +STORE, 3528912896, 3528962047, +STORE, 3528962048, 3528966143, +STORE, 3528966144, 3528970239, +STORE, 3528982528, 3530293247, +STORE, 3530366976, 3530825727, +STORE, 3530825728, 3531317247, +STORE, 3531317248, 3541041151, +STORE, 3541041152, 3541303295, +STORE, 3541430272, 3566206975, +STORE, 3566206976, 3566993407, +STORE, 3567239168, 3587571711, +STORE, 3587571712, 3588284415, +STORE, 3588284416, 3588661247, +STORE, 3588661248, 3589066751, +STORE, 3589066752, 3589574655, +STORE, 3589574656, 3590078463, +STORE, 3590078464, 3590373375, +STORE, 3590373376, 3590668287, +STORE, 3590668288, 3590963199, +STORE, 3590963200, 3591294975, +STORE, 3591294976, 3591602175, +STORE, 3591602176, 3591933951, +STORE, 3591933952, 3592241151, +STORE, 3592241152, 3592572927, +STORE, 3592572928, 3592876031, +STORE, 3592876032, 3593211903, +STORE, 3593211904, 3593547775, +STORE, 3593547776, 3593650175, +STORE, 3593650176, 3593928703, +STORE, 3593928704, 3593936895, +STORE, 3593936896, 3593940991, +STORE, 3594006528, 3594301439, +STORE, 3594301440, 3594739711, +STORE, 3594739712, 3594756095, +STORE, 3594756096, 3594760191, +STORE, 3594760192, 3594768383, +STORE, 3594952704, 3595051007, +STORE, 3595051008, 3595223039, +STORE, 3595223040, 3595227135, +STORE, 3595227136, 3595235327, +STORE, 3595431936, 3595775999, +STORE, 3595776000, 3596701695, +STORE, 3596701696, 3596742655, +STORE, 3596742656, 3596746751, +STORE, 3596746752, 3596750847, +STORE, 3596767232, 3597070335, +STORE, 3597070336, 3597402111, +STORE, 3597402112, 3598188543, +STORE, 3598262272, 3623428095, +STORE, 3623428096, 3623432191, +STORE, 3623432192, 3623436287, +STORE, 3623436288, 3623440383, +STORE, 3623616512, 3623878655, +STORE, 3624169472, 3624300543, +STORE, 3627524096, 3628523519, +STORE, 3628523520, 3629522943, +STORE, 3696631808, 3730186239, +STORE, 3730186240, 3763740671, +STORE, 3763740672, 3764027391, +STORE, 3764027392, 3765133311, +STORE, 3765133312, 3765145599, +STORE, 3765145600, 3765149695, +STORE, 3765178368, 3766022143, +STORE, 3766022144, 3768791039, +STORE, 3768791040, 3768840191, +STORE, 3768840192, 3768844287, +STORE, 3768897536, 3768913919, +STORE, 3768913920, 3768934399, +STORE, 3768934400, 3768938495, +STORE, 3769016320, 3769147391, +STORE, 3769147392, 3769233407, +STORE, 3769233408, 3769356287, +STORE, 3769356288, 3769360383, +STORE, 3769360384, 3769368575, +STORE, 3769376768, 3794542591, +STORE, 3794542592, 3794599935, +STORE, 3794599936, 3794731007, +STORE, 3794731008, 3794735103, +STORE, 3794735104, 3794743295, +STORE, 3794849792, 3794980863, +STORE, 3794980864, 3794984959, +STORE, 3794984960, 3794989055, +STORE, 3794989056, 3794993151, +STORE, 3794993152, 3794997247, +STORE, 3795103744, 3795128319, +STORE, 3795128320, 3795165183, +STORE, 3795165184, 3795169279, +STORE, 3795169280, 3795173375, +STORE, 3795210240, 3795357695, +STORE, 3795357696, 3795365887, +STORE, 3795365888, 3795374079, +STORE, 3795374080, 3795378175, +STORE, 3795378176, 3795382271, +STORE, 3795406848, 3795738623, +STORE, 3795738624, 3795742719, +STORE, 3795742720, 3795755007, +STORE, 3795755008, 3795759103, +STORE, 3795763200, 3795894271, +STORE, 3795894272, 3796041727, +STORE, 3796041728, 3796054015, +STORE, 3796054016, 3796066303, +STORE, 3796066304, 3796070399, +STORE, 3796176896, 3796205567, +STORE, 3796205568, 3796250623, +STORE, 3796250624, 3796254719, +STORE, 3796254720, 3796258815, +STORE, 3796262912, 3796393983, +STORE, 3796393984, 3796516863, +STORE, 3796516864, 3796873215, +STORE, 3796873216, 3796885503, +STORE, 3796885504, 3796889599, +STORE, 3796963328, 3796967423, +STORE, 3796967424, 3796975615, +STORE, 3796975616, 3796979711, +STORE, 3797000192, 3797307391, +STORE, 3797307392, 3797311487, +STORE, 3797311488, 3797315583, +STORE, 3797315584, 3797323775, +STORE, 3797327872, 3797450751, +STORE, 3797450752, 3797458943, +STORE, 3797458944, 3797471231, +STORE, 3797471232, 3797475327, +STORE, 3797577728, 3797700607, +STORE, 3797700608, 3797721087, +STORE, 3797721088, 3797733375, +STORE, 3797733376, 3797741567, +STORE, 3797741568, 3797864447, +STORE, 3797864448, 3797995519, +STORE, 3797995520, 3798048767, +STORE, 3798048768, 3798179839, +STORE, 3798179840, 3798188031, +STORE, 3798188032, 3798192127, +STORE, 3798290432, 3798302719, +STORE, 3798302720, 3798323199, +STORE, 3798323200, 3798327295, +STORE, 3798327296, 3798331391, +STORE, 3798429696, 3798433791, +STORE, 3798433792, 3798552575, +STORE, 3798552576, 3798556671, +STORE, 3798556672, 3798568959, +STORE, 3798568960, 3798573055, +STORE, 3798573056, 3798581247, +STORE, 3798618112, 3798749183, +STORE, 3798749184, 3798855679, +STORE, 3798855680, 3798966271, +STORE, 3798966272, 3798982655, +STORE, 3798982656, 3798986751, +STORE, 3799101440, 3799171071, +STORE, 3799171072, 3799240703, +STORE, 3799240704, 3799248895, +STORE, 3799248896, 3799252991, +STORE, 3799326720, 3799650303, +STORE, 3799650304, 3800629247, +STORE, 3800629248, 3800641535, +STORE, 3800641536, 3800645631, +STORE, 3800645632, 3800649727, +STORE, 3800649728, 3800903679, +STORE, 3800903680, 3800936447, +STORE, 3800936448, 3800969215, +STORE, 3800969216, 3800981503, +STORE, 3800981504, 3800985599, +STORE, 3801001984, 3801133055, +STORE, 3801133056, 3801202687, +STORE, 3801202688, 3801591807, +STORE, 3801591808, 3801599999, +STORE, 3801600000, 3801604095, +STORE, 3801604096, 3801608191, +STORE, 3801608192, 3801739263, +STORE, 3801739264, 3801755647, +STORE, 3801755648, 3801796607, +STORE, 3801796608, 3801804799, +STORE, 3801804800, 3801808895, +STORE, 3801878528, 3801944063, +STORE, 3801944064, 3802116095, +STORE, 3802116096, 3802124287, +STORE, 3802124288, 3802128383, +STORE, 3802136576, 3803447295, +STORE, 3803492352, 3803553791, +STORE, 3803553792, 3804233727, +STORE, 3804233728, 3806068735, +STORE, 3806121984, 3806253055, +STORE, 3806253056, 3806674943, +STORE, 3806674944, 3807117311, +STORE, 3807117312, 3807379455, +STORE, 3807379456, 3807432703, +STORE, 3807432704, 3807563775, +STORE, 3807563776, 3809202175, +STORE, 3809202176, 3810250751, +STORE, 3810250752, 3827027967, +STORE, 3827027968, 3829125119, +STORE, 3829125120, 3837513727, +STORE, 3837513728, 3839610879, +STORE, 3839610880, 3847999487, +STORE, 3847999488, 3856392191, +STORE, 3856392192, 3864784895, +STORE, 3864784896, 3868983295, +STORE, 3868983296, 3885760511, +STORE, 3885760512, 3886809087, +STORE, 3886809088, 3887857663, +STORE, 3887857664, 3888119807, +STORE, 3888144384, 3888148479, +STORE, 3888148480, 3888218111, +STORE, 3888218112, 3888222207, +STORE, 3888222208, 3888353279, +STORE, 3888353280, 3889172479, +STORE, 3889172480, 3892314111, +STORE, 3892314112, 3892576255, +STORE, 3892588544, 3892637695, +STORE, 3892637696, 3892686847, +STORE, 3892686848, 3892744191, +STORE, 3892748288, 3892785151, +STORE, 3892785152, 3895459839, +STORE, 3895459840, 3895721983, +STORE, 3895738368, 3895885823, +STORE, 3895885824, 3897081855, +STORE, 3897081856, 3906482175, +STORE, 3906482176, 3916144639, +STORE, 3916144640, 3925766143, +STORE, 3925766144, 3926974463, +STORE, 3926974464, 3928367103, +STORE, 3928367104, 3928911871, +STORE, 3928911872, 3933995007, +STORE, 3933995008, 3935830015, +STORE, 3935830016, 3935846399, +STORE, 3935879168, 3936010239, +STORE, 3936010240, 3936026623, +STORE, 3936026624, 3936034815, +STORE, 3936034816, 3936051199, +STORE, 3936051200, 3936055295, +STORE, 3936071680, 3936137215, +STORE, 3936137216, 3936202751, +STORE, 3936202752, 3936219135, +STORE, 3936235520, 3936251903, +STORE, 3936268288, 3936276479, +STORE, 3936276480, 3936284671, +STORE, 3936284672, 3936288767, +STORE, 3936288768, 3936292863, +STORE, 3936296960, 3936354303, +STORE, 3936354304, 3936616447, +STORE, 3936628736, 3936669695, +STORE, 3936669696, 3936747519, +STORE, 3936747520, 3936870399, +STORE, 3936870400, 3936874495, +STORE, 3936874496, 3936878591, +STORE, 3936882688, 3936903167, +STORE, 3936911360, 3936948223, +STORE, 3936948224, 3936964607, +STORE, 3936964608, 3937103871, +STORE, 3937103872, 3937107967, +STORE, 3937132544, 3937161215, +STORE, 3937189888, 3937255423, +STORE, 3937255424, 3938512895, +STORE, 3938512896, 3945435135, +STORE, 3945435136, 3945476095, +STORE, 3945476096, 3945484287, +STORE, 3945484288, 3945496575, +STORE, 3945500672, 3945541631, +STORE, 3945558016, 3945566207, +STORE, 3945566208, 3945594879, +STORE, 3945594880, 3945598975, +STORE, 3945598976, 3945603071, +STORE, 3945611264, 3945742335, +STORE, 3945742336, 3945844735, +STORE, 3945844736, 3945848831, +STORE, 3945848832, 3945861119, +STORE, 3945861120, 3945865215, +STORE, 3945869312, 3945897983, +STORE, 3945897984, 3946303487, +STORE, 3946303488, 3946397695, +STORE, 3946397696, 3946569727, +STORE, 3946569728, 3946573823, +STORE, 3946573824, 3946594303, +STORE, 3946594304, 3946663935, +STORE, 3946663936, 3946708991, +STORE, 3946708992, 3946823679, +STORE, 3946823680, 3946827775, +STORE, 3946827776, 3946831871, +STORE, 3946831872, 3946860543, +STORE, 3946893312, 3946897407, +STORE, 3946897408, 3946905599, +STORE, 3946905600, 3946909695, +STORE, 3946909696, 3946913791, +STORE, 3946913792, 3946930175, +STORE, 3946930176, 3946967039, +STORE, 3946967040, 3947102207, +STORE, 3947102208, 3948412927, +STORE, 3948441600, 3948556287, +STORE, 3948556288, 3948576767, +STORE, 3948576768, 3948597247, +STORE, 3948597248, 3948605439, +STORE, 3948605440, 3948609535, +STORE, 3948609536, 3948654591, +STORE, 3948654592, 3948781567, +STORE, 3948781568, 3948822527, +STORE, 3948822528, 3948904447, +STORE, 3948904448, 3948908543, +STORE, 3948908544, 3948912639, +STORE, 3948945408, 3949043711, +STORE, 3949043712, 3949174783, +STORE, 3949174784, 3949191167, +STORE, 3949191168, 3949195263, +STORE, 3949207552, 3949252607, +STORE, 3949252608, 3949256703, +STORE, 3949256704, 3949363199, +STORE, 3949363200, 3949367295, +STORE, 3949367296, 3949379583, +STORE, 3949379584, 3949383679, +STORE, 3949383680, 3949400063, +STORE, 3949400064, 3949404159, +STORE, 3949416448, 3949481983, +STORE, 3949481984, 3949486079, +STORE, 3949486080, 3949592575, +STORE, 3949592576, 3949596671, +STORE, 3949596672, 3949621247, +STORE, 3949621248, 3949662207, +STORE, 3949662208, 3949666303, +STORE, 3949694976, 3949727743, +STORE, 3949727744, 3949731839, +STORE, 3949731840, 3949838335, +STORE, 3949838336, 3949842431, +STORE, 3949842432, 3949846527, +STORE, 3949846528, 3949854719, +STORE, 3949854720, 3949858815, +STORE, 3949858816, 3949862911, +STORE, 3949867008, 3949891583, +STORE, 3949891584, 3949928447, +STORE, 3949928448, 3949993983, +STORE, 3949993984, 3950043135, +STORE, 3950043136, 3950059519, +STORE, 3950059520, 3950096383, +STORE, 3950096384, 3950100479, +STORE, 3950100480, 3950104575, +STORE, 3950104576, 3950157823, +STORE, 3950157824, 3950292991, +STORE, 3950292992, 3950346239, +STORE, 3950346240, 3950477311, +STORE, 3950477312, 3950485503, +STORE, 3950485504, 3950489599, +STORE, 3950493696, 3950510079, +STORE, 3950510080, 3950661631, +STORE, 3950661632, 3951005695, +STORE, 3951005696, 3951026175, +STORE, 3951026176, 3951030271, +STORE, 3951030272, 3951054847, +STORE, 3951054848, 3951116287, +STORE, 3951116288, 3951144959, +STORE, 3951144960, 3951149055, +STORE, 3951149056, 3951194111, +STORE, 3951194112, 3951202303, +STORE, 3951202304, 3951206399, +STORE, 3951210496, 3951226879, +STORE, 3951226880, 3951329279, +STORE, 3951329280, 3951366143, +STORE, 3951366144, 3951411199, +STORE, 3951411200, 3951415295, +STORE, 3951415296, 3951419391, +STORE, 3951419392, 3951452159, +STORE, 3951452160, 3951566847, +STORE, 3951566848, 3951812607, +STORE, 3951812608, 3952173055, +STORE, 3952173056, 3952214015, +STORE, 3952214016, 3952218111, +STORE, 3952222208, 3952250879, +STORE, 3952250880, 3952369663, +STORE, 3952369664, 3952488447, +STORE, 3952488448, 3952627711, +STORE, 3952627712, 3952635903, +STORE, 3952635904, 3952639999, +STORE, 3952652288, 3952668671, +STORE, 3952668672, 3953000447, +STORE, 3953000448, 3953004543, +STORE, 3953004544, 3953008639, +STORE, 3953008640, 3953012735, +STORE, 3953012736, 3953037311, +STORE, 3953037312, 3953151999, +STORE, 3953152000, 3953291263, +STORE, 3953291264, 3953324031, +STORE, 3953324032, 3953364991, +STORE, 3953364992, 3953373183, +STORE, 3953373184, 3953377279, +STORE, 3953381376, 3953410047, +STORE, 3953410048, 3953491967, +STORE, 3953491968, 3953643519, +STORE, 3953643520, 3953651711, +STORE, 3953651712, 3953655807, +STORE, 3953659904, 3953766399, +STORE, 3953766400, 3953774591, +STORE, 3953774592, 3953786879, +STORE, 3953786880, 3953790975, +STORE, 3953790976, 3953823743, +STORE, 3953823744, 3953963007, +STORE, 3953963008, 3954024447, +STORE, 3954024448, 3954118655, +STORE, 3954118656, 3954122751, +STORE, 3954122752, 3954126847, +STORE, 3954130944, 3954184191, +STORE, 3954184192, 3954294783, +STORE, 3954294784, 3954323455, +STORE, 3954323456, 3954393087, +STORE, 3954393088, 3954397183, +STORE, 3954397184, 3954401279, +STORE, 3954401280, 3954405375, +STORE, 3954409472, 3954528255, +STORE, 3954528256, 3954737151, +STORE, 3954737152, 3955052543, +STORE, 3955052544, 3955060735, +STORE, 3955060736, 3955064831, +STORE, 3955068928, 3955105791, +STORE, 3955105792, 3955167231, +STORE, 3955167232, 3955277823, +STORE, 3955277824, 3955310591, +STORE, 3955310592, 3955351551, +STORE, 3955351552, 3955359743, +STORE, 3955359744, 3955363839, +STORE, 3955363840, 3955392511, +STORE, 3955392512, 3955453951, +STORE, 3955453952, 3955601407, +STORE, 3955601408, 3955777535, +STORE, 3955777536, 3955982335, +STORE, 3955982336, 3956011007, +STORE, 3956011008, 3956015103, +STORE, 3956023296, 3956039679, +STORE, 3956039680, 3956125695, +STORE, 3956125696, 3956129791, +STORE, 3956129792, 3956133887, +STORE, 3956133888, 3956137983, +STORE, 3956142080, 3956449279, +STORE, 3956449280, 3956543487, +STORE, 3956543488, 3956719615, +STORE, 3956719616, 3956731903, +STORE, 3956731904, 3956735999, +STORE, 3956744192, 3956793343, +STORE, 3956793344, 3956887551, +STORE, 3956887552, 3956953087, +STORE, 3956953088, 3957035007, +STORE, 3957035008, 3957039103, +STORE, 3957039104, 3957047295, +STORE, 3957047296, 3957071871, +STORE, 3957071872, 3957231615, +STORE, 3957231616, 3957563391, +STORE, 3957563392, 3957579775, +STORE, 3957579776, 3957583871, +STORE, 3957592064, 3957608447, +STORE, 3957608448, 3957878783, +STORE, 3957878784, 3958591487, +STORE, 3958591488, 3958599679, +STORE, 3958599680, 3958607871, +STORE, 3958607872, 3958620159, +STORE, 3958620160, 3958624255, +STORE, 3958624256, 3963199487, +STORE, 3963199488, 3963285503, +STORE, 3963285504, 3963371519, +STORE, 3963371520, 3963428863, +STORE, 3963428864, 3963555839, +STORE, 3963555840, 3963559935, +STORE, 3963559936, 3963564031, +STORE, 3963568128, 3963596799, +STORE, 3963596800, 3963682815, +STORE, 3963682816, 3963695103, +STORE, 3963695104, 3963711487, +STORE, 3963711488, 3963715583, +STORE, 3963719680, 3963752447, +STORE, 3963752448, 3963846655, +STORE, 3963846656, 3963932671, +STORE, 3963932672, 3964444671, +STORE, 3964444672, 3964448767, +STORE, 3964448768, 3965808639, +STORE, 3965808640, 3965845503, +STORE, 3965845504, 3965849599, +STORE, 3965853696, 3965935615, +STORE, 3965935616, 3966017535, +STORE, 3966017536, 3966103551, +STORE, 3966103552, 3966685183, +STORE, 3966685184, 3967705087, +STORE, 3967705088, 3967758335, +STORE, 3967758336, 3967762431, +STORE, 3967762432, 3967770623, +STORE, 3967770624, 3967799295, +STORE, 3967799296, 3967848447, +STORE, 3967848448, 3967868927, +STORE, 3967868928, 3967901695, +STORE, 3967901696, 3967905791, +STORE, 3967905792, 3967909887, +STORE, 3967909888, 3967995903, +STORE, 3967995904, 3968077823, +STORE, 3968077824, 3968159743, +STORE, 3968159744, 3968167935, +STORE, 3968167936, 3968172031, +STORE, 3968172032, 3968192511, +STORE, 3968192512, 3968196607, +STORE, 3968196608, 3968200703, +STORE, 3968208896, 3968516095, +STORE, 3968516096, 3968528383, +STORE, 3968528384, 3968552959, +STORE, 3968552960, 3968557055, +STORE, 3968561152, 3968593919, +STORE, 3968593920, 3968626687, +STORE, 3968626688, 3971153919, +STORE, 3971153920, 3973754879, +STORE, 3973754880, 3973804031, +STORE, 3973804032, 3973820415, +STORE, 3973820416, 3973832703, +STORE, 3973840896, 3973873663, +STORE, 3973873664, 3973967871, +STORE, 3973967872, 3973976063, +STORE, 3973976064, 3973984255, +STORE, 3973984256, 3973988351, +STORE, 3973988352, 3973992447, +STORE, 3973996544, 3974008831, +STORE, 3974008832, 3974045695, +STORE, 3974045696, 3974139903, +STORE, 3974139904, 3974254591, +STORE, 3974254592, 3974275071, +STORE, 3974275072, 3974291455, +STORE, 3974291456, 3974295551, +STORE, 3974295552, 3974373375, +STORE, 3974373376, 3974524927, +STORE, 3974524928, 3974529023, +STORE, 3974529024, 3974537215, +STORE, 3974537216, 3974541311, +STORE, 3974541312, 3974545407, +STORE, 3974545408, 3974627327, +STORE, 3974627328, 3974680575, +STORE, 3974680576, 3974811647, +STORE, 3974811648, 3974819839, +STORE, 3974819840, 3974823935, +STORE, 3974832128, 3974918143, +STORE, 3974918144, 3974963199, +STORE, 3974963200, 3975077887, +STORE, 3975077888, 3975090175, +STORE, 3975090176, 3975094271, +STORE, 3975094272, 3975102463, +STORE, 3975102464, 3975114751, +STORE, 3975114752, 3975266303, +STORE, 3975266304, 3975274495, +STORE, 3975274496, 3975286783, +STORE, 3975286784, 3975290879, +STORE, 3975290880, 3975299071, +STORE, 3975299072, 3975315455, +STORE, 3975315456, 3975430143, +STORE, 3975430144, 3975536639, +STORE, 3975536640, 3975651327, +STORE, 3975651328, 3975655423, +STORE, 3975655424, 3975659519, +STORE, 3975659520, 3975770111, +STORE, 3975770112, 3975778303, +STORE, 3975778304, 3975790591, +STORE, 3975790592, 3975794687, +STORE, 3975794688, 3975798783, +STORE, 3975798784, 3975831551, +STORE, 3975831552, 3975872511, +STORE, 3975872512, 3975987199, +STORE, 3975987200, 3976134655, +STORE, 3976134656, 3977175039, +STORE, 3977175040, 3977183231, +STORE, 3977183232, 3977191423, +STORE, 3977191424, 3977195519, +STORE, 3977199616, 3977248767, +STORE, 3977248768, 3977539583, +STORE, 3977539584, 3977965567, +STORE, 3977965568, 3977981951, +STORE, 3977981952, 3977986047, +STORE, 3977986048, 3977994239, +STORE, 3977994240, 3978002431, +STORE, 3978002432, 3978084351, +STORE, 3978084352, 3978125311, +STORE, 3978125312, 3978174463, +STORE, 3978174464, 3978178559, +STORE, 3978178560, 3978182655, +STORE, 3978182656, 3978207231, +STORE, 3978207232, 3978297343, +STORE, 3978297344, 3978301439, +STORE, 3978301440, 3978305535, +STORE, 3978305536, 3978309631, +STORE, 3978309632, 3978317823, +STORE, 3978317824, 3978625023, +STORE, 3978625024, 3978657791, +STORE, 3978657792, 3978727423, +STORE, 3978727424, 3978735615, +STORE, 3978735616, 3978739711, +STORE, 3978739712, 3978760191, +STORE, 3978760192, 3978842111, +STORE, 3978842112, 3978850303, +STORE, 3978850304, 3978858495, +STORE, 3978858496, 3978862591, +STORE, 3978862592, 3978895359, +STORE, 3978895360, 3979014143, +STORE, 3979014144, 3979132927, +STORE, 3979132928, 3979288575, +STORE, 3979288576, 3979481087, +STORE, 3979481088, 3979489279, +STORE, 3979489280, 3979493375, +STORE, 3979497472, 3979583487, +STORE, 3979583488, 3979673599, +STORE, 3979673600, 3979718655, +STORE, 3979718656, 3979829247, +STORE, 3979829248, 3979841535, +STORE, 3979841536, 3979882495, +STORE, 3979882496, 3979964415, +STORE, 3979964416, 3980013567, +STORE, 3980013568, 3980148735, +STORE, 3980148736, 3980152831, +STORE, 3980152832, 3980320767, +STORE, 3980320768, 3980337151, +STORE, 3980337152, 3980341247, +STORE, 3980345344, 3980365823, +STORE, 3980365824, 3980423167, +STORE, 3980423168, 3980460031, +STORE, 3980460032, 3980500991, +STORE, 3980500992, 3980509183, +STORE, 3980509184, 3980513279, +STORE, 3980513280, 3980546047, +STORE, 3980546048, 3980660735, +STORE, 3980660736, 3980951551, +STORE, 3980951552, 3981500415, +STORE, 3981500416, 3981529087, +STORE, 3981529088, 3981533183, +STORE, 3981537280, 3981549567, +STORE, 3981549568, 3981598719, +STORE, 3981598720, 3981717503, +STORE, 3981717504, 3982127103, +STORE, 3982127104, 3982675967, +STORE, 3982675968, 3982733311, +STORE, 3982733312, 3982737407, +STORE, 3982741504, 3982860287, +STORE, 3982860288, 3982905343, +STORE, 3982905344, 3982966783, +STORE, 3982966784, 3982974975, +STORE, 3982974976, 3982979071, +STORE, 3982979072, 3983032319, +STORE, 3983032320, 3983085567, +STORE, 3983085568, 3983208447, +STORE, 3983208448, 3983212543, +STORE, 3983212544, 3983220735, +STORE, 3983220736, 3983224831, +STORE, 3983224832, 3983237119, +STORE, 3983237120, 3983351807, +STORE, 3983351808, 3983376383, +STORE, 3983376384, 3983392767, +STORE, 3983392768, 3983396863, +STORE, 3983396864, 3983400959, +STORE, 3983400960, 3983417343, +STORE, 3983417344, 3983753215, +STORE, 3983753216, 3983757311, +STORE, 3983757312, 3983761407, +STORE, 3983761408, 3983765503, +STORE, 3983765504, 3983769599, +STORE, 3983769600, 3983880191, +STORE, 3983880192, 3983892479, +STORE, 3983892480, 3983900671, +STORE, 3983900672, 3983904767, +STORE, 3983904768, 3983908863, +STORE, 3983908864, 3983941631, +STORE, 3983941632, 3983990783, +STORE, 3983990784, 3984097279, +STORE, 3984097280, 3984105471, +STORE, 3984105472, 3984117759, +STORE, 3984117760, 3984121855, +STORE, 3984121856, 3984125951, +STORE, 3984125952, 3984134143, +STORE, 3984134144, 3984150527, +STORE, 3984150528, 3984416767, +STORE, 3984416768, 3984470015, +STORE, 3984470016, 3984564223, +STORE, 3984564224, 3984568319, +STORE, 3984572416, 3984629759, +STORE, 3984629760, 3984805887, +STORE, 3984805888, 3985096703, +STORE, 3985096704, 3985104895, +STORE, 3985104896, 3985108991, +STORE, 3985113088, 3986862079, +STORE, 3986862080, 3993640959, +STORE, 3993640960, 3993739263, +STORE, 3993739264, 3993743359, +STORE, 3993743360, 3993759743, +STORE, 3993759744, 3993780223, +STORE, 3993780224, 3993784319, +STORE, 3993784320, 3993792511, +STORE, 3993792512, 3993796607, +STORE, 3993796608, 3993800703, +STORE, 3993804800, 3994214399, +STORE, 3994214400, 3994218495, +STORE, 3994218496, 3994222591, +STORE, 3994222592, 3994226687, +STORE, 3994230784, 3994243071, +STORE, 3994243072, 3994255359, +STORE, 3994255360, 3994304511, +STORE, 3994304512, 3994386431, +STORE, 3994386432, 3994509311, +STORE, 3994509312, 3994521599, +STORE, 3994521600, 3994525695, +STORE, 3994529792, 3994542079, +STORE, 3994542080, 3994660863, +STORE, 3994660864, 3994705919, +STORE, 3994705920, 3994796031, +STORE, 3994796032, 3994800127, +STORE, 3994800128, 3994804223, +STORE, 3994804224, 3994812415, +STORE, 3994812416, 3994845183, +STORE, 3994845184, 3994898431, +STORE, 3994898432, 3994902527, +STORE, 3994902528, 3994906623, +STORE, 3994910720, 3994931199, +STORE, 3994931200, 3995181055, +STORE, 3995181056, 3995222015, +STORE, 3995222016, 3995275263, +STORE, 3995275264, 3995279359, +STORE, 3995279360, 3995283455, +STORE, 3995283456, 3995291647, +STORE, 3995291648, 3995324415, +STORE, 3995324416, 3995451391, +STORE, 3995451392, 3995697151, +STORE, 3995697152, 3996078079, +STORE, 3996078080, 3996086271, +STORE, 3996086272, 3996090367, +STORE, 3996094464, 3996119039, +STORE, 3996119040, 3996200959, +STORE, 3996200960, 3996229631, +STORE, 3996229632, 3996233727, +STORE, 3996233728, 3996282879, +STORE, 3996282880, 3996291071, +STORE, 3996291072, 3996295167, +STORE, 3996299264, 3996311551, +STORE, 3996311552, 3996430335, +STORE, 3996430336, 3996467199, +STORE, 3996467200, 3996504063, +STORE, 3996504064, 3996512255, +STORE, 3996512256, 3996516351, +STORE, 3996516352, 3996540927, +STORE, 3996540928, 3996671999, +STORE, 3996672000, 3996676095, +STORE, 3996676096, 3996684287, +STORE, 3996684288, 3996688383, +STORE, 3996688384, 3996692479, +STORE, 3996692480, 3996717055, +STORE, 3996717056, 3997048831, +STORE, 3997048832, 3997057023, +STORE, 3997057024, 3997073407, +STORE, 3997073408, 3997077503, +STORE, 3997077504, 3997081599, +STORE, 3997081600, 3997097983, +STORE, 3997097984, 3997179903, +STORE, 3997179904, 3997356031, +STORE, 3997356032, 3997650943, +STORE, 3997650944, 3997675519, +STORE, 3997675520, 3997679615, +STORE, 3997683712, 3997700095, +STORE, 3997700096, 3997745151, +STORE, 3997745152, 3997802495, +STORE, 3997802496, 3997810687, +STORE, 3997810688, 3997814783, +STORE, 3997814784, 3998064639, +STORE, 3998064640, 3998081023, +STORE, 3998081024, 3998085119, +STORE, 3998085120, 3998130175, +STORE, 3998130176, 3998134271, +STORE, 3998134272, 3998142463, +STORE, 3998142464, 3998179327, +STORE, 3998179328, 3998212095, +STORE, 3998212096, 3998326783, +STORE, 3998326784, 3998351359, +STORE, 3998351360, 3998392319, +STORE, 3998392320, 3998396415, +STORE, 3998396416, 3998400511, +STORE, 3998400512, 3998433279, +STORE, 3998433280, 3998466047, +STORE, 3998466048, 3998613503, +STORE, 3998613504, 3998666751, +STORE, 3998666752, 3998724095, +STORE, 3998724096, 3998732287, +STORE, 3998732288, 3998736383, +STORE, 3998736384, 3998760959, +STORE, 3998760960, 3998777343, +STORE, 3998777344, 3998822399, +STORE, 3998822400, 3998826495, +STORE, 3998826496, 3998830591, +STORE, 3998830592, 3998863359, +STORE, 3998863360, 3998900223, +STORE, 3998900224, 3999043583, +STORE, 3999043584, 3999121407, +STORE, 3999121408, 3999215615, +STORE, 3999215616, 3999223807, +STORE, 3999223808, 3999227903, +STORE, 3999227904, 3999236095, +STORE, 3999236096, 3999268863, +STORE, 3999268864, 3999301631, +STORE, 3999301632, 3999354879, +STORE, 3999354880, 3999428607, +STORE, 3999428608, 3999436799, +STORE, 3999436800, 3999440895, +STORE, 3999444992, 3999461375, +STORE, 3999461376, 3999584255, +STORE, 3999584256, 3999760383, +STORE, 3999760384, 4000219135, +STORE, 4000219136, 4000235519, +STORE, 4000235520, 4000251903, +STORE, 4000251904, 4000501759, +STORE, 4000501760, 4000505855, +STORE, 4000505856, 4000509951, +STORE, 4000509952, 4000518143, +STORE, 4000518144, 4000522239, +STORE, 4000522240, 4000587775, +STORE, 4000587776, 4000645119, +STORE, 4000645120, 4000813055, +STORE, 4000813056, 4000817151, +STORE, 4000821248, 4000837631, +STORE, 4000837632, 4000870399, +STORE, 4000870400, 4000874495, +STORE, 4000874496, 4000878591, +STORE, 4000878592, 4000882687, +STORE, 4000882688, 4000886783, +STORE, 4000886784, 4000890879, +STORE, 4000890880, 4000907263, +STORE, 4000907264, 4001214463, +STORE, 4001214464, 4001558527, +STORE, 4001558528, 4002484223, +STORE, 4002484224, 4002525183, +STORE, 4002525184, 4002529279, +STORE, 4002529280, 4002533375, +STORE, 4002533376, 4002537471, +STORE, 4002537472, 4002660351, +STORE, 4002660352, 4002779135, +STORE, 4002779136, 4002791423, +STORE, 4002791424, 4002799615, +STORE, 4002799616, 4002807807, +STORE, 4002807808, 4002811903, +STORE, 4002811904, 4002828287, +STORE, 4002828288, 4002910207, +STORE, 4002910208, 4003028991, +STORE, 4003028992, 4003037183, +STORE, 4003037184, 4003045375, +STORE, 4003045376, 4003049471, +STORE, 4003049472, 4003053567, +STORE, 4003053568, 4003057663, +STORE, 4003057664, 4003065855, +STORE, 4003065856, 4003135487, +STORE, 4003135488, 4003446783, +STORE, 4003446784, 4003450879, +STORE, 4003450880, 4003454975, +STORE, 4003454976, 4003459071, +STORE, 4003459072, 4003463167, +STORE, 4003463168, 4003495935, +STORE, 4003495936, 4003569663, +STORE, 4003569664, 4003573759, +STORE, 4003573760, 4003704831, +STORE, 4003704832, 4003708927, +STORE, 4003708928, 4003713023, +STORE, 4003713024, 4003737599, +STORE, 4003737600, 4003770367, +STORE, 4003770368, 4003876863, +STORE, 4003876864, 4003880959, +STORE, 4003880960, 4003885055, +STORE, 4003885056, 4003889151, +STORE, 4003889152, 4003893247, +STORE, 4003893248, 4003897343, +STORE, 4003897344, 4003962879, +STORE, 4003962880, 4004069375, +STORE, 4004069376, 4004093951, +STORE, 4004093952, 4004118527, +STORE, 4004118528, 4004122623, +STORE, 4004122624, 4004126719, +STORE, 4004126720, 4004155391, +STORE, 4004155392, 4004286463, +STORE, 4004286464, 4004384767, +STORE, 4004384768, 4004388863, +STORE, 4004388864, 4004646911, +STORE, 4004646912, 4004655103, +STORE, 4004655104, 4004659199, +STORE, 4004659200, 4004667391, +STORE, 4004667392, 4004683775, +STORE, 4004683776, 4004814847, +STORE, 4004814848, 4004818943, +STORE, 4004818944, 4004823039, +STORE, 4004823040, 4004827135, +STORE, 4004827136, 4004835327, +STORE, 4004835328, 4004954111, +STORE, 4004954112, 4005085183, +STORE, 4005085184, 4005306367, +STORE, 4005306368, 4005765119, +STORE, 4005765120, 4005789695, +STORE, 4005789696, 4005793791, +STORE, 4005793792, 4005801983, +STORE, 4005801984, 4005920767, +STORE, 4005920768, 4005945343, +STORE, 4005945344, 4005949439, +STORE, 4005949440, 4005986303, +STORE, 4005986304, 4005990399, +STORE, 4005990400, 4005994495, +STORE, 4005994496, 4006002687, +STORE, 4006002688, 4006109183, +STORE, 4006109184, 4006117375, +STORE, 4006117376, 4006121471, +STORE, 4006121472, 4006133759, +STORE, 4006133760, 4006137855, +STORE, 4006137856, 4006141951, +STORE, 4006141952, 4006150143, +STORE, 4006150144, 4006391807, +STORE, 4006391808, 4006445055, +STORE, 4006445056, 4006563839, +STORE, 4006563840, 4006572031, +STORE, 4006572032, 4006576127, +STORE, 4006576128, 4006584319, +STORE, 4006584320, 4006694911, +STORE, 4006694912, 4006739967, +STORE, 4006739968, 4006776831, +STORE, 4006776832, 4006785023, +STORE, 4006785024, 4006789119, +STORE, 4006789120, 4006797311, +STORE, 4006797312, 4006813695, +STORE, 4006813696, 4006846463, +STORE, 4006846464, 4006977535, +STORE, 4006977536, 4007006207, +STORE, 4007006208, 4007010303, +STORE, 4007010304, 4007067647, +STORE, 4007067648, 4007075839, +STORE, 4007075840, 4007084031, +STORE, 4007084032, 4007100415, +STORE, 4007100416, 4007116799, +STORE, 4007116800, 4007133183, +STORE, 4007133184, 4007153663, +STORE, 4007153664, 4007178239, +STORE, 4007178240, 4007202815, +STORE, 4007202816, 4007206911, +STORE, 4007206912, 4007272447, +STORE, 4007272448, 4007276543, +STORE, 4007276544, 4007280639, +STORE, 4007280640, 4007284735, +STORE, 4007284736, 4007292927, +STORE, 4007292928, 4007423999, +STORE, 4007424000, 4007448575, +STORE, 4007448576, 4007452671, +STORE, 4007452672, 4007505919, +STORE, 4007505920, 4007510015, +STORE, 4007510016, 4007514111, +STORE, 4007514112, 4007645183, +STORE, 4007645184, 4007776255, +STORE, 4007776256, 4007780351, +STORE, 4007780352, 4007784447, +STORE, 4007784448, 4007788543, +STORE, 4007788544, 4007809023, +STORE, 4007809024, 4007829503, +STORE, 4007829504, 4007960575, +STORE, 4007960576, 4008091647, +STORE, 4008091648, 4008296447, +STORE, 4008296448, 4008890367, +STORE, 4008890368, 4008898559, +STORE, 4008898560, 4008902655, +STORE, 4008902656, 4008996863, +STORE, 4008996864, 4009041919, +STORE, 4009041920, 4009082879, +STORE, 4009082880, 4009091071, +STORE, 4009091072, 4009107455, +STORE, 4009107456, 4009349119, +STORE, 4009349120, 4009373695, +STORE, 4009373696, 4009414655, +STORE, 4009414656, 4009422847, +STORE, 4009422848, 4009426943, +STORE, 4009426944, 4009447423, +STORE, 4009447424, 4009471999, +STORE, 4009472000, 4009512959, +STORE, 4009512960, 4009594879, +STORE, 4009594880, 4009598975, +STORE, 4009598976, 4009697279, +STORE, 4009697280, 4009713663, +STORE, 4009713664, 4009717759, +STORE, 4009717760, 4009721855, +STORE, 4009721856, 4009730047, +STORE, 4009730048, 4009861119, +STORE, 4009861120, 4009951231, +STORE, 4009951232, 4010131455, +STORE, 4010131456, 4010135551, +STORE, 4010135552, 4010139647, +STORE, 4010139648, 4010143743, +STORE, 4010143744, 4010164223, +STORE, 4010164224, 4010295295, +STORE, 4010295296, 4010299391, +STORE, 4010299392, 4010491903, +STORE, 4010491904, 4010495999, +STORE, 4010496000, 4010668031, +STORE, 4010668032, 4011028479, +STORE, 4011028480, 4011053055, +STORE, 4011053056, 4011057151, +STORE, 4011057152, 4011118591, +STORE, 4011118592, 4011126783, +STORE, 4011126784, 4011130879, +STORE, 4011130880, 4011143167, +STORE, 4011143168, 4011147263, +STORE, 4011147264, 4011167743, +STORE, 4011167744, 4011171839, +STORE, 4011171840, 4011360255, +STORE, 4011360256, 4011364351, +STORE, 4011364352, 4011626495, +STORE, 4011626496, 4012216319, +STORE, 4012216320, 4012228607, +STORE, 4012228608, 4012232703, +STORE, 4012232704, 4012236799, +STORE, 4012236800, 4012240895, +STORE, 4012240896, 4012261375, +STORE, 4012261376, 4012392447, +STORE, 4012392448, 4012466175, +STORE, 4012466176, 4012597247, +STORE, 4012597248, 4012601343, +STORE, 4012601344, 4012605439, +STORE, 4012605440, 4012609535, +STORE, 4012609536, 4012679167, +STORE, 4012679168, 4013563903, +STORE, 4013563904, 4015366143, +STORE, 4015366144, 4015411199, +STORE, 4015411200, 4015415295, +STORE, 4015415296, 4015419391, +STORE, 4015419392, 4015542271, +STORE, 4015542272, 4015550463, +STORE, 4015550464, 4015558655, +STORE, 4015558656, 4015562751, +STORE, 4015562752, 4015583231, +STORE, 4015583232, 4015587327, +STORE, 4015587328, 4015603711, +STORE, 4015665152, 4015669247, +STORE, 4015669248, 4015812607, +STORE, 4015812608, 4015816703, +STORE, 4015816704, 4016111615, +STORE, 4016111616, 4016467967, +STORE, 4016467968, 4016508927, +STORE, 4016508928, 4016517119, +STORE, 4016517120, 4016525311, +STORE, 4016525312, 4016586751, +STORE, 4016586752, 4016664575, +STORE, 4016664576, 4016697343, +STORE, 4016697344, 4016742399, +STORE, 4016742400, 4016746495, +STORE, 4016746496, 4016750591, +STORE, 4016750592, 4016758783, +STORE, 4016799744, 4016844799, +STORE, 4016844800, 4016902143, +STORE, 4016902144, 4016992255, +STORE, 4016992256, 4017000447, +STORE, 4017000448, 4017004543, +STORE, 4017004544, 4017008639, +STORE, 4017008640, 4017016831, +STORE, 4017016832, 4017020927, +STORE, 4017020928, 4017127423, +STORE, 4017127424, 4017131519, +STORE, 4017131520, 4017229823, +STORE, 4017229824, 4017422335, +STORE, 4017422336, 4017438719, +STORE, 4017438720, 4017442815, +STORE, 4017442816, 4017446911, +STORE, 4017446912, 4017455103, +STORE, 4017455104, 4017766399, +STORE, 4017766400, 4017909759, +STORE, 4017909760, 4018081791, +STORE, 4018081792, 4018089983, +STORE, 4018089984, 4018094079, +STORE, 4018094080, 4018098175, +STORE, 4018098176, 4018327551, +STORE, 4018327552, 4018331647, +STORE, 4018331648, 4018339839, +STORE, 4018339840, 4018348031, +STORE, 4018348032, 4018610175, +STORE, 4018610176, 4018626559, +STORE, 4018626560, 4018647039, +STORE, 4018647040, 4018651135, +STORE, 4018651136, 4018749439, +STORE, 4018749440, 4018761727, +STORE, 4018761728, 4018802687, +STORE, 4018802688, 4018806783, +STORE, 4018806784, 4018810879, +STORE, 4018810880, 4018814975, +STORE, 4018814976, 4018823167, +STORE, 4018823168, 4018954239, +STORE, 4018954240, 4019007487, +STORE, 4019007488, 4019068927, +STORE, 4019068928, 4019077119, +STORE, 4019077120, 4019081215, +STORE, 4019081216, 4019093503, +STORE, 4019093504, 4019208191, +STORE, 4019208192, 4019232767, +STORE, 4019232768, 4019265535, +STORE, 4019265536, 4019269631, +STORE, 4019269632, 4019277823, +STORE, 4019277824, 4019458047, +STORE, 4019458048, 4019519487, +STORE, 4019519488, 4019613695, +STORE, 4019613696, 4019621887, +STORE, 4019621888, 4019625983, +STORE, 4019625984, 4019630079, +STORE, 4019630080, 4019744767, +STORE, 4019744768, 4019822591, +STORE, 4019822592, 4019929087, +STORE, 4019929088, 4019941375, +STORE, 4019941376, 4019945471, +STORE, 4019945472, 4019961855, +STORE, 4019961856, 4019994623, +STORE, 4019994624, 4019998719, +STORE, 4019998720, 4020002815, +STORE, 4020002816, 4020006911, +STORE, 4020006912, 4020011007, +STORE, 4020011008, 4020256767, +STORE, 4020256768, 4020326399, +STORE, 4020326400, 4020457471, +STORE, 4020457472, 4020469759, +STORE, 4020469760, 4020473855, +STORE, 4020473856, 4020482047, +STORE, 4020482048, 4020711423, +STORE, 4020711424, 4020715519, +STORE, 4020715520, 4020719615, +STORE, 4020719616, 4020723711, +STORE, 4020723712, 4020805631, +STORE, 4020805632, 4021051391, +STORE, 4021051392, 4021460991, +STORE, 4021460992, 4021469183, +STORE, 4021469184, 4021473279, +STORE, 4021473280, 4021571583, +STORE, 4021571584, 4021633023, +STORE, 4021633024, 4021727231, +STORE, 4021727232, 4021735423, +STORE, 4021735424, 4021739519, +STORE, 4021739520, 4021747711, +STORE, 4021747712, 4021829631, +STORE, 4021829632, 4021866495, +STORE, 4021866496, 4021919743, +STORE, 4021919744, 4021927935, +STORE, 4021927936, 4021932031, +STORE, 4021932032, 4021944319, +STORE, 4021944320, 4022157311, +STORE, 4022157312, 4022161407, +STORE, 4022161408, 4022173695, +STORE, 4022173696, 4022177791, +STORE, 4022177792, 4022472703, +STORE, 4022472704, 4022509567, +STORE, 4022509568, 4022583295, +STORE, 4022583296, 4022587391, +STORE, 4022587392, 4022591487, +STORE, 4022591488, 4022607871, +STORE, 4022607872, 4022657023, +STORE, 4022657024, 4022722559, +STORE, 4022722560, 4022730751, +STORE, 4022730752, 4022734847, +STORE, 4022734848, 4022865919, +STORE, 4022865920, 4022943743, +STORE, 4022943744, 4023062527, +STORE, 4023062528, 4023074815, +STORE, 4023074816, 4023078911, +STORE, 4023078912, 4023128063, +STORE, 4023128064, 4023218175, +STORE, 4023218176, 4023361535, +STORE, 4023361536, 4023373823, +STORE, 4023373824, 4023377919, +STORE, 4023377920, 4023558143, +STORE, 4023558144, 4023631871, +STORE, 4023631872, 4023816191, +STORE, 4023816192, 4023820287, +STORE, 4023820288, 4023824383, +STORE, 4023824384, 4023832575, +STORE, 4023832576, 4024078335, +STORE, 4024078336, 4024197119, +STORE, 4024197120, 4024389631, +STORE, 4024389632, 4024406015, +STORE, 4024406016, 4024410111, +STORE, 4024410112, 4024422399, +STORE, 4024422400, 4024619007, +STORE, 4024619008, 4024639487, +STORE, 4024639488, 4024655871, +STORE, 4024655872, 4024664063, +STORE, 4024664064, 4024668159, +STORE, 4024668160, 4024676351, +STORE, 4024676352, 4024905727, +STORE, 4024905728, 4024909823, +STORE, 4024909824, 4024918015, +STORE, 4024918016, 4024922111, +STORE, 4024922112, 4024930303, +STORE, 4024930304, 4025110527, +STORE, 4025110528, 4025176063, +STORE, 4025176064, 4025208831, +STORE, 4025208832, 4025212927, +STORE, 4025212928, 4025217023, +STORE, 4025217024, 4025348095, +STORE, 4025348096, 4025372671, +STORE, 4025372672, 4025458687, +STORE, 4025458688, 4025466879, +STORE, 4025466880, 4025565183, +STORE, 4025565184, 4025757695, +STORE, 4025757696, 4026249215, +STORE, 4026249216, 4026261503, +STORE, 4026261504, 4026265599, +STORE, 4026265600, 4026269695, +STORE, 4026269696, 4026302463, +STORE, 4026302464, 4026306559, +STORE, 4026306560, 4026314751, +STORE, 4026314752, 4026318847, +STORE, 4026318848, 4026322943, +STORE, 4026322944, 4026327039, +STORE, 4026327040, 4026654719, +STORE, 4026654720, 4026671103, +STORE, 4026671104, 4026720255, +STORE, 4026720256, 4026724351, +STORE, 4026724352, 4026728447, +STORE, 4026728448, 4026732543, +STORE, 4026732544, 4026863615, +STORE, 4026863616, 4027027455, +STORE, 4027027456, 4027031551, +STORE, 4027031552, 4027514879, +STORE, 4027514880, 4027531263, +STORE, 4027531264, 4027535359, +STORE, 4027535360, 4027539455, +STORE, 4027539456, 4027785215, +STORE, 4027785216, 4027789311, +STORE, 4027789312, 4027793407, +STORE, 4027793408, 4027797503, +STORE, 4027797504, 4027863039, +STORE, 4027863040, 4027899903, +STORE, 4027899904, 4027949055, +STORE, 4027949056, 4027957247, +STORE, 4027957248, 4027961343, +STORE, 4027961344, 4027965439, +STORE, 4027965440, 4028194815, +STORE, 4028194816, 4028252159, +STORE, 4028252160, 4028338175, +STORE, 4028338176, 4028350463, +STORE, 4028350464, 4028354559, +STORE, 4028354560, 4028452863, +STORE, 4028452864, 4028489727, +STORE, 4028489728, 4028530687, +STORE, 4028530688, 4028538879, +STORE, 4028538880, 4028542975, +STORE, 4028542976, 4028551167, +STORE, 4028551168, 4028665855, +STORE, 4028665856, 4029349887, +STORE, 4029349888, 4030468095, +STORE, 4030468096, 4030513151, +STORE, 4030513152, 4030517247, +STORE, 4030517248, 4030525439, +STORE, 4030525440, 4030529535, +STORE, 4030529536, 4030758911, +STORE, 4030758912, 4030828543, +STORE, 4030828544, 4030943231, +STORE, 4030943232, 4030951423, +STORE, 4030951424, 4030955519, +STORE, 4030955520, 4030967807, +STORE, 4030967808, 4031131647, +STORE, 4031131648, 4031135743, +STORE, 4031135744, 4031139839, +STORE, 4031139840, 4031148031, +STORE, 4031148032, 4031152127, +STORE, 4031152128, 4031160319, +STORE, 4031160320, 4031504383, +STORE, 4031504384, 4031598591, +STORE, 4031598592, 4031754239, +STORE, 4031754240, 4031766527, +STORE, 4031766528, 4031770623, +STORE, 4031770624, 4031774719, +STORE, 4031774720, 4031782911, +STORE, 4031782912, 4031799295, +STORE, 4031799296, 4031856639, +STORE, 4031856640, 4031983615, +STORE, 4031983616, 4031987711, +STORE, 4031987712, 4031991807, +STORE, 4031991808, 4032270335, +STORE, 4032270336, 4032274431, +STORE, 4032274432, 4032282623, +STORE, 4032282624, 4032286719, +STORE, 4032286720, 4032290815, +STORE, 4032290816, 4032389119, +STORE, 4032389120, 4032397311, +STORE, 4032397312, 4032405503, +STORE, 4032405504, 4032413695, +STORE, 4032413696, 4032417791, +STORE, 4032417792, 4032565247, +STORE, 4032565248, 4032593919, +STORE, 4032593920, 4032737279, +STORE, 4032737280, 4032741375, +STORE, 4032741376, 4032745471, +STORE, 4032745472, 4032770047, +STORE, 4032770048, 4032933887, +STORE, 4032933888, 4032999423, +STORE, 4032999424, 4033032191, +STORE, 4033032192, 4033036287, +STORE, 4033036288, 4033040383, +STORE, 4033040384, 4033105919, +STORE, 4033105920, 4033396735, +STORE, 4033396736, 4033822719, +STORE, 4033822720, 4033839103, +STORE, 4033839104, 4033843199, +STORE, 4033843200, 4033851391, +STORE, 4033851392, 4033863679, +STORE, 4033863680, 4033880063, +STORE, 4033880064, 4033933311, +STORE, 4033933312, 4034023423, +STORE, 4034023424, 4034031615, +STORE, 4034031616, 4034035711, +STORE, 4034035712, 4034043903, +STORE, 4034043904, 4034142207, +STORE, 4034142208, 4034191359, +STORE, 4034191360, 4034260991, +STORE, 4034260992, 4034269183, +STORE, 4034269184, 4034273279, +STORE, 4034273280, 4034281471, +STORE, 4034281472, 4034412543, +STORE, 4034412544, 4034445311, +STORE, 4034445312, 4034490367, +STORE, 4034490368, 4034494463, +STORE, 4034494464, 4034498559, +STORE, 4034498560, 4034662399, +STORE, 4034662400, 4034666495, +STORE, 4034666496, 4034670591, +STORE, 4034670592, 4034674687, +STORE, 4034674688, 4034678783, +STORE, 4034678784, 4034682879, +STORE, 4034682880, 4034781183, +STORE, 4034781184, 4035043327, +STORE, 4035043328, 4035047423, +STORE, 4035047424, 4035055615, +STORE, 4035055616, 4035059711, +STORE, 4035059712, 4035063807, +STORE, 4035063808, 4035067903, +STORE, 4035067904, 4035100671, +STORE, 4035100672, 4035375103, +STORE, 4035375104, 4035383295, +STORE, 4035383296, 4035395583, +STORE, 4035395584, 4035399679, +STORE, 4035399680, 4035403775, +STORE, 4035403776, 4035407871, +STORE, 4035407872, 4035411967, +STORE, 4035411968, 4035477503, +STORE, 4035477504, 4035608575, +STORE, 4035608576, 4035641343, +STORE, 4035641344, 4035682303, +STORE, 4035682304, 4035686399, +STORE, 4035686400, 4035690495, +STORE, 4035690496, 4035694591, +STORE, 4035694592, 4035743743, +STORE, 4035743744, 4035784703, +STORE, 4035784704, 4035829759, +STORE, 4035829760, 4035837951, +STORE, 4035837952, 4035842047, +STORE, 4035842048, 4035846143, +STORE, 4035846144, 4035850239, +STORE, 4035850240, 4036001791, +STORE, 4036001792, 4036005887, +STORE, 4036005888, 4036214783, +STORE, 4036214784, 4036218879, +STORE, 4036218880, 4036603903, +STORE, 4036603904, 4036648959, +STORE, 4036648960, 4036653055, +STORE, 4036653056, 4036657151, +STORE, 4036657152, 4036665343, +STORE, 4036665344, 4036780031, +STORE, 4036780032, 4036829183, +STORE, 4036829184, 4036984831, +STORE, 4036984832, 4036993023, +STORE, 4036993024, 4036997119, +STORE, 4036997120, 4037001215, +STORE, 4037001216, 4037009407, +STORE, 4037009408, 4037025791, +STORE, 4037025792, 4037095423, +STORE, 4037095424, 4037181439, +STORE, 4037181440, 4037193727, +STORE, 4037193728, 4037197823, +STORE, 4037197824, 4037206015, +STORE, 4037206016, 4037320703, +STORE, 4037320704, 4037337087, +STORE, 4037337088, 4037349375, +STORE, 4037349376, 4037357567, +STORE, 4037357568, 4037361663, +STORE, 4037369856, 4037386239, +STORE, 4037386240, 4037672959, +STORE, 4037672960, 4037689343, +STORE, 4037689344, 4037730303, +STORE, 4037730304, 4037734399, +STORE, 4037734400, 4037738495, +STORE, 4037738496, 4037742591, +STORE, 4037742592, 4037758975, +STORE, 4037758976, 4037890047, +STORE, 4037890048, 4037931007, +STORE, 4037931008, 4037976063, +STORE, 4037976064, 4037984255, +STORE, 4037984256, 4037988351, +STORE, 4037988352, 4038053887, +STORE, 4038053888, 4038184959, +STORE, 4038184960, 4038189055, +STORE, 4038189056, 4038197247, +STORE, 4038197248, 4038201343, +STORE, 4038201344, 4038205439, +STORE, 4038205440, 4038209535, +STORE, 4038217728, 4038250495, +STORE, 4038250496, 4038512639, +STORE, 4038512640, 4038516735, +STORE, 4038516736, 4038520831, +STORE, 4038520832, 4038524927, +STORE, 4038524928, 4038529023, +STORE, 4038529024, 4038533119, +STORE, 4038541312, 4038623231, +STORE, 4038623232, 4038754303, +STORE, 4038754304, 4038885375, +STORE, 4038885376, 4038889471, +STORE, 4038897664, 4038963199, +STORE, 4038963200, 4038967295, +STORE, 4038967296, 4038983679, +STORE, 4038983680, 4039114751, +STORE, 4039114752, 4039245823, +STORE, 4039245824, 4039376895, +STORE, 4039376896, 4040687615, +STORE, 4040687616, 4040691711, +STORE, 4040691712, 4040806399, +STORE, 4040806400, 4040937471, +STORE, 4040937472, 4040941567, +STORE, 4040945664, 4040949759, +STORE, 4040949760, 4041080831, +STORE, 4041080832, 4041211903, +STORE, 4041211904, 4043046911, +STORE, 4043046912, 4043051007, +STORE, 4043051008, 4043055103, +STORE, 4043055104, 4043137023, +STORE, 4043137024, 4043141119, +STORE, 4043141120, 4043145215, +STORE, 4043145216, 4043153407, +STORE, 4043153408, 4043186175, +STORE, 4043186176, 4043317247, +STORE, 4043317248, 4043448319, +STORE, 4043448320, 4043579391, +STORE, 4043579392, 4043583487, +STORE, 4043583488, 4043599871, +STORE, 4043599872, 4043661311, +STORE, 4043661312, 4043792383, +STORE, 4043792384, 4043796479, +STORE, 4043796480, 4043800575, +STORE, 4043800576, 4043816959, +STORE, 4043816960, 4043821055, +STORE, 4043821056, 4043825151, +STORE, 4043825152, 4043829247, +STORE, 4043829248, 4043833343, +STORE, 4043833344, 4047241215, +STORE, 4047241216, 4047249407, +STORE, 4047249408, 4047253503, +STORE, 4047253504, 4047323135, +STORE, 4047323136, 4047327231, +STORE, 4047327232, 4047458303, +STORE, 4047458304, 4047589375, +STORE, 4047589376, 4047720447, +STORE, 4047720448, 4047773695, +STORE, 4047773696, 4047790079, +STORE, 4047790080, 4047921151, +STORE, 4047921152, 4048052223, +STORE, 4048052224, 4048183295, +STORE, 4048183296, 4049002495, +STORE, 4049002496, 4049133567, +STORE, 4049133568, 4049154047, +STORE, 4049154048, 4049158143, +STORE, 4049158144, 4049162239, +STORE, 4049162240, 4049166335, +STORE, 4049166336, 4049174527, +STORE, 4049174528, 4049182719, +STORE, 4049182720, 4049186815, +STORE, 4049186816, 4049190911, +STORE, 4049190912, 4049195007, +STORE, 4049195008, 4049203199, +STORE, 4049203200, 4049207295, +STORE, 4049207296, 4049211391, +STORE, 4049211392, 4049215487, +STORE, 4049215488, 4049219583, +STORE, 4049219584, 4049227775, +STORE, 4049227776, 4049231871, +STORE, 4049231872, 4049235967, +STORE, 4049235968, 4049244159, +STORE, 4049244160, 4049248255, +STORE, 4049248256, 4049252351, +STORE, 4049252352, 4049256447, +STORE, 4049256448, 4049268735, +STORE, 4049268736, 4049272831, +STORE, 4049272832, 4049313791, +STORE, 4049313792, 4049723391, +STORE, 4049723392, 4049727487, +STORE, 4049727488, 4049858559, +STORE, 4049858560, 4049989631, +STORE, 4049989632, 4049993727, +STORE, 4049993728, 4050026495, +STORE, 4050026496, 4050030591, +STORE, 4050030592, 4050161663, +STORE, 4050161664, 4050169855, +STORE, 4050169856, 4050223103, +STORE, 4050223104, 4050632703, +STORE, 4050632704, 4050636799, +STORE, 4050636800, 4050640895, +STORE, 4050640896, 4050644991, +STORE, 4050644992, 4050661375, +STORE, 4050661376, 4050665471, +STORE, 4050665472, 4050673663, +STORE, 4050673664, 4050677759, +STORE, 4050677760, 4050694143, +STORE, 4050694144, 4050702335, +STORE, 4050702336, 4050956287, +STORE, 4050956288, 4051963903, +STORE, 4051963904, 4051980287, +STORE, 4051980288, 4051988479, +STORE, 4051988480, 4052000767, +STORE, 4052000768, 4052004863, +STORE, 4052004864, 4052029439, +STORE, 4284014592, 4284018687, +STORE, 4284018688, 4292403199, +SNULL, 4041080832, 4041211903, +SNULL, 3795763200, 3795894271, +STORE, 3629522944, 3696631807, +SNULL, 3663077375, 3696631807, +STORE, 3629522944, 3663077375, +STORE, 3663077376, 3696631807, +SNULL, 3663077376, 3696631807, +STORE, 3663077376, 3696631807, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3626471424, 3627524095, +SNULL, 3626471424, 3626475519, +STORE, 3626475520, 3627524095, +STORE, 3626471424, 3626475519, +SNULL, 3627519999, 3627524095, +STORE, 3626475520, 3627519999, +STORE, 3627520000, 3627524095, +STORE, 3625418752, 3626475519, +SNULL, 3625418752, 3625422847, +STORE, 3625422848, 3626475519, +STORE, 3625418752, 3625422847, +SNULL, 3626467327, 3626475519, +STORE, 3625422848, 3626467327, +STORE, 3626467328, 3626475519, +STORE, 3624366080, 3625422847, +SNULL, 3624366080, 3624370175, +STORE, 3624370176, 3625422847, +STORE, 3624366080, 3624370175, +SNULL, 3625414655, 3625422847, +STORE, 3624370176, 3625414655, +STORE, 3625414656, 3625422847, +STORE, 4041191424, 4041211903, +SNULL, 4041195519, 4041211903, +STORE, 4041191424, 4041195519, +STORE, 4041195520, 4041211903, +STORE, 4041170944, 4041191423, +SNULL, 4041175039, 4041191423, +STORE, 4041170944, 4041175039, +STORE, 4041175040, 4041191423, +SNULL, 3625426943, 3626467327, +STORE, 3625422848, 3625426943, +STORE, 3625426944, 3626467327, +STORE, 4041162752, 4041170943, +SNULL, 3626479615, 3627519999, +STORE, 3626475520, 3626479615, +STORE, 3626479616, 3627519999, +STORE, 4041154560, 4041162751, +STORE, 4041154560, 4041170943, +STORE, 4041134080, 4041154559, +SNULL, 4041138175, 4041154559, +STORE, 4041134080, 4041138175, +STORE, 4041138176, 4041154559, +SNULL, 3624374271, 3625414655, +STORE, 3624370176, 3624374271, +STORE, 3624374272, 3625414655, +STORE, 4041125888, 4041134079, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +STORE, 3487174656, 3487584255, +STORE, 4041121792, 4041125887, +SNULL, 4041121792, 4041125887, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 3487174656, 3487584255, +STORE, 3222274048, 3223326719, +SNULL, 3222274048, 3222278143, +STORE, 3222278144, 3223326719, +STORE, 3222274048, 3222278143, +SNULL, 3223322623, 3223326719, +STORE, 3222278144, 3223322623, +STORE, 3223322624, 3223326719, +STORE, 3221221376, 3222278143, +SNULL, 3221221376, 3221225471, +STORE, 3221225472, 3222278143, +STORE, 3221221376, 3221225471, +SNULL, 3222269951, 3222278143, +STORE, 3221225472, 3222269951, +STORE, 3222269952, 3222278143, +STORE, 3220168704, 3221225471, +SNULL, 3220168704, 3220172799, +STORE, 3220172800, 3221225471, +STORE, 3220168704, 3220172799, +SNULL, 3221217279, 3221225471, +STORE, 3220172800, 3221217279, +STORE, 3221217280, 3221225471, +STORE, 4041117696, 4041125887, +STORE, 4041117696, 4041134079, +STORE, 3219083264, 3220172799, +SNULL, 3219083264, 3219087359, +STORE, 3219087360, 3220172799, +STORE, 3219083264, 3219087359, +SNULL, 3220164607, 3220172799, +STORE, 3219087360, 3220164607, +STORE, 3220164608, 3220172799, +STORE, 4041109504, 4041117695, +STORE, 4041109504, 4041134079, +STORE, 3217997824, 3219087359, +SNULL, 3217997824, 3218001919, +STORE, 3218001920, 3219087359, +STORE, 3217997824, 3218001919, +SNULL, 3219079167, 3219087359, +STORE, 3218001920, 3219079167, +STORE, 3219079168, 3219087359, +STORE, 4041101312, 4041109503, +STORE, 4041101312, 4041134079, +STORE, 3216912384, 3218001919, +SNULL, 3216912384, 3216916479, +STORE, 3216916480, 3218001919, +STORE, 3216912384, 3216916479, +SNULL, 3217993727, 3218001919, +STORE, 3216916480, 3217993727, +STORE, 3217993728, 3218001919, +STORE, 4041093120, 4041101311, +STORE, 4041093120, 4041134079, +STORE, 3215826944, 3216916479, +SNULL, 3215826944, 3215831039, +STORE, 3215831040, 3216916479, +STORE, 3215826944, 3215831039, +SNULL, 3216908287, 3216916479, +STORE, 3215831040, 3216908287, +STORE, 3216908288, 3216916479, +STORE, 4016779264, 4016799743, +SNULL, 4016783359, 4016799743, +STORE, 4016779264, 4016783359, +STORE, 4016783360, 4016799743, +STORE, 4016758784, 4016779263, +SNULL, 4016762879, 4016779263, +STORE, 4016758784, 4016762879, +STORE, 4016762880, 4016779263, +SNULL, 3222282239, 3223322623, +STORE, 3222278144, 3222282239, +STORE, 3222282240, 3223322623, +STORE, 4041084928, 4041093119, +STORE, 4041084928, 4041134079, +SNULL, 3221229567, 3222269951, +STORE, 3221225472, 3221229567, +STORE, 3221229568, 3222269951, +STORE, 4015644672, 4015665151, +STORE, 4038889472, 4038897663, +SNULL, 4015648767, 4015665151, +STORE, 4015644672, 4015648767, +STORE, 4015648768, 4015665151, +STORE, 4015624192, 4015644671, +SNULL, 4015628287, 4015644671, +STORE, 4015624192, 4015628287, +STORE, 4015628288, 4015644671, +SNULL, 3219091455, 3220164607, +STORE, 3219087360, 3219091455, +STORE, 3219091456, 3220164607, +STORE, 4015603712, 4015624191, +SNULL, 4015607807, 4015624191, +STORE, 4015603712, 4015607807, +STORE, 4015607808, 4015624191, +SNULL, 3218006015, 3219079167, +STORE, 3218001920, 3218006015, +STORE, 3218006016, 3219079167, +STORE, 3949674496, 3949694975, +SNULL, 3949678591, 3949694975, +STORE, 3949674496, 3949678591, +STORE, 3949678592, 3949694975, +SNULL, 3216920575, 3217993727, +STORE, 3216916480, 3216920575, +STORE, 3216920576, 3217993727, +STORE, 3948924928, 3948945407, +SNULL, 3948929023, 3948945407, +STORE, 3948924928, 3948929023, +STORE, 3948929024, 3948945407, +SNULL, 3215835135, 3216908287, +STORE, 3215831040, 3215835135, +STORE, 3215835136, 3216908287, +SNULL, 3220176895, 3221217279, +STORE, 3220172800, 3220176895, +STORE, 3220176896, 3221217279, +STORE, 3214786560, 3215826943, +STORE, 3213733888, 3214786559, +SNULL, 3213733888, 3213737983, +STORE, 3213737984, 3214786559, +STORE, 3213733888, 3213737983, +SNULL, 3214782463, 3214786559, +STORE, 3213737984, 3214782463, +STORE, 3214782464, 3214786559, +STORE, 4038533120, 4038541311, +STORE, 3948421120, 3948441599, +SNULL, 3948425215, 3948441599, +STORE, 3948421120, 3948425215, +STORE, 3948425216, 3948441599, +SNULL, 3213742079, 3214782463, +STORE, 3213737984, 3213742079, +STORE, 3213742080, 3214782463, +STORE, 4038209536, 4038217727, +STORE, 3212681216, 3213737983, +SNULL, 3212681216, 3212685311, +STORE, 3212685312, 3213737983, +STORE, 3212681216, 3212685311, +SNULL, 3213729791, 3213737983, +STORE, 3212685312, 3213729791, +STORE, 3213729792, 3213737983, +STORE, 3795763200, 3795894271, +STORE, 3946872832, 3946893311, +SNULL, 3946876927, 3946893311, +STORE, 3946872832, 3946876927, +STORE, 3946876928, 3946893311, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +STORE, 3487174656, 3487584255, +SNULL, 3212689407, 3213729791, +STORE, 3212685312, 3212689407, +STORE, 3212689408, 3213729791, +STORE, 4041080832, 4041084927, +STORE, 4040941568, 4040945663, +STORE, 4037361664, 4037369855, +STORE, 4000817152, 4000821247, +STORE, 3999440896, 3999444991, +STORE, 3212161024, 3212681215, +SNULL, 3212161024, 3212439551, +STORE, 3212439552, 3212681215, +STORE, 3212161024, 3212439551, +SNULL, 3212161024, 3212439551, +SNULL, 3212464127, 3212681215, +STORE, 3212439552, 3212464127, +STORE, 3212464128, 3212681215, +SNULL, 3212464128, 3212681215, +SNULL, 3212439552, 3212451839, +STORE, 3212451840, 3212464127, +STORE, 3212439552, 3212451839, +SNULL, 3212439552, 3212451839, +STORE, 3212439552, 3212451839, +SNULL, 3212451840, 3212455935, +STORE, 3212455936, 3212464127, +STORE, 3212451840, 3212455935, +SNULL, 3212451840, 3212455935, +STORE, 3212451840, 3212455935, +SNULL, 3212455936, 3212460031, +STORE, 3212460032, 3212464127, +STORE, 3212455936, 3212460031, +SNULL, 3212455936, 3212460031, +STORE, 3212455936, 3212460031, +SNULL, 3212460032, 3212464127, +STORE, 3212460032, 3212464127, +STORE, 3997679616, 3997683711, +SNULL, 4049235968, 4049240063, +STORE, 4049240064, 4049244159, +STORE, 4049235968, 4049240063, +SNULL, 4049240064, 4049244159, +STORE, 4049240064, 4049244159, +SNULL, 3997679616, 3997683711, +SNULL, 3999440896, 3999444991, +SNULL, 4000817152, 4000821247, +SNULL, 4040941568, 4040945663, +SNULL, 4041080832, 4041084927, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 3487174656, 3487584255, +SNULL, 3212451840, 3212455935, +STORE, 3212451840, 3212455935, +STORE, 4041080832, 4041084927, +STORE, 3623890944, 3624169471, +SNULL, 4041080832, 4041084927, +STORE, 4041080832, 4041084927, +SNULL, 4041080832, 4041084927, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +STORE, 4041080832, 4041084927, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +STORE, 3211386880, 3212439551, +SNULL, 3211386880, 3211390975, +STORE, 3211390976, 3212439551, +STORE, 3211386880, 3211390975, +SNULL, 3212435455, 3212439551, +STORE, 3211390976, 3212435455, +STORE, 3212435456, 3212439551, +STORE, 4040941568, 4040945663, +STORE, 3937169408, 3937189887, +STORE, 3623485440, 3623616511, +SNULL, 717225983, 1388314623, +STORE, 314572800, 717225983, +STORE, 717225984, 1388314623, +SNULL, 717225984, 1388314623, +STORE, 3937112064, 3937132543, +SNULL, 3937116159, 3937132543, +STORE, 3937112064, 3937116159, +STORE, 3937116160, 3937132543, +SNULL, 3211395071, 3212435455, +STORE, 3211390976, 3211395071, +STORE, 3211395072, 3212435455, +STORE, 4000817152, 4000821247, +STORE, 3974823936, 3974832127, +STORE, 3595284480, 3595431935, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +STORE, 3487174656, 3487584255, +STORE, 3999440896, 3999444991, +STORE, 3997679616, 3997683711, +STORE, 3996295168, 3996299263, +STORE, 3996090368, 3996094463, +STORE, 3210866688, 3211386879, +SNULL, 3210866688, 3211001855, +STORE, 3211001856, 3211386879, +STORE, 3210866688, 3211001855, +SNULL, 3210866688, 3211001855, +SNULL, 3211038719, 3211386879, +STORE, 3211001856, 3211038719, +STORE, 3211038720, 3211386879, +SNULL, 3211038720, 3211386879, +SNULL, 3211001856, 3211022335, +STORE, 3211022336, 3211038719, +STORE, 3211001856, 3211022335, +SNULL, 3211001856, 3211022335, +STORE, 3211001856, 3211022335, +SNULL, 3211022336, 3211030527, +STORE, 3211030528, 3211038719, +STORE, 3211022336, 3211030527, +SNULL, 3211022336, 3211030527, +STORE, 3211022336, 3211030527, +SNULL, 3211030528, 3211034623, +STORE, 3211034624, 3211038719, +STORE, 3211030528, 3211034623, +SNULL, 3211030528, 3211034623, +STORE, 3211030528, 3211034623, +SNULL, 3211034624, 3211038719, +STORE, 3211034624, 3211038719, +STORE, 3994906624, 3994910719, +SNULL, 4049240064, 4049244159, +STORE, 4049240064, 4049244159, +SNULL, 3994906624, 3994910719, +SNULL, 3996090368, 3996094463, +SNULL, 3996295168, 3996299263, +SNULL, 3997679616, 3997683711, +SNULL, 3999440896, 3999444991, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 3487174656, 3487584255, +SNULL, 3211022336, 3211030527, +STORE, 3211022336, 3211030527, +STORE, 3999440896, 3999444991, +STORE, 3210199040, 3211001855, +SNULL, 3999440896, 3999444991, +STORE, 3999440896, 3999444991, +SNULL, 3999440896, 3999444991, +STORE, 3594821632, 3594952703, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 4048183296, 4048592895, +STORE, 4048592896, 4049002495, +STORE, 4048183296, 4048592895, +STORE, 4048183296, 4049002495, +SNULL, 1914101759, 1969434623, +STORE, 1914097664, 1914101759, +STORE, 1914101760, 1969434623, +STORE, 3567108096, 3567239167, +STORE, 3973832704, 3973840895, +STORE, 3209113600, 3210199039, +SNULL, 3209113600, 3209117695, +STORE, 3209117696, 3210199039, +STORE, 3209113600, 3209117695, +SNULL, 3210194943, 3210199039, +STORE, 3209117696, 3210194943, +STORE, 3210194944, 3210199039, +STORE, 3935858688, 3935879167, +SNULL, 3935862783, 3935879167, +STORE, 3935858688, 3935862783, +STORE, 3935862784, 3935879167, +SNULL, 3209121791, 3210194943, +STORE, 3209117696, 3209121791, +STORE, 3209121792, 3210194943, +STORE, 3528749056, 3528880127, +STORE, 3968200704, 3968208895, +STORE, 3208028160, 3209117695, +SNULL, 3208028160, 3208032255, +STORE, 3208032256, 3209117695, +STORE, 3208028160, 3208032255, +SNULL, 3209109503, 3209117695, +STORE, 3208032256, 3209109503, +STORE, 3209109504, 3209117695, +STORE, 3888123904, 3888144383, +SNULL, 3888127999, 3888144383, +STORE, 3888123904, 3888127999, +STORE, 3888128000, 3888144383, +SNULL, 3208036351, 3209109503, +STORE, 3208032256, 3208036351, +STORE, 3208036352, 3209109503, +SNULL, 3968200704, 3968208895, +SNULL, 3888123904, 3888144383, +SNULL, 3209109504, 3209113599, +STORE, 3209113600, 3209117695, +STORE, 3209109504, 3209113599, +SNULL, 3208028160, 3209113599, +STORE, 3208060928, 3209117695, +SNULL, 3208060928, 3208065023, +STORE, 3208065024, 3209117695, +STORE, 3208060928, 3208065023, +SNULL, 3209109503, 3209117695, +STORE, 3208065024, 3209109503, +STORE, 3209109504, 3209117695, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3888123904, 3888144383, +SNULL, 3888127999, 3888144383, +STORE, 3888123904, 3888127999, +STORE, 3888128000, 3888144383, +SNULL, 3208069119, 3209109503, +STORE, 3208065024, 3208069119, +STORE, 3208069120, 3209109503, +STORE, 3968200704, 3968208895, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3527778304, 3527909375, +STORE, 3999440896, 3999444991, +STORE, 3997679616, 3997683711, +STORE, 1914097664, 1914105855, +STORE, 1914105856, 1969434623, +STORE, 3957583872, 3957592063, +STORE, 3206975488, 3208065023, +SNULL, 3206975488, 3206979583, +STORE, 3206979584, 3208065023, +STORE, 3206975488, 3206979583, +SNULL, 3208056831, 3208065023, +STORE, 3206979584, 3208056831, +STORE, 3208056832, 3208065023, +STORE, 3956736000, 3956744191, +STORE, 3205890048, 3206979583, +SNULL, 3205890048, 3205894143, +STORE, 3205894144, 3206979583, +STORE, 3205890048, 3205894143, +SNULL, 3206971391, 3206979583, +STORE, 3205894144, 3206971391, +STORE, 3206971392, 3206979583, +STORE, 3806101504, 3806121983, +SNULL, 3806105599, 3806121983, +STORE, 3806101504, 3806105599, +STORE, 3806105600, 3806121983, +SNULL, 3206983679, 3208056831, +STORE, 3206979584, 3206983679, +STORE, 3206983680, 3208056831, +STORE, 3806081024, 3806101503, +SNULL, 3806085119, 3806101503, +STORE, 3806081024, 3806085119, +STORE, 3806085120, 3806101503, +SNULL, 3205898239, 3206971391, +STORE, 3205894144, 3205898239, +STORE, 3205898240, 3206971391, +STORE, 3956015104, 3956023295, +STORE, 3204804608, 3205894143, +SNULL, 3204804608, 3204808703, +STORE, 3204808704, 3205894143, +STORE, 3204804608, 3204808703, +SNULL, 3205885951, 3205894143, +STORE, 3204808704, 3205885951, +STORE, 3205885952, 3205894143, +STORE, 3803471872, 3803492351, +STORE, 3803451392, 3803471871, +STORE, 3803451392, 3803492351, +SNULL, 3957583872, 3957592063, +SNULL, 3806101504, 3806121983, +SNULL, 3206975487, 3206979583, +STORE, 3206971392, 3206975487, +STORE, 3206975488, 3206979583, +SNULL, 3208056832, 3208060927, +STORE, 3208060928, 3208065023, +STORE, 3208056832, 3208060927, +SNULL, 3206975488, 3208060927, +STORE, 3801845760, 3801878527, +STORE, 3806101504, 3806121983, +SNULL, 3806105599, 3806121983, +STORE, 3806101504, 3806105599, +STORE, 3806105600, 3806121983, +SNULL, 3204812799, 3205885951, +STORE, 3204808704, 3204812799, +STORE, 3204812800, 3205885951, +STORE, 1914097664, 1914109951, +STORE, 1914109952, 1969434623, +STORE, 3957583872, 3957592063, +STORE, 3206971392, 3208065023, +SNULL, 3206971392, 3206979583, +STORE, 3206979584, 3208065023, +STORE, 3206971392, 3206979583, +SNULL, 3208056831, 3208065023, +STORE, 3206979584, 3208056831, +STORE, 3208056832, 3208065023, +STORE, 3801825280, 3801845759, +SNULL, 3801829375, 3801845759, +STORE, 3801825280, 3801829375, +STORE, 3801829376, 3801845759, +SNULL, 3206983679, 3208056831, +STORE, 3206979584, 3206983679, +STORE, 3206983680, 3208056831, +STORE, 3202707456, 3204804607, +SNULL, 3202707456, 3204804607, +STORE, 3202707456, 3204804607, +STORE, 3200610304, 3202707455, +SNULL, 3202707456, 3204804607, +SNULL, 3200610304, 3202707455, +STORE, 3202707456, 3204804607, +SNULL, 3202707456, 3204804607, +STORE, 3202707456, 3204804607, +SNULL, 3202707456, 3204804607, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3527647232, 3527778303, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +STORE, 3487059968, 3487584255, +SNULL, 3487059968, 3487301631, +STORE, 3487301632, 3487584255, +STORE, 3487059968, 3487301631, +SNULL, 3487059968, 3487301631, +SNULL, 3487563775, 3487584255, +STORE, 3487301632, 3487563775, +STORE, 3487563776, 3487584255, +SNULL, 3487563776, 3487584255, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3524046848, 3524177919, +STORE, 3487170560, 3487301631, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3487039488, 3487170559, +STORE, 3487039488, 3487301631, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3204280320, 3204804607, +SNULL, 3204280320, 3204448255, +STORE, 3204448256, 3204804607, +STORE, 3204280320, 3204448255, +SNULL, 3204280320, 3204448255, +SNULL, 3204710399, 3204804607, +STORE, 3204448256, 3204710399, +STORE, 3204710400, 3204804607, +SNULL, 3204710400, 3204804607, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3996295168, 3996299263, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +SNULL, 3996295168, 3996299263, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3486908416, 3487039487, +STORE, 3486908416, 3487301631, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3223326720, 3290435583, +SNULL, 3223326720, 3256881151, +STORE, 3256881152, 3290435583, +STORE, 3223326720, 3256881151, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +STORE, 3201826816, 3202351103, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +STORE, 3202351104, 3204448255, +SNULL, 3202351104, 3204448255, +SNULL, 3803471871, 3803492351, +STORE, 3803451392, 3803471871, +STORE, 3803471872, 3803492351, +SNULL, 3803471872, 3803492351, +SNULL, 3803451392, 3803471871, +STORE, 3798999040, 3799101439, +SNULL, 3798999040, 3799101439, +STORE, 3952644096, 3952652287, +STORE, 3203362816, 3204448255, +SNULL, 3203362816, 3203366911, +STORE, 3203366912, 3204448255, +STORE, 3203362816, 3203366911, +SNULL, 3204444159, 3204448255, +STORE, 3203366912, 3204444159, +STORE, 3204444160, 3204448255, +STORE, 3803471872, 3803492351, +SNULL, 3803475967, 3803492351, +STORE, 3803471872, 3803475967, +STORE, 3803475968, 3803492351, +SNULL, 3203371007, 3204444159, +STORE, 3203366912, 3203371007, +STORE, 3203371008, 3204444159, +STORE, 3199729664, 3201826815, +SNULL, 3199729664, 3201826815, +STORE, 3199729664, 3201826815, +SNULL, 3199729664, 3201826815, +STORE, 3199729664, 3201826815, +SNULL, 3199729664, 3201826815, +STORE, 3199729664, 3201826815, +SNULL, 3199729664, 3201826815, +STORE, 3199729664, 3201826815, +SNULL, 3199729664, 3201826815, +STORE, 3200774144, 3201826815, +SNULL, 3200774144, 3200778239, +STORE, 3200778240, 3201826815, +STORE, 3200774144, 3200778239, +SNULL, 3201822719, 3201826815, +STORE, 3200778240, 3201822719, +STORE, 3201822720, 3201826815, +STORE, 3803451392, 3803471871, +SNULL, 3803455487, 3803471871, +STORE, 3803451392, 3803455487, +STORE, 3803455488, 3803471871, +SNULL, 3200782335, 3201822719, +STORE, 3200778240, 3200782335, +STORE, 3200782336, 3201822719, +STORE, 3949666304, 3949674495, +STORE, 3949408256, 3949416447, +STORE, 3199688704, 3200778239, +SNULL, 3199688704, 3199692799, +STORE, 3199692800, 3200778239, +STORE, 3199688704, 3199692799, +SNULL, 3200770047, 3200778239, +STORE, 3199692800, 3200770047, +STORE, 3200770048, 3200778239, +STORE, 3799306240, 3799326719, +SNULL, 3799310335, 3799326719, +STORE, 3799306240, 3799310335, +STORE, 3799310336, 3799326719, +SNULL, 3199696895, 3200770047, +STORE, 3199692800, 3199696895, +STORE, 3199696896, 3200770047, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +STORE, 3799277568, 3799306239, +SNULL, 3799277568, 3799306239, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +SNULL, 4041162751, 4041170943, +STORE, 4041154560, 4041162751, +STORE, 4041162752, 4041170943, +SNULL, 4041162752, 4041170943, +SNULL, 4041154560, 4041162751, +SNULL, 4041191424, 4041211903, +SNULL, 4041170944, 4041191423, +SNULL, 3626471423, 3626475519, +STORE, 3626467328, 3626471423, +STORE, 3626471424, 3626475519, +SNULL, 3626471424, 3627524095, +SNULL, 3625418751, 3625422847, +STORE, 3625414656, 3625418751, +STORE, 3625418752, 3625422847, +SNULL, 3625418752, 3626471423, +STORE, 3627393024, 3627524095, +STORE, 3627261952, 3627393023, +STORE, 3627261952, 3627524095, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +STORE, 3195494400, 3197591551, +SNULL, 3197591552, 3199688703, +SNULL, 3195494400, 3197591551, +STORE, 3197591552, 3199688703, +SNULL, 3197591552, 3199688703, +STORE, 3197591552, 3199688703, +STORE, 3195494400, 3197591551, +SNULL, 3197591552, 3199688703, +SNULL, 3195494400, 3197591551, +STORE, 3798999040, 3799101439, +SNULL, 3798999040, 3799101439, +/* + * mmap: unmapped_area_topdown: ffff9a9f14ddaa80 + * Gap was found: mt 4041162752 gap_end 4041183232 + * mmap: window was 4052029440 - 4096 size 28672 + * mmap: mas.min 4041154560 max 4041191423 mas.last 4041191423 + * mmap: mas.index 4041162752 align mask 0 offset 0 + * mmap: rb_find_vma find on 4041162752 => ffff9a9f03d19678 (ffff9a9f03d19678) + */ + }; + + unsigned long set43[] = { +STORE, 140737488347136, 140737488351231, +STORE, 140734187720704, 140737488351231, +SNULL, 140734187724800, 140737488351231, +STORE, 140734187589632, 140734187724799, +STORE, 4194304, 6443007, +STORE, 4337664, 6443007, +STORE, 4194304, 4337663, +SNULL, 4337664, 6443007, +STORE, 6430720, 6443007, +STORE, 206158430208, 206160674815, +STORE, 206158569472, 206160674815, +STORE, 206158430208, 206158569471, +SNULL, 206158569472, 206160674815, +STORE, 206160662528, 206160670719, +STORE, 206160670720, 206160674815, +STORE, 140734188756992, 140734188765183, +STORE, 140734188740608, 140734188756991, +STORE, 140501948112896, 140501948116991, + }; + + int count = 0; + void *ptr = NULL; + + MA_STATE(mas, mt, 0, 0); + + mt_set_non_kernel(3); + check_erase2_testset(mt, set, ARRAY_SIZE(set)); + mt_set_non_kernel(0); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set2, ARRAY_SIZE(set2)); + start = 140735933894656; + MT_BUG_ON(mt, !!mt_find(mt, &start, 140735933906943UL)); + mtree_destroy(mt); + + mt_set_non_kernel(2); + mt_init_flags(mt, 0); + check_erase2_testset(mt, set3, ARRAY_SIZE(set3)); + mt_set_non_kernel(0); + mtree_destroy(mt); + + mt_init_flags(mt, 0); + check_erase2_testset(mt, set4, ARRAY_SIZE(set4)); + rcu_read_lock(); + mas_for_each(&mas, entry, ULONG_MAX) { + if (xa_is_zero(entry)) + continue; + } + rcu_read_unlock(); + rcu_barrier(); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_set_non_kernel(100); + check_erase2_testset(mt, set5, ARRAY_SIZE(set5)); + rcu_barrier(); + mt_set_non_kernel(0); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set6, ARRAY_SIZE(set6)); + rcu_barrier(); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set7, ARRAY_SIZE(set7)); + rcu_barrier(); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set8, ARRAY_SIZE(set8)); + rcu_barrier(); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set9, ARRAY_SIZE(set9)); + rcu_barrier(); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set10, ARRAY_SIZE(set10)); + rcu_barrier(); + mtree_destroy(mt); + + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set11, ARRAY_SIZE(set11)); + rcu_barrier(); + mas_empty_area_rev(&mas, 12288, 140014592737280, 0x2000); + MT_BUG_ON(mt, mas.last != 140014592573439); + mtree_destroy(mt); + + mas_reset(&mas); + mas.tree = mt; + count = 0; + mas.index = 0; + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set12, ARRAY_SIZE(set12)); + rcu_barrier(); + mas_for_each(&mas, entry, ULONG_MAX) { + if (xa_is_zero(entry)) + continue; + BUG_ON(count > 12); + count++; + } + mtree_destroy(mt); + + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set13, ARRAY_SIZE(set13)); + mtree_erase(mt, 140373516443648); + rcu_read_lock(); + mas_empty_area_rev(&mas, 0, 140373518663680, 4096); + rcu_read_unlock(); + mtree_destroy(mt); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set14, ARRAY_SIZE(set14)); + rcu_barrier(); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set15, ARRAY_SIZE(set15)); + rcu_barrier(); + mtree_destroy(mt); + + /* set16 was to find a bug on limit updating at slot 0. */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set16, ARRAY_SIZE(set16)); + rcu_barrier(); + mas_empty_area_rev(&mas, 4096, 139921865637888, 0x6000); + MT_BUG_ON(mt, mas.last != 139921865547775); + mt_set_non_kernel(0); + mtree_destroy(mt); + + /* + * set17 found a bug in walking backwards and not counting nulls at + * the end. This could cause a gap to be missed if the null had any + * size. + */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set17, ARRAY_SIZE(set17)); + rcu_barrier(); + mas_empty_area_rev(&mas, 4096, 139953197334528, 0x1000); + MT_BUG_ON(mt, mas.last != 139953197322239); +/* MT_BUG_ON(mt, mas.index != 139953197318144); */ + mt_set_non_kernel(0); + mtree_destroy(mt); + + /* + * set18 found a bug in walking backwards and not setting the max from + * the node, but using the parent node. This was only an issue if the + * next slot in the parent had what we needed. + */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set18, ARRAY_SIZE(set18)); + rcu_barrier(); + mas_empty_area_rev(&mas, 4096, 140222972858368, 2215936); + MT_BUG_ON(mt, mas.last != 140222968475647); + /*MT_BUG_ON(mt, mas.index != 140222966259712); */ + mt_set_non_kernel(0); + mtree_destroy(mt); + + /* + * set19 found 2 bugs in prev. + * 1. If we hit root without finding anything, then there was an + * infinite loop. + * 2. The first ascending wasn't using the correct slot which may have + * caused missed entries. + */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set19, ARRAY_SIZE(set19)); + rcu_barrier(); + mas.index = 140656779083776; + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != xa_mk_value(140656779083776)); + entry = mas_prev(&mas, 0); + MT_BUG_ON(mt, entry != xa_mk_value(140656766251008)); + mt_set_non_kernel(0); + mtree_destroy(mt); + + /* + * set20 found a bug in mas_may_move_gap due to the slot being + * overwritten during the __mas_add operation and setting it to zero. + */ + mt_set_non_kernel(99); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set20, ARRAY_SIZE(set20)); + rcu_barrier(); + check_load(mt, 94849009414144, NULL); + mt_set_non_kernel(0); + mtree_destroy(mt); + + mt_set_non_kernel(99); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set21, ARRAY_SIZE(set21)); + rcu_barrier(); + mt_validate(mt); + mt_set_non_kernel(0); + mtree_destroy(mt); + + mt_set_non_kernel(999); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set22, ARRAY_SIZE(set22)); + rcu_barrier(); + mt_validate(mt); + ptr = mtree_load(mt, 140551363362816); + MT_BUG_ON(mt, ptr == mtree_load(mt, 140551363420159)); + mt_set_non_kernel(0); + mtree_destroy(mt); + + mt_set_non_kernel(99); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set23, ARRAY_SIZE(set23)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + + mt_set_non_kernel(99); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set24, ARRAY_SIZE(set24)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + mt_set_non_kernel(99); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set25, ARRAY_SIZE(set25)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + /* Split on NULL followed by delete - causes gap issues. */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set26, ARRAY_SIZE(set26)); + rcu_barrier(); + mas_empty_area_rev(&mas, 4096, 140109042671616, 409600); + MT_BUG_ON(mt, mas.last != 140109040959487); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + /* Split on NULL followed by delete - causes gap issues. */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set27, ARRAY_SIZE(set27)); + rcu_barrier(); + MT_BUG_ON(mt, 0 != mtree_load(mt, 140415537422336)); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set28, ARRAY_SIZE(set28)); + rcu_barrier(); + mas_empty_area_rev(&mas, 4096, 139918413357056, 2097152); + /* Search for the size of gap then align it (offset 0) */ + mas.index = (mas.last + 1 - 2097152 - 0) & (~2093056); + MT_BUG_ON(mt, mas.index != 139918401601536); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + /* This test found issues with retry moving rebalanced nodes so the + * incorrect parent pivot was updated. + */ + mt_set_non_kernel(999); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set29, ARRAY_SIZE(set29)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + /* This test found issues with deleting all entries in a node when + * surrounded by entries in the next nodes, then deleting the entries + * surrounding the node filled with deleted entries. + */ + mt_set_non_kernel(999); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set30, ARRAY_SIZE(set30)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + /* This test found an issue with deleting all entries in a node that was + * the end node and mas_gap incorrectly set next = curr, and curr = prev + * then moved next to the left, losing data. + */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set31, ARRAY_SIZE(set31)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set32, ARRAY_SIZE(set32)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + +/* + * mmap: empty_area_topdown: ffff88821c9cb600 Gap was found: + * mt 140582827569152 gap_end 140582869532672 + * mmap: window was 140583656296448 - 4096 size 134217728 + * mmap: mas.min 94133881868288 max 140582961786879 mas.last 140582961786879 + * mmap: mas.index 140582827569152 align mask 0 offset 0 + * mmap: rb_find_vma find on + * 140582827569152 => ffff88821c5bad00 (ffff88821c5bad00) + */ + + /* move gap failed due to an entirely empty node */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set33, ARRAY_SIZE(set33)); + rcu_barrier(); + mas_empty_area_rev(&mas, 4096, 140583656296448, 134217728); + MT_BUG_ON(mt, mas.last != 140583003750399); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + /* + * Incorrect gap in tree caused by mas_prev not setting the limits + * correctly while walking down. + */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set34, ARRAY_SIZE(set34)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + /* Empty leaf at the end of a parent caused incorrect gap. */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set35, ARRAY_SIZE(set35)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + mt_set_non_kernel(99); + /* Empty leaf at the end of a parent caused incorrect gap. */ + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set36, ARRAY_SIZE(set36)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set37, ARRAY_SIZE(set37)); + rcu_barrier(); + MT_BUG_ON(mt, 0 != mtree_load(mt, 94637033459712)); + mt_validate(mt); + mtree_destroy(mt); + + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set38, ARRAY_SIZE(set38)); + rcu_barrier(); + MT_BUG_ON(mt, 0 != mtree_load(mt, 94637033459712)); + mt_validate(mt); + mtree_destroy(mt); + + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set39, ARRAY_SIZE(set39)); + rcu_barrier(); + mt_validate(mt); + mtree_destroy(mt); + + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set40, ARRAY_SIZE(set40)); + rcu_barrier(); + mt_validate(mt); + mtree_destroy(mt); + + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set41, ARRAY_SIZE(set41)); + rcu_barrier(); + mt_validate(mt); + mtree_destroy(mt); + + /* move gap failed due to an entirely empty node. */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set42, ARRAY_SIZE(set42)); + rcu_barrier(); + mas_empty_area_rev(&mas, 4096, 4052029440, 28672); + MT_BUG_ON(mt, mas.last != 4041211903); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); + + /* gap calc off by one */ + mt_set_non_kernel(99); + mas_reset(&mas); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_erase2_testset(mt, set43, ARRAY_SIZE(set43)); + rcu_barrier(); + mt_set_non_kernel(0); + mt_validate(mt); + mtree_destroy(mt); +} + +static noinline void check_alloc_rev_range(struct maple_tree *mt) +{ + /* + * Generated by: + * cat /proc/self/maps | awk '{print $1}'| + * awk -F "-" '{printf "0x%s, 0x%s, ", $1, $2}' + */ + + unsigned long range[] = { + /* Inclusive , Exclusive. */ + 0x565234af2000, 0x565234af4000, + 0x565234af4000, 0x565234af9000, + 0x565234af9000, 0x565234afb000, + 0x565234afc000, 0x565234afd000, + 0x565234afd000, 0x565234afe000, + 0x565235def000, 0x565235e10000, + 0x7f36d4bfd000, 0x7f36d4ee2000, + 0x7f36d4ee2000, 0x7f36d4f04000, + 0x7f36d4f04000, 0x7f36d504c000, + 0x7f36d504c000, 0x7f36d5098000, + 0x7f36d5098000, 0x7f36d5099000, + 0x7f36d5099000, 0x7f36d509d000, + 0x7f36d509d000, 0x7f36d509f000, + 0x7f36d509f000, 0x7f36d50a5000, + 0x7f36d50b9000, 0x7f36d50db000, + 0x7f36d50db000, 0x7f36d50dc000, + 0x7f36d50dc000, 0x7f36d50fa000, + 0x7f36d50fa000, 0x7f36d5102000, + 0x7f36d5102000, 0x7f36d5103000, + 0x7f36d5103000, 0x7f36d5104000, + 0x7f36d5104000, 0x7f36d5105000, + 0x7fff5876b000, 0x7fff5878d000, + 0x7fff5878e000, 0x7fff58791000, + 0x7fff58791000, 0x7fff58793000, + }; + + unsigned long holes[] = { + /* + * Note: start of hole is INCLUSIVE + * end of hole is EXCLUSIVE + * (opposite of the above table.) + * Start of hole, end of hole, size of hole (+1) + */ + 0x565234afb000, 0x565234afc000, 0x1000, + 0x565234afe000, 0x565235def000, 0x12F1000, + 0x565235e10000, 0x7f36d4bfd000, 0x28E49EDED000, + }; + + /* + * req_range consists of 4 values. + * 1. min index + * 2. max index + * 3. size + * 4. number that should be returned. + * 5. return value + */ + unsigned long req_range[] = { + 0x565234af9000, /* Min */ + 0x7fff58791000, /* Max */ + 0x1000, /* Size */ + 0x7fff5878d << 12, /* First rev hole of size 0x1000 */ + 0, /* Return value success. */ + + 0x0, /* Min */ + 0x565234AF1 << 12, /* Max */ + 0x3000, /* Size */ + 0x565234AEE << 12, /* max - 3. */ + 0, /* Return value success. */ + + 0x0, /* Min */ + -1, /* Max */ + 0x1000, /* Size */ + 562949953421311 << 12,/* First rev hole of size 0x1000 */ + 0, /* Return value success. */ + + 0x0, /* Min */ + 0x7F36D510A << 12, /* Max */ + 0x4000, /* Size */ + 0x7F36D5106 << 12, /* First rev hole of size 0x4000 */ + 0, /* Return value success. */ + + /* Ascend test. */ + 0x0, + 34148798629 << 12, + 19 << 12, + 34148797418 << 12, + 0x0, + + /* Too big test. */ + 0x0, + 18446744073709551615UL, + 562915594369134UL << 12, + 0x0, + -EBUSY, + + }; + + int i, range_count = ARRAY_SIZE(range); + int req_range_count = ARRAY_SIZE(req_range); + unsigned long min = 0; + + MA_STATE(mas, mt, 0, 0); + + mtree_store_range(mt, MTREE_ALLOC_MAX, ULONG_MAX, XA_ZERO_ENTRY, + GFP_KERNEL); +#define DEBUG_REV_RANGE 0 + for (i = 0; i < range_count; i += 2) { + /* Inclusive, Inclusive (with the -1) */ + +#if DEBUG_REV_RANGE + pr_debug("\t%s: Insert %lu-%lu\n", __func__, range[i] >> 12, + (range[i + 1] >> 12) - 1); +#endif + check_insert_range(mt, range[i] >> 12, (range[i + 1] >> 12) - 1, + xa_mk_value(range[i] >> 12), 0); + mt_validate(mt); + } + + + for (i = 0; i < ARRAY_SIZE(holes); i += 3) { +#if DEBUG_REV_RANGE + pr_debug("Search from %lu-%lu for gap %lu should be at %lu\n", + min, holes[i+1]>>12, holes[i+2]>>12, + holes[i] >> 12); +#endif + MT_BUG_ON(mt, mas_empty_area_rev(&mas, min, + holes[i+1] >> 12, + holes[i+2] >> 12)); +#if DEBUG_REV_RANGE + pr_debug("Found %lu %lu\n", mas.index, mas.last); + pr_debug("gap %lu %lu\n", (holes[i] >> 12), + (holes[i+1] >> 12)); +#endif + MT_BUG_ON(mt, mas.last + 1 != (holes[i+1] >> 12)); + MT_BUG_ON(mt, mas.index != (holes[i+1] >> 12) - (holes[i+2] >> 12)); + min = holes[i+1] >> 12; + mas_reset(&mas); + } + + for (i = 0; i < req_range_count; i += 5) { +#if DEBUG_REV_RANGE + pr_debug("\tReverse request between %lu-%lu size %lu, should get %lu\n", + req_range[i] >> 12, + (req_range[i + 1] >> 12) - 1, + req_range[i+2] >> 12, + req_range[i+3] >> 12); +#endif + check_mtree_alloc_rrange(mt, + req_range[i] >> 12, /* start */ + req_range[i+1] >> 12, /* end */ + req_range[i+2] >> 12, /* size */ + req_range[i+3] >> 12, /* expected address */ + req_range[i+4], /* expected return */ + xa_mk_value(req_range[i] >> 12)); /* pointer */ + mt_validate(mt); + } + + mt_set_non_kernel(1); + mtree_erase(mt, 34148798727); /* create a deleted range. */ + check_mtree_alloc_rrange(mt, 0, 34359052173, 210253414, + 34148798725, 0, mt); + + mtree_destroy(mt); +} + +static noinline void check_alloc_range(struct maple_tree *mt) +{ + /* + * Generated by: + * cat /proc/self/maps|awk '{print $1}'| + * awk -F "-" '{printf "0x%s, 0x%s, ", $1, $2}' + */ + + unsigned long range[] = { + /* Inclusive , Exclusive. */ + 0x565234af2000, 0x565234af4000, + 0x565234af4000, 0x565234af9000, + 0x565234af9000, 0x565234afb000, + 0x565234afc000, 0x565234afd000, + 0x565234afd000, 0x565234afe000, + 0x565235def000, 0x565235e10000, + 0x7f36d4bfd000, 0x7f36d4ee2000, + 0x7f36d4ee2000, 0x7f36d4f04000, + 0x7f36d4f04000, 0x7f36d504c000, + 0x7f36d504c000, 0x7f36d5098000, + 0x7f36d5098000, 0x7f36d5099000, + 0x7f36d5099000, 0x7f36d509d000, + 0x7f36d509d000, 0x7f36d509f000, + 0x7f36d509f000, 0x7f36d50a5000, + 0x7f36d50b9000, 0x7f36d50db000, + 0x7f36d50db000, 0x7f36d50dc000, + 0x7f36d50dc000, 0x7f36d50fa000, + 0x7f36d50fa000, 0x7f36d5102000, + 0x7f36d5102000, 0x7f36d5103000, + 0x7f36d5103000, 0x7f36d5104000, + 0x7f36d5104000, 0x7f36d5105000, + 0x7fff5876b000, 0x7fff5878d000, + 0x7fff5878e000, 0x7fff58791000, + 0x7fff58791000, 0x7fff58793000, + }; + unsigned long holes[] = { + /* Start of hole, end of hole, size of hole (+1) */ + 0x565234afb000, 0x565234afc000, 0x1000, + 0x565234afe000, 0x565235def000, 0x12F1000, + 0x565235e10000, 0x7f36d4bfd000, 0x28E49EDED000, + }; + + /* + * req_range consists of 4 values. + * 1. min index + * 2. max index + * 3. size + * 4. number that should be returned. + * 5. return value + */ + unsigned long req_range[] = { + 0x565234af9000, /* Min */ + 0x7fff58791000, /* Max */ + 0x1000, /* Size */ + 0x565234afb000, /* First hole in our data of size 1000. */ + 0, /* Return value success. */ + + 0x0, /* Min */ + 0x7fff58791000, /* Max */ + 0x1F00, /* Size */ + 0x0, /* First hole in our data of size 2000. */ + 0, /* Return value success. */ + + /* Test ascend. */ + 34148797436 << 12, /* Min */ + 0x7fff587AF000, /* Max */ + 0x3000, /* Size */ + 34148798629 << 12, /* Expected location */ + 0, /* Return value success. */ + + /* Test failing. */ + 34148798623 << 12, /* Min */ + 34148798683 << 12, /* Max */ + 0x15000, /* Size */ + 0, /* Expected location */ + -EBUSY, /* Return value failed. */ + + /* Test filling entire gap. */ + 34148798623 << 12, /* Min */ + 0x7fff587AF000, /* Max */ + 0x10000, /* Size */ + 34148798632 << 12, /* Expected location */ + 0, /* Return value success. */ + + /* Test walking off the end of root. */ + 0, /* Min */ + -1, /* Max */ + -1, /* Size */ + 0, /* Expected location */ + -EBUSY, /* Return value failure. */ + + /* Test looking for too large a hole across entire range. */ + 0, /* Min */ + -1, /* Max */ + 4503599618982063UL << 12, /* Size */ + 34359052178 << 12, /* Expected location */ + -EBUSY, /* Return failure. */ + }; + int i, range_count = ARRAY_SIZE(range); + int req_range_count = ARRAY_SIZE(req_range); + unsigned long min = 0x565234af2000; + + mtree_store_range(mt, MTREE_ALLOC_MAX, ULONG_MAX, XA_ZERO_ENTRY, + GFP_KERNEL); + for (i = 0; i < range_count; i += 2) { +#define DEBUG_ALLOC_RANGE 0 +#if DEBUG_ALLOC_RANGE + pr_debug("\tInsert %lu-%lu\n", range[i] >> 12, + (range[i + 1] >> 12) - 1); + mt_dump(mt); +#endif + check_insert_range(mt, range[i] >> 12, (range[i + 1] >> 12) - 1, + xa_mk_value(range[i] >> 12), 0); + mt_validate(mt); + } + + + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i < ARRAY_SIZE(holes); i += 3) { + +#if DEBUG_ALLOC_RANGE + pr_debug("\tGet empty %lu-%lu size %lu (%lx-%lx)\n", min >> 12, + holes[i+1] >> 12, holes[i+2] >> 12, + min, holes[i+1]); +#endif + MT_BUG_ON(mt, mas_empty_area(&mas, min >> 12, + holes[i+1] >> 12, + holes[i+2] >> 12)); + MT_BUG_ON(mt, mas.index != holes[i] >> 12); + min = holes[i+1]; + mas_reset(&mas); + } + for (i = 0; i < req_range_count; i += 5) { +#if DEBUG_ALLOC_RANGE + pr_debug("\tTest %d: %lu-%lu size %lu expected %lu (%lu-%lu)\n", + i/5, req_range[i] >> 12, req_range[i + 1] >> 12, + req_range[i + 2] >> 12, req_range[i + 3] >> 12, + req_range[i], req_range[i+1]); +#endif + check_mtree_alloc_range(mt, + req_range[i] >> 12, /* start */ + req_range[i+1] >> 12, /* end */ + req_range[i+2] >> 12, /* size */ + req_range[i+3] >> 12, /* expected address */ + req_range[i+4], /* expected return */ + xa_mk_value(req_range[i] >> 12)); /* pointer */ + mt_validate(mt); +#if DEBUG_ALLOC_RANGE + mt_dump(mt); +#endif + } + + mtree_destroy(mt); +} + +static noinline void check_ranges(struct maple_tree *mt) +{ + int i, val, val2; + unsigned long r[] = { + 10, 15, + 20, 25, + 17, 22, /* Overlaps previous range. */ + 9, 1000, /* Huge. */ + 100, 200, + 45, 168, + 118, 128, + }; + + MT_BUG_ON(mt, !mtree_empty(mt)); + check_insert_range(mt, r[0], r[1], xa_mk_value(r[0]), 0); + check_insert_range(mt, r[2], r[3], xa_mk_value(r[2]), 0); + check_insert_range(mt, r[4], r[5], xa_mk_value(r[4]), -EEXIST); + MT_BUG_ON(mt, !mt_height(mt)); + /* Store */ + check_store_range(mt, r[4], r[5], xa_mk_value(r[4]), 0); + check_store_range(mt, r[6], r[7], xa_mk_value(r[6]), 0); + check_store_range(mt, r[8], r[9], xa_mk_value(r[8]), 0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + MT_BUG_ON(mt, mt_height(mt)); + + check_seq(mt, 50, false); + mt_set_non_kernel(4); + check_store_range(mt, 5, 47, xa_mk_value(47), 0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + /* Create tree of 1-100 */ + check_seq(mt, 100, false); + /* Store 45-168 */ + mt_set_non_kernel(10); + check_store_range(mt, r[10], r[11], xa_mk_value(r[10]), 0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + /* Create tree of 1-200 */ + check_seq(mt, 200, false); + /* Store 45-168 */ + check_store_range(mt, r[10], r[11], xa_mk_value(r[10]), 0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + check_seq(mt, 30, false); + check_store_range(mt, 6, 18, xa_mk_value(6), 0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + /* Overwrite across multiple levels. */ + /* Create tree of 1-400 */ + check_seq(mt, 400, false); + mt_set_non_kernel(50); + /* Store 118-128 */ + check_store_range(mt, r[12], r[13], xa_mk_value(r[12]), 0); + mt_set_non_kernel(50); + mtree_test_erase(mt, 140); + mtree_test_erase(mt, 141); + mtree_test_erase(mt, 142); + mtree_test_erase(mt, 143); + mtree_test_erase(mt, 130); + mtree_test_erase(mt, 131); + mtree_test_erase(mt, 132); + mtree_test_erase(mt, 133); + mtree_test_erase(mt, 134); + mtree_test_erase(mt, 135); + check_load(mt, r[12], xa_mk_value(r[12])); + check_load(mt, r[13], xa_mk_value(r[12])); + check_load(mt, r[13] - 1, xa_mk_value(r[12])); + check_load(mt, r[13] + 1, xa_mk_value(r[13] + 1)); + check_load(mt, 135, NULL); + check_load(mt, 140, NULL); + mt_set_non_kernel(0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + + + /* Overwrite multiple levels at the end of the tree (slot 7) */ + mt_set_non_kernel(50); + check_seq(mt, 400, false); + check_store_range(mt, 353, 361, xa_mk_value(353), 0); + check_store_range(mt, 347, 352, xa_mk_value(347), 0); + + check_load(mt, 346, xa_mk_value(346)); + for (i = 347; i <= 352; i++) + check_load(mt, i, xa_mk_value(347)); + for (i = 353; i <= 361; i++) + check_load(mt, i, xa_mk_value(353)); + check_load(mt, 362, xa_mk_value(362)); + mt_set_non_kernel(0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + mt_set_non_kernel(50); + check_seq(mt, 400, false); + check_store_range(mt, 352, 364, NULL, 0); + check_store_range(mt, 351, 363, xa_mk_value(352), 0); + check_load(mt, 350, xa_mk_value(350)); + check_load(mt, 351, xa_mk_value(352)); + for (i = 352; i <= 363; i++) + check_load(mt, i, xa_mk_value(352)); + check_load(mt, 364, NULL); + check_load(mt, 365, xa_mk_value(365)); + mt_set_non_kernel(0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + mt_set_non_kernel(5); + check_seq(mt, 400, false); + check_store_range(mt, 352, 364, NULL, 0); + check_store_range(mt, 351, 364, xa_mk_value(352), 0); + check_load(mt, 350, xa_mk_value(350)); + check_load(mt, 351, xa_mk_value(352)); + for (i = 352; i <= 364; i++) + check_load(mt, i, xa_mk_value(352)); + check_load(mt, 365, xa_mk_value(365)); + mt_set_non_kernel(0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + + mt_set_non_kernel(50); + check_seq(mt, 400, false); + check_store_range(mt, 362, 367, xa_mk_value(362), 0); + check_store_range(mt, 353, 361, xa_mk_value(353), 0); + mt_set_non_kernel(0); + mt_validate(mt); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + /* + * Interesting cases: + * 1. Overwrite the end of a node and end in the first entry of the next + * node. + * 2. Split a single range + * 3. Overwrite the start of a range + * 4. Overwrite the end of a range + * 5. Overwrite the entire range + * 6. Overwrite a range that causes multiple parent nodes to be + * combined + * 7. Overwrite a range that causes multiple parent nodes and part of + * root to be combined + * 8. Overwrite the whole tree + * 9. Try to overwrite the zero entry of an alloc tree. + * 10. Write a range larger than a nodes current pivot + */ + + mt_set_non_kernel(50); + for (i = 0; i <= 500; i++) { + val = i*5; + val2 = (i+1)*5; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + check_store_range(mt, 2400, 2400, xa_mk_value(2400), 0); + check_store_range(mt, 2411, 2411, xa_mk_value(2411), 0); + check_store_range(mt, 2412, 2412, xa_mk_value(2412), 0); + check_store_range(mt, 2396, 2400, xa_mk_value(4052020), 0); + check_store_range(mt, 2402, 2402, xa_mk_value(2402), 0); + mtree_destroy(mt); + mt_set_non_kernel(0); + + mt_set_non_kernel(50); + for (i = 0; i <= 500; i++) { + val = i*5; + val2 = (i+1)*5; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + check_store_range(mt, 2422, 2422, xa_mk_value(2422), 0); + check_store_range(mt, 2424, 2424, xa_mk_value(2424), 0); + check_store_range(mt, 2425, 2425, xa_mk_value(2), 0); + check_store_range(mt, 2460, 2470, NULL, 0); + check_store_range(mt, 2435, 2460, xa_mk_value(2435), 0); + check_store_range(mt, 2461, 2470, xa_mk_value(2461), 0); + mt_set_non_kernel(0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + /* Test rebalance gaps */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mt_set_non_kernel(50); + for (i = 0; i <= 50; i++) { + val = i*10; + val2 = (i+1)*10; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + check_store_range(mt, 161, 161, xa_mk_value(161), 0); + check_store_range(mt, 162, 162, xa_mk_value(162), 0); + check_store_range(mt, 163, 163, xa_mk_value(163), 0); + check_store_range(mt, 240, 249, NULL, 0); + mtree_erase(mt, 200); + mtree_erase(mt, 210); + mtree_erase(mt, 220); + mtree_erase(mt, 230); + mt_set_non_kernel(0); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= 500; i++) { + val = i*10; + val2 = (i+1)*10; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + check_store_range(mt, 4600, 4959, xa_mk_value(1), 0); + mt_validate(mt); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= 500; i++) { + val = i*10; + val2 = (i+1)*10; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + check_store_range(mt, 4811, 4811, xa_mk_value(4811), 0); + check_store_range(mt, 4812, 4812, xa_mk_value(4812), 0); + check_store_range(mt, 4861, 4861, xa_mk_value(4861), 0); + check_store_range(mt, 4862, 4862, xa_mk_value(4862), 0); + check_store_range(mt, 4842, 4849, NULL, 0); + mt_validate(mt); + MT_BUG_ON(mt, !mt_height(mt)); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= 1300; i++) { + val = i*10; + val2 = (i+1)*10; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + MT_BUG_ON(mt, mt_height(mt) >= 4); + } + /* Cause a 3 child split all the way up the tree. */ + for (i = 5; i < 215; i += 10) + check_store_range(mt, 11450 + i, 11450 + i + 1, NULL, 0); + for (i = 5; i < 65; i += 10) + check_store_range(mt, 11770 + i, 11770 + i + 1, NULL, 0); + + MT_BUG_ON(mt, mt_height(mt) >= 4); + for (i = 5; i < 45; i += 10) + check_store_range(mt, 11700 + i, 11700 + i + 1, NULL, 0); + MT_BUG_ON(mt, mt_height(mt) < 4); + mtree_destroy(mt); + + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= 1200; i++) { + val = i*10; + val2 = (i+1)*10; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + MT_BUG_ON(mt, mt_height(mt) >= 4); + } + /* Fill parents and leaves before split. */ + for (i = 5; i < 455; i += 10) + check_store_range(mt, 7800 + i, 7800 + i + 1, NULL, 0); + + for (i = 1; i < 16; i++) + check_store_range(mt, 8185 + i, 8185 + i + 1, + xa_mk_value(8185+i), 0); + MT_BUG_ON(mt, mt_height(mt) >= 4); + /* triple split across multiple levels. */ + check_store_range(mt, 8184, 8184, xa_mk_value(8184), 0); + MT_BUG_ON(mt, mt_height(mt) != 4); +} + +static noinline void check_next_entry(struct maple_tree *mt) +{ + void *entry = NULL; + unsigned long limit = 30, i = 0; + + MT_BUG_ON(mt, !mtree_empty(mt)); + MA_STATE(mas, mt, i, i); + + check_seq(mt, limit, false); + rcu_read_lock(); + + /* Check the first one and get ma_state in the correct state. */ + MT_BUG_ON(mt, mas_walk(&mas) != xa_mk_value(i++)); + for ( ; i <= limit + 1; i++) { + entry = mas_next(&mas, limit); + if (i > limit) + MT_BUG_ON(mt, entry != NULL); + else + MT_BUG_ON(mt, xa_mk_value(i) != entry); + } + rcu_read_unlock(); + mtree_destroy(mt); +} + +static noinline void check_prev_entry(struct maple_tree *mt) +{ + unsigned long index = 16; + void *value; + int i; + + MA_STATE(mas, mt, index, index); + + MT_BUG_ON(mt, !mtree_empty(mt)); + check_seq(mt, 30, false); + + rcu_read_lock(); + value = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, value != xa_mk_value(index)); + value = mas_prev(&mas, 0); + MT_BUG_ON(mt, value != xa_mk_value(index - 1)); + rcu_read_unlock(); + mtree_destroy(mt); + + /* Check limits on prev */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + for (i = 0; i <= index; i++) { + mas_set_range(&mas, i*10, i*10+5); + mas_store_gfp(&mas, xa_mk_value(i), GFP_KERNEL); + } + + mas_set(&mas, 20); + value = mas_walk(&mas); + MT_BUG_ON(mt, value != xa_mk_value(2)); + + value = mas_prev(&mas, 19); + MT_BUG_ON(mt, value != NULL); + + mas_set(&mas, 80); + value = mas_walk(&mas); + MT_BUG_ON(mt, value != xa_mk_value(8)); + + value = mas_prev(&mas, 76); + MT_BUG_ON(mt, value != NULL); + + mas_unlock(&mas); +} + +static noinline void check_root_expand(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 0); + void *ptr; + + + mas_lock(&mas); + mas_set(&mas, 3); + ptr = mas_walk(&mas); + MT_BUG_ON(mt, ptr != NULL); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + + ptr = &check_prev_entry; + mas_set(&mas, 1); + mas_store_gfp(&mas, ptr, GFP_KERNEL); + + mas_set(&mas, 0); + ptr = mas_walk(&mas); + MT_BUG_ON(mt, ptr != NULL); + + mas_set(&mas, 1); + ptr = mas_walk(&mas); + MT_BUG_ON(mt, ptr != &check_prev_entry); + + mas_set(&mas, 2); + ptr = mas_walk(&mas); + MT_BUG_ON(mt, ptr != NULL); + mas_unlock(&mas); + mtree_destroy(mt); + + + mt_init_flags(mt, 0); + mas_lock(&mas); + + mas_set(&mas, 0); + ptr = &check_prev_entry; + mas_store_gfp(&mas, ptr, GFP_KERNEL); + + mas_set(&mas, 5); + ptr = mas_walk(&mas); + MT_BUG_ON(mt, ptr != NULL); + MT_BUG_ON(mt, mas.index != 1); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + + mas_set_range(&mas, 0, 100); + ptr = mas_walk(&mas); + MT_BUG_ON(mt, ptr != &check_prev_entry); + MT_BUG_ON(mt, mas.last != 0); + mas_unlock(&mas); + mtree_destroy(mt); + + mt_init_flags(mt, 0); + mas_lock(&mas); + + mas_set(&mas, 0); + ptr = (void *)((unsigned long) check_prev_entry | 1UL); + mas_store_gfp(&mas, ptr, GFP_KERNEL); + ptr = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, ptr != NULL); + MT_BUG_ON(mt, (mas.index != 1) && (mas.last != ULONG_MAX)); + + mas_set(&mas, 1); + ptr = mas_prev(&mas, 0); + MT_BUG_ON(mt, (mas.index != 0) && (mas.last != 0)); + MT_BUG_ON(mt, ptr != (void *)((unsigned long) check_prev_entry | 1UL)); + + mas_unlock(&mas); + + mtree_destroy(mt); + + mt_init_flags(mt, 0); + mas_lock(&mas); + mas_set(&mas, 0); + ptr = (void *)((unsigned long) check_prev_entry | 2UL); + mas_store_gfp(&mas, ptr, GFP_KERNEL); + ptr = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, ptr != NULL); + MT_BUG_ON(mt, (mas.index != 1) && (mas.last != ULONG_MAX)); + + mas_set(&mas, 1); + ptr = mas_prev(&mas, 0); + MT_BUG_ON(mt, (mas.index != 0) && (mas.last != 0)); + MT_BUG_ON(mt, ptr != (void *)((unsigned long) check_prev_entry | 2UL)); + + + mas_unlock(&mas); +} + +static noinline void check_prealloc(struct maple_tree *mt) +{ + unsigned long i, max = 100; + unsigned long allocated; + unsigned char height; + struct maple_node *mn; + void *ptr = check_prealloc; + MA_STATE(mas, mt, 10, 20); + + mt_set_non_kernel(1000); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + mas_destroy(&mas); + allocated = mas_allocated(&mas); + MT_BUG_ON(mt, allocated != 0); + + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + mas_destroy(&mas); + allocated = mas_allocated(&mas); + MT_BUG_ON(mt, allocated != 0); + + + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); + ma_free_rcu(mn); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + mas_destroy(&mas); + allocated = mas_allocated(&mas); + MT_BUG_ON(mt, allocated != 0); + + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + mas_destroy(&mas); + allocated = mas_allocated(&mas); + MT_BUG_ON(mt, allocated != 0); + ma_free_rcu(mn); + + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + mn = mas_pop_node(&mas); + MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); + mas_push_node(&mas, mn); + MT_BUG_ON(mt, mas_allocated(&mas) != allocated); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + mas_destroy(&mas); + allocated = mas_allocated(&mas); + MT_BUG_ON(mt, allocated != 0); + + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + mas_store_prealloc(&mas, ptr); + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + mas_store_prealloc(&mas, ptr); + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + mas_store_prealloc(&mas, ptr); + + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + mas_store_prealloc(&mas, ptr); + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + mt_set_non_kernel(1); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated != 0); + mas_destroy(&mas); + + + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated == 0); + MT_BUG_ON(mt, allocated != 1 + height * 3); + mas_store_prealloc(&mas, ptr); + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + mt_set_non_kernel(1); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); + allocated = mas_allocated(&mas); + height = mas_mt_height(&mas); + MT_BUG_ON(mt, allocated != 0); +} + +static noinline void check_spanning_write(struct maple_tree *mt) +{ + unsigned long i, max = 5000; + MA_STATE(mas, mt, 1200, 2380); + + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 1205); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mtree_destroy(mt); + + for (i = 1; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + mtree_lock(mt); + mas_set_range(&mas, 9, 50006); /* Will expand to 0 - ULONG_MAX */ + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 1205); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mt_validate(mt); + mtree_destroy(mt); + + /* Test spanning store that requires a right cousin rebalance */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + mas_set_range(&mas, 0, 12900); /* Spans more than 2 levels */ + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 1205); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mtree_destroy(mt); + + /* Test non-alloc tree spanning store */ + mt_init_flags(mt, 0); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + mas_set_range(&mas, 0, 300); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 15); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mtree_destroy(mt); + + /* Test spanning store that requires a right sibling rebalance */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + mas_set_range(&mas, 0, 12865); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 15); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mtree_destroy(mt); + + /* Test spanning store that requires a left sibling rebalance */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + mas_set_range(&mas, 90, 13665); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 95); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mtree_destroy(mt); + + /* Test spanning store that requires a left cousin rebalance */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + mas_set_range(&mas, 46805, 49995); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 46815); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mtree_destroy(mt); + + /* + * Test spanning store that requires a left cousin rebalance all the way + * to root + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + mas_set_range(&mas, 32395, 49995); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 46815); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mtree_destroy(mt); + + /* + * Test spanning store that requires a right cousin rebalance all the + * way to root + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + mas_set_range(&mas, 38875, 43190); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 38900); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mtree_destroy(mt); + + /* Test spanning store ending at full node (depth 2)*/ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + mtree_lock(mt); + mas_set(&mas, 47606); + mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); + mas_set(&mas, 47607); + mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); + mas_set(&mas, 47608); + mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); + mas_set(&mas, 47609); + mas_store_gfp(&mas, check_spanning_write, GFP_KERNEL); + /* Ensure the parent node is full */ + mas_ascend(&mas); + MT_BUG_ON(mt, (mas_data_end(&mas)) != mt_slot_count(mas.node) - 1); + mas_set_range(&mas, 11516, 48940); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mtree_unlock(mt); + mtree_destroy(mt); + + /* Test spanning write with many levels of no siblings */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + mas_set_range(&mas, 43200, 49999); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mas_set(&mas, 43200); + MT_BUG_ON(mt, mas_walk(&mas) != NULL); + mtree_unlock(mt); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= 100; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + + mtree_lock(mt); + mas_set_range(&mas, 76, 875); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mtree_unlock(mt); +} + +static noinline void check_null_expand(struct maple_tree *mt) +{ + unsigned long i, max = 100; + unsigned char data_end; + MA_STATE(mas, mt, 959, 959); + + for (i = 0; i <= max; i++) + mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + /* Test expanding null at start. */ + mas_walk(&mas); + data_end = mas_data_end(&mas); + mas_set_range(&mas, 959, 963); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, mtree_load(mt, 963) != NULL); + MT_BUG_ON(mt, data_end != mas_data_end(&mas)); + + /* Test expanding null at end. */ + mas_set(&mas, 880); + mas_walk(&mas); + data_end = mas_data_end(&mas); + mas_set_range(&mas, 884, 887); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, mtree_load(mt, 884) != NULL); + MT_BUG_ON(mt, mtree_load(mt, 889) != NULL); + MT_BUG_ON(mt, data_end != mas_data_end(&mas)); + + /* Test expanding null at start and end. */ + mas_set(&mas, 890); + mas_walk(&mas); + data_end = mas_data_end(&mas); + mas_set_range(&mas, 900, 905); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, mtree_load(mt, 899) != NULL); + MT_BUG_ON(mt, mtree_load(mt, 900) != NULL); + MT_BUG_ON(mt, mtree_load(mt, 905) != NULL); + MT_BUG_ON(mt, mtree_load(mt, 906) != NULL); + MT_BUG_ON(mt, data_end - 2 != mas_data_end(&mas)); + + /* Test expanding null across multiple slots. */ + mas_set(&mas, 800); + mas_walk(&mas); + data_end = mas_data_end(&mas); + mas_set_range(&mas, 810, 825); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, mtree_load(mt, 809) != NULL); + MT_BUG_ON(mt, mtree_load(mt, 810) != NULL); + MT_BUG_ON(mt, mtree_load(mt, 825) != NULL); + MT_BUG_ON(mt, mtree_load(mt, 826) != NULL); + MT_BUG_ON(mt, data_end - 4 != mas_data_end(&mas)); +} + +static noinline void check_gap_combining(struct maple_tree *mt) +{ + struct maple_enode *mn1, *mn2; + void *entry; + + unsigned long seq100[] = { + /* 0-5 */ + 74, 75, 76, + 50, 100, 2, + + /* 6-12 */ + 44, 45, 46, 43, + 20, 50, 3, + + /* 13-20*/ + 80, 81, 82, + 76, 2, 79, 85, 4, + }; + unsigned long seq2000[] = { + 1152, 1151, + 1100, 1200, 2, + }; + unsigned long seq400[] = { + 286, 318, + 256, 260, 266, 270, 275, 280, 290, 398, + 286, 310, + }; + + unsigned long index = seq100[0]; + + MA_STATE(mas, mt, index, index); + + MT_BUG_ON(mt, !mtree_empty(mt)); + check_seq(mt, 100, false); /* create 100 singletons. */ + + mt_set_non_kernel(1); + mtree_test_erase(mt, seq100[2]); + check_load(mt, seq100[2], NULL); + mtree_test_erase(mt, seq100[1]); + check_load(mt, seq100[1], NULL); + + rcu_read_lock(); + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != xa_mk_value(index)); + mn1 = mas.node; + mas_next(&mas, ULONG_MAX); + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != xa_mk_value(index + 4)); + mn2 = mas.node; + MT_BUG_ON(mt, mn1 == mn2); /* test the test. */ + + /* + * At this point, there is a gap of 2 at index + 1 between seq100[3] and + * seq100[4]. Search for the gap. + */ + mt_set_non_kernel(1); + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, seq100[3], seq100[4], + seq100[5])); + MT_BUG_ON(mt, mas.index != index + 1); + rcu_read_unlock(); + + mtree_test_erase(mt, seq100[6]); + check_load(mt, seq100[6], NULL); + mtree_test_erase(mt, seq100[7]); + check_load(mt, seq100[7], NULL); + mtree_test_erase(mt, seq100[8]); + index = seq100[9]; + + rcu_read_lock(); + mas.index = index; + mas.last = index; + mas_reset(&mas); + entry = mas_find(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != xa_mk_value(index)); + mn1 = mas.node; + entry = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, entry != xa_mk_value(index + 4)); + mas_next(&mas, ULONG_MAX); /* go to the next entry. */ + mn2 = mas.node; + MT_BUG_ON(mt, mn1 == mn2); /* test the next entry is in the next node. */ + + /* + * At this point, there is a gap of 3 at seq100[6]. Find it by + * searching 20 - 50 for size 3. + */ + mas_reset(&mas); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, seq100[10], seq100[11], + seq100[12])); + MT_BUG_ON(mt, mas.index != seq100[6]); + rcu_read_unlock(); + + mt_set_non_kernel(1); + mtree_store(mt, seq100[13], NULL, GFP_KERNEL); + check_load(mt, seq100[13], NULL); + check_load(mt, seq100[14], xa_mk_value(seq100[14])); + mtree_store(mt, seq100[14], NULL, GFP_KERNEL); + check_load(mt, seq100[13], NULL); + check_load(mt, seq100[14], NULL); + + mas_reset(&mas); + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, seq100[16], seq100[15], + seq100[17])); + MT_BUG_ON(mt, mas.index != seq100[13]); + mt_validate(mt); + rcu_read_unlock(); + + /* + * *DEPRECATED: no retries anymore* Test retry entry in the start of a + * gap. + */ + mt_set_non_kernel(2); + mtree_test_store_range(mt, seq100[18], seq100[14], NULL); + mtree_test_erase(mt, seq100[15]); + mas_reset(&mas); + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, seq100[16], seq100[19], + seq100[20])); + rcu_read_unlock(); + MT_BUG_ON(mt, mas.index != seq100[18]); + mt_validate(mt); + mtree_destroy(mt); + + /* seq 2000 tests are for multi-level tree gaps */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_seq(mt, 2000, false); + mt_set_non_kernel(1); + mtree_test_erase(mt, seq2000[0]); + mtree_test_erase(mt, seq2000[1]); + + mt_set_non_kernel(2); + mas_reset(&mas); + rcu_read_lock(); + MT_BUG_ON(mt, mas_empty_area_rev(&mas, seq2000[2], seq2000[3], + seq2000[4])); + MT_BUG_ON(mt, mas.index != seq2000[1]); + rcu_read_unlock(); + mt_validate(mt); + mtree_destroy(mt); + + /* seq 400 tests rebalancing over two levels. */ + mt_set_non_kernel(99); + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_seq(mt, 400, false); + mtree_test_store_range(mt, seq400[0], seq400[1], NULL); + mt_set_non_kernel(0); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + check_seq(mt, 400, false); + mt_set_non_kernel(50); + mtree_test_store_range(mt, seq400[2], seq400[9], + xa_mk_value(seq400[2])); + mtree_test_store_range(mt, seq400[3], seq400[9], + xa_mk_value(seq400[3])); + mtree_test_store_range(mt, seq400[4], seq400[9], + xa_mk_value(seq400[4])); + mtree_test_store_range(mt, seq400[5], seq400[9], + xa_mk_value(seq400[5])); + mtree_test_store_range(mt, seq400[0], seq400[9], + xa_mk_value(seq400[0])); + mtree_test_store_range(mt, seq400[6], seq400[9], + xa_mk_value(seq400[6])); + mtree_test_store_range(mt, seq400[7], seq400[9], + xa_mk_value(seq400[7])); + mtree_test_store_range(mt, seq400[8], seq400[9], + xa_mk_value(seq400[8])); + mtree_test_store_range(mt, seq400[10], seq400[11], + xa_mk_value(seq400[10])); + mt_validate(mt); + mt_set_non_kernel(0); + mtree_destroy(mt); +} +static noinline void check_node_overwrite(struct maple_tree *mt) +{ + int i, max = 4000; + + for (i = 0; i < max; i++) + mtree_test_store_range(mt, i*100, i*100 + 50, xa_mk_value(i*100)); + + mtree_test_store_range(mt, 319951, 367950, NULL); + /*mt_dump(mt); */ + mt_validate(mt); +} + +static void mas_dfs_preorder(struct ma_state *mas) +{ + + struct maple_enode *prev; + unsigned char end, slot = 0; + + if (mas_is_start(mas)) { + mas_start(mas); + return; + } + + if (mte_is_leaf(mas->node) && mte_is_root(mas->node)) + goto done; + +walk_up: + end = mas_data_end(mas); + if (mte_is_leaf(mas->node) || + (slot > end)) { + if (mte_is_root(mas->node)) + goto done; + + slot = mte_parent_slot(mas->node) + 1; + mas_ascend(mas); + goto walk_up; + } + + prev = mas->node; + mas->node = mas_get_slot(mas, slot); + if (!mas->node || slot > end) { + if (mte_is_root(prev)) + goto done; + + mas->node = prev; + slot = mte_parent_slot(mas->node) + 1; + mas_ascend(mas); + goto walk_up; + } + + return; +done: + mas->node = MAS_NONE; +} + + +static void check_dfs_preorder(struct maple_tree *mt) +{ + unsigned long count = 0, max = 1000; + + MA_STATE(mas, mt, 0, 0); + + check_seq(mt, max, false); + do { + count++; + mas_dfs_preorder(&mas); + } while (!mas_is_none(&mas)); + MT_BUG_ON(mt, count != 74); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_reset(&mas); + count = 0; + check_seq(mt, max, false); + do { + count++; + mas_dfs_preorder(&mas); + } while (!mas_is_none(&mas)); + /*printk("count %lu\n", count); */ + MT_BUG_ON(mt, count != 77); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_reset(&mas); + count = 0; + check_rev_seq(mt, max, false); + do { + count++; + mas_dfs_preorder(&mas); + } while (!mas_is_none(&mas)); + /*printk("count %lu\n", count); */ + MT_BUG_ON(mt, count != 77); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_reset(&mas); + mt_zero_nr_tallocated(); + mt_set_non_kernel(200); + mas_expected_entries(&mas, max); + for (count = 0; count <= max; count++) { + mas.index = mas.last = count; + mas_store(&mas, xa_mk_value(count)); + MT_BUG_ON(mt, mas_is_err(&mas)); + } + mas_destroy(&mas); + rcu_barrier(); + /* + * pr_info(" ->seq test of 0-%lu %luK in %d active (%d total)\n", + * max, mt_get_alloc_size()/1024, mt_nr_allocated(), + * mt_nr_tallocated()); + */ + +} + +#if defined(BENCH_SLOT_STORE) +static noinline void bench_slot_store(struct maple_tree *mt) +{ + int i, brk = 105, max = 1040, brk_start = 100, count = 20000000; + + for (i = 0; i < max; i += 10) + mtree_store_range(mt, i, i + 5, xa_mk_value(i), GFP_KERNEL); + + for (i = 0; i < count; i++) { + mtree_store_range(mt, brk, brk, NULL, GFP_KERNEL); + mtree_store_range(mt, brk_start, brk, xa_mk_value(brk), + GFP_KERNEL); + } +} +#endif + +#if defined(BENCH_NODE_STORE) +static noinline void bench_node_store(struct maple_tree *mt) +{ + int i, overwrite = 76, max = 240, count = 20000000; + + for (i = 0; i < max; i += 10) + mtree_store_range(mt, i, i + 5, xa_mk_value(i), GFP_KERNEL); + + for (i = 0; i < count; i++) { + mtree_store_range(mt, overwrite, overwrite + 15, + xa_mk_value(overwrite), GFP_KERNEL); + + overwrite += 5; + if (overwrite >= 135) + overwrite = 76; + } +} +#endif + +#if defined(BENCH_AWALK) +static noinline void bench_awalk(struct maple_tree *mt) +{ + int i, max = 2500, count = 50000000; + MA_STATE(mas, mt, 1470, 1470); + + for (i = 0; i < max; i += 10) + mtree_store_range(mt, i, i + 5, xa_mk_value(i), GFP_KERNEL); + + mtree_store_range(mt, 1470, 1475, NULL, GFP_KERNEL); + + for (i = 0; i < count; i++) { + mas_empty_area_rev(&mas, 0, 2000, 10); + mas_reset(&mas); + } +} +#endif +#if defined(BENCH_WALK) +static noinline void bench_walk(struct maple_tree *mt) +{ + int i, max = 2500, count = 550000000; + MA_STATE(mas, mt, 1470, 1470); + + for (i = 0; i < max; i += 10) + mtree_store_range(mt, i, i + 5, xa_mk_value(i), GFP_KERNEL); + + for (i = 0; i < count; i++) { + mas_walk(&mas); + mas_reset(&mas); + } + +} +#endif + +#if defined(BENCH_MT_FOR_EACH) +static noinline void bench_mt_for_each(struct maple_tree *mt) +{ + int i, count = 1000000; + unsigned long max = 2500, index = 0; + void *entry; + + for (i = 0; i < max; i += 5) + mtree_store_range(mt, i, i + 4, xa_mk_value(i), GFP_KERNEL); + + for (i = 0; i < count; i++) { + unsigned long j = 0; + + mt_for_each(mt, entry, index, max) { + MT_BUG_ON(mt, entry != xa_mk_value(j)); + j += 5; + } + + index = 0; + } + +} +#endif + +static noinline void check_forking(struct maple_tree *mt) +{ + + struct maple_tree newmt; + int i, nr_entries = 134; + void *val; + MA_STATE(mas, mt, 0, 0); + MA_STATE(newmas, mt, 0, 0); + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 5, + xa_mk_value(i), GFP_KERNEL); + + mt_set_non_kernel(99999); + mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); + newmas.tree = &newmt; + mas_reset(&newmas); + mas_reset(&mas); + mas.index = 0; + mas.last = 0; + if (mas_expected_entries(&newmas, nr_entries)) { + pr_err("OOM!"); + BUG_ON(1); + } + mas_for_each(&mas, val, ULONG_MAX) { + newmas.index = mas.index; + newmas.last = mas.last; + mas_store(&newmas, val); + } + mas_destroy(&newmas); + mt_validate(&newmt); + mt_set_non_kernel(0); + mtree_destroy(&newmt); +} + +static noinline void check_mas_store_gfp(struct maple_tree *mt) +{ + + struct maple_tree newmt; + int i, nr_entries = 135; + void *val; + MA_STATE(mas, mt, 0, 0); + MA_STATE(newmas, mt, 0, 0); + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 5, + xa_mk_value(i), GFP_KERNEL); + + mt_set_non_kernel(99999); + mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); + newmas.tree = &newmt; + mas_reset(&newmas); + mas_set(&mas, 0); + mas_for_each(&mas, val, ULONG_MAX) { + newmas.index = mas.index; + newmas.last = mas.last; + mas_store_gfp(&newmas, val, GFP_KERNEL); + } + + mt_validate(&newmt); + mt_set_non_kernel(0); + mtree_destroy(&newmt); +} + +#if defined(BENCH_FORK) +static noinline void bench_forking(struct maple_tree *mt) +{ + + struct maple_tree newmt; + int i, nr_entries = 134, nr_fork = 80000; + void *val; + MA_STATE(mas, mt, 0, 0); + MA_STATE(newmas, mt, 0, 0); + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 5, + xa_mk_value(i), GFP_KERNEL); + + for (i = 0; i < nr_fork; i++) { + mt_set_non_kernel(99999); + mt_init_flags(&newmt, MT_FLAGS_ALLOC_RANGE); + newmas.tree = &newmt; + mas_reset(&newmas); + mas_reset(&mas); + mas.index = 0; + mas.last = 0; + if (mas_expected_entries(&newmas, nr_entries)) { + printk("OOM!"); + BUG_ON(1); + } + mas_for_each(&mas, val, ULONG_MAX) { + newmas.index = mas.index; + newmas.last = mas.last; + mas_store(&newmas, val); + } + mas_destroy(&newmas); + mt_validate(&newmt); + mt_set_non_kernel(0); + mtree_destroy(&newmt); + } +} +#endif + +static noinline void next_prev_test(struct maple_tree *mt) +{ + int i, nr_entries = 200; + void *val; + MA_STATE(mas, mt, 0, 0); + struct maple_enode *mn; + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 5, + xa_mk_value(i), GFP_KERNEL); + + for (i = 0; i <= nr_entries / 2; i++) { + mas_next(&mas, 1000); + if (mas_is_none(&mas)) + break; + + } + mas_reset(&mas); + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 1000) { + i++; + } + + mas_reset(&mas); + mas_set(&mas, 0); + i = 0; + mas_for_each(&mas, val, 1000) { + mas_pause(&mas); + i++; + } + + /* + * 680 - 685 = 0x61a00001930c + * 686 - 689 = NULL; + * 690 - 695 = 0x61a00001930c + * Check simple next/prev + */ + mas_set(&mas, 686); + val = mas_walk(&mas); + MT_BUG_ON(mt, val != NULL); + + val = mas_next(&mas, 1000); + MT_BUG_ON(mt, val != xa_mk_value(690 / 10)); + MT_BUG_ON(mt, mas.index != 690); + MT_BUG_ON(mt, mas.last != 695); + + val = mas_prev(&mas, 0); + MT_BUG_ON(mt, val != xa_mk_value(680 / 10)); + MT_BUG_ON(mt, mas.index != 680); + MT_BUG_ON(mt, mas.last != 685); + + val = mas_next(&mas, 1000); + MT_BUG_ON(mt, val != xa_mk_value(690 / 10)); + MT_BUG_ON(mt, mas.index != 690); + MT_BUG_ON(mt, mas.last != 695); + + val = mas_next(&mas, 1000); + MT_BUG_ON(mt, val != xa_mk_value(700 / 10)); + MT_BUG_ON(mt, mas.index != 700); + MT_BUG_ON(mt, mas.last != 705); + + /* Check across node boundaries of the tree */ + mas_set(&mas, 70); + val = mas_walk(&mas); + MT_BUG_ON(mt, val != xa_mk_value(70 / 10)); + MT_BUG_ON(mt, mas.index != 70); + MT_BUG_ON(mt, mas.last != 75); + + val = mas_next(&mas, 1000); + MT_BUG_ON(mt, val != xa_mk_value(80 / 10)); + MT_BUG_ON(mt, mas.index != 80); + MT_BUG_ON(mt, mas.last != 85); + + val = mas_prev(&mas, 70); + MT_BUG_ON(mt, val != xa_mk_value(70 / 10)); + MT_BUG_ON(mt, mas.index != 70); + MT_BUG_ON(mt, mas.last != 75); + + /* Check across two levels of the tree */ + mas_reset(&mas); + mas_set(&mas, 707); + val = mas_walk(&mas); + MT_BUG_ON(mt, val != NULL); + val = mas_next(&mas, 1000); + MT_BUG_ON(mt, val != xa_mk_value(710 / 10)); + MT_BUG_ON(mt, mas.index != 710); + MT_BUG_ON(mt, mas.last != 715); + mn = mas.node; + + val = mas_next(&mas, 1000); + MT_BUG_ON(mt, val != xa_mk_value(720 / 10)); + MT_BUG_ON(mt, mas.index != 720); + MT_BUG_ON(mt, mas.last != 725); + MT_BUG_ON(mt, mn == mas.node); + + val = mas_prev(&mas, 0); + MT_BUG_ON(mt, val != xa_mk_value(710 / 10)); + MT_BUG_ON(mt, mas.index != 710); + MT_BUG_ON(mt, mas.last != 715); + + /* Check running off the end and back on */ + mas_reset(&mas); + mas_set(&mas, 2000); + val = mas_walk(&mas); + MT_BUG_ON(mt, val != xa_mk_value(2000 / 10)); + MT_BUG_ON(mt, mas.index != 2000); + MT_BUG_ON(mt, mas.last != 2005); + + val = mas_next(&mas, ULONG_MAX); + MT_BUG_ON(mt, val != NULL); + MT_BUG_ON(mt, mas.index != ULONG_MAX); + MT_BUG_ON(mt, mas.last != ULONG_MAX); + + val = mas_prev(&mas, 0); + MT_BUG_ON(mt, val != xa_mk_value(2000 / 10)); + MT_BUG_ON(mt, mas.index != 2000); + MT_BUG_ON(mt, mas.last != 2005); + + /* Check running off the start and back on */ + mas_reset(&mas); + mas_set(&mas, 10); + val = mas_walk(&mas); + MT_BUG_ON(mt, val != xa_mk_value(1)); + MT_BUG_ON(mt, mas.index != 10); + MT_BUG_ON(mt, mas.last != 15); + + val = mas_prev(&mas, 0); + MT_BUG_ON(mt, val != xa_mk_value(0)); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 5); + + val = mas_prev(&mas, 0); + MT_BUG_ON(mt, val != NULL); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + + mas.index = 0; + mas.last = 5; + mas_store(&mas, NULL); + mas_reset(&mas); + mas_set(&mas, 10); + mas_walk(&mas); + + val = mas_prev(&mas, 0); + MT_BUG_ON(mt, val != NULL); + MT_BUG_ON(mt, mas.index != 0); + MT_BUG_ON(mt, mas.last != 0); + + mtree_destroy(mt); + + mt_init(mt); + mtree_store_range(mt, 0, 0, xa_mk_value(0), GFP_KERNEL); + mtree_store_range(mt, 5, 5, xa_mk_value(5), GFP_KERNEL); + mas_set(&mas, 5); + val = mas_prev(&mas, 4); + MT_BUG_ON(mt, val != NULL); +} + +#define RCU_RANGE_COUNT 1000 +#define RCU_MT_BUG_ON(test, y) {if (y) { test->stop = true;} MT_BUG_ON(test->mt, y);} +struct rcu_test_struct2 { + struct maple_tree *mt; + + bool start; + bool stop; + unsigned int thread_count; + + unsigned int seen_toggle; + unsigned int seen_added; + unsigned int seen_modified; + unsigned int seen_deleted; + int pause; + + unsigned long index[RCU_RANGE_COUNT]; + unsigned long last[RCU_RANGE_COUNT]; +}; + +struct rcu_reader_struct { + unsigned int id; + int mod; + int del; + int flip; + int add; + int next; + struct rcu_test_struct2 *test; +}; + +/* RCU reader helper function */ +static void rcu_reader_register(struct rcu_test_struct2 *test) +{ + rcu_register_thread(); + uatomic_inc(&test->thread_count); + + while (!test->start) + usleep(test->pause * 100); +} + +static void rcu_reader_setup(struct rcu_reader_struct *reader, + unsigned int id, struct rcu_test_struct2 *test) +{ + reader->id = id; + reader->test = test; + reader->mod = reader->id % 10; + reader->del = (reader->mod + 1) % 10; + reader->flip = (reader->mod + 2) % 10; + reader->add = (reader->mod + 3) % 10; + reader->next = (reader->mod + 4) % 10; +} + +/* RCU reader in increasing index */ +static void *rcu_reader_fwd(void *ptr) +{ + struct rcu_reader_struct *reader = (struct rcu_reader_struct *)ptr; + struct rcu_test_struct2 *test = reader->test; + unsigned long index = reader->id; + bool toggled, modified, deleted, added; + int i; + void *entry, *prev = NULL; + MA_STATE(mas, test->mt, 0, 0); + + rcu_reader_register(test); + toggled = modified = deleted = added = false; + + while (!test->stop) { + i = 0; + /* mas_for_each ?*/ + rcu_read_lock(); + mas_set(&mas, test->index[index]); + mas_for_each(&mas, entry, test->last[index + 9]) { + unsigned long r_start, r_end, alt_start; + void *expected, *alt; + + r_start = test->index[index + i]; + r_end = test->last[index + i]; + expected = xa_mk_value(r_start); + + if (i == reader->del) { + if (!deleted) { + alt_start = test->index[index + reader->flip]; + /* delete occurred. */ + if (mas.index == alt_start) { + uatomic_inc(&test->seen_deleted); + deleted = true; + } + } + if (deleted) { + i = reader->flip; + r_start = test->index[index + i]; + r_end = test->last[index + i]; + expected = xa_mk_value(r_start); + } + } + + if (!added && (i == reader->add)) { + alt_start = test->index[index + reader->next]; + if (mas.index == r_start) { + uatomic_inc(&test->seen_added); + added = true; + } else if (mas.index == alt_start) { + i = reader->next; + r_start = test->index[index + i]; + r_end = test->last[index + i]; + expected = xa_mk_value(r_start); + } + } + + RCU_MT_BUG_ON(test, mas.index != r_start); + RCU_MT_BUG_ON(test, mas.last != r_end); + + if (i == reader->flip) { + alt = xa_mk_value(index + i + RCU_RANGE_COUNT); + if (prev) { + if (toggled && entry == expected) + uatomic_inc(&test->seen_toggle); + else if (!toggled && entry == alt) + uatomic_inc(&test->seen_toggle); + } + + if (entry == expected) + toggled = false; + else if (entry == alt) + toggled = true; + else { + printk("!!%lu-%lu -> %p not %p or %p\n", mas.index, mas.last, entry, expected, alt); + RCU_MT_BUG_ON(test, 1); + } + + prev = entry; + } else if (i == reader->mod) { + alt = xa_mk_value(index + i * 2 + 1 + + RCU_RANGE_COUNT); + if (entry != expected) { + if (!modified) + uatomic_inc(&test->seen_modified); + modified = true; + } else { + if (modified) + uatomic_inc(&test->seen_modified); + modified = false; + } + + if (modified) + RCU_MT_BUG_ON(test, entry != alt); + + } else { + if (entry != expected) + printk("!!%lu-%lu -> %p not %p\n", mas.index, mas.last, entry, expected); + RCU_MT_BUG_ON(test, entry != expected); + } + + i++; + } + rcu_read_unlock(); + usleep(test->pause); + } + + rcu_unregister_thread(); + return NULL; +} + +/* RCU reader in decreasing index */ +static void *rcu_reader_rev(void *ptr) +{ + struct rcu_reader_struct *reader = (struct rcu_reader_struct *)ptr; + struct rcu_test_struct2 *test = reader->test; + unsigned long index = reader->id; + bool toggled, modified, deleted, added; + int i; + void *prev = NULL; + MA_STATE(mas, test->mt, 0, 0); + + rcu_reader_register(test); + toggled = modified = deleted = added = false; + + + while (!test->stop) { + void *entry; + + i = 9; + mas_set(&mas, test->index[index + i]); + + rcu_read_lock(); + while (i--) { + unsigned long r_start, r_end, alt_start; + void *expected, *alt; + int line = __LINE__; + + entry = mas_prev(&mas, test->index[index]); + r_start = test->index[index + i]; + r_end = test->last[index + i]; + expected = xa_mk_value(r_start); + + if (i == reader->del) { + alt_start = test->index[index + reader->mod]; + if (mas.index == alt_start) { + line = __LINE__; + if (!deleted) + uatomic_inc(&test->seen_deleted); + deleted = true; + } + if (deleted) { + line = __LINE__; + i = reader->mod; + r_start = test->index[index + i]; + r_end = test->last[index + i]; + expected = xa_mk_value(r_start); + } + } + if (!added && (i == reader->add)) { + alt_start = test->index[index + reader->flip]; + if (mas.index == r_start) { + line = __LINE__; + uatomic_inc(&test->seen_added); + added = true; + } else if (mas.index == alt_start) { + line = __LINE__; + i = reader->flip; + r_start = test->index[index + i]; + r_end = test->last[index + i]; + expected = xa_mk_value(r_start); + } + } + + if (i == reader->mod) + line = __LINE__; + else if (i == reader->flip) + line = __LINE__; + + if (mas.index != r_start) { + alt = xa_mk_value(index + i * 2 + 1 + + RCU_RANGE_COUNT); + mt_dump(test->mt); + printk("Error: %lu-%lu %p != %lu-%lu %p %p line %d i %d\n", + mas.index, mas.last, entry, + r_start, r_end, expected, alt, + line, i); + } + RCU_MT_BUG_ON(test, mas.index != r_start); + RCU_MT_BUG_ON(test, mas.last != r_end); + + if (i == reader->mod) { + alt = xa_mk_value(index + i * 2 + 1 + + RCU_RANGE_COUNT); + + if (entry != expected) { + if (!modified) + uatomic_inc(&test->seen_modified); + modified = true; + } else { + if (modified) + uatomic_inc(&test->seen_modified); + modified = false; + } + if (modified) + RCU_MT_BUG_ON(test, entry != alt); + + + } else if (i == reader->flip) { + alt = xa_mk_value(index + i + + RCU_RANGE_COUNT); + if (prev) { + if (toggled && entry == expected) + uatomic_inc(&test->seen_toggle); + else if (!toggled && entry == alt) + uatomic_inc(&test->seen_toggle); + } + + if (entry == expected) + toggled = false; + else if (entry == alt) + toggled = true; + else { + printk("%lu-%lu %p != %p or %p\n", + mas.index, mas.last, entry, + expected, alt); + RCU_MT_BUG_ON(test, 1); + } + + prev = entry; + } else { + if (entry != expected) + printk("%lu-%lu %p != %p\n", mas.index, + mas.last, entry, expected); + RCU_MT_BUG_ON(test, entry != expected); + } + } + rcu_read_unlock(); + usleep(test->pause); + } + + rcu_unregister_thread(); + return NULL; +} + +static void rcu_stress_rev(struct maple_tree *mt, struct rcu_test_struct2 *test, + int count, struct rcu_reader_struct *test_reader) +{ + int i, j = 10000; + bool toggle = true; + + test->start = true; /* Release the hounds! */ + usleep(5); + + while (j--) { + toggle = !toggle; + i = count; + while (i--) { + unsigned long start, end; + struct rcu_reader_struct *this = &test_reader[i]; + + /* Mod offset */ + if (j == 600) { + start = test->index[this->id + this->mod]; + end = test->last[this->id + this->mod]; + mtree_store_range(mt, start, end, + xa_mk_value(this->id + this->mod * 2 + + 1 + RCU_RANGE_COUNT), + GFP_KERNEL); + } + + /* Toggle */ + if (!(j % 5)) { + start = test->index[this->id + this->flip]; + end = test->last[this->id + this->flip]; + mtree_store_range(mt, start, end, + xa_mk_value((toggle ? start : + this->id + this->flip + + RCU_RANGE_COUNT)), + GFP_KERNEL); + } + + /* delete */ + if (j == 400) { + start = test->index[this->id + this->del]; + end = test->last[this->id + this->del]; + mtree_store_range(mt, start, end, NULL, GFP_KERNEL); + } + + /* add */ + if (j == 500) { + start = test->index[this->id + this->add]; + end = test->last[this->id + this->add]; + mtree_store_range(mt, start, end, + xa_mk_value(start), GFP_KERNEL); + } + } + usleep(test->pause); + /* If a test fails, don't flood the console */ + if (test->stop) + break; + } +} + +static void rcu_stress_fwd(struct maple_tree *mt, struct rcu_test_struct2 *test, + int count, struct rcu_reader_struct *test_reader) +{ + int j, i; + bool toggle = true; + + test->start = true; /* Release the hounds! */ + usleep(5); + for (j = 0; j < 10000; j++) { + toggle = !toggle; + for (i = 0; i < count; i++) { + unsigned long start, end; + struct rcu_reader_struct *this = &test_reader[i]; + + /* Mod offset */ + if (j == 600) { + start = test->index[this->id + this->mod]; + end = test->last[this->id + this->mod]; + mtree_store_range(mt, start, end, + xa_mk_value(this->id + this->mod * 2 + + 1 + RCU_RANGE_COUNT), + GFP_KERNEL); + } + + /* Toggle */ + if (!(j % 5)) { + start = test->index[this->id + this->flip]; + end = test->last[this->id + this->flip]; + mtree_store_range(mt, start, end, + xa_mk_value((toggle ? start : + this->id + this->flip + + RCU_RANGE_COUNT)), + GFP_KERNEL); + } + + /* delete */ + if (j == 400) { + start = test->index[this->id + this->del]; + end = test->last[this->id + this->del]; + mtree_store_range(mt, start, end, NULL, GFP_KERNEL); + } + + /* add */ + if (j == 500) { + start = test->index[this->id + this->add]; + end = test->last[this->id + this->add]; + mtree_store_range(mt, start, end, + xa_mk_value(start), GFP_KERNEL); + } + } + usleep(test->pause); + /* If a test fails, don't flood the console */ + if (test->stop) + break; + } +} + +/* + * This is to check: + * 1. Range that is not ever present + * 2. Range that is always present + * 3. Things being added but not removed. + * 4. Things being removed but not added. + * 5. Things are being added and removed, searches my succeed or fail + * + * This sets up two readers for every 10 entries; one forward and one reverse + * reading. + */ +static void rcu_stress(struct maple_tree *mt, bool forward) +{ + unsigned int count, i; + unsigned long r, seed; + pthread_t readers[RCU_RANGE_COUNT / 5]; + struct rcu_test_struct2 test; + struct rcu_reader_struct test_reader[RCU_RANGE_COUNT / 5]; + void *(*function)(void *); + + /* Test setup */ + test.mt = mt; + test.pause = 5; + test.seen_toggle = 0; + test.seen_deleted = 0; + test.seen_added = 0; + test.seen_modified = 0; + test.thread_count = 0; + test.start = test.stop = false; + seed = time(NULL); + srand(seed); + for (i = 0; i < RCU_RANGE_COUNT; i++) { + r = seed + rand(); + mtree_store_range(mt, seed, r, + xa_mk_value(seed), GFP_KERNEL); + + /* Record start and end of entry */ + test.index[i] = seed; + test.last[i] = r; + seed = 1 + r + rand() % 10; + } + + i = count = ARRAY_SIZE(readers); + while (i--) { + unsigned long id; + + id = i / 2 * 10; + if (i % 2) + function = rcu_reader_fwd; + else + function = rcu_reader_rev; + + rcu_reader_setup(&test_reader[i], id, &test); + if (pthread_create(&readers[i], NULL, *function, + &test_reader[i])) { + perror("creating reader thread"); + exit(1); + } + } + + for (i = 0; i < ARRAY_SIZE(readers); i++) { + struct rcu_reader_struct *this = &test_reader[i]; + int add = this->id + this->add; + + /* Remove add entries from the tree for later addition */ + mtree_store_range(mt, test.index[add], test.last[add], + NULL, GFP_KERNEL); + } + + mt_set_in_rcu(mt); + do { + usleep(5); + } while (test.thread_count > ARRAY_SIZE(readers)); + + if (forward) + rcu_stress_fwd(mt, &test, count, test_reader); + else + rcu_stress_rev(mt, &test, count, test_reader); + + test.stop = true; + while (count--) + pthread_join(readers[count], NULL); + + mt_validate(mt); +} + + +struct rcu_test_struct { + struct maple_tree *mt; /* the maple tree */ + int count; /* Number of times to check value(s) */ + unsigned long index; /* The first index to check */ + void *entry1; /* The first entry value */ + void *entry2; /* The second entry value */ + void *entry3; /* The third entry value */ + + bool update_2; + bool update_3; + unsigned long range_start; + unsigned long range_end; + unsigned int loop_sleep; + unsigned int val_sleep; + + unsigned int failed; /* failed detection for other threads */ + unsigned int seen_entry2; /* Number of threads that have seen the new value */ + unsigned int seen_entry3; /* Number of threads that have seen the new value */ + unsigned int seen_both; /* Number of threads that have seen both new values */ + unsigned int seen_toggle; + unsigned int seen_added; + unsigned int seen_removed; + unsigned long last; /* The end of the range to write. */ + + unsigned long removed; /* The index of the removed entry */ + unsigned long added; /* The index of the removed entry */ + unsigned long toggle; /* The index of the removed entry */ +}; + +static inline +int eval_rcu_entry(struct rcu_test_struct *test, void *entry, bool *update_2, + bool *update_3) +{ + if (entry == test->entry1) + return 0; + + if (entry == test->entry2) { + if (!(*update_2)) { + uatomic_inc(&test->seen_entry2); + *update_2 = true; + if (update_3) + uatomic_inc(&test->seen_both); + } + return 0; + } + + if (entry == test->entry3) { + if (!(*update_3)) { + uatomic_inc(&test->seen_entry3); + *update_3 = true; + if (update_2) + uatomic_inc(&test->seen_both); + } + return 0; + } + + return 1; +} + +/* + * rcu_val() - Read a given value in the tree test->count times using the + * regular API + * + * @ptr: The pointer to the rcu_test_struct + */ +static void *rcu_val(void *ptr) +{ + struct rcu_test_struct *test = (struct rcu_test_struct *)ptr; + unsigned long count = test->count; + bool update_2 = false; + bool update_3 = false; + void *entry; + + rcu_register_thread(); + while (count--) { + usleep(test->val_sleep); + /* + * No locking required, regular API locking is handled in the + * maple tree code + */ + entry = mtree_load(test->mt, test->index); + MT_BUG_ON(test->mt, eval_rcu_entry(test, entry, &update_2, + &update_3)); + } + rcu_unregister_thread(); + return NULL; +} + +/* + * rcu_loop() - Loop over a section of the maple tree, checking for an expected + * value using the advanced API + * + * @ptr - The pointer to the rcu_test_struct + */ +static void *rcu_loop(void *ptr) +{ + struct rcu_test_struct *test = (struct rcu_test_struct *)ptr; + unsigned long count = test->count; + void *entry, *expected; + bool update_2 = false; + bool update_3 = false; + MA_STATE(mas, test->mt, test->range_start, test->range_start); + + rcu_register_thread(); + + /* + * Loop through the test->range_start - test->range_end test->count + * times + */ + while (count--) { + usleep(test->loop_sleep); + rcu_read_lock(); + mas_for_each(&mas, entry, test->range_end) { + /* The expected value is based on the start range. */ + expected = xa_mk_value(mas.index ? mas.index / 10 : 0); + + /* Out of the interesting range */ + if (mas.index < test->index || mas.index > test->last) { + if (entry != expected) { + printk("%lx - %lx = %p not %p\n", + mas.index, mas.last, entry, expected); + } + MT_BUG_ON(test->mt, entry != expected); + continue; + } + + if (entry == expected) + continue; /* Not seen. */ + + /* In the interesting range */ + MT_BUG_ON(test->mt, eval_rcu_entry(test, entry, + &update_2, + &update_3)); + } + rcu_read_unlock(); + mas_set(&mas, test->range_start); + } + + rcu_unregister_thread(); + return NULL; +} + +static noinline +void run_check_rcu(struct maple_tree *mt, struct rcu_test_struct *vals) +{ + + int i; + void *(*function)(void *); + pthread_t readers[20]; + + mt_set_in_rcu(mt); + MT_BUG_ON(mt, !mt_in_rcu(mt)); + + for (i = 0; i < ARRAY_SIZE(readers); i++) { + if (i % 2) + function = rcu_loop; + else + function = rcu_val; + + if (pthread_create(&readers[i], NULL, *function, vals)) { + perror("creating reader thread"); + exit(1); + } + } + + usleep(5); /* small yield to ensure all threads are at least started. */ + mtree_store_range(mt, vals->index, vals->last, vals->entry2, + GFP_KERNEL); + while (i--) + pthread_join(readers[i], NULL); + + /* Make sure the test caught at least one update. */ + MT_BUG_ON(mt, !vals->seen_entry2); +} + +static noinline +void run_check_rcu_slowread(struct maple_tree *mt, struct rcu_test_struct *vals) +{ + + int i; + void *(*function)(void *); + pthread_t readers[20]; + unsigned int index = vals->index; + + mt_set_in_rcu(mt); + MT_BUG_ON(mt, !mt_in_rcu(mt)); + + for (i = 0; i < ARRAY_SIZE(readers); i++) { + if (i % 2) + function = rcu_loop; + else + function = rcu_val; + + if (pthread_create(&readers[i], NULL, *function, vals)) { + perror("creating reader thread"); + exit(1); + } + } + + usleep(5); /* small yield to ensure all threads are at least started. */ + + while (index <= vals->last) { + mtree_store(mt, index, + (index % 2 ? vals->entry2 : vals->entry3), + GFP_KERNEL); + index++; + usleep(5); + } + + while (i--) + pthread_join(readers[i], NULL); + + /* Make sure the test caught at least one update. */ + MT_BUG_ON(mt, !vals->seen_entry2); + MT_BUG_ON(mt, !vals->seen_entry3); + MT_BUG_ON(mt, !vals->seen_both); +} +static noinline void check_rcu_simulated(struct maple_tree *mt) +{ + unsigned long i, nr_entries = 1000; + unsigned long target = 4320; + unsigned long val = 0xDEAD; + + MA_STATE(mas_writer, mt, 0, 0); + MA_STATE(mas_reader, mt, target, target); + + rcu_register_thread(); + + mt_set_in_rcu(mt); + mas_lock(&mas_writer); + for (i = 0; i <= nr_entries; i++) { + mas_writer.index = i * 10; + mas_writer.last = i * 10 + 5; + mas_store_gfp(&mas_writer, xa_mk_value(i), GFP_KERNEL); + } + mas_unlock(&mas_writer); + + /* Overwrite one entry with a new value. */ + mas_set_range(&mas_writer, target, target + 5); + rcu_read_lock(); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(target/10)); + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(val), GFP_KERNEL); + mas_unlock(&mas_writer); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(val)); + rcu_read_unlock(); + + /* Restore value. */ + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(target/10), GFP_KERNEL); + mas_unlock(&mas_writer); + mas_reset(&mas_reader); + + + /* Overwrite 1/2 the entry */ + mas_set_range(&mas_writer, target, target + 2); + rcu_read_lock(); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(target/10)); + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(val), GFP_KERNEL); + mas_unlock(&mas_writer); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(val)); + rcu_read_unlock(); + + + /* Restore value. */ + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(target/10), GFP_KERNEL); + mas_unlock(&mas_writer); + mas_reset(&mas_reader); + + /* Overwrite last 1/2 the entry */ + mas_set_range(&mas_writer, target + 2, target + 5); + rcu_read_lock(); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(target/10)); + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(val), GFP_KERNEL); + mas_unlock(&mas_writer); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(target/10)); + rcu_read_unlock(); + + + /* Restore value. */ + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(target/10), GFP_KERNEL); + mas_unlock(&mas_writer); + mas_reset(&mas_reader); + + /* Overwrite more than the entry */ + mas_set_range(&mas_writer, target - 5, target + 15); + rcu_read_lock(); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(target/10)); + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(val), GFP_KERNEL); + mas_unlock(&mas_writer); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(val)); + rcu_read_unlock(); + + /* Restore value. */ + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(target/10), GFP_KERNEL); + mas_unlock(&mas_writer); + mas_reset(&mas_reader); + + /* Overwrite more than the node. */ + mas_set_range(&mas_writer, target - 400, target + 400); + rcu_read_lock(); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(target/10)); + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(val), GFP_KERNEL); + mas_unlock(&mas_writer); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(val)); + rcu_read_unlock(); + + /* Restore value. */ + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(target/10), GFP_KERNEL); + mas_unlock(&mas_writer); + mas_reset(&mas_reader); + + /* Overwrite the tree */ + mas_set_range(&mas_writer, 0, ULONG_MAX); + rcu_read_lock(); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(target/10)); + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(val), GFP_KERNEL); + mas_unlock(&mas_writer); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(val)); + rcu_read_unlock(); + + /* Clear out tree & recreate it */ + mas_lock(&mas_writer); + mas_set_range(&mas_writer, 0, ULONG_MAX); + mas_store_gfp(&mas_writer, NULL, GFP_KERNEL); + mas_set_range(&mas_writer, 0, 0); + for (i = 0; i <= nr_entries; i++) { + mas_writer.index = i * 10; + mas_writer.last = i * 10 + 5; + mas_store_gfp(&mas_writer, xa_mk_value(i), GFP_KERNEL); + } + mas_unlock(&mas_writer); + + /* next check */ + /* Overwrite one entry with a new value. */ + mas_reset(&mas_reader); + mas_set_range(&mas_writer, target, target + 5); + mas_set_range(&mas_reader, target, target); + rcu_read_lock(); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(target/10)); + mas_prev(&mas_reader, 0); + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(val), GFP_KERNEL); + mas_unlock(&mas_writer); + MT_BUG_ON(mt, mas_next(&mas_reader, ULONG_MAX) != xa_mk_value(val)); + rcu_read_unlock(); + + /* Restore value. */ + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(target/10), GFP_KERNEL); + mas_unlock(&mas_writer); + + /* prev check */ + /* Overwrite one entry with a new value. */ + mas_reset(&mas_reader); + mas_set_range(&mas_writer, target, target + 5); + mas_set_range(&mas_reader, target, target); + rcu_read_lock(); + MT_BUG_ON(mt, mas_walk(&mas_reader) != xa_mk_value(target/10)); + mas_next(&mas_reader, ULONG_MAX); + mas_lock(&mas_writer); + mas_store_gfp(&mas_writer, xa_mk_value(val), GFP_KERNEL); + mas_unlock(&mas_writer); + MT_BUG_ON(mt, mas_prev(&mas_reader, 0) != xa_mk_value(val)); + rcu_read_unlock(); + + rcu_unregister_thread(); +} + +static noinline void check_rcu_threaded(struct maple_tree *mt) +{ + unsigned long i, nr_entries = 1000; + struct rcu_test_struct vals; + + vals.val_sleep = 200; + vals.loop_sleep = 110; + + rcu_register_thread(); + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 5, + xa_mk_value(i), GFP_KERNEL); + /* Store across several slots. */ + vals.count = 1000; + vals.mt = mt; + vals.index = 8650; + vals.last = 8666; + vals.entry1 = xa_mk_value(865); + vals.entry2 = xa_mk_value(8650); + vals.entry3 = xa_mk_value(8650); + vals.range_start = 0; + vals.range_end = ULONG_MAX; + vals.seen_entry2 = 0; + vals.seen_entry3 = 0; + + run_check_rcu(mt, &vals); + mtree_destroy(mt); + + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 5, + xa_mk_value(i), GFP_KERNEL); + + /* 4390-4395: value 439 (0x1b7) [0x36f] */ + /* Store across several slots. */ + /* Spanning store. */ + vals.count = 10000; + vals.mt = mt; + vals.index = 4390; + vals.last = 4398; + vals.entry1 = xa_mk_value(4390); + vals.entry2 = xa_mk_value(439); + vals.entry3 = xa_mk_value(439); + vals.seen_entry2 = 0; + vals.range_start = 4316; + vals.range_end = 5035; + run_check_rcu(mt, &vals); + mtree_destroy(mt); + + + /* Forward writer for rcu stress */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + rcu_stress(mt, true); + mtree_destroy(mt); + + /* Reverse writer for rcu stress */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + rcu_stress(mt, false); + mtree_destroy(mt); + + /* Slow reader test with spanning store. */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 5, + xa_mk_value(i), GFP_KERNEL); + + /* 4390-4395: value 439 (0x1b7) [0x36f] */ + /* Store across several slots. */ + /* Spanning store. */ + vals.count = 15000; + vals.mt = mt; + vals.index = 4390; + vals.last = 4398; + vals.entry1 = xa_mk_value(4390); + vals.entry2 = xa_mk_value(439); + vals.entry3 = xa_mk_value(4391); + vals.seen_toggle = 0; + vals.seen_added = 0; + vals.seen_removed = 0; + vals.range_start = 4316; + vals.range_end = 5035; + vals.removed = 4360; + vals.added = 4396; + vals.toggle = 4347; + vals.val_sleep = 400; + vals.loop_sleep = 200; + vals.seen_entry2 = 0; + vals.seen_entry3 = 0; + vals.seen_both = 0; + vals.entry3 = xa_mk_value(438); + + run_check_rcu_slowread(mt, &vals); + rcu_unregister_thread(); +} + +extern void test_kmem_cache_bulk(void); + +/* Test spanning writes that require balancing right sibling or right cousin */ +static noinline void check_spanning_relatives(struct maple_tree *mt) +{ + + unsigned long i, nr_entries = 1000; + + for (i = 0; i <= nr_entries; i++) + mtree_store_range(mt, i*10, i*10 + 5, + xa_mk_value(i), GFP_KERNEL); + + + mtree_store_range(mt, 9365, 9955, NULL, GFP_KERNEL); +} + +static noinline void check_fuzzer(struct maple_tree *mt) +{ + /* + * 1. Causes a spanning rebalance of a single root node. + * Fixed by setting the correct limit in mast_cp_to_nodes() when the + * entire right side is consumed. + */ + mtree_test_insert(mt, 88, (void *)0xb1); + mtree_test_insert(mt, 84, (void *)0xa9); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert(mt, 4, (void *)0x9); + mtree_test_insert(mt, 14, (void *)0x1d); + mtree_test_insert(mt, 7, (void *)0xf); + mtree_test_insert(mt, 12, (void *)0x19); + mtree_test_insert(mt, 18, (void *)0x25); + mtree_test_store_range(mt, 8, 18, (void *)0x11); + mtree_destroy(mt); + + + /* + * 2. Cause a spanning rebalance of two nodes in root. + * Fixed by setting mast->r->max correctly. + */ + mt_init_flags(mt, 0); + mtree_test_store(mt, 87, (void *)0xaf); + mtree_test_store(mt, 0, (void *)0x1); + mtree_test_load(mt, 4); + mtree_test_insert(mt, 4, (void *)0x9); + mtree_test_store(mt, 8, (void *)0x11); + mtree_test_store(mt, 44, (void *)0x59); + mtree_test_store(mt, 68, (void *)0x89); + mtree_test_store(mt, 2, (void *)0x5); + mtree_test_insert(mt, 43, (void *)0x57); + mtree_test_insert(mt, 24, (void *)0x31); + mtree_test_insert(mt, 844, (void *)0x699); + mtree_test_store(mt, 84, (void *)0xa9); + mtree_test_store(mt, 4, (void *)0x9); + mtree_test_erase(mt, 4); + mtree_test_load(mt, 5); + mtree_test_erase(mt, 0); + mtree_destroy(mt); + + /* + * 3. Cause a node overflow on copy + * Fixed by using the correct check for node size in mas_wr_modify() + * Also discovered issue with metadata setting. + */ + mt_init_flags(mt, 0); + mtree_test_store_range(mt, 0, 18446744073709551615UL, (void *)0x1); + mtree_test_store(mt, 4, (void *)0x9); + mtree_test_erase(mt, 5); + mtree_test_erase(mt, 0); + mtree_test_erase(mt, 4); + mtree_test_store(mt, 5, (void *)0xb); + mtree_test_erase(mt, 5); + mtree_test_store(mt, 5, (void *)0xb); + mtree_test_erase(mt, 5); + mtree_test_erase(mt, 4); + mtree_test_store(mt, 4, (void *)0x9); + mtree_test_store(mt, 444, (void *)0x379); + mtree_test_store(mt, 0, (void *)0x1); + mtree_test_load(mt, 0); + mtree_test_store(mt, 5, (void *)0xb); + mtree_test_erase(mt, 0); + mtree_destroy(mt); + + /* + * 4. spanning store failure due to writing incorrect pivot value at + * last slot. + * Fixed by setting mast->r->max correctly in mast_cp_to_nodes() + * + */ + mt_init_flags(mt, 0); + mtree_test_insert(mt, 261, (void *)0x20b); + mtree_test_store(mt, 516, (void *)0x409); + mtree_test_store(mt, 6, (void *)0xd); + mtree_test_insert(mt, 5, (void *)0xb); + mtree_test_insert(mt, 1256, (void *)0x9d1); + mtree_test_store(mt, 4, (void *)0x9); + mtree_test_erase(mt, 1); + mtree_test_store(mt, 56, (void *)0x71); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_store(mt, 24, (void *)0x31); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 2263, (void *)0x11af); + mtree_test_insert(mt, 446, (void *)0x37d); + mtree_test_store_range(mt, 6, 45, (void *)0xd); + mtree_test_store_range(mt, 3, 446, (void *)0x7); + mtree_destroy(mt); + + /* + * 5. mas_wr_extend_null() may overflow slots. + * Fix by checking against wr_mas->node_end. + */ + mt_init_flags(mt, 0); + mtree_test_store(mt, 48, (void *)0x61); + mtree_test_store(mt, 3, (void *)0x7); + mtree_test_load(mt, 0); + mtree_test_store(mt, 88, (void *)0xb1); + mtree_test_store(mt, 81, (void *)0xa3); + mtree_test_insert(mt, 0, (void *)0x1); + mtree_test_insert(mt, 8, (void *)0x11); + mtree_test_insert(mt, 4, (void *)0x9); + mtree_test_insert(mt, 2480, (void *)0x1361); + mtree_test_insert(mt, 18446744073709551615UL, + (void *)0xffffffffffffffff); + mtree_test_erase(mt, 18446744073709551615UL); + mtree_destroy(mt); + + /* + * 6. When reusing a node with an implied pivot and the node is + * shrinking, old data would be left in the implied slot + * Fixed by checking the last pivot for the mas->max and clear + * accordingly. This only affected the left-most node as that node is + * the only one allowed to end in NULL. + */ + mt_init_flags(mt, 0); + mtree_test_erase(mt, 3); + mtree_test_insert(mt, 22, (void *)0x2d); + mtree_test_insert(mt, 15, (void *)0x1f); + mtree_test_load(mt, 2); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_insert(mt, 5, (void *)0xb); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_insert(mt, 4, (void *)0x9); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 3); + mtree_test_insert(mt, 22, (void *)0x2d); + mtree_test_insert(mt, 15, (void *)0x1f); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_insert(mt, 8, (void *)0x11); + mtree_test_load(mt, 2); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_store(mt, 1, (void *)0x3); + mtree_test_insert(mt, 5, (void *)0xb); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_insert(mt, 4, (void *)0x9); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 3); + mtree_test_insert(mt, 22, (void *)0x2d); + mtree_test_insert(mt, 15, (void *)0x1f); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_insert(mt, 8, (void *)0x11); + mtree_test_insert(mt, 12, (void *)0x19); + mtree_test_erase(mt, 1); + mtree_test_store_range(mt, 4, 62, (void *)0x9); + mtree_test_erase(mt, 62); + mtree_test_store_range(mt, 1, 0, (void *)0x3); + mtree_test_insert(mt, 11, (void *)0x17); + mtree_test_insert(mt, 3, (void *)0x7); + mtree_test_insert(mt, 3, (void *)0x7); + mtree_test_store(mt, 62, (void *)0x7d); + mtree_test_erase(mt, 62); + mtree_test_store_range(mt, 1, 15, (void *)0x3); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 22, (void *)0x2d); + mtree_test_insert(mt, 12, (void *)0x19); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 3, (void *)0x7); + mtree_test_store(mt, 62, (void *)0x7d); + mtree_test_erase(mt, 62); + mtree_test_insert(mt, 122, (void *)0xf5); + mtree_test_store(mt, 3, (void *)0x7); + mtree_test_insert(mt, 0, (void *)0x1); + mtree_test_store_range(mt, 0, 1, (void *)0x1); + mtree_test_insert(mt, 85, (void *)0xab); + mtree_test_insert(mt, 72, (void *)0x91); + mtree_test_insert(mt, 81, (void *)0xa3); + mtree_test_insert(mt, 726, (void *)0x5ad); + mtree_test_insert(mt, 0, (void *)0x1); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_store(mt, 51, (void *)0x67); + mtree_test_insert(mt, 611, (void *)0x4c7); + mtree_test_insert(mt, 485, (void *)0x3cb); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 0, (void *)0x1); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_insert_range(mt, 26, 1, (void *)0x35); + mtree_test_load(mt, 1); + mtree_test_store_range(mt, 1, 22, (void *)0x3); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 1); + mtree_test_load(mt, 53); + mtree_test_load(mt, 1); + mtree_test_store_range(mt, 1, 1, (void *)0x3); + mtree_test_insert(mt, 222, (void *)0x1bd); + mtree_test_insert(mt, 485, (void *)0x3cb); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 1); + mtree_test_load(mt, 0); + mtree_test_insert(mt, 21, (void *)0x2b); + mtree_test_insert(mt, 3, (void *)0x7); + mtree_test_store(mt, 621, (void *)0x4db); + mtree_test_insert(mt, 0, (void *)0x1); + mtree_test_erase(mt, 5); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_store(mt, 62, (void *)0x7d); + mtree_test_erase(mt, 62); + mtree_test_store_range(mt, 1, 0, (void *)0x3); + mtree_test_insert(mt, 22, (void *)0x2d); + mtree_test_insert(mt, 12, (void *)0x19); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_store_range(mt, 4, 62, (void *)0x9); + mtree_test_erase(mt, 62); + mtree_test_erase(mt, 1); + mtree_test_load(mt, 1); + mtree_test_store_range(mt, 1, 22, (void *)0x3); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 1); + mtree_test_load(mt, 53); + mtree_test_load(mt, 1); + mtree_test_store_range(mt, 1, 1, (void *)0x3); + mtree_test_insert(mt, 222, (void *)0x1bd); + mtree_test_insert(mt, 485, (void *)0x3cb); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_load(mt, 0); + mtree_test_load(mt, 0); + mtree_destroy(mt); + + /* + * 7. Previous fix was incomplete, fix mas_resuse_node() clearing of old + * data by overwriting it first - that way metadata is of no concern. + */ + mt_init_flags(mt, 0); + mtree_test_load(mt, 1); + mtree_test_insert(mt, 102, (void *)0xcd); + mtree_test_erase(mt, 2); + mtree_test_erase(mt, 0); + mtree_test_load(mt, 0); + mtree_test_insert(mt, 4, (void *)0x9); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert(mt, 110, (void *)0xdd); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_insert_range(mt, 5, 0, (void *)0xb); + mtree_test_erase(mt, 2); + mtree_test_store(mt, 0, (void *)0x1); + mtree_test_store(mt, 112, (void *)0xe1); + mtree_test_insert(mt, 21, (void *)0x2b); + mtree_test_store(mt, 1, (void *)0x3); + mtree_test_insert_range(mt, 110, 2, (void *)0xdd); + mtree_test_store(mt, 2, (void *)0x5); + mtree_test_load(mt, 22); + mtree_test_erase(mt, 2); + mtree_test_store(mt, 210, (void *)0x1a5); + mtree_test_store_range(mt, 0, 2, (void *)0x1); + mtree_test_store(mt, 2, (void *)0x5); + mtree_test_erase(mt, 2); + mtree_test_erase(mt, 22); + mtree_test_erase(mt, 1); + mtree_test_erase(mt, 2); + mtree_test_store(mt, 0, (void *)0x1); + mtree_test_load(mt, 112); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_erase(mt, 2); + mtree_test_store(mt, 1, (void *)0x3); + mtree_test_insert_range(mt, 1, 2, (void *)0x3); + mtree_test_erase(mt, 0); + mtree_test_erase(mt, 2); + mtree_test_store(mt, 2, (void *)0x5); + mtree_test_erase(mt, 0); + mtree_test_erase(mt, 2); + mtree_test_store(mt, 0, (void *)0x1); + mtree_test_store(mt, 0, (void *)0x1); + mtree_test_erase(mt, 2); + mtree_test_store(mt, 2, (void *)0x5); + mtree_test_erase(mt, 2); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert_range(mt, 1, 2, (void *)0x3); + mtree_test_erase(mt, 0); + mtree_test_erase(mt, 2); + mtree_test_store(mt, 0, (void *)0x1); + mtree_test_load(mt, 112); + mtree_test_store_range(mt, 110, 12, (void *)0xdd); + mtree_test_store(mt, 2, (void *)0x5); + mtree_test_load(mt, 110); + mtree_test_insert_range(mt, 4, 71, (void *)0x9); + mtree_test_load(mt, 2); + mtree_test_store(mt, 2, (void *)0x5); + mtree_test_insert_range(mt, 11, 22, (void *)0x17); + mtree_test_erase(mt, 12); + mtree_test_store(mt, 2, (void *)0x5); + mtree_test_load(mt, 22); + mtree_destroy(mt); + + + /* + * 8. When rebalancing or spanning_rebalance(), the max of the new node + * may be set incorrectly to the final pivot and not the right max. + * Fix by setting the left max to orig right max if the entire node is + * consumed. + */ + mt_init_flags(mt, 0); + mtree_test_store(mt, 6, (void *)0xd); + mtree_test_store(mt, 67, (void *)0x87); + mtree_test_insert(mt, 15, (void *)0x1f); + mtree_test_insert(mt, 6716, (void *)0x3479); + mtree_test_store(mt, 61, (void *)0x7b); + mtree_test_insert(mt, 13, (void *)0x1b); + mtree_test_store(mt, 8, (void *)0x11); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_load(mt, 0); + mtree_test_erase(mt, 67167); + mtree_test_insert_range(mt, 6, 7167, (void *)0xd); + mtree_test_insert(mt, 6, (void *)0xd); + mtree_test_erase(mt, 67); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 667167); + mtree_test_insert(mt, 6, (void *)0xd); + mtree_test_store(mt, 67, (void *)0x87); + mtree_test_insert(mt, 5, (void *)0xb); + mtree_test_erase(mt, 1); + mtree_test_insert(mt, 6, (void *)0xd); + mtree_test_erase(mt, 67); + mtree_test_insert(mt, 15, (void *)0x1f); + mtree_test_insert(mt, 67167, (void *)0x20cbf); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_load(mt, 7); + mtree_test_insert(mt, 16, (void *)0x21); + mtree_test_insert(mt, 36, (void *)0x49); + mtree_test_store(mt, 67, (void *)0x87); + mtree_test_store(mt, 6, (void *)0xd); + mtree_test_insert(mt, 367, (void *)0x2df); + mtree_test_insert(mt, 115, (void *)0xe7); + mtree_test_store(mt, 0, (void *)0x1); + mtree_test_store_range(mt, 1, 3, (void *)0x3); + mtree_test_store(mt, 1, (void *)0x3); + mtree_test_erase(mt, 67167); + mtree_test_insert_range(mt, 6, 47, (void *)0xd); + mtree_test_store(mt, 1, (void *)0x3); + mtree_test_insert_range(mt, 1, 67, (void *)0x3); + mtree_test_load(mt, 67); + mtree_test_insert(mt, 1, (void *)0x3); + mtree_test_erase(mt, 67167); + mtree_destroy(mt); + + /* + * 9. spanning store to the end of data caused an invalid metadata + * length which resulted in a crash eventually. + * Fix by checking if there is a value in pivot before incrementing the + * metadata end in mab_mas_cp(). To ensure this doesn't happen again, + * abstract the two locations this happens into a function called + * mas_leaf_set_meta(). + */ + mt_init_flags(mt, 0); + mtree_test_insert(mt, 21, (void *)0x2b); + mtree_test_insert(mt, 12, (void *)0x19); + mtree_test_insert(mt, 6, (void *)0xd); + mtree_test_insert(mt, 8, (void *)0x11); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert(mt, 91, (void *)0xb7); + mtree_test_insert(mt, 18, (void *)0x25); + mtree_test_insert(mt, 81, (void *)0xa3); + mtree_test_store_range(mt, 0, 128, (void *)0x1); + mtree_test_store(mt, 1, (void *)0x3); + mtree_test_erase(mt, 8); + mtree_test_insert(mt, 11, (void *)0x17); + mtree_test_insert(mt, 8, (void *)0x11); + mtree_test_insert(mt, 21, (void *)0x2b); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert(mt, 18446744073709551605UL, (void *)0xffffffffffffffeb); + mtree_test_erase(mt, 18446744073709551605UL); + mtree_test_store_range(mt, 0, 281, (void *)0x1); + mtree_test_erase(mt, 2); + mtree_test_insert(mt, 1211, (void *)0x977); + mtree_test_insert(mt, 111, (void *)0xdf); + mtree_test_insert(mt, 13, (void *)0x1b); + mtree_test_insert(mt, 211, (void *)0x1a7); + mtree_test_insert(mt, 11, (void *)0x17); + mtree_test_insert(mt, 5, (void *)0xb); + mtree_test_insert(mt, 1218, (void *)0x985); + mtree_test_insert(mt, 61, (void *)0x7b); + mtree_test_store(mt, 1, (void *)0x3); + mtree_test_insert(mt, 121, (void *)0xf3); + mtree_test_insert(mt, 8, (void *)0x11); + mtree_test_insert(mt, 21, (void *)0x2b); + mtree_test_insert(mt, 2, (void *)0x5); + mtree_test_insert(mt, 18446744073709551605UL, (void *)0xffffffffffffffeb); + mtree_test_erase(mt, 18446744073709551605UL); +} + +static DEFINE_MTREE(tree); +static int maple_tree_seed(void) +{ + unsigned long set[] = {5015, 5014, 5017, 25, 1000, + 1001, 1002, 1003, 1005, 0, + 5003, 5002}; + void *ptr = &set; + + pr_info("\nTEST STARTING\n\n"); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_root_expand(&tree); + mtree_destroy(&tree); + +#if defined(BENCH_SLOT_STORE) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_slot_store(&tree); + mtree_destroy(&tree); + goto skip; +#endif +#if defined(BENCH_NODE_STORE) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_node_store(&tree); + mtree_destroy(&tree); + goto skip; +#endif +#if defined(BENCH_AWALK) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_awalk(&tree); + mtree_destroy(&tree); + goto skip; +#endif +#if defined(BENCH_WALK) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_walk(&tree); + mtree_destroy(&tree); + goto skip; +#endif +#if defined(BENCH_FORK) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_forking(&tree); + mtree_destroy(&tree); + goto skip; +#endif +#if defined(BENCH_MT_FOR_EACH) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_mt_for_each(&tree); + mtree_destroy(&tree); + goto skip; +#endif + + test_kmem_cache_bulk(); + + mt_init_flags(&tree, 0); + check_new_node(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_prealloc(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_spanning_write(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_null_expand(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, 0); + check_dfs_preorder(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_forking(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_mas_store_gfp(&tree); + mtree_destroy(&tree); + + /* Test ranges (store and insert) */ + mt_init_flags(&tree, 0); + check_ranges(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_alloc_range(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_alloc_rev_range(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, 0); + + check_load(&tree, set[0], NULL); /* See if 5015 -> NULL */ + + check_insert(&tree, set[9], &tree); /* Insert 0 */ + check_load(&tree, set[9], &tree); /* See if 0 -> &tree */ + check_load(&tree, set[0], NULL); /* See if 5015 -> NULL */ + + check_insert(&tree, set[10], ptr); /* Insert 5003 */ + check_load(&tree, set[9], &tree); /* See if 0 -> &tree */ + check_load(&tree, set[11], NULL); /* See if 5002 -> NULL */ + check_load(&tree, set[10], ptr); /* See if 5003 -> ptr */ + + /* Clear out the tree */ + mtree_destroy(&tree); + + /* Try to insert, insert a dup, and load back what was inserted. */ + mt_init_flags(&tree, 0); + check_insert(&tree, set[0], &tree); /* Insert 5015 */ + check_dup_insert(&tree, set[0], &tree); /* Insert 5015 again */ + check_load(&tree, set[0], &tree); /* See if 5015 -> &tree */ + + /* + * Second set of tests try to load a value that doesn't exist, inserts + * a second value, then loads the value again + */ + check_load(&tree, set[1], NULL); /* See if 5014 -> NULL */ + check_insert(&tree, set[1], ptr); /* insert 5014 -> ptr */ + check_load(&tree, set[1], ptr); /* See if 5014 -> ptr */ + check_load(&tree, set[0], &tree); /* See if 5015 -> &tree */ + /* + * Tree currently contains: + * p[0]: 14 -> (nil) p[1]: 15 -> ptr p[2]: 16 -> &tree p[3]: 0 -> (nil) + */ + check_insert(&tree, set[6], ptr); /* insert 1002 -> ptr */ + check_insert(&tree, set[7], &tree); /* insert 1003 -> &tree */ + + check_load(&tree, set[0], &tree); /* See if 5015 -> &tree */ + check_load(&tree, set[1], ptr); /* See if 5014 -> ptr */ + check_load(&tree, set[6], ptr); /* See if 1002 -> ptr */ + check_load(&tree, set[7], &tree); /* 1003 = &tree ? */ + + /* Clear out tree */ + mtree_destroy(&tree); + + mt_init_flags(&tree, 0); + /* Test inserting into a NULL hole. */ + check_insert(&tree, set[5], ptr); /* insert 1001 -> ptr */ + check_insert(&tree, set[7], &tree); /* insert 1003 -> &tree */ + check_insert(&tree, set[6], ptr); /* insert 1002 -> ptr */ + check_load(&tree, set[5], ptr); /* See if 1001 -> ptr */ + check_load(&tree, set[6], ptr); /* See if 1002 -> ptr */ + check_load(&tree, set[7], &tree); /* See if 1003 -> &tree */ + + /* Clear out the tree */ + mtree_destroy(&tree); + + mt_init_flags(&tree, 0); + check_erase_testset(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, 0); + /* + * set[] = {5015, 5014, 5017, 25, 1000, + * 1001, 1002, 1003, 1005, 0, + * 5003, 5002}; + */ + + check_insert(&tree, set[0], ptr); /* 5015 */ + check_insert(&tree, set[1], &tree); /* 5014 */ + check_insert(&tree, set[2], ptr); /* 5017 */ + check_insert(&tree, set[3], &tree); /* 25 */ + check_load(&tree, set[0], ptr); + check_load(&tree, set[1], &tree); + check_load(&tree, set[2], ptr); + check_load(&tree, set[3], &tree); + check_insert(&tree, set[4], ptr); /* 1000 < Should split. */ + check_load(&tree, set[0], ptr); + check_load(&tree, set[1], &tree); + check_load(&tree, set[2], ptr); + check_load(&tree, set[3], &tree); /*25 */ + check_load(&tree, set[4], ptr); + check_insert(&tree, set[5], &tree); /* 1001 */ + check_load(&tree, set[0], ptr); + check_load(&tree, set[1], &tree); + check_load(&tree, set[2], ptr); + check_load(&tree, set[3], &tree); + check_load(&tree, set[4], ptr); + check_load(&tree, set[5], &tree); + check_insert(&tree, set[6], ptr); + check_load(&tree, set[0], ptr); + check_load(&tree, set[1], &tree); + check_load(&tree, set[2], ptr); + check_load(&tree, set[3], &tree); + check_load(&tree, set[4], ptr); + check_load(&tree, set[5], &tree); + check_load(&tree, set[6], ptr); + check_insert(&tree, set[7], &tree); + check_load(&tree, set[0], ptr); + check_insert(&tree, set[8], ptr); + + check_insert(&tree, set[9], &tree); + + check_load(&tree, set[0], ptr); + check_load(&tree, set[1], &tree); + check_load(&tree, set[2], ptr); + check_load(&tree, set[3], &tree); + check_load(&tree, set[4], ptr); + check_load(&tree, set[5], &tree); + check_load(&tree, set[6], ptr); + check_load(&tree, set[9], &tree); + mtree_destroy(&tree); + + check_nomem(&tree); + mt_init_flags(&tree, 0); + check_seq(&tree, 16, false); + mtree_destroy(&tree); + + mt_init_flags(&tree, 0); + check_seq(&tree, 1000, true); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_rev_seq(&tree, 1000, true); + mtree_destroy(&tree); + + check_lower_bound_split(&tree); + check_upper_bound_split(&tree); + check_mid_split(&tree); + + mt_init_flags(&tree, 0); + check_next_entry(&tree); + check_find(&tree); + check_find_2(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_prev_entry(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, 0); + check_erase2_sets(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_gap_combining(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_node_overwrite(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + next_prev_test(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_rcu_simulated(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_rcu_threaded(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_spanning_relatives(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_rev_find(&tree); + mtree_destroy(&tree); + + mt_init_flags(&tree, 0); + check_fuzzer(&tree); + mtree_destroy(&tree); + + +#if defined(BENCH) +skip: +#endif + rcu_barrier(); + pr_info("maple_tree: %u of %u tests passed\n", + atomic_read(&maple_tree_tests_passed), + atomic_read(&maple_tree_tests_run)); + if (atomic_read(&maple_tree_tests_run) == + atomic_read(&maple_tree_tests_passed)) + return 0; + + return -EINVAL; +} + +static void maple_tree_harvest(void) +{ + +} + +module_init(maple_tree_seed); +module_exit(maple_tree_harvest); +MODULE_AUTHOR("Liam R. Howlett "); +MODULE_LICENSE("GPL"); diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index c4ea4fbb0bfcd1..89d613e0505b30 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -4,9 +4,9 @@ CFLAGS += -I. -I../../include -g -Og -Wall -D_LGPL_SOURCE -fsanitize=address \ -fsanitize=undefined LDFLAGS += -fsanitize=address -fsanitize=undefined LDLIBS+= -lpthread -lurcu -TARGETS = main idr-test multiorder xarray +TARGETS = main idr-test multiorder xarray maple CORE_OFILES := xarray.o radix-tree.o idr.o linux.o test.o find_bit.o bitmap.o \ - slab.o + slab.o maple.o OFILES = main.o $(CORE_OFILES) regression1.o regression2.o regression3.o \ regression4.o tag_check.o multiorder.o idr-test.o iteration_check.o \ iteration_check_2.o benchmark.o @@ -29,6 +29,8 @@ idr-test: idr-test.o $(CORE_OFILES) xarray: $(CORE_OFILES) +maple: $(CORE_OFILES) + multiorder: multiorder.o $(CORE_OFILES) clean: @@ -40,6 +42,7 @@ $(OFILES): Makefile *.h */*.h generated/map-shift.h \ ../../include/linux/*.h \ ../../include/asm/*.h \ ../../../include/linux/xarray.h \ + ../../../include/linux/maple_tree.h \ ../../../include/linux/radix-tree.h \ ../../../include/linux/idr.h @@ -51,6 +54,8 @@ idr.c: ../../../lib/idr.c xarray.o: ../../../lib/xarray.c ../../../lib/test_xarray.c +maple.o: ../../../lib/maple_tree.c ../../../lib/test_maple_tree.c + generated/map-shift.h: @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ echo "#define XA_CHUNK_SHIFT $(SHIFT)" > \ From 03b055c3f563afd807641d1aab24aa4496593fe3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:45 +0000 Subject: [PATCH 0757/1250] mm: start tracking VMAs with maple tree Start tracking the VMAs with the new maple tree structure in parallel with the rb_tree. Add debug and trace events for maple tree operations and duplicate the rb_tree that is created on forks into the maple tree. The maple tree is added to the mm_struct including the mm_init struct, added support in required mm/mmap functions, added tracking in kernel/fork for process forking, and used to find the unmapped_area and checked against what the rbtree finds. This also moves the mmap_lock() in exit_mmap() since the oom reaper call does walk the VMAs. Otherwise lockdep will be unhappy if oom happens. When splitting a vma fails due to allocations of the maple tree nodes, the error path in __split_vma() calls new->vm_ops->close(new). The page accounting for hugetlb is actually in the close() operation, so it accounts for the removal of 1/2 of the VMA which was not adjusted. This results in a negative exit value. To avoid the negative charge, set vm_start = vm_end and vm_pgoff = 0. There is also a potential accounting issue in special mappings from insert_vm_struct() failing to allocate, so reverse the charge there in the failure scenario. Link: https://lkml.kernel.org/r/20220504010716.661115-10-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-9-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/x86/kernel/tboot.c | 1 + drivers/firmware/efi/efi.c | 1 + include/linux/mm.h | 5 + include/linux/mm_types.h | 3 + include/trace/events/mmap.h | 73 ++++++++ kernel/fork.c | 20 +- mm/init-mm.c | 2 + mm/mmap.c | 353 ++++++++++++++++++++++++++++++++---- mm/nommu.c | 13 ++ 9 files changed, 435 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 0c1154a1c40327..71c54ad3868a02 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -97,6 +97,7 @@ void __init tboot_probe(void) static pgd_t *tboot_pg_dir; static struct mm_struct tboot_mm = { .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 860534bcfdac2a..1eddef189d6893 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -58,6 +58,7 @@ static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR; struct mm_struct efi_mm = { .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock), .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), .write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq), diff --git a/include/linux/mm.h b/include/linux/mm.h index 4265bd5728ff15..cd754a3327ce05 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2528,6 +2528,8 @@ extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); +/* mmap.c */ +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, @@ -2591,6 +2593,9 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas); + static inline int check_data_rlimit(unsigned long rlim, unsigned long new, unsigned long start, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cf97f3884fda20..dcc2416f918cf3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -486,6 +487,7 @@ struct kioctx_table; struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ + struct maple_tree mm_mt; struct rb_root mm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU @@ -681,6 +683,7 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN) extern struct mm_struct init_mm; /* Pointer magic because the dynamic array size confuses some compilers. */ diff --git a/include/trace/events/mmap.h b/include/trace/events/mmap.h index 4661f7ba07c05e..216de5f0362101 100644 --- a/include/trace/events/mmap.h +++ b/include/trace/events/mmap.h @@ -42,6 +42,79 @@ TRACE_EVENT(vm_unmapped_area, __entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->align_offset) ); + +TRACE_EVENT(vma_mas_szero, + TP_PROTO(struct maple_tree *mt, unsigned long start, + unsigned long end), + + TP_ARGS(mt, start, end), + + TP_STRUCT__entry( + __field(struct maple_tree *, mt) + __field(unsigned long, start) + __field(unsigned long, end) + ), + + TP_fast_assign( + __entry->mt = mt; + __entry->start = start; + __entry->end = end; + ), + + TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,", + __entry->mt, + (unsigned long) __entry->start, + (unsigned long) __entry->end + ) +); + +TRACE_EVENT(vma_store, + TP_PROTO(struct maple_tree *mt, struct vm_area_struct *vma), + + TP_ARGS(mt, vma), + + TP_STRUCT__entry( + __field(struct maple_tree *, mt) + __field(struct vm_area_struct *, vma) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + ), + + TP_fast_assign( + __entry->mt = mt; + __entry->vma = vma; + __entry->vm_start = vma->vm_start; + __entry->vm_end = vma->vm_end - 1; + ), + + TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,", + __entry->mt, __entry->vma, + (unsigned long) __entry->vm_start, + (unsigned long) __entry->vm_end + ) +); + + +TRACE_EVENT(exit_mmap, + TP_PROTO(struct mm_struct *mm), + + TP_ARGS(mm), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(struct maple_tree *, mt) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->mt = &mm->mm_mt; + ), + + TP_printk("mt_mod %p, DESTROY\n", + __entry->mt + ) +); + #endif /* This part must be outside protection */ diff --git a/kernel/fork.c b/kernel/fork.c index 9d44f2d46c6964..1840da0732f606 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -585,6 +585,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, int retval; unsigned long charge; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, 0, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { @@ -614,6 +615,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; khugepaged_fork(mm, oldmm); + retval = mas_expected_entries(&mas, oldmm->map_count); + if (retval) + goto out; + prev = NULL; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -629,7 +634,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, */ if (fatal_signal_pending(current)) { retval = -EINTR; - goto out; + goto loop_out; } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); @@ -694,6 +699,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; + /* Link the vma into the MT */ + mas.index = tmp->vm_start; + mas.last = tmp->vm_end - 1; + mas_store(&mas, tmp); + mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); @@ -702,10 +712,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_ops->open(tmp); if (retval) - goto out; + goto loop_out; } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); +loop_out: + mas_destroy(&mas); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -721,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); - goto out; + goto loop_out; } static inline int mm_alloc_pgd(struct mm_struct *mm) @@ -1111,6 +1123,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, { mm->mmap = NULL; mm->mm_rb = RB_ROOT; + mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); + mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); diff --git a/mm/init-mm.c b/mm/init-mm.c index fbe7844d0912f5..b912b0f2ecedac 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -29,6 +30,7 @@ */ struct mm_struct init_mm = { .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/mm/mmap.c b/mm/mmap.c index d529837bc8c3b9..efa9e9f1d6bb03 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -334,7 +334,71 @@ static int browse_rb(struct mm_struct *mm) } return bug ? -1 : i; } +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) +extern void mt_validate(struct maple_tree *mt); +extern void mt_dump(const struct maple_tree *mt); +/* Validate the maple tree */ +static void validate_mm_mt(struct mm_struct *mm) +{ + struct maple_tree *mt = &mm->mm_mt; + struct vm_area_struct *vma_mt, *vma = mm->mmap; + + MA_STATE(mas, mt, 0, 0); + + mt_validate(&mm->mm_mt); + mas_for_each(&mas, vma_mt, ULONG_MAX) { + if (xa_is_zero(vma_mt)) + continue; + + if (!vma) + break; + + if ((vma != vma_mt) || + (vma->vm_start != vma_mt->vm_start) || + (vma->vm_end != vma_mt->vm_end) || + (vma->vm_start != mas.index) || + (vma->vm_end - 1 != mas.last)) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); +#ifdef CONFIG_DEBUG_VM + dump_vma(vma_mt); + pr_emerg("and next in rb\n"); + dump_vma(vma->vm_next); +#endif + pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, + mas.index, mas.last); + pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, + vma_mt->vm_start, vma_mt->vm_end); + pr_emerg("rb vma: %p %lu - %lu\n", vma, + vma->vm_start, vma->vm_end); + pr_emerg("rb->next = %p %lu - %lu\n", vma->vm_next, + vma->vm_next->vm_start, vma->vm_next->vm_end); + + mt_dump(mas.tree); + if (vma_mt->vm_end != mas.last + 1) { + pr_err("vma: %p vma_mt %lu-%lu\tmt %lu-%lu\n", + mm, vma_mt->vm_start, vma_mt->vm_end, + mas.index, mas.last); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); + if (vma_mt->vm_start != mas.index) { + pr_err("vma: %p vma_mt %p %lu - %lu doesn't match\n", + mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); + } + VM_BUG_ON(vma != vma_mt); + vma = vma->vm_next; + + } + VM_BUG_ON(vma); +} +#else +#define validate_mm_mt(root) do { } while (0) +#endif static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) { struct rb_node *nd; @@ -389,6 +453,7 @@ static void validate_mm(struct mm_struct *mm) } #else #define validate_mm_rb(root, ignore) do { } while (0) +#define validate_mm_mt(root) do { } while (0) #define validate_mm(mm) do { } while (0) #endif @@ -633,6 +698,56 @@ static void __vma_link_file(struct vm_area_struct *vma) } } +/* + * vma_mas_store() - Store a VMA in the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state + * + * Efficient way to store a VMA in the maple tree when the @mas has already + * walked to the correct location. + * + * Note: the end address is inclusive in the maple tree. + */ +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) +{ + trace_vma_store(mas->tree, vma); + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); +} + +/* + * vma_mas_remove() - Remove a VMA from the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state + * + * Efficient way to remove a VMA from the maple tree when the @mas has already + * been established and points to the correct location. + * Note: the end address is inclusive in the maple tree. + */ +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +{ + trace_vma_mas_szero(mas->tree, vma->vm_start, vma->vm_end - 1); + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); +} + +/* + * vma_mas_szero() - Set a given range to zero. Used when modifying a + * vm_area_struct start or end. + * + * @mm: The struct_mm + * @start: The start address to zero + * @end: The end address to zero. + */ +static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, + unsigned long end) +{ + trace_vma_mas_szero(mas->tree, start, end - 1); + mas_set_range(mas, start, end - 1); + mas_store_prealloc(mas, NULL); +} + static void __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node **rb_link, @@ -642,17 +757,22 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, __vma_link_rb(mm, vma, rb_link, rb_parent); } -static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node **rb_link, struct rb_node *rb_parent) { + MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); } + vma_mas_store(vma, &mas); __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); @@ -661,13 +781,15 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, mm->map_count++; validate_mm(mm); + return 0; } /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the * mm's list and rbtree. It has already been inserted into the interval tree. */ -static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) +static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, + struct vm_area_struct *vma) { struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; @@ -675,7 +797,10 @@ static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) if (find_vma_links(mm, vma->vm_start, vma->vm_end, &prev, &rb_link, &rb_parent)) BUG(); - __vma_link(mm, vma, prev, rb_link, rb_parent); + + vma_mas_store(vma, mas); + __vma_link_list(mm, vma, prev); + __vma_link_rb(mm, vma, rb_link, rb_parent); mm->map_count++; } @@ -702,6 +827,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; + struct vm_area_struct *next_next; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; @@ -709,10 +835,13 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, bool start_changed = false, end_changed = false; long adjust_next = 0; int remove_next = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + struct vm_area_struct *exporter = NULL, *importer = NULL; - if (next && !insert) { - struct vm_area_struct *exporter = NULL, *importer = NULL; + validate_mm(mm); + validate_mm_mt(mm); + if (next && !insert) { if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -741,10 +870,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * remove_next == 1 is case 1 or 7. */ remove_next = 1 + (end > next->vm_end); + if (remove_next == 2) + next_next = find_vma(mm, next->vm_end); + VM_WARN_ON(remove_next == 2 && end != next->vm_next->vm_end); - /* trim end to next, for case 6 first pass */ - end = next->vm_end; } exporter = next; @@ -792,9 +922,11 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, return error; } } -again: - vma_adjust_trans_huge(orig_vma, start, end, adjust_next); + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + + vma_adjust_trans_huge(orig_vma, start, end, adjust_next); if (file) { mapping = file->f_mapping; root = &mapping->i_mmap; @@ -835,17 +967,28 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (start != vma->vm_start) { + unsigned long old_start = vma->vm_start; vma->vm_start = start; + if (old_start < start) + vma_mas_szero(&mas, old_start, start); start_changed = true; } if (end != vma->vm_end) { + unsigned long old_end = vma->vm_end; vma->vm_end = end; + if (old_end > end) + vma_mas_szero(&mas, end, old_end); end_changed = true; } + + if (end_changed || start_changed) + vma_mas_store(vma, &mas); + vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; + vma_mas_store(next, &mas); } if (file) { @@ -859,10 +1002,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, /* * vma_merge has merged next into vma, and needs * us to remove next before dropping the locks. + * Since we have expanded over this vma, the maple tree will + * have overwritten by storing the value */ - if (remove_next != 3) + if (remove_next != 3) { __vma_unlink(mm, next, next); - else + if (remove_next == 2) + __vma_unlink(mm, next_next, next_next); + } else { /* * vma is not before next if they've been * swapped. @@ -873,15 +1020,19 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * "vma"). */ __vma_unlink(mm, next, vma); - if (file) + } + if (file) { __remove_shared_vm_struct(next, file, mapping); + if (remove_next == 2) + __remove_shared_vm_struct(next_next, file, mapping); + } } else if (insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, insert); + __insert_vm_struct(mm, &mas, insert); } else { if (start_changed) vma_gap_update(vma); @@ -909,6 +1060,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (remove_next) { +again: if (file) { uprobe_munmap(next, next->vm_start, next->vm_end); fput(file); @@ -930,7 +1082,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * "next->vm_prev->vm_end" changed and the * "vma->vm_next" gap must be updated. */ - next = vma->vm_next; + next = next_next; } else { /* * For the scope of the comment "next" and @@ -946,7 +1098,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (remove_next == 2) { remove_next = 1; - end = next->vm_end; goto again; } else if (next) @@ -978,6 +1129,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, uprobe_mmap(insert); validate_mm(mm); + validate_mm_mt(mm); return 0; } @@ -1131,6 +1283,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *area, *next; int err; + validate_mm_mt(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1206,6 +1359,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, khugepaged_enter_vma(area, vm_flags); return area; } + validate_mm_mt(mm); return NULL; } @@ -1685,6 +1839,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; + validate_mm_mt(mm); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; @@ -1799,7 +1954,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - vma_link(mm, vma, prev, rb_link, rb_parent); + if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + error = -ENOMEM; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } /* * vma_merge() calls khugepaged_enter_vma() either, the below @@ -1839,6 +2000,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_set_page_prot(vma); + validate_mm_mt(mm); return addr; unmap_and_free_vma: @@ -1855,6 +2017,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unacct_error: if (charged) vm_unacct_memory(charged); + validate_mm_mt(mm); return error; } @@ -1871,12 +2034,19 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long length, low_limit, high_limit, gap_start, gap_end; + unsigned long gap; + MA_STATE(mas, &mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; + mas_empty_area(&mas, info->low_limit, info->high_limit - 1, + length); + gap = mas.index; + gap += (info->align_offset - gap) & info->align_mask; + /* Adjust search limits by the desired length */ if (info->high_limit < length) return -ENOMEM; @@ -1958,20 +2128,31 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) VM_BUG_ON(gap_start + info->length > info->high_limit); VM_BUG_ON(gap_start + info->length > gap_end); + + VM_BUG_ON(gap != gap_start); return gap_start; } static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; unsigned long length, low_limit, high_limit, gap_start, gap_end; + unsigned long gap; + + MA_STATE(mas, &mm->mm_mt, 0, 0); + validate_mm_mt(mm); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; + mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + length); + gap = mas.last + 1 - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + /* * Adjust search limits by the desired length. * See implementation comment at top of unmapped_area(). @@ -2057,6 +2238,32 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) VM_BUG_ON(gap_end < info->low_limit); VM_BUG_ON(gap_end < gap_start); + + if (gap != gap_end) { + pr_err("%s: %p Gap was found: mt %lu gap_end %lu\n", __func__, + mm, gap, gap_end); + pr_err("window was %lu - %lu size %lu\n", info->high_limit, + info->low_limit, length); + pr_err("mas.min %lu max %lu mas.last %lu\n", mas.min, mas.max, + mas.last); + pr_err("mas.index %lu align mask %lu offset %lu\n", mas.index, + info->align_mask, info->align_offset); + pr_err("rb_find_vma find on %lu => %p (%p)\n", mas.index, + find_vma(mm, mas.index), vma); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + mt_dump(&mm->mm_mt); +#endif + { + struct vm_area_struct *dv = mm->mmap; + + while (dv) { + pr_err("vma %p %lu-%lu\n", dv, dv->vm_start, dv->vm_end); + dv = dv->vm_next; + } + } + VM_BUG_ON(gap != gap_end); + } + return gap_end; } @@ -2279,7 +2486,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) vmacache_update(addr, vma); return vma; } - EXPORT_SYMBOL(find_vma); /* @@ -2352,7 +2558,9 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) struct vm_area_struct *next; unsigned long gap_addr; int error = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + validate_mm_mt(mm); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -2376,9 +2584,14 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); return -ENOMEM; + } /* * vma->vm_start/vm_end cannot change under us because the caller @@ -2415,6 +2628,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); if (vma->vm_next) vma_gap_update(vma->vm_next); @@ -2429,6 +2644,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); validate_mm(mm); + validate_mm_mt(mm); + mas_destroy(&mas); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -2442,7 +2659,9 @@ int expand_downwards(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + validate_mm(mm); address &= PAGE_MASK; if (address < mmap_min_addr) return -EPERM; @@ -2456,9 +2675,14 @@ int expand_downwards(struct vm_area_struct *vma, return -ENOMEM; } + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; + /* We must make sure the anon_vma is allocated. */ - if (unlikely(anon_vma_prepare(vma))) + if (unlikely(anon_vma_prepare(vma))) { + mas_destroy(&mas); return -ENOMEM; + } /* * vma->vm_start/vm_end cannot change under us because the caller @@ -2496,6 +2720,8 @@ int expand_downwards(struct vm_area_struct *vma, anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); vma_gap_update(vma); spin_unlock(&mm->page_table_lock); @@ -2507,6 +2733,7 @@ int expand_downwards(struct vm_area_struct *vma, anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); validate_mm(mm); + mas_destroy(&mas); return error; } @@ -2628,14 +2855,17 @@ static void unmap_region(struct mm_struct *mm, * vma list as we go.. */ static bool -detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, unsigned long end) +detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long end) { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; + mas_set_range(mas, vma->vm_start, end - 1); + mas_store_prealloc(mas, NULL); do { vma_rb_erase(vma, &mm->mm_rb); if (vma->vm_flags & VM_LOCKED) @@ -2676,6 +2906,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct *new; int err; + validate_mm_mt(mm); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); @@ -2718,6 +2949,9 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (!err) return 0; + /* Avoid vm accounting in close() operation */ + new->vm_start = new->vm_end; + new->vm_pgoff = 0; /* Clean everything up if vma_adjust failed. */ if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); @@ -2728,6 +2962,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: vm_area_free(new); + validate_mm_mt(mm); return err; } @@ -2754,6 +2989,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, { unsigned long end; struct vm_area_struct *vma, *prev, *last; + int error = -ENOMEM; + MA_STATE(mas, &mm->mm_mt, 0, 0); if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; @@ -2774,6 +3011,9 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, vma = find_vma_intersection(mm, start, end); if (!vma) return 0; + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + return -ENOMEM; prev = vma->vm_prev; /* @@ -2784,7 +3024,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * places tmp vma above, and higher split_vma places tmp vma below. */ if (start > vma->vm_start) { - int error; /* * Make sure that map_count on return from munmap() will @@ -2792,20 +3031,20 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * its limit temporarily, to help free resources as expected. */ if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) - return -ENOMEM; + goto map_count_exceeded; error = __split_vma(mm, vma, start, 0); if (error) - return error; + goto split_failed; prev = vma; } /* Does it split the last one? */ last = find_vma(mm, end); if (last && end > last->vm_start) { - int error = __split_vma(mm, last, end, 1); + error = __split_vma(mm, last, end, 1); if (error) - return error; + goto split_failed; } vma = vma_next(mm, prev); @@ -2819,13 +3058,13 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ - int error = userfaultfd_unmap_prep(vma, start, end, uf); + error = userfaultfd_unmap_prep(vma, start, end, uf); if (error) - return error; + goto userfaultfd_error; } /* Detach vmas from rbtree */ - if (!detach_vmas_to_be_unmapped(mm, vma, prev, end)) + if (!detach_vmas_to_be_unmapped(mm, &mas, vma, prev, end)) downgrade = false; if (downgrade) @@ -2837,6 +3076,12 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, remove_vma_list(mm, vma); return downgrade ? 1 : 0; + +map_count_exceeded: +split_failed: +userfaultfd_error: + mas_destroy(&mas); + return error; } int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, @@ -2976,6 +3221,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla pgoff_t pgoff = addr >> PAGE_SHIFT; int error; unsigned long mapped_addr; + validate_mm_mt(mm); /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0) @@ -3025,7 +3271,9 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla vma->vm_pgoff = pgoff; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - vma_link(mm, vma, prev, rb_link, rb_parent); + if (vma_link(mm, vma, prev, rb_link, rb_parent)) + goto no_vma_link; + out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -3033,7 +3281,12 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; + validate_mm_mt(mm); return 0; + +no_vma_link: + vm_area_free(vma); + return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) @@ -3122,6 +3375,9 @@ void exit_mmap(struct mm_struct *mm) vma = remove_vma(vma); cond_resched(); } + + trace_exit_mmap(mm); + __mt_destroy(&mm->mm_mt); mm->mmap = NULL; mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); @@ -3135,12 +3391,30 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; + unsigned long start = vma->vm_start; + struct vm_area_struct *overlap = NULL; + unsigned long charged = vma_pages(vma); if (find_vma_links(mm, vma->vm_start, vma->vm_end, &prev, &rb_link, &rb_parent)) + + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; + + overlap = mt_find(&mm->mm_mt, &start, vma->vm_end - 1); + if (overlap) { + + pr_err("Found vma ending at %lu\n", start - 1); + pr_err("vma : %lu => %lu-%lu\n", (unsigned long)overlap, + overlap->vm_start, overlap->vm_end - 1); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + mt_dump(&mm->mm_mt); +#endif + BUG(); + } + if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory_mm(mm, vma_pages(vma))) + security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; /* @@ -3160,7 +3434,11 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - vma_link(mm, vma, prev, rb_link, rb_parent); + if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + vm_unacct_memory(charged); + return -ENOMEM; + } + return 0; } @@ -3178,7 +3456,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; bool faulted_in_anon_vma = true; + unsigned long index = addr; + validate_mm_mt(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3190,6 +3470,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) return NULL; /* should never get here */ + if (mt_find(&mm->mm_mt, &index, addr+len - 1)) + BUG(); new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -3233,6 +3515,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_link(mm, new_vma, prev, rb_link, rb_parent); *need_rmap_locks = false; } + validate_mm_mt(mm); return new_vma; out_free_mempol: @@ -3240,6 +3523,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_vma: vm_area_free(new_vma); out: + validate_mm_mt(mm); return NULL; } @@ -3376,6 +3660,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; + validate_mm_mt(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3398,10 +3683,12 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); + validate_mm_mt(mm); return vma; out: vm_area_free(vma); + validate_mm_mt(mm); return ERR_PTR(ret); } diff --git a/mm/nommu.c b/mm/nommu.c index e819cbc21b396b..c63793c53a8240 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -545,6 +545,19 @@ static void put_nommu_region(struct vm_region *region) __put_nommu_region(region); } +void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) +{ + mas_set_range(mas, vma->vm_start, vma->vm_end - 1); + mas_store_prealloc(mas, vma); +} + +void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +{ + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + mas_store_prealloc(mas, NULL); +} + /* * add a VMA into a process's mm_struct in the appropriate place in the list * and tree and add to the address space's page tree also if not an anonymous From 0fbf15cd28a89a89ad34a9c3f248f60cb23b60b5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:46 +0000 Subject: [PATCH 0758/1250] mm: add VMA iterator This thin layer of abstraction over the maple tree state is for iterating over VMAs. You can go forwards, go backwards or ask where the iterator is. Rename the existing vma_next() to __vma_next() -- it will be removed by the end of this series. Link: https://lkml.kernel.org/r/20220504010716.661115-11-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-10-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-10-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 21 +++++++++++++++++++++ mm/mmap.c | 10 +++++----- 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index cd754a3327ce05..1027ca66827dff 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -658,6 +658,38 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } +static inline +struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) +{ + return mas_find(&vmi->mas, max); +} + +static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) +{ + /* + * Uses vma_find() to get the first VMA when the iterator starts. + * Calling mas_next() could skip the first entry. + */ + return vma_find(vmi, ULONG_MAX); +} + +static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi) +{ + return mas_prev(&vmi->mas, 0); +} + +static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) +{ + return vmi->mas.index; +} + +#define for_each_vma(__vmi, __vma) \ + while (((__vma) = vma_next(&(__vmi))) != NULL) + +/* The MM code likes to work with exclusive end addresses */ +#define for_each_vma_range(__vmi, __vma, __end) \ + while (((__vma) = vma_find(&(__vmi), (__end) - 1)) != NULL) + #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index dcc2416f918cf3..ae8fc7b36548ad 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -701,6 +701,27 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) return (struct cpumask *)&mm->cpu_bitmap; } +struct vma_iterator { + struct ma_state mas; +}; + +#define VMA_ITERATOR(name, __mm, __addr) \ + struct vma_iterator name = { \ + .mas = { \ + .tree = &(__mm)->mm_mt, \ + .index = __addr, \ + .node = MAS_START, \ + }, \ + } + +static inline void vma_iter_init(struct vma_iterator *vmi, + struct mm_struct *mm, unsigned long addr) +{ + vmi->mas.tree = &mm->mm_mt; + vmi->mas.index = addr; + vmi->mas.node = MAS_START; +} + struct mmu_gather; extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); diff --git a/mm/mmap.c b/mm/mmap.c index efa9e9f1d6bb03..119211401fa685 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -586,7 +586,7 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, } /* - * vma_next() - Get the next VMA. + * __vma_next() - Get the next VMA. * @mm: The mm_struct. * @vma: The current vma. * @@ -594,7 +594,7 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, * * Returns: The next VMA after @vma. */ -static inline struct vm_area_struct *vma_next(struct mm_struct *mm, +static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, struct vm_area_struct *vma) { if (!vma) @@ -1291,7 +1291,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - next = vma_next(mm, prev); + next = __vma_next(mm, prev); area = next; if (area && area->vm_end == end) /* cases 6, 7, 8 */ next = next->vm_next; @@ -2838,7 +2838,7 @@ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end) { - struct vm_area_struct *next = vma_next(mm, prev); + struct vm_area_struct *next = __vma_next(mm, prev); struct mmu_gather tlb; lru_add_drain(); @@ -3046,7 +3046,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (error) goto split_failed; } - vma = vma_next(mm, prev); + vma = __vma_next(mm, prev); if (unlikely(uf)) { /* From 74d08a66d51cee0c53c02b15969d75b574668194 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:46 +0000 Subject: [PATCH 0759/1250] mmap: use the VMA iterator in count_vma_pages_range() This simplifies the implementation and is faster than using the linked list. Link: https://lkml.kernel.org/r/20220504010716.661115-12-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-11-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-11-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 119211401fa685..7e81df77d303cc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -629,29 +629,19 @@ munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, return 0; } + static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { - unsigned long nr_pages = 0; + VMA_ITERATOR(vmi, mm, addr); struct vm_area_struct *vma; + unsigned long nr_pages = 0; - /* Find first overlapping mapping */ - vma = find_vma_intersection(mm, addr, end); - if (!vma) - return 0; - - nr_pages = (min(end, vma->vm_end) - - max(addr, vma->vm_start)) >> PAGE_SHIFT; - - /* Iterate over the rest of the overlaps */ - for (vma = vma->vm_next; vma; vma = vma->vm_next) { - unsigned long overlap_len; - - if (vma->vm_start > end) - break; + for_each_vma_range(vmi, vma, end) { + unsigned long vm_start = max(addr, vma->vm_start); + unsigned long vm_end = min(end, vma->vm_end); - overlap_len = min(end, vma->vm_end) - vma->vm_start; - nr_pages += overlap_len >> PAGE_SHIFT; + nr_pages += PHYS_PFN(vm_end - vm_start); } return nr_pages; From 4a8e8ff67dbaeb6732d1191a9de6621ad4f2b3d8 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:47 +0000 Subject: [PATCH 0760/1250] mm/mmap: use the maple tree in find_vma() instead of the rbtree. Using the maple tree interface mt_find() will handle the RCU locking and will start searching at the address up to the limit, ULONG_MAX in this case. Add kernel documentation to this API. Link: https://lkml.kernel.org/r/20220504010716.661115-13-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-12-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 7e81df77d303cc..d370033f1c9c6b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2444,11 +2444,18 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); -/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ +/** + * find_vma() - Find the VMA for a given address, or the next VMA. + * @mm: The mm_struct to check + * @addr: The address + * + * Returns: The VMA associated with addr, or the next VMA. + * May return %NULL in the case of no VMA at addr or above. + */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct rb_node *rb_node; struct vm_area_struct *vma; + unsigned long index = addr; mmap_assert_locked(mm); /* Check the cache first. */ @@ -2456,22 +2463,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) if (likely(vma)) return vma; - rb_node = mm->mm_rb.rb_node; - - while (rb_node) { - struct vm_area_struct *tmp; - - tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); - - if (tmp->vm_end > addr) { - vma = tmp; - if (tmp->vm_start <= addr) - break; - rb_node = rb_node->rb_left; - } else - rb_node = rb_node->rb_right; - } - + vma = mt_find(&mm->mm_mt, &index, ULONG_MAX); if (vma) vmacache_update(addr, vma); return vma; From f46204bb64f016c2085cdd7f2ba937b8af354ddb Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:47 +0000 Subject: [PATCH 0761/1250] mm/mmap: use the maple tree for find_vma_prev() instead of the rbtree Use the maple tree's advanced API and a maple state to walk the tree for the entry at the address of the next vma, then use the maple state to walk back one entry to find the previous entry. Add kernel documentation comments for this API. Link: https://lkml.kernel.org/r/20220504010716.661115-14-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-13-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-13-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index d370033f1c9c6b..4f5a7be08c96f5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2470,23 +2470,30 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) } EXPORT_SYMBOL(find_vma); -/* - * Same as find_vma, but also return a pointer to the previous VMA in *pprev. +/** + * find_vma_prev() - Find the VMA for a given address, or the next vma and + * set %pprev to the previous VMA, if any. + * @mm: The mm_struct to check + * @addr: The address + * @pprev: The pointer to set to the previous VMA + * + * Note that RCU lock is missing here since the external mmap_lock() is used + * instead. + * + * Returns: The VMA associated with @addr, or the next vma. + * May return %NULL in the case of no vma at addr or above. */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, addr, addr); - vma = find_vma(mm, addr); - if (vma) { - *pprev = vma->vm_prev; - } else { - struct rb_node *rb_node = rb_last(&mm->mm_rb); - - *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; - } + vma = mas_walk(&mas); + *pprev = mas_prev(&mas, 0); + if (!vma) + vma = mas_next(&mas, ULONG_MAX); return vma; } From 12f9af36974c5c2e458bd3c6b2198de1f996b360 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:47 +0000 Subject: [PATCH 0762/1250] mm/mmap: use maple tree for unmapped_area{_topdown} The maple tree code was added to find the unmapped area in a previous commit and was checked against what the rbtree returned, but the actual result was never used. Start using the maple tree implementation and remove the rbtree code. Add kernel documentation comment for these functions. Link: https://lkml.kernel.org/r/20220504010716.661115-15-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-14-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-14-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 255 ++++++++---------------------------------------------- 1 file changed, 34 insertions(+), 221 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4f5a7be08c96f5..13cea81f40f3e8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2011,250 +2011,63 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return error; } +/** + * unmapped_area() - Find an area between the low_limit and the high_limit with + * the correct alignment and offset, all from @info. Note: current->mm is used + * for the search. + * + * @info: The unmapped area information including the range (low_limit - + * hight_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) { - /* - * We implement the search by looking for an rbtree node that - * immediately follows a suitable gap. That is, - * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; - * - gap_end = vma->vm_start >= info->low_limit + length; - * - gap_end - gap_start >= length - */ + unsigned long length, gap; - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long length, low_limit, high_limit, gap_start, gap_end; - unsigned long gap; - MA_STATE(mas, &mm->mm_mt, 0, 0); + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; - mas_empty_area(&mas, info->low_limit, info->high_limit - 1, - length); - gap = mas.index; - gap += (info->align_offset - gap) & info->align_mask; - - /* Adjust search limits by the desired length */ - if (info->high_limit < length) + if (mas_empty_area(&mas, info->low_limit, info->high_limit - 1, + length)) return -ENOMEM; - high_limit = info->high_limit - length; - if (info->low_limit > high_limit) - return -ENOMEM; - low_limit = info->low_limit + length; - - /* Check if rbtree root looks promising */ - if (RB_EMPTY_ROOT(&mm->mm_rb)) - goto check_highest; - vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); - if (vma->rb_subtree_gap < length) - goto check_highest; - - while (true) { - /* Visit left subtree if it looks promising */ - gap_end = vm_start_gap(vma); - if (gap_end >= low_limit && vma->vm_rb.rb_left) { - struct vm_area_struct *left = - rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb); - if (left->rb_subtree_gap >= length) { - vma = left; - continue; - } - } - - gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; -check_current: - /* Check if current node has a suitable gap */ - if (gap_start > high_limit) - return -ENOMEM; - if (gap_end >= low_limit && - gap_end > gap_start && gap_end - gap_start >= length) - goto found; - - /* Visit right subtree if it looks promising */ - if (vma->vm_rb.rb_right) { - struct vm_area_struct *right = - rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb); - if (right->rb_subtree_gap >= length) { - vma = right; - continue; - } - } - - /* Go back up the rbtree to find next candidate node */ - while (true) { - struct rb_node *prev = &vma->vm_rb; - if (!rb_parent(prev)) - goto check_highest; - vma = rb_entry(rb_parent(prev), - struct vm_area_struct, vm_rb); - if (prev == vma->vm_rb.rb_left) { - gap_start = vm_end_gap(vma->vm_prev); - gap_end = vm_start_gap(vma); - goto check_current; - } - } - } - -check_highest: - /* Check highest gap, which does not precede any rbtree node */ - gap_start = mm->highest_vm_end; - gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ - if (gap_start > high_limit) - return -ENOMEM; - -found: - /* We found a suitable gap. Clip it with the original low_limit. */ - if (gap_start < info->low_limit) - gap_start = info->low_limit; - - /* Adjust gap address to the desired alignment */ - gap_start += (info->align_offset - gap_start) & info->align_mask; - - VM_BUG_ON(gap_start + info->length > info->high_limit); - VM_BUG_ON(gap_start + info->length > gap_end); - - VM_BUG_ON(gap != gap_start); - return gap_start; + gap = mas.index; + gap += (info->align_offset - gap) & info->align_mask; + return gap; } +/** + * unmapped_area_topdown() - Find an area between the low_limit and the + * high_limit with * the correct alignment and offset at the highest available + * address, all from @info. Note: current->mm is used for the search. + * + * @info: The unmapped area information including the range (low_limit - + * hight_limit), the alignment offset and mask. + * + * Return: A memory address or -ENOMEM. + */ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = NULL; - unsigned long length, low_limit, high_limit, gap_start, gap_end; - unsigned long gap; - - MA_STATE(mas, &mm->mm_mt, 0, 0); - validate_mm_mt(mm); + unsigned long length, gap; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; - mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, - length); - gap = mas.last + 1 - info->length; - gap -= (gap - info->align_offset) & info->align_mask; - - /* - * Adjust search limits by the desired length. - * See implementation comment at top of unmapped_area(). - */ - gap_end = info->high_limit; - if (gap_end < length) - return -ENOMEM; - high_limit = gap_end - length; - - if (info->low_limit > high_limit) + if (mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + length)) return -ENOMEM; - low_limit = info->low_limit + length; - /* Check highest gap, which does not precede any rbtree node */ - gap_start = mm->highest_vm_end; - if (gap_start <= high_limit) - goto found_highest; - - /* Check if rbtree root looks promising */ - if (RB_EMPTY_ROOT(&mm->mm_rb)) - return -ENOMEM; - vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); - if (vma->rb_subtree_gap < length) - return -ENOMEM; - - while (true) { - /* Visit right subtree if it looks promising */ - gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0; - if (gap_start <= high_limit && vma->vm_rb.rb_right) { - struct vm_area_struct *right = - rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb); - if (right->rb_subtree_gap >= length) { - vma = right; - continue; - } - } - -check_current: - /* Check if current node has a suitable gap */ - gap_end = vm_start_gap(vma); - if (gap_end < low_limit) - return -ENOMEM; - if (gap_start <= high_limit && - gap_end > gap_start && gap_end - gap_start >= length) - goto found; - - /* Visit left subtree if it looks promising */ - if (vma->vm_rb.rb_left) { - struct vm_area_struct *left = - rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb); - if (left->rb_subtree_gap >= length) { - vma = left; - continue; - } - } - - /* Go back up the rbtree to find next candidate node */ - while (true) { - struct rb_node *prev = &vma->vm_rb; - if (!rb_parent(prev)) - return -ENOMEM; - vma = rb_entry(rb_parent(prev), - struct vm_area_struct, vm_rb); - if (prev == vma->vm_rb.rb_right) { - gap_start = vma->vm_prev ? - vm_end_gap(vma->vm_prev) : 0; - goto check_current; - } - } - } - -found: - /* We found a suitable gap. Clip it with the original high_limit. */ - if (gap_end > info->high_limit) - gap_end = info->high_limit; - -found_highest: - /* Compute highest gap address at the desired alignment */ - gap_end -= info->length; - gap_end -= (gap_end - info->align_offset) & info->align_mask; - - VM_BUG_ON(gap_end < info->low_limit); - VM_BUG_ON(gap_end < gap_start); - - if (gap != gap_end) { - pr_err("%s: %p Gap was found: mt %lu gap_end %lu\n", __func__, - mm, gap, gap_end); - pr_err("window was %lu - %lu size %lu\n", info->high_limit, - info->low_limit, length); - pr_err("mas.min %lu max %lu mas.last %lu\n", mas.min, mas.max, - mas.last); - pr_err("mas.index %lu align mask %lu offset %lu\n", mas.index, - info->align_mask, info->align_offset); - pr_err("rb_find_vma find on %lu => %p (%p)\n", mas.index, - find_vma(mm, mas.index), vma); -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - mt_dump(&mm->mm_mt); -#endif - { - struct vm_area_struct *dv = mm->mmap; - - while (dv) { - pr_err("vma %p %lu-%lu\n", dv, dv->vm_start, dv->vm_end); - dv = dv->vm_next; - } - } - VM_BUG_ON(gap != gap_end); - } - - return gap_end; + gap = mas.last + 1 - info->length; + gap -= (gap - info->align_offset) & info->align_mask; + return gap; } /* From 57579b57de57318fae396d4d8aa6f074bcf512fc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:48 +0000 Subject: [PATCH 0763/1250] kernel/fork: use maple tree for dup_mmap() during forking The maple tree was already tracking VMAs in this function by an earlier commit, but the rbtree iterator was being used to iterate the list. Change the iterator to use a maple tree native iterator and switch to the maple tree advanced API to avoid multiple walks of the tree during insert operations. Unexport the now-unused vma_store() function. For performance reasons we bulk allocate the maple tree nodes. The node calculations are done internally to the tree and use the VMA count and assume the worst-case node requirements. The VM_DONT_COPY flag does not allow for the most efficient copy method of the tree and so a bulk loading algorithm is used. Link: https://lkml.kernel.org/r/20220504010716.661115-16-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-15-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- kernel/fork.c | 10 ++++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1027ca66827dff..e156aff3080ad7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2560,8 +2560,6 @@ extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); -/* mmap.c */ -void vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node, diff --git a/kernel/fork.c b/kernel/fork.c index 1840da0732f606..f575a3bead0ee1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -583,8 +583,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct vm_area_struct *mpnt, *tmp, *prev, **pprev; struct rb_node **rb_link, *rb_parent; int retval; - unsigned long charge; + unsigned long charge = 0; LIST_HEAD(uf); + MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); MA_STATE(mas, &mm->mm_mt, 0, 0); uprobe_start_dup_mmap(); @@ -620,7 +621,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, goto out; prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + + retval = mas_expected_entries(&mas, oldmm->map_count); + if (retval) + goto out; + + mas_for_each(&old_mas, mpnt, ULONG_MAX) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { From 561cd17cbbe8751bfeb32271f51c76684f7d1b47 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:48 +0000 Subject: [PATCH 0764/1250] damon: convert __damon_va_three_regions to use the VMA iterator This rather specialised walk can use the VMA iterator. If this proves to be too slow, we can write a custom routine to find the two largest gaps, but it will be somewhat complicated, so let's see if we need it first. Update the kunit test case to use the maple tree. This also fixes an issue with the kunit testcase not adding the last VMA to the list. Link: https://lkml.kernel.org/r/20220504011215.661968-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-16-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-16-Liam.Howlett@oracle.com Fixes: 17ccae8bb5c9 (mm/damon: add kunit tests) Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: SeongJae Park Reviewed-by: David Hildenbrand Cc: Catalin Marinas Cc: David Howells Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/damon/vaddr-test.h | 36 ++++++++++------------------- mm/damon/vaddr.c | 53 ++++++++++++++++++++++--------------------- 2 files changed, 39 insertions(+), 50 deletions(-) diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index d4f55f3491007b..bce37c4875402d 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -14,33 +14,19 @@ #include -static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) +static void __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, + ssize_t nr_vmas) { - int i, j; - unsigned long largest_gap, gap; + int i; + MA_STATE(mas, mt, 0, 0); if (!nr_vmas) return; - for (i = 0; i < nr_vmas - 1; i++) { - vmas[i].vm_next = &vmas[i + 1]; - - vmas[i].vm_rb.rb_left = NULL; - vmas[i].vm_rb.rb_right = &vmas[i + 1].vm_rb; - - largest_gap = 0; - for (j = i; j < nr_vmas; j++) { - if (j == 0) - continue; - gap = vmas[j].vm_start - vmas[j - 1].vm_end; - if (gap > largest_gap) - largest_gap = gap; - } - vmas[i].rb_subtree_gap = largest_gap; - } - vmas[i].vm_next = NULL; - vmas[i].vm_rb.rb_right = NULL; - vmas[i].rb_subtree_gap = 0; + mas_lock(&mas); + for (i = 0; i < nr_vmas; i++) + vma_mas_store(&vmas[i], &mas); + mas_unlock(&mas); } /* @@ -72,6 +58,7 @@ static void __link_vmas(struct vm_area_struct *vmas, ssize_t nr_vmas) */ static void damon_test_three_regions_in_vmas(struct kunit *test) { + static struct mm_struct mm; struct damon_addr_range regions[3] = {0,}; /* 10-20-25, 200-210-220, 300-305, 307-330 */ struct vm_area_struct vmas[] = { @@ -83,9 +70,10 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) (struct vm_area_struct) {.vm_start = 307, .vm_end = 330}, }; - __link_vmas(vmas, 6); + mt_init_flags(&mm.mm_mt, MM_MT_FLAGS); + __link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)); - __damon_va_three_regions(&vmas[0], regions); + __damon_va_three_regions(&mm, regions); KUNIT_EXPECT_EQ(test, 10ul, regions[0].start); KUNIT_EXPECT_EQ(test, 25ul, regions[0].end); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3c7b9d6dca95d3..d24148a8149faf 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -113,37 +113,38 @@ static unsigned long sz_range(struct damon_addr_range *r) * * Returns 0 if success, or negative error code otherwise. */ -static int __damon_va_three_regions(struct vm_area_struct *vma, +static int __damon_va_three_regions(struct mm_struct *mm, struct damon_addr_range regions[3]) { - struct damon_addr_range gap = {0}, first_gap = {0}, second_gap = {0}; - struct vm_area_struct *last_vma = NULL; - unsigned long start = 0; - struct rb_root rbroot; - - /* Find two biggest gaps so that first_gap > second_gap > others */ - for (; vma; vma = vma->vm_next) { - if (!last_vma) { - start = vma->vm_start; - goto next; - } + struct damon_addr_range first_gap = {0}, second_gap = {0}; + VMA_ITERATOR(vmi, mm, 0); + struct vm_area_struct *vma, *prev = NULL; + unsigned long start; - if (vma->rb_subtree_gap <= sz_range(&second_gap)) { - rbroot.rb_node = &vma->vm_rb; - vma = rb_entry(rb_last(&rbroot), - struct vm_area_struct, vm_rb); + /* + * Find the two biggest gaps so that first_gap > second_gap > others. + * If this is too slow, it can be optimised to examine the maple + * tree gaps. + */ + for_each_vma(vmi, vma) { + unsigned long gap; + + if (!prev) { + start = vma->vm_start; goto next; } - - gap.start = last_vma->vm_end; - gap.end = vma->vm_start; - if (sz_range(&gap) > sz_range(&second_gap)) { - swap(gap, second_gap); - if (sz_range(&second_gap) > sz_range(&first_gap)) - swap(second_gap, first_gap); + gap = vma->vm_start - prev->vm_end; + + if (gap > sz_range(&first_gap)) { + second_gap = first_gap; + first_gap.start = prev->vm_end; + first_gap.end = vma->vm_start; + } else if (gap > sz_range(&second_gap)) { + second_gap.start = prev->vm_end; + second_gap.end = vma->vm_start; } next: - last_vma = vma; + prev = vma; } if (!sz_range(&second_gap) || !sz_range(&first_gap)) @@ -159,7 +160,7 @@ static int __damon_va_three_regions(struct vm_area_struct *vma, regions[1].start = ALIGN(first_gap.end, DAMON_MIN_REGION); regions[1].end = ALIGN(second_gap.start, DAMON_MIN_REGION); regions[2].start = ALIGN(second_gap.end, DAMON_MIN_REGION); - regions[2].end = ALIGN(last_vma->vm_end, DAMON_MIN_REGION); + regions[2].end = ALIGN(prev->vm_end, DAMON_MIN_REGION); return 0; } @@ -180,7 +181,7 @@ static int damon_va_three_regions(struct damon_target *t, return -EINVAL; mmap_read_lock(mm); - rc = __damon_va_three_regions(mm->mmap, regions); + rc = __damon_va_three_regions(mm, regions); mmap_read_unlock(mm); mmput(mm); From 6a26a3981d03380c4203387ffd56829e788f957e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:48 +0000 Subject: [PATCH 0765/1250] proc: remove VMA rbtree use from nommu These users of the rbtree should probably have been walks of the linked list, but convert them to use walks of the maple tree. Link: https://lkml.kernel.org/r/20220504011345.662299-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-17-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-17-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- fs/proc/task_nommu.c | 45 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a6d21fc0033c64..2fd06f52b6a448 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -20,15 +20,13 @@ */ void task_mem(struct seq_file *m, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long bytes = 0, sbytes = 0, slack = 0, size; - - mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + mmap_read_lock(mm); + for_each_vma(vmi, vma) { bytes += kobjsize(vma); region = vma->vm_region; @@ -82,15 +80,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long task_vsize(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - struct rb_node *p; unsigned long vsize = 0; mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - } mmap_read_unlock(mm); return vsize; } @@ -99,14 +95,13 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; struct vm_region *region; - struct rb_node *p; unsigned long size = kobjsize(mm); mmap_read_lock(mm); - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { - vma = rb_entry(p, struct vm_area_struct, vm_rb); + for_each_vma(vmi, vma) { size += kobjsize(vma); region = vma->vm_region; if (region) { @@ -190,17 +185,19 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) */ static int show_map(struct seq_file *m, void *_p) { - struct rb_node *p = _p; - - return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); + return nommu_vma_show(m, _p); } static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; struct mm_struct *mm; - struct rb_node *p; - loff_t n = *pos; + struct vm_area_struct *vma; + unsigned long addr = *pos; + + /* See m_next(). Zero at the start or after lseek. */ + if (addr == -1UL) + return NULL; /* pin the task and mm whilst we play with them */ priv->task = get_proc_task(priv->inode); @@ -216,10 +213,10 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EINTR); } - /* start from the Nth VMA */ - for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) - if (n-- == 0) - return p; + /* start the next element from addr */ + vma = find_vma(mm, addr); + if (vma) + return vma; mmap_read_unlock(mm); mmput(mm); @@ -242,10 +239,10 @@ static void m_stop(struct seq_file *m, void *_vml) static void *m_next(struct seq_file *m, void *_p, loff_t *pos) { - struct rb_node *p = _p; + struct vm_area_struct *vma = _p; - (*pos)++; - return p ? rb_next(p) : NULL; + *pos = vma->vm_end; + return find_vma(vma->vm_mm, vma->vm_end); } static const struct seq_operations proc_pid_maps_ops = { From 3c057956c0cfe6b99f4f017d2389feb15c5eab0a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:49 +0000 Subject: [PATCH 0766/1250] mm: remove rb tree. Remove the RB tree and start using the maple tree for vm_area_struct tracking. Drop validate_mm() calls in expand_upwards() and expand_downwards() as the lock is not held. Link: https://lkml.kernel.org/r/20220504011345.662299-2-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-18-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/x86/kernel/tboot.c | 1 - drivers/firmware/efi/efi.c | 1 - include/linux/mm.h | 2 - include/linux/mm_types.h | 14 - kernel/fork.c | 8 - mm/init-mm.c | 2 - mm/mmap.c | 506 ++++++++----------------------------- mm/nommu.c | 87 ++----- mm/util.c | 10 +- 9 files changed, 144 insertions(+), 487 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 71c54ad3868a02..3b388330a1063c 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -96,7 +96,6 @@ void __init tboot_probe(void) static pgd_t *tboot_pg_dir; static struct mm_struct tboot_mm = { - .mm_rb = RB_ROOT, .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 1eddef189d6893..07677fde00af52 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -57,7 +57,6 @@ static unsigned long __initdata mem_reserve = EFI_INVALID_TABLE_ADDR; static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR; struct mm_struct efi_mm = { - .mm_rb = RB_ROOT, .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock), .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/include/linux/mm.h b/include/linux/mm.h index e156aff3080ad7..215a680ecab354 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2615,8 +2615,6 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *, extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ae8fc7b36548ad..f4cccd212b3e3a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -410,19 +410,6 @@ struct vm_area_struct { /* linked list of VM areas per task, sorted by address */ struct vm_area_struct *vm_next, *vm_prev; - - struct rb_node vm_rb; - - /* - * Largest free memory gap in bytes to the left of this VMA. - * Either between this VMA and vma->vm_prev, or between one of the - * VMAs below us in the VMA rbtree and its ->vm_prev. This helps - * get_unmapped_area find a free area of the right size. - */ - unsigned long rb_subtree_gap; - - /* Second cache line starts here. */ - struct mm_struct *vm_mm; /* The address space we belong to. */ /* @@ -488,7 +475,6 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; - struct rb_root mm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, diff --git a/kernel/fork.c b/kernel/fork.c index f575a3bead0ee1..9f2802eff361b5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -581,7 +581,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge = 0; LIST_HEAD(uf); @@ -608,8 +607,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) @@ -701,10 +698,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_prev = prev; prev = tmp; - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - /* Link the vma into the MT */ mas.index = tmp->vm_start; mas.last = tmp->vm_end - 1; @@ -1128,7 +1121,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { mm->mmap = NULL; - mm->mm_rb = RB_ROOT; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); mm->vmacache_seqnum = 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index b912b0f2ecedac..c9327abb771c54 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include #include #include #include @@ -29,7 +28,6 @@ * and size this cpu_bitmask to NR_CPUS. */ struct mm_struct init_mm = { - .mm_rb = RB_ROOT, .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), diff --git a/mm/mmap.c b/mm/mmap.c index 13cea81f40f3e8..414280ec76978a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -247,93 +246,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) return origbrk; } -static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) -{ - unsigned long gap, prev_end; - - /* - * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we - * allow two stack_guard_gaps between them here, and when choosing - * an unmapped area; whereas when expanding we only require one. - * That's a little inconsistent, but keeps the code here simpler. - */ - gap = vm_start_gap(vma); - if (vma->vm_prev) { - prev_end = vm_end_gap(vma->vm_prev); - if (gap > prev_end) - gap -= prev_end; - else - gap = 0; - } - return gap; -} - -#ifdef CONFIG_DEBUG_VM_RB -static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) -{ - unsigned long max = vma_compute_gap(vma), subtree_gap; - if (vma->vm_rb.rb_left) { - subtree_gap = rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - if (vma->vm_rb.rb_right) { - subtree_gap = rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - return max; -} - -static int browse_rb(struct mm_struct *mm) -{ - struct rb_root *root = &mm->mm_rb; - int i = 0, j, bug = 0; - struct rb_node *nd, *pn = NULL; - unsigned long prev = 0, pend = 0; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) { - pr_emerg("vm_start %lx < prev %lx\n", - vma->vm_start, prev); - bug = 1; - } - if (vma->vm_start < pend) { - pr_emerg("vm_start %lx < pend %lx\n", - vma->vm_start, pend); - bug = 1; - } - if (vma->vm_start > vma->vm_end) { - pr_emerg("vm_start %lx > vm_end %lx\n", - vma->vm_start, vma->vm_end); - bug = 1; - } - spin_lock(&mm->page_table_lock); - if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - pr_emerg("free gap %lx, correct %lx\n", - vma->rb_subtree_gap, - vma_compute_subtree_gap(vma)); - bug = 1; - } - spin_unlock(&mm->page_table_lock); - i++; - pn = nd; - prev = vma->vm_start; - pend = vma->vm_end; - } - j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) - j++; - if (i != j) { - pr_emerg("backwards %d, forwards %d\n", j, i); - bug = 1; - } - return bug ? -1 : i; -} #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) extern void mt_validate(struct maple_tree *mt); extern void mt_dump(const struct maple_tree *mt); @@ -361,19 +273,25 @@ static void validate_mm_mt(struct mm_struct *mm) (vma->vm_end - 1 != mas.last)) { pr_emerg("issue in %s\n", current->comm); dump_stack(); -#ifdef CONFIG_DEBUG_VM dump_vma(vma_mt); - pr_emerg("and next in rb\n"); + pr_emerg("and vm_next\n"); dump_vma(vma->vm_next); -#endif pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, mas.index, mas.last); pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, vma_mt->vm_start, vma_mt->vm_end); - pr_emerg("rb vma: %p %lu - %lu\n", vma, + if (vma->vm_prev) { + pr_emerg("ll prev: %p %lu - %lu\n", + vma->vm_prev, vma->vm_prev->vm_start, + vma->vm_prev->vm_end); + } + pr_emerg("ll vma: %p %lu - %lu\n", vma, vma->vm_start, vma->vm_end); - pr_emerg("rb->next = %p %lu - %lu\n", vma->vm_next, - vma->vm_next->vm_start, vma->vm_next->vm_end); + if (vma->vm_next) { + pr_emerg("ll next: %p %lu - %lu\n", + vma->vm_next, vma->vm_next->vm_start, + vma->vm_next->vm_end); + } mt_dump(mas.tree); if (vma_mt->vm_end != mas.last + 1) { @@ -396,21 +314,6 @@ static void validate_mm_mt(struct mm_struct *mm) } VM_BUG_ON(vma); } -#else -#define validate_mm_mt(root) do { } while (0) -#endif -static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) -{ - struct rb_node *nd; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - VM_BUG_ON_VMA(vma != ignore && - vma->rb_subtree_gap != vma_compute_subtree_gap(vma), - vma); - } -} static void validate_mm(struct mm_struct *mm) { @@ -419,7 +322,10 @@ static void validate_mm(struct mm_struct *mm) unsigned long highest_address = 0; struct vm_area_struct *vma = mm->mmap; + validate_mm_mt(mm); + while (vma) { +#ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; @@ -429,6 +335,7 @@ static void validate_mm(struct mm_struct *mm) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } +#endif highest_address = vm_end_gap(vma); vma = vma->vm_next; @@ -443,80 +350,13 @@ static void validate_mm(struct mm_struct *mm) mm->highest_vm_end, highest_address); bug = 1; } - i = browse_rb(mm); - if (i != mm->map_count) { - if (i != -1) - pr_emerg("map_count %d rb %d\n", mm->map_count, i); - bug = 1; - } VM_BUG_ON_MM(bug, mm); } -#else -#define validate_mm_rb(root, ignore) do { } while (0) + +#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ #define validate_mm_mt(root) do { } while (0) #define validate_mm(mm) do { } while (0) -#endif - -RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, - struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_gap) - -/* - * Update augmented rbtree rb_subtree_gap values after vma->vm_start or - * vma->vm_prev->vm_end values changed, without modifying the vma's position - * in the rbtree. - */ -static void vma_gap_update(struct vm_area_struct *vma) -{ - /* - * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created - * a callback function that does exactly what we want. - */ - vma_gap_callbacks_propagate(&vma->vm_rb, NULL); -} - -static inline void vma_rb_insert(struct vm_area_struct *vma, - struct rb_root *root) -{ - /* All rb_subtree_gap values must be consistent prior to insertion */ - validate_mm_rb(root, NULL); - - rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) -{ - /* - * Note rb_erase_augmented is a fairly large inline function, - * so make sure we instantiate it only once with our desired - * augmented rbtree callbacks. - */ - rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, - struct rb_root *root, - struct vm_area_struct *ignore) -{ - /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of - * - * a. the "next" vma being erased if next->vm_start was reduced in - * __vma_adjust() -> __vma_unlink() - * b. the vma being erased in detach_vmas_to_be_unmapped() -> - * vma_rb_erase() - */ - validate_mm_rb(root, ignore); - - __vma_rb_erase(vma, root); -} - -static __always_inline void vma_rb_erase(struct vm_area_struct *vma, - struct rb_root *root) -{ - vma_rb_erase_ignore(vma, root, vma); -} +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * vma has some anon_vma assigned, and is already inserted on that @@ -550,39 +390,26 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } -static int find_vma_links(struct mm_struct *mm, unsigned long addr, - unsigned long end, struct vm_area_struct **pprev, - struct rb_node ***rb_link, struct rb_node **rb_parent) +/* + * range_has_overlap() - Check the @start - @end range for overlapping VMAs and + * sets up a pointer to the previous VMA + * @mm: the mm struct + * @start: the start address of the range + * @end: the end address of the range + * @pprev: the pointer to the pointer of the previous VMA + * + * Returns: True if there is an overlapping VMA, false otherwise + */ +static inline +bool range_has_overlap(struct mm_struct *mm, unsigned long start, + unsigned long end, struct vm_area_struct **pprev) { - struct rb_node **__rb_link, *__rb_parent, *rb_prev; - - mmap_assert_locked(mm); - __rb_link = &mm->mm_rb.rb_node; - rb_prev = __rb_parent = NULL; - - while (*__rb_link) { - struct vm_area_struct *vma_tmp; - - __rb_parent = *__rb_link; - vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); + struct vm_area_struct *existing; - if (vma_tmp->vm_end > addr) { - /* Fail if an existing vma overlaps the area */ - if (vma_tmp->vm_start < end) - return -ENOMEM; - __rb_link = &__rb_parent->rb_left; - } else { - rb_prev = __rb_parent; - __rb_link = &__rb_parent->rb_right; - } - } - - *pprev = NULL; - if (rb_prev) - *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - *rb_link = __rb_link; - *rb_parent = __rb_parent; - return 0; + MA_STATE(mas, &mm->mm_mt, start, start); + existing = mas_find(&mas, end - 1); + *pprev = mas_prev(&mas, 0); + return existing ? true : false; } /* @@ -609,8 +436,6 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, * @start: The start of the range. * @len: The length of the range. * @pprev: pointer to the pointer that will be set to previous vm_area_struct - * @rb_link: the rb_node - * @rb_parent: the parent rb_node * * Find all the vm_area_struct that overlap from @start to * @end and munmap them. Set @pprev to the previous vm_area_struct. @@ -619,14 +444,11 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, */ static inline int munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, - struct vm_area_struct **pprev, struct rb_node ***link, - struct rb_node **parent, struct list_head *uf) + struct vm_area_struct **pprev, struct list_head *uf) { - - while (find_vma_links(mm, start, start + len, pprev, link, parent)) + while (range_has_overlap(mm, start, start + len, pprev)) if (do_munmap(mm, start, len, uf)) return -ENOMEM; - return 0; } @@ -647,30 +469,6 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, return nr_pages; } -void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, - struct rb_node **rb_link, struct rb_node *rb_parent) -{ - /* Update tracking information for the gap following the new vma. */ - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else - mm->highest_vm_end = vm_end_gap(vma); - - /* - * vma->vm_prev wasn't known when we followed the rbtree to find the - * correct insertion point for that vma. As a result, we could not - * update the vma vm_rb parents rb_subtree_gap values on the way down. - * So, we first insert the vma with a zero rb_subtree_gap value - * (to be consistent with what we did on the way down), and then - * immediately update the gap to the correct value. Finally we - * rebalance the rbtree after all augmented values have been set. - */ - rb_link_node(&vma->vm_rb, rb_parent, rb_link); - vma->rb_subtree_gap = 0; - vma_gap_update(vma); - vma_rb_insert(vma, &mm->mm_rb); -} - static void __vma_link_file(struct vm_area_struct *vma) { struct file *file; @@ -738,18 +536,8 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, mas_store_prealloc(mas, NULL); } -static void -__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - __vma_link_list(mm, vma, prev); - __vma_link_rb(mm, vma, rb_link, rb_parent); -} - static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) + struct vm_area_struct *prev) { MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; @@ -763,7 +551,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, } vma_mas_store(vma, &mas); - __vma_link(mm, vma, prev, rb_link, rb_parent); + __vma_link_list(mm, vma, prev); __vma_link_file(vma); if (mapping) @@ -776,34 +564,20 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the interval tree. + * mm's list and the mm tree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma) { struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; - - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - BUG(); + mas_set(mas, vma->vm_start); + prev = mas_prev(mas, 0); vma_mas_store(vma, mas); __vma_link_list(mm, vma, prev); - __vma_link_rb(mm, vma, rb_link, rb_parent); mm->map_count++; } -static __always_inline void __vma_unlink(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *ignore) -{ - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); - __vma_unlink_list(mm, vma); - /* Kill the cache */ - vmacache_invalidate(mm); -} - /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -816,21 +590,18 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; - struct vm_area_struct *next_next; + struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end); + struct vm_area_struct *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; - bool start_changed = false, end_changed = false; + bool vma_changed = false; long adjust_next = 0; int remove_next = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; - validate_mm(mm); - validate_mm_mt(mm); - if (next && !insert) { if (end >= next->vm_end) { /* @@ -957,21 +728,21 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (start != vma->vm_start) { - unsigned long old_start = vma->vm_start; + if (vma->vm_start < start) + vma_mas_szero(&mas, vma->vm_start, start); + vma_changed = true; vma->vm_start = start; - if (old_start < start) - vma_mas_szero(&mas, old_start, start); - start_changed = true; } if (end != vma->vm_end) { - unsigned long old_end = vma->vm_end; + if (vma->vm_end > end) + vma_mas_szero(&mas, end, vma->vm_end); + vma_changed = true; vma->vm_end = end; - if (old_end > end) - vma_mas_szero(&mas, end, old_end); - end_changed = true; + if (!next) + mm->highest_vm_end = vm_end_gap(vma); } - if (end_changed || start_changed) + if (vma_changed) vma_mas_store(vma, &mas); vma->vm_pgoff = pgoff; @@ -995,22 +766,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * Since we have expanded over this vma, the maple tree will * have overwritten by storing the value */ - if (remove_next != 3) { - __vma_unlink(mm, next, next); - if (remove_next == 2) - __vma_unlink(mm, next_next, next_next); - } else { - /* - * vma is not before next if they've been - * swapped. - * - * pre-swap() next->vm_start was reduced so - * tell validate_mm_rb to ignore pre-swap() - * "next" (which is stored in post-swap() - * "vma"). - */ - __vma_unlink(mm, next, vma); - } + __vma_unlink_list(mm, next); + if (remove_next == 2) + __vma_unlink_list(mm, next_next); + /* Kill the cache */ + vmacache_invalidate(mm); + if (file) { __remove_shared_vm_struct(next, file, mapping); if (remove_next == 2) @@ -1023,15 +784,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * (it may either follow vma or precede it). */ __insert_vm_struct(mm, &mas, insert); - } else { - if (start_changed) - vma_gap_update(vma); - if (end_changed) { - if (!next) - mm->highest_vm_end = vm_end_gap(vma); - else if (!adjust_next) - vma_gap_update(next); - } } if (anon_vma) { @@ -1059,7 +811,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); + if (remove_next != 2) + BUG_ON(vma->vm_end < next->vm_end); vm_area_free(next); + /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1089,10 +844,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (remove_next == 2) { remove_next = 1; goto again; - } - else if (next) - vma_gap_update(next); - else { + } else if (!next) { /* * If remove_next == 2 we obviously can't * reach this path. @@ -1119,8 +871,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, uprobe_mmap(insert); validate_mm(mm); - validate_mm_mt(mm); - return 0; } @@ -1273,7 +1023,6 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *area, *next; int err; - validate_mm_mt(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1349,7 +1098,6 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, khugepaged_enter_vma(area, vm_flags); return area; } - validate_mm_mt(mm); return NULL; } @@ -1519,6 +1267,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags_t vm_flags; int pkey = 0; + validate_mm(mm); *populate = 0; if (!len) @@ -1826,10 +1575,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev, *merge; int error; - struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; - validate_mm_mt(mm); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; @@ -1845,8 +1592,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -ENOMEM; } - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + /* Clear old maps, set up prev and uf */ + if (munmap_vma_range(mm, addr, len, &prev, uf)) return -ENOMEM; /* * Private writable mapping: check memory availability @@ -1944,7 +1691,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + if (vma_link(mm, vma, prev)) { error = -ENOMEM; if (file) goto unmap_and_free_vma; @@ -1990,7 +1737,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_set_page_prot(vma); - validate_mm_mt(mm); return addr; unmap_and_free_vma: @@ -2007,7 +1753,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unacct_error: if (charged) vm_unacct_memory(charged); - validate_mm_mt(mm); return error; } @@ -2362,7 +2107,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int error = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); - validate_mm_mt(mm); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -2414,15 +2158,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2433,9 +2175,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Overwrite old entry in mtree. */ vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else + if (!vma->vm_next) mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); @@ -2445,8 +2185,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); - validate_mm_mt(mm); mas_destroy(&mas); return error; } @@ -2455,15 +2193,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_downwards(struct vm_area_struct *vma, - unsigned long address) +int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); - validate_mm(mm); address &= PAGE_MASK; if (address < mmap_min_addr) return -EPERM; @@ -2505,15 +2241,13 @@ int expand_downwards(struct vm_area_struct *vma, error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2525,7 +2259,6 @@ int expand_downwards(struct vm_area_struct *vma, /* Overwrite old entry in mtree. */ vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - vma_gap_update(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2534,7 +2267,6 @@ int expand_downwards(struct vm_area_struct *vma, } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); mas_destroy(&mas); return error; } @@ -2666,10 +2398,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; - mas_set_range(mas, vma->vm_start, end - 1); - mas_store_prealloc(mas, NULL); + vma_mas_szero(mas, vma->vm_start, end); do { - vma_rb_erase(vma, &mm->mm_rb); if (vma->vm_flags & VM_LOCKED) mm->locked_vm -= vma_pages(vma); mm->map_count--; @@ -2677,10 +2407,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; - if (vma) { + if (vma) vma->vm_prev = prev; - vma_gap_update(vma); - } else + else mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; @@ -2802,11 +2531,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (len == 0) return -EINVAL; - /* - * arch_unmap() might do unmaps itself. It must be called - * and finish any rbtree manipulation before this code - * runs and also starts to manipulate the rbtree. - */ + /* arch_unmap() might do unmaps itself. */ arch_unmap(mm, start, end); /* Find the first overlapping VMA where start < vma->vm_end */ @@ -2817,6 +2542,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; prev = vma->vm_prev; + /* we have start < vma->vm_end */ + + /* if it doesn't overlap, we have nothing.. */ + if (vma->vm_start >= end) + return 0; /* * If we need to split any vma, do it now to save pain later. @@ -2877,6 +2607,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, /* Fix up all other VM information */ remove_vma_list(mm, vma); + + validate_mm(mm); return downgrade ? 1 : 0; map_count_exceeded: @@ -3015,11 +2747,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf) +static int do_brk_flags(unsigned long addr, unsigned long len, + unsigned long flags, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; unsigned long mapped_addr; @@ -3038,8 +2770,8 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (error) return error; - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + /* Clear old maps, set up prev and uf */ + if (munmap_vma_range(mm, addr, len, &prev, uf)) return -ENOMEM; /* Check against address space limits *after* clearing old maps... */ @@ -3073,7 +2805,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla vma->vm_pgoff = pgoff; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - if (vma_link(mm, vma, prev, rb_link, rb_parent)) + if (vma_link(mm, vma, prev)) goto no_vma_link; out: @@ -3192,29 +2924,12 @@ void exit_mmap(struct mm_struct *mm) int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; - unsigned long start = vma->vm_start; - struct vm_area_struct *overlap = NULL; unsigned long charged = vma_pages(vma); - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) + if (range_has_overlap(mm, vma->vm_start, vma->vm_end, &prev)) return -ENOMEM; - overlap = mt_find(&mm->mm_mt, &start, vma->vm_end - 1); - if (overlap) { - - pr_err("Found vma ending at %lu\n", start - 1); - pr_err("vma : %lu => %lu-%lu\n", (unsigned long)overlap, - overlap->vm_start, overlap->vm_end - 1); -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - mt_dump(&mm->mm_mt); -#endif - BUG(); - } - if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; @@ -3236,7 +2951,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + if (vma_link(mm, vma, prev)) { vm_unacct_memory(charged); return -ENOMEM; } @@ -3256,9 +2971,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; - struct rb_node **rb_link, *rb_parent; bool faulted_in_anon_vma = true; - unsigned long index = addr; validate_mm_mt(mm); /* @@ -3270,10 +2983,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + if (range_has_overlap(mm, addr, addr + len, &prev)) return NULL; /* should never get here */ - if (mt_find(&mm->mm_mt, &index, addr+len - 1)) - BUG(); + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -3314,12 +3026,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); + if (vma_link(mm, new_vma, prev)) + goto out_vma_link; *need_rmap_locks = false; } validate_mm_mt(mm); return new_vma; +out_vma_link: + if (new_vma->vm_ops && new_vma->vm_ops->close) + new_vma->vm_ops->close(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: diff --git a/mm/nommu.c b/mm/nommu.c index c63793c53a8240..321c7e6718a892 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -566,9 +566,9 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) */ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma, *prev; struct address_space *mapping; - struct rb_node **p, *parent, *rb_prev; + struct vm_area_struct *prev; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); BUG_ON(!vma->vm_region); @@ -586,42 +586,10 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) i_mmap_unlock_write(mapping); } + prev = mas_prev(&mas, 0); + mas_reset(&mas); /* add the VMA to the tree */ - parent = rb_prev = NULL; - p = &mm->mm_rb.rb_node; - while (*p) { - parent = *p; - pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - - /* sort by: start addr, end addr, VMA struct addr in that order - * (the latter is necessary as we may get identical VMAs) */ - if (vma->vm_start < pvma->vm_start) - p = &(*p)->rb_left; - else if (vma->vm_start > pvma->vm_start) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma->vm_end < pvma->vm_end) - p = &(*p)->rb_left; - else if (vma->vm_end > pvma->vm_end) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) { - rb_prev = parent; - p = &(*p)->rb_right; - } else - BUG(); - } - - rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); - - /* add VMA to the VMA list also */ - prev = NULL; - if (rb_prev) - prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - + vma_mas_store(vma, &mas); __vma_link_list(mm, vma, prev); } @@ -634,6 +602,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; struct task_struct *curr = current; + MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); mm->map_count--; for (i = 0; i < VMACACHE_SIZE; i++) { @@ -656,8 +625,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) } /* remove from the MM's tree and list */ - rb_erase(&vma->vm_rb, &mm->mm_rb); - + vma_mas_remove(vma, &mas); __vma_unlink_list(mm, vma); } @@ -681,24 +649,19 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, addr, addr); /* check the cache first */ vma = vmacache_find(mm, addr); if (likely(vma)) return vma; - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end > addr) { - vmacache_update(addr, vma); - return vma; - } - } + vma = mas_walk(&mas); - return NULL; + if (vma) + vmacache_update(addr, vma); + + return vma; } EXPORT_SYMBOL(find_vma); @@ -730,26 +693,23 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; + MA_STATE(mas, &mm->mm_mt, addr, addr); /* check the cache first */ vma = vmacache_find_exact(mm, addr, end); if (vma) return vma; - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start < addr) - continue; - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end == end) { - vmacache_update(addr, vma); - return vma; - } - } + vma = mas_walk(&mas); + if (!vma) + return NULL; + if (vma->vm_start != addr) + return NULL; + if (vma->vm_end != end) + return NULL; - return NULL; + vmacache_update(addr, vma); + return vma; } /* @@ -1546,6 +1506,7 @@ void exit_mmap(struct mm_struct *mm) delete_vma(mm, vma); cond_resched(); } + __mt_destroy(&mm->mm_mt); } int vm_brk(unsigned long addr, unsigned long len) diff --git a/mm/util.c b/mm/util.c index 5df8f2db7ca956..812365cbdd1978 100644 --- a/mm/util.c +++ b/mm/util.c @@ -288,6 +288,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_next = next; if (next) next->vm_prev = vma; + else + mm->highest_vm_end = vm_end_gap(vma); } void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) @@ -300,8 +302,14 @@ void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) prev->vm_next = next; else mm->mmap = next; - if (next) + if (next) { next->vm_prev = prev; + } else { + if (prev) + mm->highest_vm_end = vm_end_gap(prev); + else + mm->highest_vm_end = 0; + } } /* Check if the vma is being used as a stack by this task */ From 8b5daf8dc3601228752b57247e36832fb2585bf0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:49 +0000 Subject: [PATCH 0767/1250] mmap: change zeroing of maple tree in __vma_adjust() Only write to the maple tree if we are not inserting or the insert isn't going to overwrite the area to clear. This avoids spanning writes and node coealescing when unnecessary. The change requires a custom search for the linked list addition to find the correct VMA for the prev link. Link: https://lkml.kernel.org/r/20220504011345.662299-3-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-19-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-19-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 414280ec76978a..cfaea573111b9c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -567,11 +567,11 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, * mm's list and the mm tree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma) + struct vm_area_struct *vma, unsigned long location) { struct vm_area_struct *prev; - mas_set(mas, vma->vm_start); + mas_set(mas, location); prev = mas_prev(mas, 0); vma_mas_store(vma, mas); __vma_link_list(mm, vma, prev); @@ -601,6 +601,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, int remove_next = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; + unsigned long ll_prev = vma->vm_start; /* linked list prev. */ if (next && !insert) { if (end >= next->vm_end) { @@ -728,15 +729,27 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (start != vma->vm_start) { - if (vma->vm_start < start) + if ((vma->vm_start < start) && + (!insert || (insert->vm_end != start))) { vma_mas_szero(&mas, vma->vm_start, start); - vma_changed = true; + VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + } else { + vma_changed = true; + } vma->vm_start = start; } if (end != vma->vm_end) { - if (vma->vm_end > end) - vma_mas_szero(&mas, end, vma->vm_end); - vma_changed = true; + if (vma->vm_end > end) { + if (!insert || (insert->vm_start != end)) { + vma_mas_szero(&mas, end, vma->vm_end); + VM_WARN_ON(insert && + insert->vm_end < vma->vm_end); + } else if (insert->vm_start == end) { + ll_prev = vma->vm_end; + } + } else { + vma_changed = true; + } vma->vm_end = end; if (!next) mm->highest_vm_end = vm_end_gap(vma); @@ -783,7 +796,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, &mas, insert); + __insert_vm_struct(mm, &mas, insert, ll_prev); } if (anon_vma) { @@ -870,6 +883,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (insert && file) uprobe_mmap(insert); + mas_destroy(&mas); validate_mm(mm); return 0; } From 2772cffffbdabee33eed8b3229e60b16fd4023de Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:49 +0000 Subject: [PATCH 0768/1250] xen: use vma_lookup() in privcmd_ioctl_mmap() vma_lookup() walks the VMA tree for a specific value, find_vma() will search the tree after walking to a specific value. It is more efficient to only walk to the requested value since privcmd_ioctl_mmap() will exit the loop if vm_start != msg->va. Link: https://lkml.kernel.org/r/20220504011345.662299-4-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-20-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-20-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- drivers/xen/privcmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 3369734108af23..ad17166b0ef6b3 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -282,7 +282,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) struct page, lru); struct privcmd_mmap_entry *msg = page_address(page); - vma = find_vma(mm, msg->va); + vma = vma_lookup(mm, msg->va); rc = -EINVAL; if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) From 9b2c3b958b2c32f26bbe3ae11024255eea8bad1c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:50 +0000 Subject: [PATCH 0769/1250] mm: optimize find_exact_vma() to use vma_lookup() Use vma_lookup() to walk the tree to the start value requested. If the vma at the start does not match, then the answer is NULL and there is no need to look at the next vma the way that find_vma() would. Link: https://lkml.kernel.org/r/20220504011345.662299-5-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-21-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-21-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 215a680ecab354..c31f5cc2b293e2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2811,7 +2811,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { - struct vm_area_struct *vma = find_vma(mm, vm_start); + struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; From 9bbd369e8796dce26f6b2f8e0f89cf1976882916 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:50 +0000 Subject: [PATCH 0770/1250] mm/khugepaged: optimize collapse_pte_mapped_thp() by using vma_lookup() vma_lookup() will walk the vma tree once and not continue to look for the next vma. Since the exact vma is checked below, this is a more optimal way of searching. Link: https://lkml.kernel.org/r/20220504011345.662299-6-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-22-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-22-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 01f71786d53035..c4f03a2ad60244 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1387,7 +1387,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { unsigned long haddr = addr & HPAGE_PMD_MASK; - struct vm_area_struct *vma = find_vma(mm, haddr); + struct vm_area_struct *vma = vma_lookup(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; pmd_t *pmd; From acc95bfb012365a717a41455038473e99ed7b6d7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:50 +0000 Subject: [PATCH 0771/1250] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap() Avoid allocating a new VMA when it a vma modification can occur. When a brk() can expand or contract a VMA, then the single store operation will only modify one index of the maple tree instead of causing a node to split or coalesce. This avoids unnecessary allocations/frees of maple tree nodes and VMAs. Move some limit & flag verifications out of the do_brk_flags() function to use only relevant checks in the code path of bkr() and vm_brk_flags(). Set the vma to check if it can expand in vm_brk_flags() if extra criteria are met. Drop userfaultfd from do_brk_flags() path and only use it in vm_brk_flags() path since that is the only place a munmap will happen. Add a wraper for munmap for the brk case called do_brk_munmap(). Link: https://lkml.kernel.org/r/20220504011345.662299-7-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-23-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-23-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 239 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 179 insertions(+), 60 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index cfaea573111b9c..8baff80e85f5ea 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -147,17 +147,40 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, - struct list_head *uf); +/* + * check_brk_limits() - Use platform specific check of range & verify mlock + * limits. + * @addr: The address to check + * @len: The size of increase. + * + * Return: 0 on success. + */ +static int check_brk_limits(unsigned long addr, unsigned long len) +{ + unsigned long mapped_addr; + + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; + + return mlock_future_check(current->mm, current->mm->def_flags, len); +} +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf); +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, + unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; - struct vm_area_struct *next; + struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate; bool downgraded = false; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, 0, 0); if (mmap_write_lock_killable(mm)) return -EINTR; @@ -199,35 +222,52 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * __do_munmap() may downgrade mmap_lock to read. + * do_brk_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; + /* Search one past newbrk */ + mas_set(&mas, newbrk); + brkvma = mas_find(&mas, oldbrk); + BUG_ON(brkvma == NULL); + if (brkvma->vm_start >= oldbrk) + goto out; /* mapping intersects with an existing non-brk vma. */ /* - * mm->brk must to be protected by write mmap_lock so update it - * before downgrading mmap_lock. When __do_munmap() fails, - * mm->brk will be restored from origbrk. + * mm->brk must be protected by write mmap_lock. + * do_brk_munmap() may downgrade the lock, so update it + * before calling do_brk_munmap(). */ mm->brk = brk; - ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); - if (ret < 0) { - mm->brk = origbrk; - goto out; - } else if (ret == 1) { + mas.last = oldbrk - 1; + ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + if (ret == 1) { downgraded = true; - } - goto success; + goto success; + } else if (!ret) + goto success; + + mm->brk = origbrk; + goto out; } - /* Check against existing mmap mappings. */ - next = find_vma(mm, oldbrk); + if (check_brk_limits(oldbrk, newbrk - oldbrk)) + goto out; + + /* + * Only check if the next VMA is within the stack_guard_gap of the + * expansion area + */ + mas_set(&mas, oldbrk); + next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; + brkvma = mas_prev(&mas, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) + if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; + mm->brk = brk; success: @@ -2757,38 +2797,55 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, } /* - * this is really a simplified "do_mmap". it only handles - * anonymous maps. eventually we may be able to do some - * brk-specific accounting here. + * brk_munmap() - Unmap a parital vma. + * @mas: The maple tree state. + * @vma: The vma to be modified + * @newbrk: the start of the address to unmap + * @oldbrk: The end of the address to unmap + * @uf: The userfaultfd list_head + * + * Returns: 1 on success. + * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if + * possible. */ -static int do_brk_flags(unsigned long addr, unsigned long len, - unsigned long flags, struct list_head *uf) +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; - pgoff_t pgoff = addr >> PAGE_SHIFT; - int error; - unsigned long mapped_addr; - validate_mm_mt(mm); - - /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((flags & (~VM_EXEC)) != 0) - return -EINVAL; - flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - - mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (IS_ERR_VALUE(mapped_addr)) - return mapped_addr; + struct mm_struct *mm = vma->vm_mm; + int ret; - error = mlock_future_check(mm, mm->def_flags, len); - if (error) - return error; + arch_unmap(mm, newbrk, oldbrk); + ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true); + validate_mm_mt(mm); + return ret; +} - /* Clear old maps, set up prev and uf */ - if (munmap_vma_range(mm, addr, len, &prev, uf)) - return -ENOMEM; +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @mas: The maple tree state. + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *prev = NULL; - /* Check against address space limits *after* clearing old maps... */ + validate_mm_mt(mm); + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; @@ -2798,30 +2855,56 @@ static int do_brk_flags(unsigned long addr, unsigned long len, if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - /* Can we just expand an old private anonymous mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; - /* - * create a vma struct for an anonymous mapping + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. */ - vma = vm_area_alloc(mm); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; + if (vma && + (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { + mas->index = vma->vm_start; + mas->last = addr + len - 1; + vma_adjust_trans_huge(vma, addr, addr + len, 0); + if (vma->anon_vma) { + anon_vma_lock_write(vma->anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + vma->vm_end = addr + len; + vma->vm_flags |= VM_SOFTDIRTY; + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_expand_failed; + + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + khugepaged_enter_vma(vma, flags); + goto out; } + prev = vma; + + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto vma_alloc_fail; vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; + vma->vm_pgoff = addr >> PAGE_SHIFT; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - if (vma_link(mm, vma, prev)) - goto no_vma_link; + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_store_fail; + mm->map_count++; + + if (!prev) + prev = mas_prev(mas, 0); + + __vma_link_list(mm, vma, prev); + mm->map_count++; out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -2832,18 +2915,29 @@ static int do_brk_flags(unsigned long addr, unsigned long len, validate_mm_mt(mm); return 0; -no_vma_link: +mas_store_fail: vm_area_free(vma); +vma_alloc_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; + +mas_expand_failed: + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, addr, addr); len = PAGE_ALIGN(request); if (len < request) @@ -2854,13 +2948,38 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_brk_flags(addr, len, flags, &uf); + /* Until we need other flags, refuse anything except VM_EXEC. */ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + + ret = check_brk_limits(addr, len); + if (ret) + goto limits_failed; + + if (find_vma_intersection(mm, addr, addr + len)) + ret = do_munmap(mm, addr, len, &uf); + + if (ret) + goto munmap_failed; + + vma = mas_prev(&mas, 0); + if (!vma || vma->vm_end != addr || vma_policy(vma) || + !can_vma_merge_after(vma, flags, NULL, NULL, + addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) + vma = NULL; + + ret = do_brk_flags(&mas, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; + +munmap_failed: +limits_failed: + mmap_write_unlock(mm); + return ret; } EXPORT_SYMBOL(vm_brk_flags); From 7a937e088009c4b051550d1d8aa1ec7dffe5ae0b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:51 +0000 Subject: [PATCH 0772/1250] mm: use maple tree operations for find_vma_intersection() Move find_vma_intersection() to mmap.c and change implementation to maple tree. When searching for a vma within a range, it is easier to use the maple tree interface. Exported find_vma_intersection() for kvm module. Link: https://lkml.kernel.org/r/20220504011345.662299-8-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-24-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-24-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++------------------ mm/mmap.c | 29 +++++++++++++++++++++++++++++ mm/nommu.c | 11 +++++++++++ 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c31f5cc2b293e2..9dda5284fdfdb5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2739,26 +2739,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/** - * find_vma_intersection() - Look up the first VMA which intersects the interval - * @mm: The process address space. - * @start_addr: The inclusive start user address. - * @end_addr: The exclusive end user address. - * - * Returns: The first VMA within the provided range, %NULL otherwise. Assumes - * start_addr < end_addr. +/* + * Look up the first VMA which intersects the interval [start_addr, end_addr) + * NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, - unsigned long start_addr, - unsigned long end_addr) -{ - struct vm_area_struct *vma = find_vma(mm, start_addr); - - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; -} + unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address diff --git a/mm/mmap.c b/mm/mmap.c index 8baff80e85f5ea..99416cb7844204 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2056,6 +2056,35 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +/** + * find_vma_intersection() - Look up the first VMA which intersects the interval + * @mm: The process address space. + * @start_addr: The inclusive start user address. + * @end_addr: The exclusive end user address. + * + * Returns: The first VMA within the provided range, %NULL otherwise. Assumes + * start_addr < end_addr. + */ +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + struct vm_area_struct *vma; + unsigned long index = start_addr; + + mmap_assert_locked(mm); + /* Check the cache first. */ + vma = vmacache_find(mm, start_addr); + if (likely(vma)) + return vma; + + vma = mt_find(&mm->mm_mt, &index, end_addr - 1); + if (vma) + vmacache_update(start_addr, vma); + return vma; +} +EXPORT_SYMBOL(find_vma_intersection); + /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check diff --git a/mm/nommu.c b/mm/nommu.c index 321c7e6718a892..2702790d05d3c1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -642,6 +642,17 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) vm_area_free(vma); } +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} +EXPORT_SYMBOL(find_vma_intersection); + /* * look up the first VMA in which addr resides, NULL if none * - should be called with mm->mmap_lock at least held readlocked From e3076ce18333c1ab07ae72b82c921bef331dea2d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:51 +0000 Subject: [PATCH 0773/1250] mm/mmap: use advanced maple tree API for mmap_region() Changing mmap_region() to use the maple tree state and the advanced maple tree interface allows for a lot less tree walking. This change removes the last caller of munmap_vma_range(), so drop this unused function. Add vma_expand() to expand a VMA if possible by doing the necessary hugepage check, uprobe_munmap of files, dcache flush, modifications then undoing the detaches, etc. Link: https://lkml.kernel.org/r/20220504011345.662299-9-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220519020341.rr3s6b4dr7o36cqb@revolver Link: https://lkml.kernel.org/r/20220621204632.3370049-25-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-25-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 251 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 203 insertions(+), 48 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 99416cb7844204..ecd5c1b813c8d6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -470,28 +470,6 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, return vma->vm_next; } -/* - * munmap_vma_range() - munmap VMAs that overlap a range. - * @mm: The mm struct - * @start: The start of the range. - * @len: The length of the range. - * @pprev: pointer to the pointer that will be set to previous vm_area_struct - * - * Find all the vm_area_struct that overlap from @start to - * @end and munmap them. Set @pprev to the previous vm_area_struct. - * - * Returns: -ENOMEM on munmap failure or 0 on success. - */ -static inline int -munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, - struct vm_area_struct **pprev, struct list_head *uf) -{ - while (range_has_overlap(mm, start, start + len, pprev)) - if (do_munmap(mm, start, len, uf)) - return -ENOMEM; - return 0; -} - static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -618,6 +596,129 @@ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, mm->map_count++; } +/* + * vma_expand - Expand an existing VMA + * + * @mas: The maple state + * @vma: The vma to expand + * @start: The start of the vma + * @end: The exclusive end of the vma + * @pgoff: The page offset of vma + * @next: The current of next vma. + * + * Expand @vma to @start and @end. Can expand off the start and end. Will + * expand over @next if it's different from @vma and @end == @next->vm_end. + * Checking if the @vma can expand and merge with @next needs to be handled by + * the caller. + * + * Returns: 0 on success + */ +inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) +{ + struct mm_struct *mm = vma->vm_mm; + struct address_space *mapping = NULL; + struct rb_root_cached *root = NULL; + struct anon_vma *anon_vma = vma->anon_vma; + struct file *file = vma->vm_file; + bool remove_next = false; + + if (next && (vma != next) && (end == next->vm_end)) { + remove_next = true; + if (next->anon_vma && !vma->anon_vma) { + int error; + + anon_vma = next->anon_vma; + vma->anon_vma = anon_vma; + error = anon_vma_clone(vma, next); + if (error) + return error; + } + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); + /* Only handles expanding */ + VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); + + if (mas_preallocate(mas, vma, GFP_KERNEL)) + goto nomem; + + vma_adjust_trans_huge(vma, start, end, 0); + + if (file) { + mapping = file->f_mapping; + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + i_mmap_lock_write(mapping); + } + + if (anon_vma) { + anon_vma_lock_write(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + + if (file) { + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, root); + } + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + /* Note: mas must be pointing to the expanding VMA */ + vma_mas_store(vma, mas); + + if (file) { + vma_interval_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } + + /* Expanding over the next vma */ + if (remove_next) { + /* Remove from mm linked list - also updates highest_vm_end */ + __vma_unlink_list(mm, next); + + /* Kill the cache */ + vmacache_invalidate(mm); + + if (file) + __remove_shared_vm_struct(next, file, mapping); + + } else if (!next) { + mm->highest_vm_end = vm_end_gap(vma); + } + + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(anon_vma); + } + + if (file) { + i_mmap_unlock_write(mapping); + uprobe_mmap(vma); + } + + if (remove_next) { + if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); + fput(file); + } + if (next->anon_vma) + anon_vma_merge(vma, next); + mm->map_count--; + mpol_put(vma_policy(next)); + vm_area_free(next); + } + + validate_mm(mm); + return 0; + +nomem: + return -ENOMEM; +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -1627,9 +1728,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct list_head *uf) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev, *merge; - int error; + struct vm_area_struct *vma = NULL; + struct vm_area_struct *next, *prev, *merge; + pgoff_t pglen = len >> PAGE_SHIFT; unsigned long charged = 0; + unsigned long end = addr + len; + unsigned long merge_start = addr, merge_end = end; + pgoff_t vm_pgoff; + int error; + MA_STATE(mas, &mm->mm_mt, addr, end - 1); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { @@ -1639,16 +1746,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. */ - nr_pages = count_vma_pages_range(mm, addr, addr + len); + nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; } - /* Clear old maps, set up prev and uf */ - if (munmap_vma_range(mm, addr, len, &prev, uf)) + /* Unmap any existing mapping in the area */ + if (do_munmap(mm, addr, len, uf)) return -ENOMEM; + /* * Private writable mapping: check memory availability */ @@ -1659,14 +1767,43 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vm_flags |= VM_ACCOUNT; } - /* - * Can we just expand an old mapping? - */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; + next = mas_next(&mas, ULONG_MAX); + prev = mas_prev(&mas, 0); + if (vm_flags & VM_SPECIAL) + goto cannot_expand; + + /* Attempt to expand an old mapping */ + /* Check next */ + if (next && next->vm_start == end && !vma_policy(next) && + can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, + NULL_VM_UFFD_CTX, NULL)) { + merge_end = next->vm_end; + vma = next; + vm_pgoff = next->vm_pgoff - pglen; + } + + /* Check prev */ + if (prev && prev->vm_end == addr && !vma_policy(prev) && + (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, + pgoff, vma->vm_userfaultfd_ctx, NULL) : + can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, + NULL_VM_UFFD_CTX, NULL))) { + merge_start = prev->vm_start; + vma = prev; + vm_pgoff = prev->vm_pgoff; + } + + + /* Actually expand, if possible */ + if (vma && + !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + khugepaged_enter_vma(vma, vm_flags); + goto expanded; + } + mas.index = addr; + mas.last = end - 1; +cannot_expand: /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but @@ -1679,7 +1816,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } vma->vm_start = addr; - vma->vm_end = addr + len; + vma->vm_end = end; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; @@ -1700,28 +1837,32 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * * Answer: Yes, several device drivers can do it in their * f_op->mmap method. -DaveM - * Bug: If addr is changed, prev, rb_link, rb_parent should - * be updated for vma_link() */ WARN_ON_ONCE(addr != vma->vm_start); addr = vma->vm_start; + mas_reset(&mas); - /* If vm_flags changed after call_mmap(), we should try merge vma again - * as we may succeed this time. + /* + * If vm_flags changed after call_mmap(), we should try merge + * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { - /* ->mmap() can change vma->vm_file and fput the original file. So - * fput the vma->vm_file here or we would add an extra fput for file - * and cause general protection fault ultimately. + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. */ fput(vma->vm_file); vm_area_free(vma); vma = merge; /* Update vm_flags to pick up the change. */ + addr = vma->vm_start; vm_flags = vma->vm_flags; goto unmap_writable; } @@ -1745,7 +1886,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - if (vma_link(mm, vma, prev)) { + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { error = -ENOMEM; if (file) goto unmap_and_free_vma; @@ -1753,6 +1894,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + + vma_mas_store(vma, &mas); + __vma_link_list(mm, vma, prev); + mm->map_count++; + if (vma->vm_file) { + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(vma->vm_file->f_mapping); + + flush_dcache_mmap_lock(vma->vm_file->f_mapping); + vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); + flush_dcache_mmap_unlock(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + /* * vma_merge() calls khugepaged_enter_vma() either, the below * call covers the non-merge case. @@ -1764,7 +1921,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (file && vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; -out: +expanded: perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -1791,6 +1948,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_set_page_prot(vma); + validate_mm(mm); return addr; unmap_and_free_vma: @@ -1807,6 +1965,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unacct_error: if (charged) vm_unacct_memory(charged); + validate_mm(mm); return error; } @@ -2627,10 +2786,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, prev = vma->vm_prev; /* we have start < vma->vm_end */ - /* if it doesn't overlap, we have nothing.. */ - if (vma->vm_start >= end) - return 0; - /* * If we need to split any vma, do it now to save pain later. * From 4c063f987cac89222f31226135ae8f105e8eb4bd Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:52 +0000 Subject: [PATCH 0774/1250] mm: remove vmacache By using the maple tree and the maple tree state, the vmacache is no longer beneficial and is complicating the VMA code. Remove the vmacache to reduce the work in keeping it up to date and code complexity. Link: https://lkml.kernel.org/r/20220504011345.662299-10-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-26-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-26-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- fs/exec.c | 3 - fs/proc/task_mmu.c | 1 - include/linux/mm_types.h | 1 - include/linux/mm_types_task.h | 12 ---- include/linux/sched.h | 1 - include/linux/vm_event_item.h | 4 -- include/linux/vmacache.h | 28 -------- include/linux/vmstat.h | 6 -- kernel/debug/debug_core.c | 12 ---- kernel/fork.c | 5 -- lib/Kconfig.debug | 8 --- mm/Makefile | 2 +- mm/debug.c | 4 +- mm/mmap.c | 31 +-------- mm/nommu.c | 37 ++--------- mm/vmacache.c | 117 ---------------------------------- mm/vmstat.c | 4 -- 17 files changed, 9 insertions(+), 267 deletions(-) delete mode 100644 include/linux/vmacache.h delete mode 100644 mm/vmacache.c diff --git a/fs/exec.c b/fs/exec.c index 0989fb8472a18f..b97afa682ffe9a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -1027,8 +1026,6 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; - vmacache_flush(tsk); task_unlock(tsk); if (old_mm) { mmap_read_unlock(old_mm); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a3398d0f1927f5..4c4559e98632d3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include #include #include #include diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f4cccd212b3e3a..403861b681d38f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -475,7 +475,6 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; - u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index c1bc6731125cbb..0bb4b6da999394 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -24,18 +24,6 @@ IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) -/* - * The per task VMA cache array: - */ -#define VMACACHE_BITS 2 -#define VMACACHE_SIZE (1U << VMACACHE_BITS) -#define VMACACHE_MASK (VMACACHE_SIZE - 1) - -struct vmacache { - u64 seqnum; - struct vm_area_struct *vmas[VMACACHE_SIZE]; -}; - /* * When updating this, please also update struct resident_page_types[] in * kernel/fork.c diff --git a/include/linux/sched.h b/include/linux/sched.h index c46f3a63b758f3..dc131048d46aa5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -859,7 +859,6 @@ struct task_struct { struct mm_struct *active_mm; /* Per-thread vma caching: */ - struct vmacache vmacache; #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 404024486fa539..1ce8fadb2b1c70 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -122,10 +122,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - VMACACHE_FIND_CALLS, - VMACACHE_FIND_HITS, -#endif #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h deleted file mode 100644 index 6fce268a4588e5..00000000000000 --- a/include/linux/vmacache.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_VMACACHE_H -#define __LINUX_VMACACHE_H - -#include -#include - -static inline void vmacache_flush(struct task_struct *tsk) -{ - memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas)); -} - -extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); -extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, - unsigned long addr); - -#ifndef CONFIG_MMU -extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end); -#endif - -static inline void vmacache_invalidate(struct mm_struct *mm) -{ - mm->vmacache_seqnum++; -} - -#endif /* __LINUX_VMACACHE_H */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index bfe38869498d7c..19cf5b6892ceba 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,12 +125,6 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif -#ifdef CONFIG_DEBUG_VM_VMACACHE -#define count_vm_vmacache_event(x) count_vm_event(x) -#else -#define count_vm_vmacache_event(x) do {} while (0) -#endif - #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7beceb447211d1..d5e9ccde3ab8e9 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm) { - int i; - - for (i = 0; i < VMACACHE_SIZE; i++) { - if (!current->vmacache.vmas[i]) - continue; - flush_cache_range(current->vmacache.vmas[i], - addr, addr + BREAK_INSTR_SIZE); - } - } - /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } diff --git a/kernel/fork.c b/kernel/fork.c index 9f2802eff361b5..4a9fce369f306b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -1123,7 +1122,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->mmap = NULL; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); - mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); seqcount_init(&mm->write_protect_seq); @@ -1578,9 +1576,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; - /* initialize the new vmacache entries */ - vmacache_flush(tsk); - if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a8a36e5897552c..fb328c095d5b2d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -813,14 +813,6 @@ config DEBUG_VM If unsure, say N. -config DEBUG_VM_VMACACHE - bool "Debug VMA caching" - depends on DEBUG_VM - help - Enable this to turn on VMA caching debug information. Doing so - can cause significant overhead, so only enable it in non-production - environments. - config DEBUG_VM_MAPLE_TREE bool "Debug VM maple trees" depends on DEBUG_VM diff --git a/mm/Makefile b/mm/Makefile index 9a564f83640359..8083fa85a34879 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o vmacache.o \ + compaction.o \ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) diff --git a/mm/debug.c b/mm/debug.c index bef329bf28f01a..2d625ca0e32694 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -155,7 +155,7 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" + pr_emerg("mm %px mmap %px task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif @@ -183,7 +183,7 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size, + mm, mm->mmap, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif diff --git a/mm/mmap.c b/mm/mmap.c index ecd5c1b813c8d6..a50e545e744582 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -680,9 +679,6 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Remove from mm linked list - also updates highest_vm_end */ __vma_unlink_list(mm, next); - /* Kill the cache */ - vmacache_invalidate(mm); - if (file) __remove_shared_vm_struct(next, file, mapping); @@ -923,8 +919,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, __vma_unlink_list(mm, next); if (remove_next == 2) __vma_unlink_list(mm, next_next); - /* Kill the cache */ - vmacache_invalidate(mm); if (file) { __remove_shared_vm_struct(next, file, mapping); @@ -2228,19 +2222,10 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { - struct vm_area_struct *vma; unsigned long index = start_addr; mmap_assert_locked(mm); - /* Check the cache first. */ - vma = vmacache_find(mm, start_addr); - if (likely(vma)) - return vma; - - vma = mt_find(&mm->mm_mt, &index, end_addr - 1); - if (vma) - vmacache_update(start_addr, vma); - return vma; + return mt_find(&mm->mm_mt, &index, end_addr - 1); } EXPORT_SYMBOL(find_vma_intersection); @@ -2254,19 +2239,10 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma; unsigned long index = addr; mmap_assert_locked(mm); - /* Check the cache first. */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - vma = mt_find(&mm->mm_mt, &index, ULONG_MAX); - if (vma) - vmacache_update(addr, vma); - return vma; + return mt_find(&mm->mm_mt, &index, ULONG_MAX); } EXPORT_SYMBOL(find_vma); @@ -2655,9 +2631,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; - /* Kill the cache */ - vmacache_invalidate(mm); - /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or * VM_GROWSUP VMA. Such VMAs can change their size under diff --git a/mm/nommu.c b/mm/nommu.c index 2702790d05d3c1..265a444a2cc274 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -598,23 +597,12 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { - int i; - struct address_space *mapping; - struct mm_struct *mm = vma->vm_mm; - struct task_struct *curr = current; MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); - mm->map_count--; - for (i = 0; i < VMACACHE_SIZE; i++) { - /* if the vma is cached, invalidate the entire cache */ - if (curr->vmacache.vmas[i] == vma) { - vmacache_invalidate(mm); - break; - } - } - + vma->vm_mm->map_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { + struct address_space *mapping; mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); @@ -626,7 +614,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) /* remove from the MM's tree and list */ vma_mas_remove(vma, &mas); - __vma_unlink_list(mm, vma); + __vma_unlink_list(vma->vm_mm, vma); } /* @@ -659,20 +647,9 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma; MA_STATE(mas, &mm->mm_mt, addr, addr); - /* check the cache first */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - vma = mas_walk(&mas); - - if (vma) - vmacache_update(addr, vma); - - return vma; + return mas_walk(&mas); } EXPORT_SYMBOL(find_vma); @@ -706,11 +683,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long end = addr + len; MA_STATE(mas, &mm->mm_mt, addr, addr); - /* check the cache first */ - vma = vmacache_find_exact(mm, addr, end); - if (vma) - return vma; - vma = mas_walk(&mas); if (!vma) return NULL; @@ -719,7 +691,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma->vm_end != end) return NULL; - vmacache_update(addr, vma); return vma; } diff --git a/mm/vmacache.c b/mm/vmacache.c deleted file mode 100644 index 01a6e6688ec1fb..00000000000000 --- a/mm/vmacache.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2014 Davidlohr Bueso. - */ -#include -#include -#include -#include - -/* - * Hash based on the pmd of addr if configured with MMU, which provides a good - * hit rate for workloads with spatial locality. Otherwise, use pages. - */ -#ifdef CONFIG_MMU -#define VMACACHE_SHIFT PMD_SHIFT -#else -#define VMACACHE_SHIFT PAGE_SHIFT -#endif -#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) - -/* - * This task may be accessing a foreign mm via (for example) - * get_user_pages()->find_vma(). The vmacache is task-local and this - * task's vmacache pertains to a different mm (ie, its own). There is - * nothing we can do here. - * - * Also handle the case where a kernel thread has adopted this mm via - * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm. - */ -static inline bool vmacache_valid_mm(struct mm_struct *mm) -{ - return current->mm == mm && !(current->flags & PF_KTHREAD); -} - -void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) -{ - if (vmacache_valid_mm(newvma->vm_mm)) - current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma; -} - -static bool vmacache_valid(struct mm_struct *mm) -{ - struct task_struct *curr; - - if (!vmacache_valid_mm(mm)) - return false; - - curr = current; - if (mm->vmacache_seqnum != curr->vmacache.seqnum) { - /* - * First attempt will always be invalid, initialize - * the new cache for this task here. - */ - curr->vmacache.seqnum = mm->vmacache_seqnum; - vmacache_flush(curr); - return false; - } - return true; -} - -struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) -{ - int idx = VMACACHE_HASH(addr); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma) { -#ifdef CONFIG_DEBUG_VM_VMACACHE - if (WARN_ON_ONCE(vma->vm_mm != mm)) - break; -#endif - if (vma->vm_start <= addr && vma->vm_end > addr) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} - -#ifndef CONFIG_MMU -struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ - int idx = VMACACHE_HASH(start); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma && vma->vm_start == start && vma->vm_end == end) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 373d2730fcf215..da7e389cf33c90 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1382,10 +1382,6 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - "vmacache_find_calls", - "vmacache_find_hits", -#endif #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", From 56c4be7a34a78abdd42a6cad144f00c56035831d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:52 +0000 Subject: [PATCH 0775/1250] mm: convert vma_lookup() to use mtree_load() Unlike the rbtree, the Maple Tree will return a NULL if there's nothing at a particular address. Since the previous commit dropped the vmacache, it is now possible to consult the tree directly. Link: https://lkml.kernel.org/r/20220504011345.662299-11-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-27-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-27-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9dda5284fdfdb5..80fd428f50345b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2756,12 +2756,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = find_vma(mm, addr); - - if (vma && addr < vma->vm_start) - vma = NULL; - - return vma; + return mtree_load(&mm->mm_mt, addr); } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) From 487c1795c854c8bf3f57a38f28534433f0fcb911 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:52 +0000 Subject: [PATCH 0776/1250] mm/mmap: move mmap_region() below do_munmap() Relocation of code for the next commit. There should be no changes here. Link: https://lkml.kernel.org/r/20220504011345.662299-12-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-28-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-28-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 492 +++++++++++++++++++++++++++--------------------------- 1 file changed, 246 insertions(+), 246 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index a50e545e744582..4ccc7071873dd5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1717,252 +1717,6 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } -unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = NULL; - struct vm_area_struct *next, *prev, *merge; - pgoff_t pglen = len >> PAGE_SHIFT; - unsigned long charged = 0; - unsigned long end = addr + len; - unsigned long merge_start = addr, merge_end = end; - pgoff_t vm_pgoff; - int error; - MA_STATE(mas, &mm->mm_mt, addr, end - 1); - - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { - unsigned long nr_pages; - - /* - * MAP_FIXED may remove pages of mappings that intersects with - * requested mapping. Account for the pages it would unmap. - */ - nr_pages = count_vma_pages_range(mm, addr, end); - - if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) - return -ENOMEM; - } - - /* Unmap any existing mapping in the area */ - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - - /* - * Private writable mapping: check memory availability - */ - if (accountable_mapping(file, vm_flags)) { - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } - - next = mas_next(&mas, ULONG_MAX); - prev = mas_prev(&mas, 0); - if (vm_flags & VM_SPECIAL) - goto cannot_expand; - - /* Attempt to expand an old mapping */ - /* Check next */ - if (next && next->vm_start == end && !vma_policy(next) && - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, - NULL_VM_UFFD_CTX, NULL)) { - merge_end = next->vm_end; - vma = next; - vm_pgoff = next->vm_pgoff - pglen; - } - - /* Check prev */ - if (prev && prev->vm_end == addr && !vma_policy(prev) && - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, - pgoff, vma->vm_userfaultfd_ctx, NULL) : - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, - NULL_VM_UFFD_CTX, NULL))) { - merge_start = prev->vm_start; - vma = prev; - vm_pgoff = prev->vm_pgoff; - } - - - /* Actually expand, if possible */ - if (vma && - !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { - khugepaged_enter_vma(vma, vm_flags); - goto expanded; - } - - mas.index = addr; - mas.last = end - 1; -cannot_expand: - /* - * Determine the object being mapped and call the appropriate - * specific mapper. the address has already been validated, but - * not unmapped, but the maps are removed from the list. - */ - vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; - goto unacct_error; - } - - vma->vm_start = addr; - vma->vm_end = end; - vma->vm_flags = vm_flags; - vma->vm_page_prot = vm_get_page_prot(vm_flags); - vma->vm_pgoff = pgoff; - - if (file) { - if (vm_flags & VM_SHARED) { - error = mapping_map_writable(file->f_mapping); - if (error) - goto free_vma; - } - - vma->vm_file = get_file(file); - error = call_mmap(file, vma); - if (error) - goto unmap_and_free_vma; - - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - */ - WARN_ON_ONCE(addr != vma->vm_start); - - addr = vma->vm_start; - mas_reset(&mas); - - /* - * If vm_flags changed after call_mmap(), we should try merge - * vma again as we may succeed this time. - */ - if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (merge) { - /* - * ->mmap() can change vma->vm_file and fput - * the original file. So fput the vma->vm_file - * here or we would add an extra fput for file - * and cause general protection fault - * ultimately. - */ - fput(vma->vm_file); - vm_area_free(vma); - vma = merge; - /* Update vm_flags to pick up the change. */ - addr = vma->vm_start; - vm_flags = vma->vm_flags; - goto unmap_writable; - } - } - - vm_flags = vma->vm_flags; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; - } else { - vma_set_anonymous(vma); - } - - /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; - if (file) - goto unmap_and_free_vma; - else - goto free_vma; - } - - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { - error = -ENOMEM; - if (file) - goto unmap_and_free_vma; - else - goto free_vma; - } - - if (vma->vm_file) - i_mmap_lock_write(vma->vm_file->f_mapping); - - vma_mas_store(vma, &mas); - __vma_link_list(mm, vma, prev); - mm->map_count++; - if (vma->vm_file) { - if (vma->vm_flags & VM_SHARED) - mapping_allow_writable(vma->vm_file->f_mapping); - - flush_dcache_mmap_lock(vma->vm_file->f_mapping); - vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); - flush_dcache_mmap_unlock(vma->vm_file->f_mapping); - i_mmap_unlock_write(vma->vm_file->f_mapping); - } - - /* - * vma_merge() calls khugepaged_enter_vma() either, the below - * call covers the non-merge case. - */ - khugepaged_enter_vma(vma, vma->vm_flags); - - /* Once vma denies write, undo our temporary denial count */ -unmap_writable: - if (file && vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); - file = vma->vm_file; -expanded: - perf_event_mmap(vma); - - vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); - if (vm_flags & VM_LOCKED) { - if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; - else - mm->locked_vm += (len >> PAGE_SHIFT); - } - - if (file) - uprobe_mmap(vma); - - /* - * New (or expanded) vma always get soft dirty status. - * Otherwise user-space soft-dirty page tracker won't - * be able to distinguish situation when vma area unmapped, - * then new mapped in-place (which must be aimed as - * a completely new data area). - */ - vma->vm_flags |= VM_SOFTDIRTY; - - vma_set_page_prot(vma); - - validate_mm(mm); - return addr; - -unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; - - /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - charged = 0; - if (vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); -free_vma: - vm_area_free(vma); -unacct_error: - if (charged) - vm_unacct_memory(charged); - validate_mm(mm); - return error; -} - /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used @@ -2835,6 +2589,252 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return __do_munmap(mm, start, len, uf, false); } +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct vm_area_struct *next, *prev, *merge; + pgoff_t pglen = len >> PAGE_SHIFT; + unsigned long charged = 0; + unsigned long end = addr + len; + unsigned long merge_start = addr, merge_end = end; + pgoff_t vm_pgoff; + int error; + MA_STATE(mas, &mm->mm_mt, addr, end - 1); + + /* Check against address space limit. */ + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + nr_pages = count_vma_pages_range(mm, addr, end); + + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } + + /* Unmap any existing mapping in the area */ + if (do_munmap(mm, addr, len, uf)) + return -ENOMEM; + + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(file, vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + + next = mas_next(&mas, ULONG_MAX); + prev = mas_prev(&mas, 0); + if (vm_flags & VM_SPECIAL) + goto cannot_expand; + + /* Attempt to expand an old mapping */ + /* Check next */ + if (next && next->vm_start == end && !vma_policy(next) && + can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, + NULL_VM_UFFD_CTX, NULL)) { + merge_end = next->vm_end; + vma = next; + vm_pgoff = next->vm_pgoff - pglen; + } + + /* Check prev */ + if (prev && prev->vm_end == addr && !vma_policy(prev) && + (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, + pgoff, vma->vm_userfaultfd_ctx, NULL) : + can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, + NULL_VM_UFFD_CTX, NULL))) { + merge_start = prev->vm_start; + vma = prev; + vm_pgoff = prev->vm_pgoff; + } + + + /* Actually expand, if possible */ + if (vma && + !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + khugepaged_enter_vma(vma, vm_flags); + goto expanded; + } + + mas.index = addr; + mas.last = end - 1; +cannot_expand: + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = vm_area_alloc(mm); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + + vma->vm_start = addr; + vma->vm_end = end; + vma->vm_flags = vm_flags; + vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->vm_pgoff = pgoff; + + if (file) { + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto free_vma; + } + + vma->vm_file = get_file(file); + error = call_mmap(file, vma); + if (error) + goto unmap_and_free_vma; + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + WARN_ON_ONCE(addr != vma->vm_start); + + addr = vma->vm_start; + mas_reset(&mas); + + /* + * If vm_flags changed after call_mmap(), we should try merge + * vma again as we may succeed this time. + */ + if (unlikely(vm_flags != vma->vm_flags && prev)) { + merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + if (merge) { + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. + */ + fput(vma->vm_file); + vm_area_free(vma); + vma = merge; + /* Update vm_flags to pick up the change. */ + addr = vma->vm_start; + vm_flags = vma->vm_flags; + goto unmap_writable; + } + } + + vm_flags = vma->vm_flags; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } else { + vma_set_anonymous(vma); + } + + /* Allow architectures to sanity-check the vm_flags */ + if (!arch_validate_flags(vma->vm_flags)) { + error = -EINVAL; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + error = -ENOMEM; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + + vma_mas_store(vma, &mas); + __vma_link_list(mm, vma, prev); + mm->map_count++; + if (vma->vm_file) { + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(vma->vm_file->f_mapping); + + flush_dcache_mmap_lock(vma->vm_file->f_mapping); + vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); + flush_dcache_mmap_unlock(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + + /* + * vma_merge() calls khugepaged_enter_vma() either, the below + * call covers the non-merge case. + */ + khugepaged_enter_vma(vma, vma->vm_flags); + + /* Once vma denies write, undo our temporary denial count */ +unmap_writable: + if (file && vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + file = vma->vm_file; +expanded: + perf_event_mmap(vma); + + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); + if (vm_flags & VM_LOCKED) { + if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm)) + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + else + mm->locked_vm += (len >> PAGE_SHIFT); + } + + if (file) + uprobe_mmap(vma); + + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + + vma_set_page_prot(vma); + + validate_mm(mm); + return addr; + +unmap_and_free_vma: + fput(vma->vm_file); + vma->vm_file = NULL; + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + charged = 0; + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +free_vma: + vm_area_free(vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + validate_mm(mm); + return error; +} + static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; From a0d6cab9d7f07999e5b97de16b01c2f0a4d6ad9a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:53 +0000 Subject: [PATCH 0777/1250] mm/mmap: reorganize munmap to use maple states Remove __do_munmap() in favour of do_munmap(), do_mas_munmap(), and do_mas_align_munmap(). do_munmap() is a wrapper to create a maple state for any callers that have not been converted to the maple tree. do_mas_munmap() takes a maple state to mumap a range. This is just a small function which checks for error conditions and aligns the end of the range. do_mas_align_munmap() uses the aligned range to mumap a range. do_mas_align_munmap() starts with the first VMA in the range, then finds the last VMA in the range. Both start and end are split if necessary. Then the VMAs are removed from the linked list and the mm mlock count is updated at the same time. Followed by a single tree operation of overwriting the area in with a NULL. Finally, the detached list is unmapped and freed. By reorganizing the munmap calls as outlined, it is now possible to avoid extra work of aligning pre-aligned callers which are known to be safe, avoid extra VMA lookups or tree walks for modifications. detach_vmas_to_be_unmapped() is no longer used, so drop this code. vm_brk_flags() can just call the do_mas_munmap() as it checks for intersecting VMAs directly. Link: https://lkml.kernel.org/r/20220504011345.662299-13-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-29-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-29-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +- mm/mmap.c | 228 ++++++++++++++++++++++++++++----------------- mm/mremap.c | 17 ++-- 3 files changed, 158 insertions(+), 92 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 80fd428f50345b..474c1f8ad1afc8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2671,8 +2671,9 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int __do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf, bool downgrade); +extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); diff --git a/mm/mmap.c b/mm/mmap.c index 4ccc7071873dd5..ccf35141bb760f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2356,47 +2356,6 @@ static void unmap_region(struct mm_struct *mm, tlb_finish_mmu(&tlb); } -/* - * Create a list of vma's touched by the unmap, removing them from the mm's - * vma list as we go.. - */ -static bool -detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long end) -{ - struct vm_area_struct **insertion_point; - struct vm_area_struct *tail_vma = NULL; - - insertion_point = (prev ? &prev->vm_next : &mm->mmap); - vma->vm_prev = NULL; - vma_mas_szero(mas, vma->vm_start, end); - do { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= vma_pages(vma); - mm->map_count--; - tail_vma = vma; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); - *insertion_point = vma; - if (vma) - vma->vm_prev = prev; - else - mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; - tail_vma->vm_next = NULL; - - /* - * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or - * VM_GROWSUP VMA. Such VMAs can change their size under - * down_read(mmap_lock) and collide with the VMA we are about to unmap. - */ - if (vma && (vma->vm_flags & VM_GROWSDOWN)) - return false; - if (prev && (prev->vm_flags & VM_GROWSUP)) - return false; - return true; -} - /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. @@ -2479,40 +2438,51 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -/* Munmap is split into 2 main parts -- this part which finds - * what needs doing, and the areas themselves, which do the - * work. This now handles partial unmappings. - * Jeremy Fitzhardinge - */ -int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf, bool downgrade) +static inline int +unlock_range(struct vm_area_struct *start, struct vm_area_struct **tail, + unsigned long limit) { - unsigned long end; - struct vm_area_struct *vma, *prev, *last; - int error = -ENOMEM; - MA_STATE(mas, &mm->mm_mt, 0, 0); + struct mm_struct *mm = start->vm_mm; + struct vm_area_struct *tmp = start; + int count = 0; - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; + while (tmp && tmp->vm_start < limit) { + *tail = tmp; + count++; + if (tmp->vm_flags & VM_LOCKED) + mm->locked_vm -= vma_pages(tmp); - len = PAGE_ALIGN(len); - end = start + len; - if (len == 0) - return -EINVAL; + tmp = tmp->vm_next; + } - /* arch_unmap() might do unmaps itself. */ - arch_unmap(mm, start, end); + return count; +} - /* Find the first overlapping VMA where start < vma->vm_end */ - vma = find_vma_intersection(mm, start, end); - if (!vma) - return 0; +/* + * do_mas_align_munmap() - munmap the aligned region from @start to @end. + * @mas: The maple_state, ideally set up to alter the correct tree location. + * @vma: The starting vm_area_struct + * @mm: The mm_struct + * @start: The aligned start address to munmap. + * @end: The aligned end address to munmap. + * @uf: The userfaultfd list_head + * @downgrade: Set to true to attempt a write downgrade of the mmap_sem + * + * If @downgrade is true, check return code for potential release of the lock. + */ +static int +do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool downgrade) +{ + struct vm_area_struct *prev, *last; + int error = -ENOMEM; + /* we have start < vma->vm_end */ - if (mas_preallocate(&mas, vma, GFP_KERNEL)) + if (mas_preallocate(mas, vma, GFP_KERNEL)) return -ENOMEM; - prev = vma->vm_prev; - /* we have start < vma->vm_end */ + mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. * @@ -2533,17 +2503,31 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, error = __split_vma(mm, vma, start, 0); if (error) goto split_failed; + prev = vma; + vma = __vma_next(mm, prev); + mas->index = start; + mas_reset(mas); + } else { + prev = vma->vm_prev; } + if (vma->vm_end >= end) + last = vma; + else + last = find_vma_intersection(mm, end - 1, end); + /* Does it split the last one? */ - last = find_vma(mm, end); - if (last && end > last->vm_start) { + if (last && end < last->vm_end) { error = __split_vma(mm, last, end, 1); + if (error) goto split_failed; + + if (vma == last) + vma = __vma_next(mm, prev); + mas_reset(mas); } - vma = __vma_next(mm, prev); if (unlikely(uf)) { /* @@ -2556,16 +2540,46 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * failure that it's not worth optimizing it for. */ error = userfaultfd_unmap_prep(vma, start, end, uf); + if (error) goto userfaultfd_error; } - /* Detach vmas from rbtree */ - if (!detach_vmas_to_be_unmapped(mm, &mas, vma, prev, end)) - downgrade = false; + /* + * unlock any mlock()ed ranges before detaching vmas, count the number + * of VMAs to be dropped, and return the tail entry of the affected + * area. + */ + mm->map_count -= unlock_range(vma, &last, end); + /* Drop removed area from the tree */ + mas_store_prealloc(mas, NULL); - if (downgrade) - mmap_write_downgrade(mm); + /* Detach vmas from the MM linked list */ + vma->vm_prev = NULL; + if (prev) + prev->vm_next = last->vm_next; + else + mm->mmap = last->vm_next; + + if (last->vm_next) { + last->vm_next->vm_prev = prev; + last->vm_next = NULL; + } else + mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; + + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (downgrade) { + if (last && (last->vm_flags & VM_GROWSDOWN)) + downgrade = false; + else if (prev && (prev->vm_flags & VM_GROWSUP)) + downgrade = false; + else + mmap_write_downgrade(mm); + } unmap_region(mm, vma, prev, start, end); @@ -2579,14 +2593,63 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, map_count_exceeded: split_failed: userfaultfd_error: - mas_destroy(&mas); + mas_destroy(mas); return error; } +/* + * do_mas_munmap() - munmap a given range. + * @mas: The maple state + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length of the range to munmap + * @uf: The userfaultfd list_head + * @downgrade: set to true if the user wants to attempt to write_downgrade the + * mmap_sem + * + * This function takes a @mas that is either pointing to the previous VMA or set + * to MA_START and sets it up to remove the mapping(s). The @len will be + * aligned and any arch_unmap work will be preformed. + * + * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. + */ +int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade) +{ + unsigned long end; + struct vm_area_struct *vma; + + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + end = start + PAGE_ALIGN(len); + if (end == start) + return -EINVAL; + + /* arch_unmap() might do unmaps itself. */ + arch_unmap(mm, start, end); + + /* Find the first overlapping VMA */ + vma = mas_find(mas, end - 1); + if (!vma) + return 0; + + return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade); +} + +/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length to be munmapped. + * @uf: The userfaultfd list_head + */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - return __do_munmap(mm, start, len, uf, false); + MA_STATE(mas, &mm->mm_mt, start, start); + + return do_mas_munmap(&mas, mm, start, len, uf, false); } unsigned long mmap_region(struct file *file, unsigned long addr, @@ -2620,7 +2683,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_munmap(mm, addr, len, uf)) + if (do_mas_munmap(&mas, mm, addr, len, uf, false)) return -ENOMEM; /* @@ -2840,11 +2903,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, start, start); if (mmap_write_lock_killable(mm)) return -EINTR; - ret = __do_munmap(mm, start, len, &uf, downgrade); + ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade); /* * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset @@ -2973,7 +3037,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true); + ret = do_mas_munmap(mas, mm, newbrk, oldbrk-newbrk, uf, true); validate_mm_mt(mm); return ret; } @@ -3113,9 +3177,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - if (find_vma_intersection(mm, addr, addr + len)) - ret = do_munmap(mm, addr, len, &uf); - + ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; diff --git a/mm/mremap.c b/mm/mremap.c index b522cd0259a0f1..e0fba90042466d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -975,20 +975,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * __do_munmap does all the needed commit accounting, and + * do_mas_munmap does all the needed commit accounting, and * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; + MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); - retval = __do_munmap(mm, addr+new_len, old_len - new_len, - &uf_unmap, true); - if (retval < 0 && old_len != new_len) { - ret = retval; - goto out; + retval = do_mas_munmap(&mas, mm, addr + new_len, + old_len - new_len, &uf_unmap, true); /* Returning 1 indicates mmap_lock is downgraded to read. */ - } else if (retval == 1) + if (retval == 1) { downgraded = true; + } else if (retval < 0 && old_len != new_len) { + ret = retval; + goto out; + } + ret = addr; goto out; } From 488d4aac11c0091f5064c5899b4de90b126941f0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:53 +0000 Subject: [PATCH 0778/1250] mm/mmap: change do_brk_munmap() to use do_mas_align_munmap() do_brk_munmap() has already aligned the address and has a maple tree state to be used. Use the new do_mas_align_munmap() to avoid unnecessary alignment and error checks. Link: https://lkml.kernel.org/r/20220504011345.662299-14-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220519150509.1290067-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-30-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-30-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index ccf35141bb760f..259929d9230110 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3037,7 +3037,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_munmap(mas, mm, newbrk, oldbrk-newbrk, uf, true); + ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true); validate_mm_mt(mm); return ret; } From 0116af68060a82cdcbe95896bf3eca31a1ddf0db Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:53 +0000 Subject: [PATCH 0779/1250] arm64: remove mmap linked list from vdso Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220504011345.662299-15-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-31-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-31-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/arm64/kernel/vdso.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index a61fc4f989b37b..a8388af62b99e4 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -136,10 +136,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) From 1fd46f735eeb56f9c378345256a057f499cd5fbc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:54 +0000 Subject: [PATCH 0780/1250] arm64: Change elfcore for_each_mte_vma() to use VMA iterator Rework for_each_mte_vma() to use a VMA iterator instead of an explicit linked-list. Link: https://lkml.kernel.org/r/20220504011345.662299-16-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-32-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-32-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20220218023650.672072-1-Liam.Howlett@oracle.com Signed-off-by: Will Deacon Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/arm64/kernel/elfcore.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 98d67444a5b615..27ef7ad3ffd2e2 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -8,9 +8,9 @@ #include #include -#define for_each_mte_vma(tsk, vma) \ +#define for_each_mte_vma(vmi, vma) \ if (system_supports_mte()) \ - for (vma = tsk->mm->mmap; vma; vma = vma->vm_next) \ + for_each_vma(vmi, vma) \ if (vma->vm_flags & VM_MTE) static unsigned long mte_vma_tag_dump_size(struct vm_area_struct *vma) @@ -81,8 +81,9 @@ Elf_Half elf_core_extra_phdrs(void) { struct vm_area_struct *vma; int vma_count = 0; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) + for_each_mte_vma(vmi, vma) vma_count++; return vma_count; @@ -91,8 +92,9 @@ Elf_Half elf_core_extra_phdrs(void) int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) { + for_each_mte_vma(vmi, vma) { struct elf_phdr phdr; phdr.p_type = PT_AARCH64_MEMTAG_MTE; @@ -116,8 +118,9 @@ size_t elf_core_extra_data_size(void) { struct vm_area_struct *vma; size_t data_size = 0; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) + for_each_mte_vma(vmi, vma) data_size += mte_vma_tag_dump_size(vma); return data_size; @@ -126,8 +129,9 @@ size_t elf_core_extra_data_size(void) int elf_core_write_extra_data(struct coredump_params *cprm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) { + for_each_mte_vma(vmi, vma) { if (vma->vm_flags & VM_DONTDUMP) continue; From d37745134ad4936f3a40c1fa8bcf98ea998480e8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:54 +0000 Subject: [PATCH 0781/1250] parisc: remove mmap linked list from cache handling Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220504011345.662299-17-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-33-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-33-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/parisc/kernel/cache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index a9bc578e4c52e5..b54c7cf6fcdeaf 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -660,15 +660,20 @@ static inline unsigned long mm_total_size(struct mm_struct *mm) { struct vm_area_struct *vma; unsigned long usize = 0; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma && usize < parisc_cache_flush_threshold; vma = vma->vm_next) + for_each_vma(vmi, vma) { + if (usize >= parisc_cache_flush_threshold) + break; usize += vma->vm_end - vma->vm_start; + } return usize; } void flush_cache_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); /* * Flushing the whole cache on each cpu takes forever on @@ -688,7 +693,7 @@ void flush_cache_mm(struct mm_struct *mm) } /* Flush mm */ - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) flush_cache_pages(vma, vma->vm_start, vma->vm_end); } From 52c7442bc2cf013f7a5f50bad82ee886353e1122 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:54 +0000 Subject: [PATCH 0782/1250] powerpc: remove mmap linked list walks Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220504011345.662299-18-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-34-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-34-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/powerpc/kernel/vdso.c | 6 +++--- arch/powerpc/mm/book3s32/tlb.c | 11 ++++++----- arch/powerpc/mm/book3s64/subpage_prot.c | 13 ++----------- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 0da287544054f5..94a8fa5017c358 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -113,18 +113,18 @@ struct vdso_data *arch_get_vdso_data(void *vvar_page) int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; mmap_read_lock(mm); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, &vvar_spec)) zap_page_range(vma, vma->vm_start, size); } - mmap_read_unlock(mm); + return 0; } diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c index 19f0ef950d7732..9ad6b56bfec96e 100644 --- a/arch/powerpc/mm/book3s32/tlb.c +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range); void hash__flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *mp; + VMA_ITERATOR(vmi, mm, 0); /* - * It is safe to go down the mm's list of vmas when called - * from dup_mmap, holding mmap_lock. It would also be safe from - * unmap_region or exit_mmap, but not from vmtruncate on SMP - - * but it seems dup_mmap is the only SMP case which gets here. + * It is safe to iterate the vmas when called from dup_mmap, + * holding mmap_lock. It would also be safe from unmap_region + * or exit_mmap, but not from vmtruncate on SMP - but it seems + * dup_mmap is the only SMP case which gets here. */ - for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) + for_each_vma(vmi, mp) hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); } EXPORT_SYMBOL(hash__flush_tlb_mm); diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 60c6ea16a972af..d73b3b4176e81d 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -149,24 +149,15 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); /* * We don't try too hard, we just mark all the vma in that range * VM_NOHUGEPAGE and split them. */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; + for_each_vma_range(vmi, vma, addr + len) { vma->vm_flags |= VM_NOHUGEPAGE; walk_page_vma(vma, &subpage_walk_ops, NULL); - vma = vma->vm_next; } } #else From 4adb45976d90c4ec5dac89a4f28f5ba665b24376 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:55 +0000 Subject: [PATCH 0783/1250] s390: remove vma linked list walks Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220504011345.662299-19-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-35-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-35-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/s390/kernel/vdso.c | 3 ++- arch/s390/mm/gmap.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 5075cde77b2920..535099f2736dab 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -69,10 +69,11 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma) int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (!vma_is_special_mapping(vma, &vvar_mapping)) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index b8ae4a4aa2ba46..6e24f337eac0b5 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2515,8 +2515,9 @@ static const struct mm_walk_ops thp_split_walk_ops = { static inline void thp_split_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for_each_vma(vmi, vma) { vma->vm_flags &= ~VM_HUGEPAGE; vma->vm_flags |= VM_NOHUGEPAGE; walk_page_vma(vma, &thp_split_walk_ops, NULL); @@ -2584,8 +2585,9 @@ int gmap_mark_unmergeable(void) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int ret; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_UNMERGEABLE, &vma->vm_flags); if (ret) From 047baae2a0390fd7589f5fe28ce60d8766645abb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:55 +0000 Subject: [PATCH 0784/1250] x86: remove vma linked list walks Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220504011345.662299-20-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-36-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-36-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/x86/entry/vdso/vma.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1000d457c3321e..6292b960037b79 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -127,17 +127,17 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, &vvar_mapping)) zap_page_range(vma, vma->vm_start, size); } - mmap_read_unlock(mm); + return 0; } #else @@ -354,6 +354,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); /* @@ -363,7 +364,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) * We could search vma near context.vdso, but it's a slowpath, * so let's explicitly check all VMAs to be completely sure. */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_mapping) || vma_is_special_mapping(vma, &vvar_mapping)) { mmap_write_unlock(mm); From 972c9b52d6b3f34ff731b9c2ebefed42c6e00a33 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:55 +0000 Subject: [PATCH 0785/1250] xtensa: remove vma linked list walks Use the VMA iterator instead. Since VMA can no longer be NULL in the loop, then deal with out-of-memory outside the loop. This means a slightly longer run time in the failure case (-ENOMEM) - it will run to the end of the VMAs before erroring instead of in the middle of the loop. Link: https://lkml.kernel.org/r/20220504011345.662299-21-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-37-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-37-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/xtensa/kernel/syscall.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c index 201356faa7e6e0..b3c2450d6f239e 100644 --- a/arch/xtensa/kernel/syscall.c +++ b/arch/xtensa/kernel/syscall.c @@ -58,6 +58,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct vm_area_struct *vmm; + struct vma_iterator vmi; if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate @@ -79,15 +80,20 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, else addr = PAGE_ALIGN(addr); - for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { - /* At this point: (!vmm || addr < vmm->vm_end). */ - if (TASK_SIZE - len < addr) - return -ENOMEM; - if (!vmm || addr + len <= vm_start_gap(vmm)) - return addr; + vma_iter_init(&vmi, current->mm, addr); + for_each_vma(vmi, vmm) { + /* At this point: (addr < vmm->vm_end). */ + if (addr + len <= vm_start_gap(vmm)) + break; + addr = vmm->vm_end; if (flags & MAP_SHARED) addr = COLOUR_ALIGN(addr, pgoff); } + + if (TASK_SIZE - len < addr) + return -ENOMEM; + + return addr; } #endif From 31023dc33c49e6a0b10bfc5a3b964a7f775d98ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:56 +0000 Subject: [PATCH 0786/1250] cxl: remove vma linked list walk Use the VMA iterator instead. This requires a little restructuring of the surrounding code to hoist the mm to the caller. That turns cxl_prefault_one() into a trivial function, so call cxl_fault_segment() directly. Link: https://lkml.kernel.org/r/20220504011345.662299-22-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-38-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-38-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- drivers/misc/cxl/fault.c | 45 ++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 60c829113299bd..2c64f55cf01f8d 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -280,22 +280,6 @@ void cxl_handle_fault(struct work_struct *fault_work) mmput(mm); } -static void cxl_prefault_one(struct cxl_context *ctx, u64 ea) -{ - struct mm_struct *mm; - - mm = get_mem_context(ctx); - if (mm == NULL) { - pr_devel("cxl_prefault_one unable to get mm %i\n", - pid_nr(ctx->pid)); - return; - } - - cxl_fault_segment(ctx, mm, ea); - - mmput(mm); -} - static u64 next_segment(u64 ea, u64 vsid) { if (vsid & SLB_VSID_B_1T) @@ -306,23 +290,16 @@ static u64 next_segment(u64 ea, u64 vsid) return ea + 1; } -static void cxl_prefault_vma(struct cxl_context *ctx) +static void cxl_prefault_vma(struct cxl_context *ctx, struct mm_struct *mm) { u64 ea, last_esid = 0; struct copro_slb slb; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int rc; - struct mm_struct *mm; - - mm = get_mem_context(ctx); - if (mm == NULL) { - pr_devel("cxl_prefault_vm unable to get mm %i\n", - pid_nr(ctx->pid)); - return; - } mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { for (ea = vma->vm_start; ea < vma->vm_end; ea = next_segment(ea, slb.vsid)) { rc = copro_calculate_slb(mm, ea, &slb); @@ -337,20 +314,28 @@ static void cxl_prefault_vma(struct cxl_context *ctx) } } mmap_read_unlock(mm); - - mmput(mm); } void cxl_prefault(struct cxl_context *ctx, u64 wed) { + struct mm_struct *mm = get_mem_context(ctx); + + if (mm == NULL) { + pr_devel("cxl_prefault unable to get mm %i\n", + pid_nr(ctx->pid)); + return; + } + switch (ctx->afu->prefault_mode) { case CXL_PREFAULT_WED: - cxl_prefault_one(ctx, wed); + cxl_fault_segment(ctx, mm, wed); break; case CXL_PREFAULT_ALL: - cxl_prefault_vma(ctx); + cxl_prefault_vma(ctx, mm); break; default: break; } + + mmput(mm); } From ab481355a34bee787df7078f26200e1c62afec75 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:56 +0000 Subject: [PATCH 0787/1250] optee: remove vma linked list walk Use the VMA iterator instead. Change the calling convention of __check_mem_type() to pass in the mm instead of the first vma in the range. Link: https://lkml.kernel.org/r/20220504011345.662299-23-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-39-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-39-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- drivers/tee/optee/call.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c index 28f87cd8b3ede0..290b1bb0e9cd72 100644 --- a/drivers/tee/optee/call.c +++ b/drivers/tee/optee/call.c @@ -492,15 +492,18 @@ static bool is_normal_memory(pgprot_t p) #endif } -static int __check_mem_type(struct vm_area_struct *vma, unsigned long end) +static int __check_mem_type(struct mm_struct *mm, unsigned long start, + unsigned long end) { - while (vma && is_normal_memory(vma->vm_page_prot)) { - if (vma->vm_end >= end) - return 0; - vma = vma->vm_next; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, start); + + for_each_vma_range(vmi, vma, end) { + if (!is_normal_memory(vma->vm_page_prot)) + return -EINVAL; } - return -EINVAL; + return 0; } int optee_check_mem_type(unsigned long start, size_t num_pages) @@ -516,8 +519,7 @@ int optee_check_mem_type(unsigned long start, size_t num_pages) return 0; mmap_read_lock(mm); - rc = __check_mem_type(find_vma(mm, start), - start + num_pages * PAGE_SIZE); + rc = __check_mem_type(mm, start, start + num_pages * PAGE_SIZE); mmap_read_unlock(mm); return rc; From eb83d617e0c9c5a78c9bc1cc647cddd20007b850 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:56 +0000 Subject: [PATCH 0788/1250] um: remove vma linked list walk Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220504011345.662299-24-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-40-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-40-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/um/kernel/tlb.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index bc38f79ca3a382..ad449173a1a1cd 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -584,21 +584,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, void flush_tlb_mm(struct mm_struct *mm) { - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - while (vma != NULL) { + for_each_vma(vmi, vma) fix_range(mm, vma->vm_start, vma->vm_end, 0); - vma = vma->vm_next; - } } void force_flush_all(void) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - while (vma != NULL) { + for_each_vma(vmi, vma) fix_range(mm, vma->vm_start, vma->vm_end, 1); - vma = vma->vm_next; - } } From ee9ddff9c804ca8a51666e9f077f4f5156eeb281 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:56 +0000 Subject: [PATCH 0789/1250] coredump: remove vma linked list walk Use the Maple Tree iterator instead. This is too complicated for the VMA iterator to handle, so let's open-code it for now. If this turns out to be a common pattern, we can migrate it to common code. Link: https://lkml.kernel.org/r/20220504011345.662299-25-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-41-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-41-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- fs/coredump.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index ebc43f960b6456..3a0022c1ca36e6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1072,30 +1072,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, return vma->vm_end - vma->vm_start; } -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} - /* * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, +static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, + struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) + if (gate_vma && (vma == gate_vma)) return NULL; + + vma = mas_next(mas, ULONG_MAX); + if (vma) + return vma; return gate_vma; } @@ -1119,9 +1109,10 @@ static void free_vma_snapshot(struct coredump_params *cprm) */ static bool dump_vma_snapshot(struct coredump_params *cprm) { - struct vm_area_struct *vma, *gate_vma; + struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - int i; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int i = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1141,8 +1132,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma), i++) { + while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; @@ -1150,10 +1140,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); m->pgoff = vma->vm_pgoff; - m->file = vma->vm_file; if (m->file) get_file(m->file); + i++; } mmap_write_unlock(mm); From 1ef40f14fda75ccf04109b053f8be07d963eacab Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:57 +0000 Subject: [PATCH 0790/1250] exec: use VMA iterator instead of linked list Remove a use of the vm_next list by doing the initial lookup with the VMA iterator and then using it to find the next entry. Link: https://lkml.kernel.org/r/20220504011345.662299-26-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-42-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-42-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- fs/exec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index b97afa682ffe9a..9843cecd031a71 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -686,6 +686,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + struct vm_area_struct *next; struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -694,7 +696,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * ensure there are no vmas between where we want to go * and where we are */ - if (vma != find_vma(mm, new_start)) + if (vma != vma_next(&vmi)) return -EFAULT; /* @@ -713,12 +715,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) lru_add_drain(); tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -727,7 +730,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); From 3fe71ff6281b915b56b131456aa1dd628e9c4e9f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:57 +0000 Subject: [PATCH 0791/1250] fs/proc/base: use maple tree iterators in place of linked list Link: https://lkml.kernel.org/r/20220504011345.662299-27-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-43-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-43-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- fs/proc/base.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 8dfa36a99c7421..61781616874807 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2322,6 +2322,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + MA_STATE(mas, NULL, 0, 0); genradix_init(&fa); @@ -2349,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) } nr_files = 0; + mas.tree = &mm->mm_mt; /* * We need two passes here: @@ -2360,7 +2362,8 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) * routine might require mmap_lock taken in might_fault(). */ - for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + pos = 2; + mas_for_each(&mas, vma, ULONG_MAX) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) From 199e47c4fdb018038cd88331c9f5751b9cfd177c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:57 +0000 Subject: [PATCH 0792/1250] fs/proc/task_mmu: stop using linked list and highest_vm_end Remove references to mm_struct linked list and highest_vm_end for when they are removed Link: https://lkml.kernel.org/r/20220504011345.662299-28-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-44-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-44-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- fs/proc/internal.h | 2 +- fs/proc/task_mmu.c | 73 ++++++++++++++++++++++++++-------------------- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 06a80f78433d8b..f03000764ce52a 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -285,7 +285,7 @@ struct proc_maps_private { struct task_struct *task; struct mm_struct *mm; #ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; + struct vma_iterator iter; #endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4c4559e98632d3..34d292cec79a60 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -123,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -2UL; + vma = get_gate_vma(priv->mm); + } + + return vma; +} + static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) @@ -152,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-EINTR); } + vma_iter_init(&priv->iter, mm, last_addr); hold_task_mempolicy(priv); - priv->tail_vma = get_gate_vma(mm); - - vma = find_vma(mm, last_addr); - if (vma) - return vma; + if (last_addr == -2UL) + return get_gate_vma(mm); - return priv->tail_vma; + return proc_get_vma(priv, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - struct proc_maps_private *priv = m->private; - struct vm_area_struct *next, *vma = v; - - if (vma == priv->tail_vma) - next = NULL; - else if (vma->vm_next) - next = vma->vm_next; - else - next = priv->tail_vma; - - *ppos = next ? next->vm_start : -1UL; - - return next; + if (*ppos == -2UL) { + *ppos = -1UL; + return NULL; + } + return proc_get_vma(m->private, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -875,16 +879,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss; - struct mm_struct *mm; + struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; - unsigned long last_vma_end = 0; + unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; - mm = priv->mm; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; @@ -897,8 +901,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); + vma = mas_find(&mas, 0); + + if (unlikely(!vma)) + goto empty_set; - for (vma = priv->mm->mmap; vma;) { + vma_start = vma->vm_start; + do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; @@ -907,6 +916,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { + mas_pause(&mas); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -950,7 +960,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = find_vma(mm, last_vma_end - 1); + vma = mas_find(&mas, ULONG_MAX); /* Case 3 above */ if (!vma) break; @@ -964,11 +974,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v) smap_gather_stats(vma, &mss, last_vma_end); } /* Case 2 above */ - vma = vma->vm_next; - } + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); - show_vma_header_prefix(m, priv->mm->mmap->vm_start, - last_vma_end, 0, 0, 0, 0); +empty_set: + show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); @@ -1261,6 +1270,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + MA_STATE(mas, &mm->mm_mt, 0, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1280,7 +1290,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vma->vm_flags &= ~VM_SOFTDIRTY; @@ -1292,8 +1302,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, - &cp); + walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); From da92f2cf123b520f041b5f7f9e8647490bca888c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:57 +0000 Subject: [PATCH 0793/1250] userfaultfd: use maple tree iterator to iterate VMAs Don't use the mm_struct linked list or the vma->vm_next in prep for removal. Link: https://lkml.kernel.org/r/20220504011345.662299-29-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220615164150.652376-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-45-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-45-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 62 ++++++++++++++++++++++++----------- include/linux/userfaultfd_k.h | 7 ++-- mm/mmap.c | 2 +- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index de86f5b2859f94..0dd0136b54d4ce 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -615,14 +615,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, if (release_new_ctx) { struct vm_area_struct *vma; struct mm_struct *mm = release_new_ctx->mm; + VMA_ITERATOR(vmi, mm, 0); /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~__VM_UFFD_FLAGS; } + } mmap_write_unlock(mm); userfaultfd_ctx_put(release_new_ctx); @@ -803,11 +805,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *unmaps) +int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *unmaps) { - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + VMA_ITERATOR(vmi, mm, start); + struct vm_area_struct *vma; + + for_each_vma_range(vmi, vma, end) { struct userfaultfd_unmap_ctx *unmap_ctx; struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; @@ -857,6 +861,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + MA_STATE(mas, &mm->mm_mt, 0, 0); WRITE_ONCE(ctx->released, true); @@ -873,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -887,10 +892,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) + if (prev) { + mas_pause(&mas); vma = prev; - else + } else { prev = vma; + } + vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1272,6 +1280,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; + MA_STATE(mas, &mm->mm_mt, 0, 0); user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1314,7 +1323,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1339,7 +1349,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1399,8 +1409,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1430,6 +1442,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto next; } @@ -1437,11 +1451,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = split_vma(mm, vma, start, 1); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } next: /* @@ -1458,8 +1476,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1503,6 +1521,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; + MA_STATE(mas, &mm->mm_mt, 0, 0); ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1521,7 +1540,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1546,7 +1566,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, */ found = false; ret = -EINVAL; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1566,8 +1586,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1632,8 +1654,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 732b522bacb7e5..eee374c29c8596 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -173,9 +173,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf); +extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); @@ -256,7 +255,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma, return true; } -static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, +static inline int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf) { diff --git a/mm/mmap.c b/mm/mmap.c index 259929d9230110..11aee96c8f13d3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2539,7 +2539,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ - error = userfaultfd_unmap_prep(vma, start, end, uf); + error = userfaultfd_unmap_prep(mm, start, end, uf); if (error) goto userfaultfd_error; From 92cd9118025e99ccc431120badc3eea961231c65 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:58 +0000 Subject: [PATCH 0794/1250] ipc/shm: use VMA iterator instead of linked list The VMA iterator is faster than the linked llist, and it can be walked even when VMAs are being removed from the address space, so there's no need to keep track of 'next'. Link: https://lkml.kernel.org/r/20220504011345.662299-30-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-46-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-46-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- ipc/shm.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index b3048ebd5c315c..7d86f058fb861b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1721,7 +1721,7 @@ long ksys_shmdt(char __user *shmaddr) #ifdef CONFIG_MMU loff_t size = 0; struct file *file; - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, addr); #endif if (addr & ~PAGE_MASK) @@ -1751,12 +1751,9 @@ long ksys_shmdt(char __user *shmaddr) * match the usual checks anyway. So assume all vma's are * above the starting address given. */ - vma = find_vma(mm, addr); #ifdef CONFIG_MMU - while (vma) { - next = vma->vm_next; - + for_each_vma(vmi, vma) { /* * Check if the starting address would match, i.e. it's * a fragment created by mprotect() and/or munmap(), or it @@ -1774,6 +1771,7 @@ long ksys_shmdt(char __user *shmaddr) file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + mas_pause(&vmi.mas); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1781,10 +1779,9 @@ long ksys_shmdt(char __user *shmaddr) * searching for matching vma's. */ retval = 0; - vma = next; + vma = vma_next(&vmi); break; } - vma = next; } /* @@ -1794,17 +1791,19 @@ long ksys_shmdt(char __user *shmaddr) */ size = PAGE_ALIGN(size); while (vma && (loff_t)(vma->vm_end - addr) <= size) { - next = vma->vm_next; - /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) + (vma->vm_file == file)) { do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - vma = next; + mas_pause(&vmi.mas); + } + + vma = vma_next(&vmi); } #else /* CONFIG_MMU */ + vma = vma_lookup(mm, addr); /* under NOMMU conditions, the exact address to be destroyed must be * given */ From b9d92d8ebbb99c2c200fbd314e717a92ac69ce3e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:58 +0000 Subject: [PATCH 0795/1250] acct: use VMA iterator instead of linked list The VMA iterator is faster than the linked list. Link: https://lkml.kernel.org/r/20220504011345.662299-31-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-47-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-47-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- kernel/acct.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 13706356ec54d4..62200d799b9b00 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -555,15 +555,14 @@ void acct_collect(long exitcode, int group_dead) unsigned long vsize = 0; if (group_dead && current->mm) { + struct mm_struct *mm = current->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - mmap_read_lock(current->mm); - vma = current->mm->mmap; - while (vma) { + mmap_read_lock(mm); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); } spin_lock_irq(¤t->sighand->siglock); From ac5d4e875cb009a12a0c0351dab9c0400e8969bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:58 +0000 Subject: [PATCH 0796/1250] perf: use VMA iterator The VMA iterator is faster than the linked list and removing the linked list will shrink the vm_area_struct. Link: https://lkml.kernel.org/r/20220504011345.662299-32-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-48-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-48-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- kernel/events/core.c | 3 ++- kernel/events/uprobes.c | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 80782cddb1dabf..61ad10862c2134 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10210,8 +10210,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2eaa327f8158dc..401bc2d24ce062 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -349,9 +349,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe, static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; - for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; @@ -1231,11 +1232,12 @@ int uprobe_apply(struct inode *inode, loff_t offset, static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1983,9 +1985,10 @@ bool uprobe_deny_signal(void) static void mmf_recalc_uprobes(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* From 13a5c7e94d0f97782f9febf91ccb5c60466d91d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:59 +0000 Subject: [PATCH 0797/1250] sched: use maple tree iterator to walk VMAs The linked list is slower than walking the VMAs using the maple tree. We can't use the VMA iterator here because it doesn't support moving to an earlier position. Link: https://lkml.kernel.org/r/20220504011345.662299-33-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-49-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-49-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 77b2048a932622..e8202b5cd3d5b8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2734,6 +2734,7 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; + MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2790,13 +2791,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = mas_find(&mas, ULONG_MAX)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; From efc11b656ea83e9a6eebaadc4cb4325d8f8f4a5b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:17:59 +0000 Subject: [PATCH 0798/1250] fork: use VMA iterator The VMA iterator is faster than the linked list and removing the linked list will shrink the vm_area_struct. Link: https://lkml.kernel.org/r/20220504011345.662299-34-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-50-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-50-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- kernel/fork.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 4a9fce369f306b..4b7b0b7dd44685 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1294,13 +1294,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) /* Forbid mm->exe_file change if old file still mapped. */ old_exe_file = get_mm_exe_file(mm); if (old_exe_file) { + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path, - &old_exe_file->f_path)) + &old_exe_file->f_path)) { ret = -EBUSY; + break; + } } mmap_read_unlock(mm); fput(old_exe_file); From 24a0c12b1c8888bf4a17a024c969e7b508a79e46 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:59 +0000 Subject: [PATCH 0799/1250] bpf: remove VMA linked list Use vma_next() and remove reference to the start of the linked list Link: https://lkml.kernel.org/r/20220504011345.662299-35-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-51-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-51-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- kernel/bpf/task_iter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def492..1c8debd42dc9fc 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -299,8 +299,8 @@ struct bpf_iter_seq_task_vma_info { }; enum bpf_task_vma_iter_find_op { - task_vma_iter_first_vma, /* use mm->mmap */ - task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_first_vma, /* use find_vma() with addr 0 */ + task_vma_iter_next_vma, /* use vma_next() with curr_vma */ task_vma_iter_find_vma, /* use find_vma() to find next vma */ }; @@ -400,10 +400,10 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) switch (op) { case task_vma_iter_first_vma: - curr_vma = curr_task->mm->mmap; + curr_vma = find_vma(curr_task->mm, 0); break; case task_vma_iter_next_vma: - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; case task_vma_iter_find_vma: /* We dropped mmap_lock so it is necessary to use find_vma @@ -417,7 +417,7 @@ task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) if (curr_vma && curr_vma->vm_start == info->prev_vm_start && curr_vma->vm_end == info->prev_vm_end) - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; } if (!curr_vma) { From 5553d5047e7e2752abf6f77d27d62a9f8e898276 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:17:59 +0000 Subject: [PATCH 0800/1250] mm/gup: use maple tree navigation instead of linked list Use find_vma_intersection() to locate the VMAs in __mm_populate() instead of using find_vma() and the linked list. Link: https://lkml.kernel.org/r/20220504011345.662299-36-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-52-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-52-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/gup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 79cb9fb1d889f7..32d0ccfb17eac7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1643,10 +1643,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) if (!locked) { locked = 1; mmap_read_lock(mm); - vma = find_vma(mm, nstart); + vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) + vma = find_vma_intersection(mm, vma->vm_end, end); + + if (!vma) break; /* * Set [nstart; nend) to intersection of desired address From b0230354167102ffad74157bfa49b826348a95bd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:18:00 +0000 Subject: [PATCH 0801/1250] mm/khugepaged: stop using vma linked list Use vma iterator & find_vma() instead of vma linked list. Link: https://lkml.kernel.org/r/20220504011345.662299-37-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-53-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-53-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 814020689d3e88..9f70582b8f5123 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2311,11 +2311,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, + * If we're also updating the next vma vm_start, * check if we need to split it. */ if (adjust_next > 0) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c4f03a2ad60244..cfe231c5958f75 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2056,10 +2056,12 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { + struct vma_iterator vmi; struct mm_slot *mm_slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; + unsigned long address; VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); @@ -2083,11 +2085,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - if (likely(!khugepaged_test_exit(mm))) - vma = find_vma(mm, khugepaged_scan.address); progress++; - for (; vma; vma = vma->vm_next) { + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + address = khugepaged_scan.address; + vma_iter_init(&vmi, mm, address); + for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); From d842fcaaba39f1d8a2fa0d008e2da3186499efd1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:18:00 +0000 Subject: [PATCH 0802/1250] mm/ksm: use vma iterators instead of vma linked list Remove the use of the linked list for eventual removal. Link: https://lkml.kernel.org/r/20220504011345.662299-38-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-54-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-54-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/ksm.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 55f1d96348690e..075123602bd07b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -981,11 +981,13 @@ static int unmerge_and_remove_all_rmap_items(void) struct mm_slot, mm_list); spin_unlock(&ksm_mmlist_lock); - for (mm_slot = ksm_scan.mm_slot; - mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { + for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; + mm_slot = ksm_scan.mm_slot) { + VMA_ITERATOR(vmi, mm_slot->mm, 0); + mm = mm_slot->mm; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (ksm_test_exit(mm)) break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) @@ -2232,6 +2234,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) struct mm_slot *slot; struct vm_area_struct *vma; struct rmap_item *rmap_item; + struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.mm_list)) @@ -2290,13 +2293,13 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) } mm = slot->mm; + vma_iter_init(&vmi, mm, ksm_scan.address); + mmap_read_lock(mm); if (ksm_test_exit(mm)) - vma = NULL; - else - vma = find_vma(mm, ksm_scan.address); + goto no_vmas; - for (; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) @@ -2334,6 +2337,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) } if (ksm_test_exit(mm)) { +no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &slot->rmap_list; } From 083383e05ffa9a5e8f50c21608e466ed923fcd0e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:00 +0000 Subject: [PATCH 0803/1250] mm/madvise: use vma_find() instead of vma linked list madvise_walk_vmas() no longer uses linked list. Link: https://lkml.kernel.org/r/20220504011345.662299-39-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-55-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-55-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 5f0f0948a50e43..851fa4e134bc54 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1238,7 +1238,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, if (start >= end) break; if (prev) - vma = prev->vm_next; + vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } From a86d871021e145be2203373d062bbdc71e3522b3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:01 +0000 Subject: [PATCH 0804/1250] mm/memcontrol: stop using mm->highest_vm_end Pass through ULONG_MAX instead. Link: https://lkml.kernel.org/r/20220504011345.662299-40-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-56-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-56-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c5bfb3eacd0822..956b82ec8f7314 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5857,7 +5857,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; mmap_read_lock(mm); - walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); + walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); mmap_read_unlock(mm); precharge = mc.precharge; @@ -6155,9 +6155,7 @@ static void mem_cgroup_move_charge(void) * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, - NULL); - + walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); mmap_read_unlock(mc.mm); atomic_dec(&mc.from->moving_account); } From 03b7e86e36701ff33906dc16fc616c9cb0ab860b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:01 +0000 Subject: [PATCH 0805/1250] mm/mempolicy: use vma iterator & maple state instead of vma linked list Reworked the way mbind_range() finds the first VMA to reuse the maple state and limit the number of tree walks needed. Note, this drops the VM_BUG_ON(!vma) call, which would catch a start address higher than the last VMA. The code was written in a way that allowed no VMA updates to occur and still return success. There should be no functional change to this scenario with the new code. Link: https://lkml.kernel.org/r/20220504011345.662299-41-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-57-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-57-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mempolicy.c | 56 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 88a5173c6ff077..dc74239d1ac776 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -381,9 +381,10 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) mpol_rebind_policy(vma->vm_policy, new); mmap_write_unlock(mm); } @@ -656,7 +657,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->vma; + struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; @@ -671,9 +672,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, /* hole at head side of range */ return -EFAULT; } + next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && - (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) + (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; @@ -787,26 +789,24 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { + MA_STATE(mas, &mm->mm_mt, start - 1, start - 1); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - unsigned long vmstart; - unsigned long vmend; - - vma = find_vma(mm, start); - VM_BUG_ON(!vma); - prev = vma->vm_prev; - if (start > vma->vm_start) - prev = vma; + prev = mas_find_rev(&mas, 0); + if (prev && (start < prev->vm_end)) + vma = prev; + else + vma = mas_next(&mas, end - 1); - for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); + for (; vma; vma = mas_next(&mas, end - 1)) { + unsigned long vmstart = max(start, vma->vm_start); + unsigned long vmend = min(end, vma->vm_end); if (mpol_equal(vma_policy(vma), new_pol)) - continue; + goto next; pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); @@ -815,6 +815,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto replace; } @@ -822,19 +824,25 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, err = split_vma(vma->vm_mm, vma, vmstart, 1); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end != vmend) { err = split_vma(vma->vm_mm, vma, vmend, 0); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } - replace: +replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; +next: + prev = vma; } - out: +out: return err; } @@ -1049,6 +1057,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; + struct vm_area_struct *vma; LIST_HEAD(pagelist); int err = 0; struct migration_target_control mtc = { @@ -1064,8 +1073,9 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ + vma = find_vma(mm, 0); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { @@ -1195,14 +1205,13 @@ static struct page *new_page(struct page *page, unsigned long start) struct folio *dst, *src = page_folio(page); struct vm_area_struct *vma; unsigned long address; + VMA_ITERATOR(vmi, current->mm, start); gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - vma = find_vma(current->mm, start); - while (vma) { + for_each_vma(vmi, vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) break; - vma = vma->vm_next; } if (folio_test_hugetlb(src)) @@ -1480,6 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long vmend; unsigned long end; int err = -ENOENT; + VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) @@ -1505,9 +1515,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); - vma = find_vma(mm, start); - for (; vma && vma->vm_start < end; vma = vma->vm_next) { - + for_each_vma_range(vmi, vma, end) { vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); new = mpol_dup(vma_policy(vma)); From 1fef8cc60be627c044f013ea99091661fa95b92a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:18:01 +0000 Subject: [PATCH 0806/1250] mm/mlock: use vma iterator and maple state instead of vma linked list Handle overflow checking in count_mm_mlocked_page_nr() differently. Link: https://lkml.kernel.org/r/20220504011345.662299-42-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-58-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-58-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mlock.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b14e929084ccaa..43d19a1f28eb37 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -471,6 +471,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; int error; + MA_STATE(mas, ¤t->mm->mm_mt, start, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -479,13 +480,14 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return -EINVAL; if (end == start) return 0; - vma = find_vma(current->mm, start); - if (!vma || vma->vm_start > start) + vma = mas_walk(&mas); + if (!vma) return -ENOMEM; - prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; + else + prev = mas_prev(&mas, 0); for (nstart = start ; ; ) { vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -505,7 +507,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(prev->vm_mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; @@ -526,24 +528,23 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long count = 0; + unsigned long end; + VMA_ITERATOR(vmi, mm, start); if (mm == NULL) mm = current->mm; - vma = find_vma(mm, start); - if (vma == NULL) - return 0; - - for (; vma ; vma = vma->vm_next) { - if (start >= vma->vm_end) - continue; - if (start + len <= vma->vm_start) - break; + /* Don't overflow past ULONG_MAX */ + if (unlikely(ULONG_MAX - len < start)) + end = ULONG_MAX; + else + end = start + len; + for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); - if (start + len < vma->vm_end) { - count += start + len - vma->vm_start; + if (end < vma->vm_end) { + count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; @@ -659,6 +660,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; @@ -679,7 +681,7 @@ static int apply_mlockall_flags(int flags) to_add |= VM_LOCKONFAULT; } - for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { vm_flags_t newflags; newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -687,6 +689,7 @@ static int apply_mlockall_flags(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + mas_pause(&mas); cond_resched(); } out: From e0ba64bfb51b56f78e14f577ef26c9cebc25bb6b Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:01 +0000 Subject: [PATCH 0807/1250] mm/mprotect: use maple tree navigation instead of vma linked list Link: https://lkml.kernel.org/r/20220504011345.662299-43-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-59-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-59-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mprotect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 8250c1315d9c7b..0420c3ed936c9b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -668,6 +668,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); struct mmu_gather tlb; + MA_STATE(mas, ¤t->mm->mm_mt, start, start); start = untagged_addr(start); @@ -699,7 +700,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - vma = find_vma(current->mm, start); + vma = mas_find(&mas, ULONG_MAX); error = -ENOMEM; if (!vma) goto out; @@ -725,7 +726,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (start > vma->vm_start) prev = vma; else - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); tlb_gather_mmu(&tlb, current->mm); for (nstart = start ; ; ) { @@ -788,7 +789,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(current->mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; From 55333a1a34563b27d050f5d7271605d44b5cb63f Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:02 +0000 Subject: [PATCH 0808/1250] mm/mremap: use vma_find_intersection() instead of vma linked list Link: https://lkml.kernel.org/r/20220504011345.662299-44-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-60-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-60-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mremap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index e0fba90042466d..8644ff278f0298 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -716,7 +716,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (excess) { vma->vm_flags |= VM_ACCOUNT; if (split) - vma->vm_next->vm_flags |= VM_ACCOUNT; + find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; } return new_addr; @@ -866,9 +866,10 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) { unsigned long end = vma->vm_end + delta; + if (end < vma->vm_end) /* overflow */ return 0; - if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ + if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) return 0; if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 0, MAP_FIXED) & ~PAGE_MASK) From e2f0f10da4feb99c99ab023feab8468f20696447 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:02 +0000 Subject: [PATCH 0809/1250] mm/msync: use vma_find() instead of vma linked list Link: https://lkml.kernel.org/r/20220504011345.662299-45-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-61-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-61-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/msync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/msync.c b/mm/msync.c index 137d1c104f3e94..ac4c9bfea2e7fa 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) error = 0; goto out_unlock; } - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); } } out_unlock: From 494af92f12c06a57f04b17ef41adf77dcab5458c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:02 +0000 Subject: [PATCH 0810/1250] mm/oom_kill: use maple tree iterators instead of vma linked list Link: https://lkml.kernel.org/r/20220504011345.662299-46-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-62-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-62-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/oom_kill.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3c6cf9e3cd66ea..3996301450e8d3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -513,6 +513,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; + VMA_ITERATOR(vmi, mm, 0); /* * Tell all users of get_user/copy_from_user etc... that the content @@ -522,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) */ set_bit(MMF_UNSTABLE, &mm->flags); - for (vma = mm->mmap ; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; From f895dad1e71ffdea2ada5207aa195867649dce19 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:18:02 +0000 Subject: [PATCH 0811/1250] mm/pagewalk: use vma_find() instead of vma linked list walk_page_range() no longer uses the one vma linked list reference. Link: https://lkml.kernel.org/r/20220504011345.662299-47-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-63-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-63-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/pagewalk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 9b3db11a4d1db8..53e5c145fcce5a 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -456,7 +456,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } else { /* inside vma */ walk.vma = vma; next = min(end, vma->vm_end); - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); err = walk_page_test(start, next, &walk); if (err > 0) { From 6140da761354a2a3ded9bd683181b7cbc9636fb0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:03 +0000 Subject: [PATCH 0812/1250] mm/swapfile: use vma iterator instead of vma linked list unuse_mm() no longer needs to reference the linked list. Link: https://lkml.kernel.org/r/20220504011345.662299-48-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-64-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-64-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/swapfile.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 1fdccd2f1422eb..5c8681a3f1d9dd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1990,14 +1990,16 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->anon_vma) { ret = unuse_vma(vma, type); if (ret) break; } + cond_resched(); } mmap_read_unlock(mm); From fc2abb41a280f99f00cd5b2d46edcd960866a144 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:18:03 +0000 Subject: [PATCH 0813/1250] i915: use the VMA iterator Replace the linked list in probe_range() with the VMA iterator. Link: https://lkml.kernel.org/r/20220504011345.662299-49-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-65-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-65-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 094f06b4ce3359..a509f7da104be0 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -426,12 +426,11 @@ static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = { static int probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { - const unsigned long end = addr + len; + VMA_ITERATOR(vmi, mm, addr); struct vm_area_struct *vma; - int ret = -EFAULT; mmap_read_lock(mm); - for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { + for_each_vma_range(vmi, vma, addr + len) { /* Check for holes, note that we also update the addr below */ if (vma->vm_start > addr) break; @@ -439,16 +438,13 @@ probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len) if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) break; - if (vma->vm_end >= end) { - ret = 0; - break; - } - addr = vma->vm_end; } mmap_read_unlock(mm); - return ret; + if (vma) + return -EFAULT; + return 0; } /* From 7713c8f00019182a53004d58b1ca526bf4e80e49 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Jul 2022 02:18:03 +0000 Subject: [PATCH 0814/1250] nommu: remove uses of VMA linked list Use the maple tree or VMA iterator instead. This is faster and will allow us to shrink the VMA. Link: https://lkml.kernel.org/r/20220504011345.662299-50-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-66-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-66-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/nommu.c | 135 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 101 insertions(+), 34 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 265a444a2cc274..171faa07e577a2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -557,26 +557,14 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) mas_store_prealloc(mas, NULL); } -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_lock held writelocked - */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) { - struct address_space *mapping; - struct vm_area_struct *prev; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); - - BUG_ON(!vma->vm_region); - mm->map_count++; vma->vm_mm = mm; /* add the VMA to the mapping */ if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); @@ -584,21 +572,52 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} - prev = mas_prev(&mas, 0); - mas_reset(&mas); +/* + * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). + * @mas: The maple state with preallocations. + * @mm: The mm_struct + * @vma: The vma to add + * + */ +static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, + struct vm_area_struct *vma) +{ + struct vm_area_struct *prev; + + BUG_ON(!vma->vm_region); + + setup_vma_to_mm(vma, mm); + + prev = mas_prev(mas, 0); + mas_reset(mas); /* add the VMA to the tree */ - vma_mas_store(vma, &mas); + vma_mas_store(vma, mas); __vma_link_list(mm, vma, prev); } /* - * delete a VMA from its owning mm_struct and address space + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_lock held writelocked */ -static void delete_vma_from_mm(struct vm_area_struct *vma) +static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + mas_add_vma_to_mm(&mas, mm, vma); + return 0; +} +static void cleanup_vma_from_mm(struct vm_area_struct *vma) +{ vma->vm_mm->map_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -611,10 +630,25 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} +/* + * delete a VMA from its owning mm_struct and address space + */ +static int delete_vma_from_mm(struct vm_area_struct *vma) +{ + MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + cleanup_vma_from_mm(vma); /* remove from the MM's tree and list */ vma_mas_remove(vma, &mas); __vma_unlink_list(vma->vm_mm, vma); + return 0; } /* @@ -1024,6 +1058,7 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags; unsigned long capabilities, result; int ret; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); *populate = 0; @@ -1042,6 +1077,7 @@ unsigned long do_mmap(struct file *file, * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); + /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); if (!region) @@ -1051,6 +1087,9 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + goto error_maple_preallocate; + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1191,7 +1230,7 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - add_vma_to_mm(current->mm, vma); + mas_add_vma_to_mm(&mas, current->mm, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1217,6 +1256,7 @@ unsigned long do_mmap(struct file *file, sharing_violation: up_write(&nommu_region_sem); + mas_destroy(&mas); pr_warn("Attempt to share mismatched mappings\n"); ret = -EINVAL; goto error; @@ -1233,6 +1273,14 @@ unsigned long do_mmap(struct file *file, len, current->pid); show_free_areas(0, NULL); return -ENOMEM; + +error_maple_preallocate: + kmem_cache_free(vm_region_jar, region); + vm_area_free(vma); + pr_warn("Allocation of vma tree for process %d failed\n", current->pid); + show_free_areas(0, NULL); + return -ENOMEM; + } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, @@ -1298,6 +1346,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ @@ -1333,7 +1382,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - delete_vma_from_mm(vma); down_write(&nommu_region_sem); delete_nommu_region(vma->vm_region); if (new_below) { @@ -1346,8 +1394,17 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, add_nommu_region(vma->vm_region); add_nommu_region(new->vm_region); up_write(&nommu_region_sem); - add_vma_to_mm(mm, vma); - add_vma_to_mm(mm, new); + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + + setup_vma_to_mm(vma, mm); + setup_vma_to_mm(new, mm); + mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); + mas_store(&mas, vma); + vma_mas_store(new, &mas); return 0; } @@ -1363,12 +1420,14 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + return -ENOMEM; if (from > vma->vm_start) vma->vm_end = from; else vma->vm_start = to; - add_vma_to_mm(mm, vma); + if (add_vma_to_mm(mm, vma)) + return -ENOMEM; /* cut the backing region down to size */ region = vma->vm_region; @@ -1396,9 +1455,10 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { + MA_STATE(mas, &mm->mm_mt, start, start); struct vm_area_struct *vma; unsigned long end; - int ret; + int ret = 0; len = PAGE_ALIGN(len); if (len == 0) @@ -1407,7 +1467,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = find_vma(mm, start); + vma = mas_find(&mas, end - 1); if (!vma) { static int limit; if (limit < 5) { @@ -1426,7 +1486,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = vma->vm_next; + vma = mas_next(&mas, end - 1); } while (vma); return -EINVAL; } else { @@ -1448,9 +1508,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list } erase_whole_vma: - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + ret = -ENOMEM; delete_vma(mm, vma); - return 0; + return ret; } int vm_munmap(unsigned long addr, size_t len) @@ -1475,6 +1536,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) */ void exit_mmap(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; if (!mm) @@ -1482,13 +1544,18 @@ void exit_mmap(struct mm_struct *mm) mm->total_vm = 0; - while ((vma = mm->mmap)) { - mm->mmap = vma->vm_next; - delete_vma_from_mm(vma); + /* + * Lock the mm to avoid assert complaining even though this is the only + * user of the mm + */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + cleanup_vma_from_mm(vma); delete_vma(mm, vma); cond_resched(); } __mt_destroy(&mm->mm_mt); + mmap_write_unlock(mm); } int vm_brk(unsigned long addr, unsigned long len) From dc9e9de96cbd6cfbd85066352c7fb7594bac9634 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:04 +0000 Subject: [PATCH 0815/1250] riscv: use vma iterator for vdso Remove the linked list use in favour of the vma iterator. Link: https://lkml.kernel.org/r/20220504011345.662299-51-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-67-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-67-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- arch/riscv/kernel/vdso.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 69b05b6c181b6d..692e7ae3dcb80b 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -114,11 +114,12 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); struct __vdso_info *vdso_info = mm->context.vdso_info; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, vdso_info->dm)) From 2fecc8b97cab4ead5a6b83f7bcc9d2e3a399eb92 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:04 +0000 Subject: [PATCH 0816/1250] mm: remove the vma linked list Replace any vm_next use with vma_find(). Update free_pgtables(), unmap_vmas(), and zap_page_range() to use the maple tree. Use the new free_pgtables() and unmap_vmas() in do_mas_align_munmap(). At the same time, alter the loop to be more compact. Now that free_pgtables() and unmap_vmas() take a maple tree as an argument, rearrange do_mas_align_munmap() to use the new tree to hold the vmas to remove. Remove __vma_link_list() and __vma_unlink_list() as they are exclusively used to update the linked list. Drop linked list update from __insert_vm_struct(). Rework validation of tree as it was depending on the linked list. Link: https://lkml.kernel.org/r/20220504011345.662299-52-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220513141548.2019143-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-68-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-68-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: Hulk Robot Cc: Yang Yingliang Cc: David Hildenbrand Cc: Lukas Bulwahn Cc: Sven Schnelle Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +- include/linux/mm_types.h | 4 - kernel/fork.c | 19 +- mm/debug.c | 14 +- mm/internal.h | 8 +- mm/memory.c | 33 ++- mm/mmap.c | 471 ++++++++++++++++----------------------- mm/nommu.c | 6 - mm/util.c | 40 ---- 9 files changed, 224 insertions(+), 376 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 474c1f8ad1afc8..95388863a61a9f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1817,8 +1817,9 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long start, unsigned long end); +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long start, + unsigned long end); struct mmu_notifier_range; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 403861b681d38f..93552d823076d7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -408,8 +408,6 @@ struct vm_area_struct { unsigned long vm_end; /* The first byte after our end address within vm_mm. */ - /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next, *vm_prev; struct mm_struct *vm_mm; /* The address space we belong to. */ /* @@ -473,7 +471,6 @@ struct vm_area_struct { struct kioctx_table; struct mm_struct { struct { - struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, @@ -488,7 +485,6 @@ struct mm_struct { unsigned long mmap_compat_legacy_base; #endif unsigned long task_size; /* size of task vm space */ - unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; #ifdef CONFIG_MEMBARRIER diff --git a/kernel/fork.c b/kernel/fork.c index 4b7b0b7dd44685..2d7ce88da5408a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -474,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) */ *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); - new->vm_next = new->vm_prev = NULL; dup_anon_vma_name(orig, new); } return new; @@ -579,7 +578,7 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct vm_area_struct *mpnt, *tmp; int retval; unsigned long charge = 0; LIST_HEAD(uf); @@ -606,18 +605,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); - if (retval) - goto out; - - prev = NULL; - retval = mas_expected_entries(&mas, oldmm->map_count); if (retval) goto out; @@ -689,14 +681,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - /* Link the vma into the MT */ mas.index = tmp->vm_start; mas.last = tmp->vm_end - 1; @@ -1119,7 +1103,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - mm->mmap = NULL; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); diff --git a/mm/debug.c b/mm/debug.c index 2d625ca0e32694..0fd15ba70d1631 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -139,13 +139,11 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { - pr_emerg("vma %px start %px end %px\n" - "next %px prev %px mm %px\n" + pr_emerg("vma %px start %px end %px mm %px\n" "prot %lx anon_vma %px vm_ops %px\n" "pgoff %lx file %px private_data %px\n" "flags: %#lx(%pGv)\n", - vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, - vma->vm_prev, vma->vm_mm, + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data, @@ -155,11 +153,11 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px task_size %lu\n" + pr_emerg("mm %px task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif - "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" + "mmap_base %lu mmap_legacy_base %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" @@ -183,11 +181,11 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, mm->task_size, + mm, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif - mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, + mm->mmap_base, mm->mmap_legacy_base, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), mm_pgtables_bytes(mm), diff --git a/mm/internal.h b/mm/internal.h index 9fe642aab0baca..5c7220017c7863 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -84,8 +84,9 @@ void folio_rotate_reclaimable(struct folio *folio); bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long floor, + unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; @@ -479,9 +480,6 @@ static inline bool is_data_mapping(vm_flags_t flags) } /* mm/util.c */ -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev); -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); struct anon_vma *folio_anon_vma(struct folio *folio); #ifdef CONFIG_MMU diff --git a/mm/memory.c b/mm/memory.c index b12c1efa46e5ff..57480ce3dbc1b5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -402,12 +402,21 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long floor, unsigned long ceiling) +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling) { - while (vma) { - struct vm_area_struct *next = vma->vm_next; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); + + do { unsigned long addr = vma->vm_start; + struct vm_area_struct *next; + + /* + * Note: USER_PGTABLES_CEILING may be passed as ceiling and may + * be 0. This will underflow and is okay. + */ + next = mas_find(&mas, ceiling - 1); /* * Hide vma from rmap and truncate_pagecache before freeing @@ -426,7 +435,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; - next = vma->vm_next; + next = mas_find(&mas, ceiling - 1); unlink_anon_vmas(vma); unlink_file_vma(vma); } @@ -434,7 +443,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, floor, next ? next->vm_start : ceiling); } vma = next; - } + } while (vma); } void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) @@ -1713,7 +1722,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { @@ -1723,12 +1732,14 @@ void unmap_vmas(struct mmu_gather *tlb, /* Careful - we need to zap private pages too! */ .even_cows = true, }; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) + do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1743,8 +1754,11 @@ void unmap_vmas(struct mmu_gather *tlb, void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { + struct maple_tree *mt = &vma->vm_mm->mm_mt; + unsigned long end = start + size; struct mmu_notifier_range range; struct mmu_gather tlb; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, @@ -1752,8 +1766,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) + do { unmap_single_vma(&tlb, vma, start, range.end, NULL); + } while ((vma = mas_find(&mas, end - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } diff --git a/mm/mmap.c b/mm/mmap.c index 11aee96c8f13d3..9f6af7cce0bfa9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -75,9 +75,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long start, unsigned long end); + struct vm_area_struct *next, unsigned long start, + unsigned long end); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -130,12 +131,10 @@ void unlink_file_vma(struct vm_area_struct *vma) } /* - * Close a vm structure and free it, returning the next. + * Close a vm structure and free it. */ -static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +static void remove_vma(struct vm_area_struct *vma) { - struct vm_area_struct *next = vma->vm_next; - might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); @@ -143,7 +142,6 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) fput(vma->vm_file); mpol_put(vma_policy(vma)); vm_area_free(vma); - return next; } /* @@ -168,8 +166,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, unsigned long newbrk, unsigned long oldbrk, struct list_head *uf); static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, - unsigned long flags); + unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; @@ -238,7 +235,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * before calling do_brk_munmap(). */ mm->brk = brk; - mas.last = oldbrk - 1; ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); if (ret == 1) { downgraded = true; @@ -293,44 +289,21 @@ extern void mt_dump(const struct maple_tree *mt); static void validate_mm_mt(struct mm_struct *mm) { struct maple_tree *mt = &mm->mm_mt; - struct vm_area_struct *vma_mt, *vma = mm->mmap; + struct vm_area_struct *vma_mt; MA_STATE(mas, mt, 0, 0); mt_validate(&mm->mm_mt); mas_for_each(&mas, vma_mt, ULONG_MAX) { - if (xa_is_zero(vma_mt)) - continue; - - if (!vma) - break; - - if ((vma != vma_mt) || - (vma->vm_start != vma_mt->vm_start) || - (vma->vm_end != vma_mt->vm_end) || - (vma->vm_start != mas.index) || - (vma->vm_end - 1 != mas.last)) { + if ((vma_mt->vm_start != mas.index) || + (vma_mt->vm_end - 1 != mas.last)) { pr_emerg("issue in %s\n", current->comm); dump_stack(); dump_vma(vma_mt); - pr_emerg("and vm_next\n"); - dump_vma(vma->vm_next); pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, mas.index, mas.last); pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, vma_mt->vm_start, vma_mt->vm_end); - if (vma->vm_prev) { - pr_emerg("ll prev: %p %lu - %lu\n", - vma->vm_prev, vma->vm_prev->vm_start, - vma->vm_prev->vm_end); - } - pr_emerg("ll vma: %p %lu - %lu\n", vma, - vma->vm_start, vma->vm_end); - if (vma->vm_next) { - pr_emerg("ll next: %p %lu - %lu\n", - vma->vm_next, vma->vm_next->vm_start, - vma->vm_next->vm_end); - } mt_dump(mas.tree); if (vma_mt->vm_end != mas.last + 1) { @@ -347,23 +320,19 @@ static void validate_mm_mt(struct mm_struct *mm) } VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); } - VM_BUG_ON(vma != vma_mt); - vma = vma->vm_next; - } - VM_BUG_ON(vma); } static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - unsigned long highest_address = 0; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, 0, 0); validate_mm_mt(mm); - while (vma) { + mas_for_each(&mas, vma, ULONG_MAX) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; @@ -375,18 +344,10 @@ static void validate_mm(struct mm_struct *mm) anon_vma_unlock_read(anon_vma); } #endif - - highest_address = vm_end_gap(vma); - vma = vma->vm_next; i++; } if (i != mm->map_count) { - pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); - bug = 1; - } - if (highest_address != mm->highest_vm_end) { - pr_emerg("mm->highest_vm_end %lx, found %lx\n", - mm->highest_vm_end, highest_address); + pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); @@ -446,29 +407,13 @@ bool range_has_overlap(struct mm_struct *mm, unsigned long start, struct vm_area_struct *existing; MA_STATE(mas, &mm->mm_mt, start, start); + rcu_read_lock(); existing = mas_find(&mas, end - 1); *pprev = mas_prev(&mas, 0); + rcu_read_unlock(); return existing ? true : false; } -/* - * __vma_next() - Get the next VMA. - * @mm: The mm_struct. - * @vma: The current vma. - * - * If @vma is NULL, return the first vma in the mm. - * - * Returns: The next VMA after @vma. - */ -static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - if (!vma) - return mm->mmap; - - return vma->vm_next; -} - static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -553,8 +498,7 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, mas_store_prealloc(mas, NULL); } -static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; @@ -568,7 +512,6 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, } vma_mas_store(vma, &mas); - __vma_link_list(mm, vma, prev); __vma_link_file(vma); if (mapping) @@ -579,22 +522,6 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } -/* - * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and the mm tree. It has already been inserted into the interval tree. - */ -static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long location) -{ - struct vm_area_struct *prev; - - mas_set(mas, location); - prev = mas_prev(mas, 0); - vma_mas_store(vma, mas); - __vma_link_list(mm, vma, prev); - mm->map_count++; -} - /* * vma_expand - Expand an existing VMA * @@ -675,15 +602,8 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, } /* Expanding over the next vma */ - if (remove_next) { - /* Remove from mm linked list - also updates highest_vm_end */ - __vma_unlink_list(mm, next); - - if (file) - __remove_shared_vm_struct(next, file, mapping); - - } else if (!next) { - mm->highest_vm_end = vm_end_gap(vma); + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); } if (anon_vma) { @@ -738,7 +658,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, int remove_next = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; - unsigned long ll_prev = vma->vm_start; /* linked list prev. */ if (next && !insert) { if (end >= next->vm_end) { @@ -773,7 +692,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, next_next = find_vma(mm, next->vm_end); VM_WARN_ON(remove_next == 2 && - end != next->vm_next->vm_end); + end != next_next->vm_end); } exporter = next; @@ -784,7 +703,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * next, if the vma overlaps with it. */ if (remove_next == 2 && !next->anon_vma) - exporter = next->vm_next; + exporter = next_next; } else if (end > next->vm_start) { /* @@ -879,17 +798,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (vma->vm_end > end) { if (!insert || (insert->vm_start != end)) { vma_mas_szero(&mas, end, vma->vm_end); + mas_reset(&mas); VM_WARN_ON(insert && insert->vm_end < vma->vm_end); - } else if (insert->vm_start == end) { - ll_prev = vma->vm_end; } } else { vma_changed = true; } vma->vm_end = end; - if (!next) - mm->highest_vm_end = vm_end_gap(vma); } if (vma_changed) @@ -909,29 +825,19 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, flush_dcache_mmap_unlock(mapping); } - if (remove_next) { - /* - * vma_merge has merged next into vma, and needs - * us to remove next before dropping the locks. - * Since we have expanded over this vma, the maple tree will - * have overwritten by storing the value - */ - __vma_unlink_list(mm, next); + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); if (remove_next == 2) - __vma_unlink_list(mm, next_next); - - if (file) { - __remove_shared_vm_struct(next, file, mapping); - if (remove_next == 2) - __remove_shared_vm_struct(next_next, file, mapping); - } + __remove_shared_vm_struct(next_next, file, mapping); } else if (insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, &mas, insert, ll_prev); + mas_reset(&mas); + vma_mas_store(insert, &mas); + mm->map_count++; } if (anon_vma) { @@ -965,54 +871,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, /* * In mprotect's case 6 (see comments on vma_merge), - * we must remove another next too. It would clutter - * up the code too much to do both in one go. + * we must remove next_next too. */ - if (remove_next != 3) { - /* - * If "next" was removed and vma->vm_end was - * expanded (up) over it, in turn - * "next->vm_prev->vm_end" changed and the - * "vma->vm_next" gap must be updated. - */ - next = next_next; - } else { - /* - * For the scope of the comment "next" and - * "vma" considered pre-swap(): if "vma" was - * removed, next->vm_start was expanded (down) - * over it and the "next" gap must be updated. - * Because of the swap() the post-swap() "vma" - * actually points to pre-swap() "next" - * (post-swap() "next" as opposed is now a - * dangling pointer). - */ - next = vma; - } if (remove_next == 2) { remove_next = 1; + next = next_next; goto again; - } else if (!next) { - /* - * If remove_next == 2 we obviously can't - * reach this path. - * - * If remove_next == 3 we can't reach this - * path because pre-swap() next is always not - * NULL. pre-swap() "next" is not being - * removed and its next->vm_end is not altered - * (and furthermore "end" already matches - * next->vm_end in remove_next == 3). - * - * We reach this only in the remove_next == 1 - * case if the "next" vma that was removed was - * the highest vma of the mm. However in such - * case next->vm_end == "end" and the extended - * "vma" has vma->vm_end == next->vm_end so - * mm->highest_vm_end doesn't need any update - * in remove_next == 1 case. - */ - VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); } } if (insert && file) @@ -1020,6 +884,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, mas_destroy(&mas); validate_mm(mm); + return 0; } @@ -1179,10 +1044,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - next = __vma_next(mm, prev); + next = find_vma(mm, prev ? prev->vm_end : 0); area = next; if (area && area->vm_end == end) /* cases 6, 7, 8 */ - next = next->vm_next; + next = find_vma(mm, next->vm_end); /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); @@ -1316,18 +1181,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_ */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { + MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end); struct anon_vma *anon_vma = NULL; + struct vm_area_struct *prev, *next; /* Try next first. */ - if (vma->vm_next) { - anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next); + next = mas_walk(&mas); + if (next) { + anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } + prev = mas_prev(&mas, 0); + VM_BUG_ON_VMA(prev != vma, vma); + prev = mas_prev(&mas, 0); /* Try prev next. */ - if (vma->vm_prev) - anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma); + if (prev) + anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find @@ -2095,8 +1966,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; - next = vma->vm_next; - if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2147,8 +2018,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Overwrite old entry in mtree. */ vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - if (!vma->vm_next) - mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2168,16 +2037,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); address &= PAGE_MASK; if (address < mmap_min_addr) return -EPERM; /* Enforce stack_guard_gap */ - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); /* Check that both stack segments have the same anon_vma? */ if (prev && !(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev)) { @@ -2312,25 +2181,26 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL_GPL(find_extend_vma); /* - * Ok - we have the memory areas we should free on the vma list, - * so release them, and do the vma updates. + * Ok - we have the memory areas we should free on a maple tree so release them, + * and do the vma updates. * * Called with the mm semaphore held. */ -static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) { unsigned long nr_accounted = 0; + struct vm_area_struct *vma; /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); - do { + mas_for_each(mas, vma, ULONG_MAX) { long nrpages = vma_pages(vma); if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; vm_stat_account(mm, vma->vm_flags, -nrpages); - vma = remove_vma(vma); - } while (vma); + remove_vma(vma); + } vm_unacct_memory(nr_accounted); validate_mm(mm); } @@ -2340,18 +2210,18 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) * * Called with the mm semaphore held. */ -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, + struct vm_area_struct *next, unsigned long start, unsigned long end) { - struct vm_area_struct *next = __vma_next(mm, prev); struct mmu_gather tlb; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end); - free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + unmap_vmas(&tlb, mt, vma, start, end); + free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); } @@ -2438,24 +2308,17 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -static inline int -unlock_range(struct vm_area_struct *start, struct vm_area_struct **tail, - unsigned long limit) +static inline int munmap_sidetree(struct vm_area_struct *vma, + struct ma_state *mas_detach) { - struct mm_struct *mm = start->vm_mm; - struct vm_area_struct *tmp = start; - int count = 0; - - while (tmp && tmp->vm_start < limit) { - *tail = tmp; - count++; - if (tmp->vm_flags & VM_LOCKED) - mm->locked_vm -= vma_pages(tmp); + mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); + if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) + return -ENOMEM; - tmp = tmp->vm_next; - } + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm -= vma_pages(vma); - return count; + return 0; } /* @@ -2475,9 +2338,13 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool downgrade) { - struct vm_area_struct *prev, *last; + struct vm_area_struct *prev, *next = NULL; + struct maple_tree mt_detach; + int count = 0; int error = -ENOMEM; - /* we have start < vma->vm_end */ + MA_STATE(mas_detach, &mt_detach, 0, 0); + mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt_detach, &mm->mmap_lock); if (mas_preallocate(mas, vma, GFP_KERNEL)) return -ENOMEM; @@ -2490,6 +2357,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, * unmapped vm_area_struct will remain in use: so lower split_vma * places tmp vma above, and higher split_vma places tmp vma below. */ + + /* Does it split the first one? */ if (start > vma->vm_start) { /* @@ -2500,35 +2369,60 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; + /* + * mas_pause() is not needed since mas->index needs to be set + * differently than vma->vm_end anyways. + */ error = __split_vma(mm, vma, start, 0); if (error) - goto split_failed; + goto start_split_failed; - prev = vma; - vma = __vma_next(mm, prev); - mas->index = start; - mas_reset(mas); - } else { - prev = vma->vm_prev; + mas_set(mas, start); + vma = mas_walk(mas); } - if (vma->vm_end >= end) - last = vma; - else - last = find_vma_intersection(mm, end - 1, end); + prev = mas_prev(mas, 0); + if (unlikely((!prev))) + mas_set(mas, start); + + /* + * Detach a range of VMAs from the mm. Using next as a temp variable as + * it is always overwritten. + */ + mas_for_each(mas, next, end - 1) { + /* Does it split the end? */ + if (next->vm_end > end) { + struct vm_area_struct *split; + + error = __split_vma(mm, next, end, 1); + if (error) + goto end_split_failed; - /* Does it split the last one? */ - if (last && end < last->vm_end) { - error = __split_vma(mm, last, end, 1); + mas_set(mas, end); + split = mas_prev(mas, 0); + error = munmap_sidetree(split, &mas_detach); + if (error) + goto munmap_sidetree_failed; + count++; + if (vma == next) + vma = split; + break; + } + error = munmap_sidetree(next, &mas_detach); if (error) - goto split_failed; + goto munmap_sidetree_failed; - if (vma == last) - vma = __vma_next(mm, prev); - mas_reset(mas); + count++; +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + BUG_ON(next->vm_start < start); + BUG_ON(next->vm_start > end); +#endif } + if (!next) + next = mas_next(mas, ULONG_MAX); + if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2545,35 +2439,36 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, goto userfaultfd_error; } - /* - * unlock any mlock()ed ranges before detaching vmas, count the number - * of VMAs to be dropped, and return the tail entry of the affected - * area. - */ - mm->map_count -= unlock_range(vma, &last, end); - /* Drop removed area from the tree */ + /* Point of no return */ + mas_set_range(mas, start, end - 1); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + /* Make sure no VMAs are about to be lost. */ + { + MA_STATE(test, &mt_detach, start, end - 1); + struct vm_area_struct *vma_mas, *vma_test; + int test_count = 0; + + rcu_read_lock(); + vma_test = mas_find(&test, end - 1); + mas_for_each(mas, vma_mas, end - 1) { + BUG_ON(vma_mas != vma_test); + test_count++; + vma_test = mas_next(&test, end - 1); + } + rcu_read_unlock(); + BUG_ON(count != test_count); + mas_set_range(mas, start, end - 1); + } +#endif mas_store_prealloc(mas, NULL); - - /* Detach vmas from the MM linked list */ - vma->vm_prev = NULL; - if (prev) - prev->vm_next = last->vm_next; - else - mm->mmap = last->vm_next; - - if (last->vm_next) { - last->vm_next->vm_prev = prev; - last->vm_next = NULL; - } else - mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; - + mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or * VM_GROWSUP VMA. Such VMAs can change their size under * down_read(mmap_lock) and collide with the VMA we are about to unmap. */ if (downgrade) { - if (last && (last->vm_flags & VM_GROWSDOWN)) + if (next && (next->vm_flags & VM_GROWSDOWN)) downgrade = false; else if (prev && (prev->vm_flags & VM_GROWSUP)) downgrade = false; @@ -2581,18 +2476,22 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mmap_write_downgrade(mm); } - unmap_region(mm, vma, prev, start, end); - - /* Fix up all other VM information */ - remove_vma_list(mm, vma); + unmap_region(mm, &mt_detach, vma, prev, next, start, end); + /* Statistics and freeing VMAs */ + mas_set(&mas_detach, start); + remove_mt(mm, &mas_detach); + __mt_destroy(&mt_detach); validate_mm(mm); return downgrade ? 1 : 0; -map_count_exceeded: -split_failed: userfaultfd_error: +munmap_sidetree_failed: +end_split_failed: + __mt_destroy(&mt_detach); +start_split_failed: +map_count_exceeded: mas_destroy(mas); return error; } @@ -2827,7 +2726,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, i_mmap_lock_write(vma->vm_file->f_mapping); vma_mas_store(vma, &mas); - __vma_link_list(mm, vma, prev); mm->map_count++; if (vma->vm_file) { if (vma->vm_flags & VM_SHARED) @@ -2885,7 +2783,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma->vm_file = NULL; /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); charged = 0; if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); @@ -2974,11 +2872,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, goto out; if (start + size > vma->vm_end) { - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, vma->vm_end); + struct vm_area_struct *next, *prev = vma; - for (next = vma->vm_next; next; next = next->vm_next) { + for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ - if (next->vm_start != next->vm_prev->vm_end) + if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) @@ -2987,8 +2886,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; - if (start + size <= next->vm_end) - break; + prev = next; } if (!next) @@ -3055,11 +2953,9 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, * do some brk-specific accounting here. */ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, - unsigned long flags) + unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *prev = NULL; validate_mm_mt(mm); /* @@ -3102,7 +2998,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, khugepaged_enter_vma(vma, flags); goto out; } - prev = vma; /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); @@ -3120,12 +3015,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, goto mas_store_fail; mm->map_count++; - - if (!prev) - prev = mas_prev(mas, 0); - - __vma_link_list(mm, vma, prev); - mm->map_count++; out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -3133,7 +3022,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; - validate_mm_mt(mm); + validate_mm(mm); return 0; mas_store_fail: @@ -3214,6 +3103,8 @@ void exit_mmap(struct mm_struct *mm) struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -3238,7 +3129,7 @@ void exit_mmap(struct mm_struct *mm) mmap_write_lock(mm); arch_exit_mmap(mm); - vma = mm->mmap; + vma = mas_find(&mas, ULONG_MAX); if (!vma) { /* Can happen if dup_mmap() received an OOM */ mmap_write_unlock(mm); @@ -3249,22 +3140,29 @@ void exit_mmap(struct mm_struct *mm) flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ - /* Use -1 here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); + /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, + USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* Walk the list again, actually closing and freeing it. */ - while (vma) { + /* + * Walk the list again, actually closing and freeing it, with preemption + * enabled, without holding any MM locks besides the unreachable + * mmap_write_lock. + */ + do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - vma = remove_vma(vma); + remove_vma(vma); + count++; cond_resched(); - } + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + + BUG_ON(count != mm->map_count); trace_exit_mmap(mm); __mt_destroy(&mm->mm_mt); - mm->mmap = NULL; mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3303,7 +3201,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - if (vma_link(mm, vma, prev)) { + if (vma_link(mm, vma)) { vm_unacct_memory(charged); return -ENOMEM; } @@ -3335,7 +3233,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - if (range_has_overlap(mm, addr, addr + len, &prev)) + new_vma = find_vma_prev(mm, addr, &prev); + if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, @@ -3378,7 +3277,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - if (vma_link(mm, new_vma, prev)) + if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; } @@ -3683,12 +3582,13 @@ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3696,7 +3596,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3704,7 +3605,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) @@ -3763,11 +3665,12 @@ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); diff --git a/mm/nommu.c b/mm/nommu.c index 171faa07e577a2..6c611a689ec08c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -584,17 +584,12 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *prev; - BUG_ON(!vma->vm_region); setup_vma_to_mm(vma, mm); - prev = mas_prev(mas, 0); - mas_reset(mas); /* add the VMA to the tree */ vma_mas_store(vma, mas); - __vma_link_list(mm, vma, prev); } /* @@ -647,7 +642,6 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) /* remove from the MM's tree and list */ vma_mas_remove(vma, &mas); - __vma_unlink_list(vma->vm_mm, vma); return 0; } diff --git a/mm/util.c b/mm/util.c index 812365cbdd1978..c831c9f8766b48 100644 --- a/mm/util.c +++ b/mm/util.c @@ -272,46 +272,6 @@ void *memdup_user_nul(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user_nul); -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - struct vm_area_struct *next; - - vma->vm_prev = prev; - if (prev) { - next = prev->vm_next; - prev->vm_next = vma; - } else { - next = mm->mmap; - mm->mmap = vma; - } - vma->vm_next = next; - if (next) - next->vm_prev = vma; - else - mm->highest_vm_end = vm_end_gap(vma); -} - -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) -{ - struct vm_area_struct *prev, *next; - - next = vma->vm_next; - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - if (next) { - next->vm_prev = prev; - } else { - if (prev) - mm->highest_vm_end = vm_end_gap(prev); - else - mm->highest_vm_end = 0; - } -} - /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { From 77795882c9460f614adcf4ad0976aff25ba1d7a9 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:04 +0000 Subject: [PATCH 0817/1250] mm/mmap: drop range_has_overlap() function Since there is no longer a linked list, the range_has_overlap() function is identical to the find_vma_intersection() function. Link: https://lkml.kernel.org/r/20220504011345.662299-53-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-69-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-69-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 9f6af7cce0bfa9..f00bc374a5df7c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -390,30 +390,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } -/* - * range_has_overlap() - Check the @start - @end range for overlapping VMAs and - * sets up a pointer to the previous VMA - * @mm: the mm struct - * @start: the start address of the range - * @end: the end address of the range - * @pprev: the pointer to the pointer of the previous VMA - * - * Returns: True if there is an overlapping VMA, false otherwise - */ -static inline -bool range_has_overlap(struct mm_struct *mm, unsigned long start, - unsigned long end, struct vm_area_struct **pprev) -{ - struct vm_area_struct *existing; - - MA_STATE(mas, &mm->mm_mt, start, start); - rcu_read_lock(); - existing = mas_find(&mas, end - 1); - *pprev = mas_prev(&mas, 0); - rcu_read_unlock(); - return existing ? true : false; -} - static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -3173,11 +3149,10 @@ void exit_mmap(struct mm_struct *mm) */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *prev; unsigned long charged = vma_pages(vma); - if (range_has_overlap(mm, vma->vm_start, vma->vm_end, &prev)) + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && From ed4f3c9ee66613566072e5d3392396584ba73eca Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 20 Jul 2022 02:18:05 +0000 Subject: [PATCH 0818/1250] mm/mmap.c: pass in mapping to __vma_link_file() __vma_link_file() resolves the mapping from the file, if there is one. Pass through the mapping and check the vm_file externally since most places already have the required information and check of vm_file. Link: https://lkml.kernel.org/r/20220504011345.662299-54-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220621204632.3370049-70-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20220720021727.17018-70-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Catalin Marinas Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Vlastimil Babka Cc: Will Deacon Cc: Davidlohr Bueso Cc: David Hildenbrand Cc: Hulk Robot Cc: Lukas Bulwahn Cc: Sven Schnelle Cc: Yang Yingliang Signed-off-by: Andrew Morton --- mm/mmap.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f00bc374a5df7c..6f7e672fe63dc9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -199,6 +199,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) if (brk < min_brk) goto out; + /* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data @@ -275,7 +276,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) if (populate) mm_populate(oldbrk, newbrk - oldbrk); return brk; - out: mmap_write_unlock(mm); return origbrk; @@ -407,21 +407,15 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, return nr_pages; } -static void __vma_link_file(struct vm_area_struct *vma) +static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) { - struct file *file; - - file = vma->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - if (vma->vm_flags & VM_SHARED) - mapping_allow_writable(mapping); + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(mapping); - flush_dcache_mmap_lock(mapping); - vma_interval_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - } + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); } /* @@ -488,10 +482,11 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) } vma_mas_store(vma, &mas); - __vma_link_file(vma); - if (mapping) + if (mapping) { + __vma_link_file(vma, mapping); i_mmap_unlock_write(mapping); + } mm->map_count++; validate_mm(mm); @@ -730,14 +725,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, uprobe_munmap(next, next->vm_start, next->vm_end); i_mmap_lock_write(mapping); - if (insert) { + if (insert && insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ - __vma_link_file(insert); + __vma_link_file(insert, insert->vm_file->f_mapping); } } @@ -2934,6 +2929,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, struct mm_struct *mm = current->mm; validate_mm_mt(mm); + /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. @@ -2991,6 +2987,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, goto mas_store_fail; mm->map_count++; + out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; From 0b648589be1f5e35ce7407236a6a3815db7140c2 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 May 2022 15:30:59 -0700 Subject: [PATCH 0819/1250] mm: drop oom code from exit_mmap The primary reason to invoke the oom reaper from the exit_mmap path used to be a prevention of an excessive oom killing if the oom victim exit races with the oom reaper (see [1] for more details). The invocation has moved around since then because of the interaction with the munlock logic but the underlying reason has remained the same (see [2]). Munlock code is no longer a problem since [3] and there shouldn't be any blocking operation before the memory is unmapped by exit_mmap so the oom reaper invocation can be dropped. The unmapping part can be done with the non-exclusive mmap_sem and the exclusive one is only required when page tables are freed. Remove the oom_reaper from exit_mmap which will make the code easier to read. This is really unlikely to make any observable difference although some microbenchmarks could benefit from one less branch that needs to be evaluated even though it almost never is true. [1] 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") [2] 27ae357fa82b ("mm, oom: fix concurrent munlock and oom reaper unmap, v3") [3] a213e5cf71cb ("mm/munlock: delete munlock_vma_pages_all(), allow oomreap") Link: https://lkml.kernel.org/r/20220531223100.510392-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Christian Brauner (Microsoft) Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Jann Horn Cc: Johannes Weiner Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Minchan Kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/oom.h | 2 -- mm/mmap.c | 24 +++++++----------------- mm/oom_kill.c | 2 +- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 02d1e7bbd8cd5b..6cdde62b078b5e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -106,8 +106,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) return 0; } -bool __oom_reap_task_mm(struct mm_struct *mm); - long oom_badness(struct task_struct *p, unsigned long totalpages); diff --git a/mm/mmap.c b/mm/mmap.c index 6f7e672fe63dc9..fb98cc4accc51f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3082,23 +3082,6 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Manually reap the mm to free as much memory as possible. - * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard - * this mm from further consideration. Taking mm->mmap_lock for - * write after setting MMF_OOM_SKIP will guarantee that the oom - * reaper will not run on this mm again after mmap_lock is - * dropped. - * - * Nothing can be holding mm->mmap_lock here and the above call - * to mmu_notifier_release(mm) ensures mmu notifier callbacks in - * __oom_reap_task_mm() will not block. - */ - (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); - } - mmap_write_lock(mm); arch_exit_mmap(mm); @@ -3115,6 +3098,13 @@ void exit_mmap(struct mm_struct *mm) /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + + /* + * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper + * because the memory has been already freed. Do not bother checking + * mm_is_oom_victim because setting a bit unconditionally is cheaper. + */ + set_bit(MMF_OOM_SKIP, &mm->flags); free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3996301450e8d3..decb21474c6c5d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -509,7 +509,7 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -bool __oom_reap_task_mm(struct mm_struct *mm) +static bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; From 0b1cd0dfcb8f324526406fc200475ebd9355e8f2 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 1 Jun 2022 15:17:52 -0700 Subject: [PATCH 0820/1250] mm-drop-oom-code-from-exit_mmap-fix-fix restore Suren's mmap_read_lock() optimization Cc: Suren Baghdasaryan Cc: Liam Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index fb98cc4accc51f..950648c46cb612 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3082,13 +3082,13 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); - mmap_write_lock(mm); + mmap_read_lock(mm); arch_exit_mmap(mm); vma = mas_find(&mas, ULONG_MAX); if (!vma) { /* Can happen if dup_mmap() received an OOM */ - mmap_write_unlock(mm); + mmap_read_unlock(mm); return; } @@ -3098,6 +3098,7 @@ void exit_mmap(struct mm_struct *mm) /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + mmap_read_unlock(mm); /* * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper @@ -3105,6 +3106,7 @@ void exit_mmap(struct mm_struct *mm) * mm_is_oom_victim because setting a bit unconditionally is cheaper. */ set_bit(MMF_OOM_SKIP, &mm->flags); + mmap_write_lock(mm); free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); From 86d6b8d5f6b193e87869b9461fae3d90abd8e1ca Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 May 2022 15:31:00 -0700 Subject: [PATCH 0821/1250] mm: delete unused MMF_OOM_VICTIM flag With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is now unused and can be removed. Link: https://lkml.kernel.org/r/20220531223100.510392-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: Johannes Weiner Cc: Roman Gushchin Cc: Minchan Kim Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Christian Brauner (Microsoft) Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: David Hildenbrand Cc: Jann Horn Cc: Shakeel Butt Cc: Peter Xu Cc: John Hubbard Cc: Shuah Khan Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/oom.h | 9 --------- include/linux/sched/coredump.h | 7 +++---- mm/oom_kill.c | 4 +--- 3 files changed, 4 insertions(+), 16 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 6cdde62b078b5e..7d0c9c48a0c54e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } -/* - * Use this helper if tsk->mm != mm and the victim mm needs a special - * handling. This is guaranteed to stay true after once set. - */ -static inline bool mm_is_oom_victim(struct mm_struct *mm) -{ - return test_bit(MMF_OOM_VICTIM, &mm->flags); -} - /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 4d0a5be28b70f1..8270ad7ae14c2a 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ -#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ +#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either * replaced in the future by mm.pinned_vm when it becomes stable, or grow into @@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm) * pinned pages were unpinned later on, we'll still keep this bit set for the * lifecycle of this mm, just for simplicity. */ -#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */ +#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index decb21474c6c5d..35ec75cdfee21e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -765,10 +765,8 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) mmgrab(tsk->signal->oom_mm); - set_bit(MMF_OOM_VICTIM, &mm->flags); - } /* * Make sure that the task is woken up from uninterruptible sleep From bfb21c0fb419de67fda089e02acb6fecfccaefd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= Date: Fri, 3 Jun 2022 16:57:18 +0200 Subject: [PATCH 0822/1250] mm: refactor of vma_merge() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Refactor of vma_merge and new merge call", v4. I am currently working on my master's thesis trying to increase number of merges of VMAs currently failing because of page offset incompatibility and difference in their anon_vmas. The following refactor and added merge call included in this series is just two smaller upgrades I created along the way. This patch (of 2): Refactor vma_merge() to make it shorter and more understandable. Main change is the elimination of code duplicity in the case of merge next check. This is done by first doing checks and caching the results before executing the merge itself. The variable 'area' is divided into 'mid' and 'res' as previously it was used for two purposes, as the middle VMA between prev and next and also as the result of the merge itself. Exit paths are also unified. Link: https://lkml.kernel.org/r/20220603145719.1012094-1-matenajakub@gmail.com Link: https://lkml.kernel.org/r/20220603145719.1012094-2-matenajakub@gmail.com Signed-off-by: Jakub Matěna Reviewed-by: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Matthew Wilcox Cc: Liam Howlett Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Rik van Riel Cc: Steven Rostedt Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton --- mm/mmap.c | 87 +++++++++++++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 950648c46cb612..682abc2841bb64 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1005,8 +1005,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *area, *next; - int err; + struct vm_area_struct *mid, *next, *res; + int err = -1; + bool merge_prev = false; + bool merge_next = false; /* * We later require that vma->vm_flags == vm_flags, @@ -1016,75 +1018,60 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, return NULL; next = find_vma(mm, prev ? prev->vm_end : 0); - area = next; - if (area && area->vm_end == end) /* cases 6, 7, 8 */ + mid = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ next = find_vma(mm, next->vm_end); /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(area && end > area->vm_end); + VM_WARN_ON(mid && end > mid->vm_end); VM_WARN_ON(addr >= end); - /* - * Can it merge with the predecessor? - */ + /* Can we merge the predecessor? */ if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, vm_userfaultfd_ctx, anon_name)) { - /* - * OK, it can. Can we now merge in the successor as well? - */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, - pgoff+pglen, - vm_userfaultfd_ctx, anon_name) && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma, NULL)) { - /* cases 1, 6 */ - err = __vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL, - prev); - } else /* cases 2, 5, 7 */ - err = __vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); - if (err) - return NULL; - khugepaged_enter_vma(prev, vm_flags); - return prev; + merge_prev = true; } - - /* - * Can this new request be merged in front of next? - */ + /* Can we merge the successor? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx, anon_name)) { + merge_next = true; + } + /* Can we merge both the predecessor and the successor? */ + if (merge_prev && merge_next && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma, NULL)) { /* cases 1, 6 */ + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); + res = prev; + } else if (merge_prev) { /* cases 2, 5, 7 */ + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); + res = prev; + } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); - else { /* cases 3, 8 */ - err = __vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); - /* - * In case 3 area is already equal to next and - * this is a noop, but in case 8 "area" has - * been removed and next was expanded over it. - */ - area = next; - } - if (err) - return NULL; - khugepaged_enter_vma(area, vm_flags); - return area; + addr, prev->vm_pgoff, NULL, next); + else /* cases 3, 8 */ + err = __vma_adjust(mid, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + res = next; } - return NULL; + /* + * Cannot merge with predecessor or successor or error in __vma_adjust? + */ + if (err) + return NULL; + khugepaged_enter_vma(res, vm_flags); + return res; } /* From d5d114ee37aabc43b1e6b204b574c60565132ec2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= Date: Fri, 3 Jun 2022 16:57:19 +0200 Subject: [PATCH 0823/1250] mm: add merging after mremap resize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mremap call results in expansion, it might be possible to merge the VMA with the next VMA which might become adjacent. This patch adds vma_merge call after the expansion is done to try and merge. Link: https://lkml.kernel.org/r/20220603145719.1012094-3-matenajakub@gmail.com Signed-off-by: Jakub Matěna Reviewed-by: Vlastimil Babka Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Peter Zijlstra (Intel) Cc: Rik van Riel Cc: Steven Rostedt Signed-off-by: Andrew Morton --- mm/mremap.c | 19 +++++++++- tools/testing/selftests/vm/mremap_test.c | 47 +++++++++++++++++++++++- 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 8644ff278f0298..e465ffe279bb03 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include @@ -1012,6 +1014,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { long pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long extension_start = addr + old_len; + unsigned long extension_end = addr + new_len; + pgoff_t extension_pgoff = vma->vm_pgoff + (old_len >> PAGE_SHIFT); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1020,8 +1025,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } } - if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + /* + * Function vma_merge() is called on the extension we are adding to + * the already existing vma, vma_merge() will merge this extension with + * the already existing vma (expand operation itself) and possibly also + * with the next vma if it becomes adjacent to the expanded vma and + * otherwise compatible. + */ + vma = vma_merge(mm, vma, extension_start, extension_end, + vma->vm_flags, vma->anon_vma, vma->vm_file, + extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; goto out; diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c index db0270127aeb04..0865a6cb5bdbae 100644 --- a/tools/testing/selftests/vm/mremap_test.c +++ b/tools/testing/selftests/vm/mremap_test.c @@ -118,6 +118,48 @@ static unsigned long long get_mmap_min_addr(void) return addr; } +/* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + FILE *fp; + char *line = NULL; + size_t len = 0; + bool success = false; + + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + munmap(start + page_size, page_size); + mremap(start, page_size, 2 * page_size, 0); + + fp = fopen("/proc/self/maps", "r"); + if (fp == NULL) { + ksft_test_result_fail("%s\n", test_name); + return; + } + + while(getline(&line, &len, fp) != -1) { + char *first = strtok(line,"- "); + void *first_val = (void *) strtol(first, NULL, 16); + char *second = strtok(NULL,"- "); + void *second_val = (void *) strtol(second, NULL, 16); + if (first_val == start && second_val == start + 3 * page_size) { + success = true; + break; + } + } + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); + fclose(fp); +} + /* * Returns the start address of the mapping on success, else returns * NULL on failure. @@ -336,6 +378,7 @@ int main(int argc, char **argv) int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; + int num_expand_tests = 1; struct test test_cases[MAX_TEST]; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; @@ -407,12 +450,14 @@ int main(int argc, char **argv) (threshold_mb * _1MB >= _1GB); ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? - ARRAY_SIZE(perf_test_cases) : 0)); + ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, pattern_seed); + mremap_expand_merge(page_size); + if (run_perf_tests) { ksft_print_msg("\n%s\n", "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); From 201fd24c2a9c71316c5b2d68fb49edab3052ea30 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 3 Jun 2022 10:41:30 -0700 Subject: [PATCH 0824/1250] mm-add-merging-after-mremap-resize-checkpatch-fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit WARNING: line length of 108 exceeds 100 columns #97: FILE: tools/testing/selftests/vm/mremap_test.c:136: + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); WARNING: Missing a blank line after declarations #98: FILE: tools/testing/selftests/vm/mremap_test.c:137: + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + munmap(start + page_size, page_size); ERROR: space required before the open parenthesis '(' #107: FILE: tools/testing/selftests/vm/mremap_test.c:146: + while(getline(&line, &len, fp) != -1) { ERROR: space required after that ',' (ctx:VxV) #108: FILE: tools/testing/selftests/vm/mremap_test.c:147: + char *first = strtok(line,"- "); ^ ERROR: space required after that ',' (ctx:VxV) #110: FILE: tools/testing/selftests/vm/mremap_test.c:149: + char *second = strtok(NULL,"- "); ^ WARNING: Missing a blank line after declarations #112: FILE: tools/testing/selftests/vm/mremap_test.c:151: + void *second_val = (void *) strtol(second, NULL, 16); + if (first_val == start && second_val == start + 3 * page_size) { total: 3 errors, 3 warnings, 113 lines checked NOTE: For some of the reported defects, checkpatch may be able to mechanically convert to the typical style using --fix or --fix-inplace. ./patches/mm-add-merging-after-mremap-resize.patch has style problems, please review. NOTE: If any of the errors are false positives, please report them to the maintainer, see CHECKPATCH in MAINTAINERS. Please run checkpatch prior to sending patches Cc: Jakub Matěna Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/mremap_test.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c index 0865a6cb5bdbae..9496346973d44a 100644 --- a/tools/testing/selftests/vm/mremap_test.c +++ b/tools/testing/selftests/vm/mremap_test.c @@ -132,8 +132,9 @@ static void mremap_expand_merge(unsigned long page_size) char *line = NULL; size_t len = 0; bool success = false; + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); munmap(start + page_size, page_size); mremap(start, page_size, 2 * page_size, 0); @@ -143,11 +144,12 @@ static void mremap_expand_merge(unsigned long page_size) return; } - while(getline(&line, &len, fp) != -1) { - char *first = strtok(line,"- "); - void *first_val = (void *) strtol(first, NULL, 16); - char *second = strtok(NULL,"- "); + while (getline(&line, &len, fp) != -1) { + char *first = strtok(line, "- "); + void *first_val = (void *)strtol(first, NULL, 16); + char *second = strtok(NULL, "- "); void *second_val = (void *) strtol(second, NULL, 16); + if (first_val == start && second_val == start + 3 * page_size) { success = true; break; From a06ac1d05dbf9f76a2eb036bdb8924fa8c01d720 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 11 Jun 2022 10:13:52 +0800 Subject: [PATCH 0825/1250] mm/page_alloc: minor clean up for memmap_init_compound() Since commit 5232c63f46fd ("mm: Make compound_pincount always available"), compound_pincount_ptr is stored at first tail page now. So we should call prep_compound_head() after the first tail page is initialized to take advantage of the likelihood of that tail struct page being cached given that we will read them right after in prep_compound_head(). Link: https://lkml.kernel.org/r/20220611021352.13529-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Joao Martins Signed-off-by: Andrew Morton --- mm/page_alloc.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee2500756fbeb..d0d09a9ce36476 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6773,13 +6773,18 @@ static void __ref memmap_init_compound(struct page *head, set_page_count(page, 0); /* - * The first tail page stores compound_mapcount_ptr() and - * compound_order() and the second tail page stores - * compound_pincount_ptr(). Call prep_compound_head() after - * the first and second tail pages have been initialized to - * not have the data overwritten. + * The first tail page stores compound_mapcount_ptr(), + * compound_order() and compound_pincount_ptr(). Call + * prep_compound_head() after the first tail page have + * been initialized to not have the data overwritten. + * + * Note the idea to make this right after we initialize + * the offending tail pages is trying to take advantage + * of the likelihood of those tail struct pages being + * cached given that we will read them right after in + * prep_compound_head(). */ - if (pfn == head_pfn + 2) + if (unlikely(pfn == head_pfn + 1)) prep_compound_head(head, order); } } From 2da508d831bd681ed59f24fbb55e01dcec58d703 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Wed, 15 Jun 2022 17:40:58 +0000 Subject: [PATCH 0826/1250] mm/mlock: drop dead code in count_mm_mlocked_page_nr() The check for mm being null has never been needed since the only caller has always passed in current->mm. Remove the check from count_mm_mlocked_page_nr(). Link: https://lkml.kernel.org/r/20220615174050.738523-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Lukas Bulwahn Signed-off-by: Andrew Morton --- mm/mlock.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 43d19a1f28eb37..7032f6dd0ce198 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -531,14 +531,12 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long end; VMA_ITERATOR(vmi, mm, start); - if (mm == NULL) - mm = current->mm; - /* Don't overflow past ULONG_MAX */ if (unlikely(ULONG_MAX - len < start)) end = ULONG_MAX; else end = start + len; + for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) From d44a4c1221a542323ef9f5e69763a4ecbf02df8e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 18 Jun 2022 16:20:27 +0800 Subject: [PATCH 0827/1250] mm/mmap.c: fix missing call to vm_unacct_memory in mmap_region Since the beginning, charged is set to 0 to avoid calling vm_unacct_memory twice because vm_unacct_memory will be called by above unmap_region. But since commit 4f74d2c8e827 ("vm: remove 'nr_accounted' calculations from the unmap_vmas() interfaces"), unmap_region doesn't call vm_unacct_memory anymore. So charged shouldn't be set to 0 now otherwise the calling to paired vm_unacct_memory will be missed and leads to imbalanced account. Link: https://lkml.kernel.org/r/20220618082027.43391-1-linmiaohe@huawei.com Fixes: 4f74d2c8e827 ("vm: remove 'nr_accounted' calculations from the unmap_vmas() interfaces") Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/mmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 682abc2841bb64..125e8903c93c39 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2742,7 +2742,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Undo any partial mapping done by a device driver. */ unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); - charged = 0; if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); free_vma: From 33084a7766a2068af48487fbd962e9b51bf081ab Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 23 Jun 2022 15:06:06 -0700 Subject: [PATCH 0828/1250] procfs: add 'size' to /proc//fdinfo/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "procfs: Add file path and size to /proc//fdinfo", v2. Processes can pin shared memory by keeping a handle to it through a file descriptor; for instance dmabufs, memfd, and ashmem (in Android). In the case of a memory leak, to identify the process pinning the memory, userspace needs to: - Iterate the /proc//fd/* for each process - Do a readlink on each entry to identify the type of memory from the file path. - stat() each entry to get the size of the memory. The file permissions on /proc//fd/* only allows for the owner or root to perform the operations above; and so is not suitable for capturing the system-wide state in a production environment. This issue was addressed for dmabufs by making /proc/*/fdinfo/* accessible to a process with PTRACE_MODE_READ_FSCREDS credentials[1] To allow the same kind of tracking for other types of shared memory, add the following fields to /proc//fdinfo/: path - This allows identifying the type of memory based on common prefixes: e.g. "/memfd...", "/dmabuf...", "/dev/ashmem..." This was not an issued when dmabuf tracking was introduced because the exp_name field of dmabuf fdinfo could be used to distinguish dmabuf fds from other types. size - To track the amount of memory that is being pinned. dmabufs expose size as an additional field in fdinfo. Remove this and make it a common field for all fds. Access to /proc//fdinfo is governed by PTRACE_MODE_READ_FSCREDS -- the same as for /proc//maps which also exposes the path and size for mapped memory regions. This allows for a system process with PTRACE_MODE_READ_FSCREDS to account the pinned per-process memory via fdinfo. This patch (of 2): To be able to account the amount of memory a process is keeping pinned by open file descriptors add a 'size' field to fdinfo output. dmabufs fds already expose a 'size' field for this reason, remove this and make it a common field for all fds. This allows tracking of other types of memory (e.g. memfd and ashmem in Android). Link: https://lkml.kernel.org/r/20220623220613.3014268-1-kaleshsingh@google.com Link: https://lkml.kernel.org/r/20220623220613.3014268-2-kaleshsingh@google.com Signed-off-by: Kalesh Singh Reviewed-by: Christian König Cc: Al Viro Cc: Christoph Hellwig Cc: Stephen Brennan Cc: David Laight Cc: Ioannis Ilkos Cc: T.J. Mercier Cc: Suren Baghdasaryan Cc: Jonathan Corbet Cc: Sumit Semwal Cc: Johannes Weiner Cc: Christoph Anton Mitterer Cc: Colin Cross Cc: Paul Gortmaker Cc: Randy Dunlap Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 12 ++++++++++-- drivers/dma-buf/dma-buf.c | 1 - fs/proc/fd.c | 9 +++++---- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index e7aafc82be9991..640fe47586e3e6 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1891,13 +1891,14 @@ if precise results are needed. 3.8 /proc//fdinfo/ - Information about opened file --------------------------------------------------------------- This file provides information associated with an opened file. The regular -files have at least four fields -- 'pos', 'flags', 'mnt_id' and 'ino'. +files have at least five fields -- 'pos', 'flags', 'mnt_id', 'ino', and 'size'. + The 'pos' represents the current offset of the opened file in decimal form [see lseek(2) for details], 'flags' denotes the octal O_xxx mask the file has been created with [see open(2) for details] and 'mnt_id' represents mount ID of the file system containing the opened file [see 3.5 /proc//mountinfo for details]. 'ino' represents the inode number of -the file. +the file, and 'size' represents the size of the file in bytes. A typical output is:: @@ -1905,6 +1906,7 @@ A typical output is:: flags: 0100002 mnt_id: 19 ino: 63107 + size: 0 All locks associated with a file descriptor are shown in its fdinfo too:: @@ -1922,6 +1924,7 @@ Eventfd files flags: 04002 mnt_id: 9 ino: 63107 + size: 0 eventfd-count: 5a where 'eventfd-count' is hex value of a counter. @@ -1935,6 +1938,7 @@ Signalfd files flags: 04002 mnt_id: 9 ino: 63107 + size: 0 sigmask: 0000000000000200 where 'sigmask' is hex value of the signal mask associated @@ -1949,6 +1953,7 @@ Epoll files flags: 02 mnt_id: 9 ino: 63107 + size: 0 tfd: 5 events: 1d data: ffffffffffffffff pos:0 ino:61af sdev:7 where 'tfd' is a target file descriptor number in decimal form, @@ -1967,6 +1972,7 @@ For inotify files the format is the following:: flags: 02000000 mnt_id: 9 ino: 63107 + size: 0 inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d where 'wd' is a watch descriptor in decimal form, i.e. a target file @@ -1990,6 +1996,7 @@ For fanotify files the format is:: flags: 02 mnt_id: 9 ino: 63107 + size: 0 fanotify flags:10 event-flags:0 fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003 fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4 @@ -2015,6 +2022,7 @@ Timerfd files flags: 02 mnt_id: 9 ino: 63107 + size: 0 clockid: 0 ticks: 0 settime flags: 01 diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 32f55640890ce5..5f2ae38c960fd5 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -378,7 +378,6 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file) { struct dma_buf *dmabuf = file->private_data; - seq_printf(m, "size:\t%zu\n", dmabuf->size); /* Don't count the temporary reference taken inside procfs seq_show */ seq_printf(m, "count:\t%ld\n", file_count(dmabuf->file) - 1); seq_printf(m, "exp_name:\t%s\n", dmabuf->exp_name); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 913bef0d2a36c4..464bc3f557596b 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -54,10 +54,11 @@ static int seq_show(struct seq_file *m, void *v) if (ret) return ret; - seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\nino:\t%lu\n", - (long long)file->f_pos, f_flags, - real_mount(file->f_path.mnt)->mnt_id, - file_inode(file)->i_ino); + seq_printf(m, "pos:\t%lli\n", (long long)file->f_pos); + seq_printf(m, "flags:\t0%o\n", f_flags); + seq_printf(m, "mnt_id:\t%i\n", real_mount(file->f_path.mnt)->mnt_id); + seq_printf(m, "ino:\t%lu\n", file_inode(file)->i_ino); + seq_printf(m, "size:\t%lli\n", (long long)file_inode(file)->i_size); /* show_fd_locks() never deferences files so a stale value is safe */ show_fd_locks(m, file, files); From 33afe91a4daa3922230264f7dc02654cdecdfab7 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Thu, 23 Jun 2022 15:06:07 -0700 Subject: [PATCH 0829/1250] procfs: add 'path' to /proc//fdinfo/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to identify the type of memory a process has pinned through its open fds, add the file path to fdinfo output. This allows identifying memory types based on common prefixes: e.g. "/memfd...", "/dmabuf...", "/dev/ashmem...". To be cautious, only expose the paths for anonymous inodes, and this also avoids printing path names with strange characters. Access to /proc//fdinfo is governed by PTRACE_MODE_READ_FSCREDS the same as /proc//maps which also exposes the file path of mappings; so the security permissions for accessing path is consistent with that of /proc//maps. Link: https://lkml.kernel.org/r/20220623220613.3014268-3-kaleshsingh@google.com Signed-off-by: Kalesh Singh Cc: Alexey Dobriyan Cc: Al Viro Cc: Christian König Cc: Christoph Anton Mitterer Cc: Christoph Hellwig Cc: Colin Cross Cc: David Laight Cc: Ioannis Ilkos Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Paul Gortmaker Cc: Randy Dunlap Cc: Stephen Brennan Cc: Sumit Semwal Cc: Suren Baghdasaryan Cc: T.J. Mercier Signed-off-by: Andrew Morton --- Documentation/filesystems/proc.rst | 10 ++++++++++ fs/libfs.c | 9 +++++++++ fs/proc/fd.c | 13 +++++++++++-- include/linux/fs.h | 1 + 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 640fe47586e3e6..47e95dbc820d56 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1912,6 +1912,9 @@ All locks associated with a file descriptor are shown in its fdinfo too:: lock: 1: FLOCK ADVISORY WRITE 359 00:13:11691 0 EOF +Files with anonymous inodes have an additional 'path' field which represents +the anonymous file path. + The files such as eventfd, fsnotify, signalfd, epoll among the regular pos/flags pair provide additional information particular to the objects they represent. @@ -1925,6 +1928,7 @@ Eventfd files mnt_id: 9 ino: 63107 size: 0 + path: anon_inode:[eventfd] eventfd-count: 5a where 'eventfd-count' is hex value of a counter. @@ -1939,6 +1943,7 @@ Signalfd files mnt_id: 9 ino: 63107 size: 0 + path: anon_inode:[signalfd] sigmask: 0000000000000200 where 'sigmask' is hex value of the signal mask associated @@ -1954,6 +1959,7 @@ Epoll files mnt_id: 9 ino: 63107 size: 0 + path: anon_inode:[eventpoll] tfd: 5 events: 1d data: ffffffffffffffff pos:0 ino:61af sdev:7 where 'tfd' is a target file descriptor number in decimal form, @@ -1973,6 +1979,7 @@ For inotify files the format is the following:: mnt_id: 9 ino: 63107 size: 0 + path: anon_inode:inotify inotify wd:3 ino:9e7e sdev:800013 mask:800afce ignored_mask:0 fhandle-bytes:8 fhandle-type:1 f_handle:7e9e0000640d1b6d where 'wd' is a watch descriptor in decimal form, i.e. a target file @@ -1997,6 +2004,7 @@ For fanotify files the format is:: mnt_id: 9 ino: 63107 size: 0 + path: anon_inode:[fanotify] fanotify flags:10 event-flags:0 fanotify mnt_id:12 mflags:40 mask:38 ignored_mask:40000003 fanotify ino:4f969 sdev:800013 mflags:0 mask:3b ignored_mask:40000000 fhandle-bytes:8 fhandle-type:1 f_handle:69f90400c275b5b4 @@ -2023,6 +2031,7 @@ Timerfd files mnt_id: 9 ino: 63107 size: 0 + path: anon_inode:[timerfd] clockid: 0 ticks: 0 settime flags: 01 @@ -2047,6 +2056,7 @@ DMA Buffer files mnt_id: 9 ino: 63107 size: 32768 + path: /dmabuf: count: 2 exp_name: system-heap diff --git a/fs/libfs.c b/fs/libfs.c index 31b0ddf01c31da..6911749b4da79d 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1217,6 +1217,15 @@ void kfree_link(void *p) } EXPORT_SYMBOL(kfree_link); +static const struct address_space_operations anon_aops = { + .dirty_folio = noop_dirty_folio, +}; + +bool is_anon_inode(struct inode *inode) +{ + return inode->i_mapping->a_ops == &anon_aops; +} + struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 464bc3f557596b..5bac79a2fa515b 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -23,6 +23,7 @@ static int seq_show(struct seq_file *m, void *v) struct files_struct *files = NULL; int f_flags = 0, ret = -ENOENT; struct file *file = NULL; + struct inode *inode = NULL; struct task_struct *task; task = get_proc_task(m->private); @@ -54,11 +55,19 @@ static int seq_show(struct seq_file *m, void *v) if (ret) return ret; + inode = file_inode(file); + seq_printf(m, "pos:\t%lli\n", (long long)file->f_pos); seq_printf(m, "flags:\t0%o\n", f_flags); seq_printf(m, "mnt_id:\t%i\n", real_mount(file->f_path.mnt)->mnt_id); - seq_printf(m, "ino:\t%lu\n", file_inode(file)->i_ino); - seq_printf(m, "size:\t%lli\n", (long long)file_inode(file)->i_size); + seq_printf(m, "ino:\t%lu\n", inode->i_ino); + seq_printf(m, "size:\t%lli\n", (long long)inode->i_size); + + if (is_anon_inode(inode)) { + seq_puts(m, "path:\t"); + seq_file_path(m, file, "\n"); + seq_putc(m, '\n'); + } /* show_fd_locks() never deferences files so a stale value is safe */ show_fd_locks(m, file, files); diff --git a/include/linux/fs.h b/include/linux/fs.h index 134e9d7ad5d68e..7132c6f955c195 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3115,6 +3115,7 @@ extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); +extern bool is_anon_inode(struct inode *inode); void generic_fillattr(struct user_namespace *, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); From 66799b771b614cfb828fea309dfde9e490e4b907 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Mon, 27 Jun 2022 05:11:26 +0300 Subject: [PATCH 0830/1250] memcg: notify about global mem_cgroup_id space depletion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the host owner is not informed about the exhaustion of the global mem_cgroup_id space. When this happens, systemd cannot start a new service and receives a unique -ENOSPC error code. However, this can happen inside this container, persist in the log file of the local container, and may not be noticed by the host owner if he did not try to start any new services. Link: https://lkml.kernel.org/r/97bed1fd-f230-c2ea-1cb6-8230825a9a64@openvz.org Signed-off-by: Vasily Averin Cc: Shakeel Butt Cc: Roman Gushchin Cc: Michal Koutný Cc: Michal Hocko Cc: Vlastimil Babka Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 956b82ec8f7314..0fb880dd25d180 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5175,6 +5175,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) 1, MEM_CGROUP_ID_MAX + 1, GFP_KERNEL); if (memcg->id.id < 0) { error = memcg->id.id; + if (error == -ENOSPC) + pr_notice_ratelimited("mem_cgroup_id space is exhausted\n"); goto fail; } From 8341f2f3306641379da751e0d4670830b0a2928e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 27 Jun 2022 21:23:51 +0800 Subject: [PATCH 0831/1250] filemap: minor cleanup for filemap_write_and_wait_range Restructure the logic in filemap_write_and_wait_range to simplify the code and make it more consistent with file_write_and_wait_range. No functional change intended. Link: https://lkml.kernel.org/r/20220627132351.55680-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/filemap.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ffdfbc8b0e3cab..cd59f055e29d50 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -667,7 +667,7 @@ EXPORT_SYMBOL_GPL(filemap_range_has_writeback); int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend) { - int err = 0; + int err = 0, err2; if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, @@ -678,18 +678,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, * But the -EIO is special case, it may indicate the worst * thing (e.g. bug) happened, so we avoid waiting for it. */ - if (err != -EIO) { - int err2 = filemap_fdatawait_range(mapping, - lstart, lend); - if (!err) - err = err2; - } else { - /* Clear any previously stored errors */ - filemap_check_errors(mapping); - } - } else { - err = filemap_check_errors(mapping); + if (err != -EIO) + __filemap_fdatawait_range(mapping, lstart, lend); } + err2 = filemap_check_errors(mapping); + if (!err) + err = err2; return err; } EXPORT_SYMBOL(filemap_write_and_wait_range); From e00801e13304e5df5ab1d4fced355c1b4c75a678 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 1 Jul 2022 20:35:21 -0700 Subject: [PATCH 0832/1250] mm: memcontrol: do not miss MEMCG_MAX events for enforced allocations Yafang Shao reported an issue related to the accounting of bpf memory: if a bpf map is charged indirectly for memory consumed from an interrupt context and allocations are enforced, MEMCG_MAX events are not raised. It's not/less of an issue in a generic case because consequent allocations from a process context will trigger the direct reclaim and MEMCG_MAX events will be raised. However a bpf map can belong to a dying/abandoned memory cgroup, so there will be no allocations from a process context and no MEMCG_MAX events will be triggered. Link: https://lkml.kernel.org/r/20220702033521.64630-1-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin Reported-by: Yafang Shao Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0fb880dd25d180..7e463660209aee 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2577,6 +2577,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, bool passed_oom = false; bool may_swap = true; bool drained = false; + bool raised_max_event = false; unsigned long pflags; retry: @@ -2616,6 +2617,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, goto nomem; memcg_memory_event(mem_over_limit, MEMCG_MAX); + raised_max_event = true; psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, @@ -2682,6 +2684,13 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH))) return -ENOMEM; force: + /* + * If the allocation has to be enforced, don't forget to raise + * a MEMCG_MAX event. + */ + if (!raised_max_event) + memcg_memory_event(mem_over_limit, MEMCG_MAX); + /* * The allocation either can't fail or will lead to more memory * being freed very soon. Allow memory usage go over the limit From 1e41897331212129e2527e74932b4ba975608a8b Mon Sep 17 00:00:00 2001 From: Adam Sindelar Date: Mon, 4 Jul 2022 19:33:51 +0200 Subject: [PATCH 0833/1250] selftests/vm: fix errno handling in mrelease_test mrelease_test should return KSFT_SKIP when process_mrelease is not defined, but due to a perror call consuming the errno, it returns KSFT_FAIL. This patch decides the exit code before calling perror. Link: https://lkml.kernel.org/r/20220704173351.19595-1-adam@wowsignal.io Fixes: 33776141b812 ("selftests: vm: add process_mrelease tests") Signed-off-by: Adam Sindelar Reviewed-by: David Vernet Reviewed-by: Suren Baghdasaryan Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/mrelease_test.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c index 96671c2f7d4859..e8b17258579ba7 100644 --- a/tools/testing/selftests/vm/mrelease_test.c +++ b/tools/testing/selftests/vm/mrelease_test.c @@ -100,8 +100,10 @@ int main(void) /* Test a wrong pidfd */ if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { + /* perror overwrites errno, so this line must be first */ + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease with wrong pidfd"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } /* Start the test with 1MB child memory allocation */ @@ -156,8 +158,9 @@ int main(void) run_negative_tests(pidfd); if (kill(pid, SIGKILL)) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("kill"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); From 4091777a6786af6159aba6a06ebc30ce45e73edf Mon Sep 17 00:00:00 2001 From: Adam Sindelar Date: Wed, 6 Jul 2022 16:16:02 +0200 Subject: [PATCH 0834/1250] selftests-vm-fix-errno-handling-in-mrelease_test-v4 fix remaining instances of errno mishandling Link: https://lkml.kernel.org/r/20220706141602.10159-1-adam@wowsignal.io Fixes: 33776141b812 ("selftests: vm: add process_mrelease tests") Signed-off-by: Adam Sindelar Reviewed-by: David Vernet Reviewed-by: Suren Baghdasaryan Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/mrelease_test.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vm/mrelease_test.c b/tools/testing/selftests/vm/mrelease_test.c index e8b17258579ba7..6c62966ab5dbcb 100644 --- a/tools/testing/selftests/vm/mrelease_test.c +++ b/tools/testing/selftests/vm/mrelease_test.c @@ -62,19 +62,22 @@ static int alloc_noexit(unsigned long nr_pages, int pipefd) /* The process_mrelease calls in this test are expected to fail */ static void run_negative_tests(int pidfd) { + int res; /* Test invalid flags. Expect to fail with EINVAL error code. */ if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease with wrong flags"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } /* * Test reaping while process is alive with no pending SIGKILL. * Expect to fail with EINVAL error code. */ if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease on a live process"); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } } @@ -100,7 +103,6 @@ int main(void) /* Test a wrong pidfd */ if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { - /* perror overwrites errno, so this line must be first */ res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease with wrong pidfd"); exit(res); @@ -175,9 +177,10 @@ int main(void) if (errno == ESRCH) { retry = (size <= MAX_SIZE_MB); } else { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); perror("process_mrelease"); waitpid(pid, NULL, 0); - exit(errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + exit(res); } } From 3f00e76a1d648579898c2308e4904dbc04d4ec23 Mon Sep 17 00:00:00 2001 From: Adam Sindelar Date: Mon, 4 Jul 2022 14:38:13 +0200 Subject: [PATCH 0835/1250] selftests/vm: skip 128TBswitch on unsupported arch The test va_128TBswitch.c exercises a feature only supported on PPC and x86_64, but it's run on other 64-bit archs as well. Before this patch, the test did nothing and returned 0 for KSFT_PASS. This patch makes it return the KSFT codes from kselftest.h, including KSFT_SKIP when appropriate. Verified on arm64 and x86_64. Link: https://lkml.kernel.org/r/20220704123813.427625-1-adam@wowsignal.io Signed-off-by: Adam Sindelar Cc: David Vernet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/va_128TBswitch.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c index da6ec3b53ea8d4..1d206898988396 100644 --- a/tools/testing/selftests/vm/va_128TBswitch.c +++ b/tools/testing/selftests/vm/va_128TBswitch.c @@ -231,7 +231,7 @@ static struct testcase hugetlb_testcases[] = { static int run_test(struct testcase *test, int count) { void *p; - int i, ret = 0; + int i, ret = KSFT_PASS; for (i = 0; i < count; i++) { struct testcase *t = test + i; @@ -242,13 +242,13 @@ static int run_test(struct testcase *test, int count) if (p == MAP_FAILED) { printf("FAILED\n"); - ret = 1; + ret = KSFT_FAIL; continue; } if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { printf("FAILED\n"); - ret = 1; + ret = KSFT_FAIL; } else { /* * Do a dereference of the address returned so that we catch @@ -280,7 +280,7 @@ int main(int argc, char **argv) int ret; if (!supported_arch()) - return 0; + return KSFT_SKIP; ret = run_test(testcases, ARRAY_SIZE(testcases)); if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) From 93a2838042ff88ae303550fc0b8c79a57f967639 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 6 Jul 2022 16:59:20 -0700 Subject: [PATCH 0836/1250] mm: khugepaged: don't carry huge page to the next loop for !CONFIG_NUMA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: userspace hugepage collapse", v7. Introduction -------------------------------- This series provides a mechanism for userspace to induce a collapse of eligible ranges of memory into transparent hugepages in process context, thus permitting users to more tightly control their own hugepage utilization policy at their own expense. This idea was introduced by David Rientjes[5]. Interface -------------------------------- The proposed interface adds a new madvise(2) mode, MADV_COLLAPSE, and leverages the new process_madvise(2) call. process_madvise(2) Performs a synchronous collapse of the native pages mapped by the list of iovecs into transparent hugepages. This operation is independent of the system THP sysfs settings, but attempts to collapse VMAs marked VM_NOHUGEPAGE will still fail. THP allocation may enter direct reclaim and/or compaction. When a range spans multiple VMAs, the semantics of the collapse over of each VMA is independent from the others. Caller must have CAP_SYS_ADMIN if not acting on self. Return value follows existing process_madvise(2) conventions. A “success” indicates that all hugepage-sized/aligned regions covered by the provided range were either successfully collapsed, or were already pmd-mapped THPs. madvise(2) Equivalent to process_madvise(2) on self, with 0 returned on “success”. Current Use-Cases -------------------------------- (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. With MADV_COLLAPSE, we get the best of both worlds: Peak upfront performance and lower RAM footprints. Note that subsequent support for file-backed memory is required here. (2) malloc() implementations that manage memory in hugepage-sized chunks, but sometimes subrelease memory back to the system in native-sized chunks via MADV_DONTNEED; zapping the pmd. Later, when the memory is hot, the implementation could madvise(MADV_COLLAPSE) to re-back the memory by THPs to regain hugepage coverage and dTLB performance. TCMalloc is such an implementation that could benefit from this[6]. A prior study of Google internal workloads during evaluation of Temeraire, a hugepage-aware enhancement to TCMalloc, showed that nearly 20% of all cpu cycles were spent in dTLB stalls, and that increasing hugepage coverage by even small amount can help with that[7]. (3) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. Note that subsequent support for file/shmem-backed memory is required here. (4) HugeTLB high-granularity mapping allows HugeTLB a HugeTLB page to be mapped at different levels in the page tables[8]. As it's not "transparent" like THP, HugeTLB high-granularity mappings require an explicit user API. It is intended that MADV_COLLAPSE be co-opted for this use case[9]. Note that subsequent support for HugeTLB memory is required here. Future work -------------------------------- Only private anonymous memory is supported by this series. File and shmem memory support will be added later. One possible user of this functionality is a userspace agent that attempts to optimize THP utilization system-wide by allocating THPs based on, for example, task priority, task performance requirements, or heatmaps. For the latter, one idea that has already surfaced is using DAMON to identify hot regions, and driving THP collapse through a new DAMOS_COLLAPSE scheme[10]. This patch (of 17): The khugepaged has optimization to reduce huge page allocation calls for !CONFIG_NUMA by carrying the allocated but failed to collapse huge page to the next loop. CONFIG_NUMA doesn't do so since the next loop may try to collapse huge page from a different node, so it doesn't make too much sense to carry it. But when NUMA=n, the huge page is allocated by khugepaged_prealloc_page() before scanning the address space, so it means huge page may be allocated even though there is no suitable range for collapsing. Then the page would be just freed if khugepaged already made enough progress. This could make NUMA=n run have 5 times as much thp_collapse_alloc as NUMA=y run. This problem actually makes things worse due to the way more pointless THP allocations and makes the optimization pointless. This could be fixed by carrying the huge page across scans, but it will complicate the code further and the huge page may be carried indefinitely. But if we take one step back, the optimization itself seems not worth keeping nowadays since: * Not too many users build NUMA=n kernel nowadays even though the kernel is actually running on a non-NUMA machine. Some small devices may run NUMA=n kernel, but I don't think they actually use THP. * Since commit 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists"), THP could be cached by pcp. This actually somehow does the job done by the optimization. Link: https://lkml.kernel.org/r/20220706235936.2197195-1-zokeefe@google.com Link: https://lkml.kernel.org/r/20220706235936.2197195-3-zokeefe@google.com Signed-off-by: Yang Shi Signed-off-by: Zach O'Keefe Co-developed-by: Peter Xu Signed-off-by: Peter Xu Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- mm/khugepaged.c | 120 +++++++++++------------------------------------- 1 file changed, 26 insertions(+), 94 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index cfe231c5958f75..2d763a5bf66531 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -796,29 +796,16 @@ static int khugepaged_find_target_node(void) last_khugepaged_target_node = target_node; return target_node; } - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +#else +static int khugepaged_find_target_node(void) { - if (IS_ERR(*hpage)) { - if (!*wait) - return false; - - *wait = false; - *hpage = NULL; - khugepaged_alloc_sleep(); - } else if (*hpage) { - put_page(*hpage); - *hpage = NULL; - } - - return true; + return 0; } +#endif static struct page * khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) { - VM_BUG_ON_PAGE(*hpage, *hpage); - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); @@ -830,74 +817,6 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) count_vm_event(THP_COLLAPSE_ALLOC); return *hpage; } -#else -static int khugepaged_find_target_node(void) -{ - return 0; -} - -static inline struct page *alloc_khugepaged_hugepage(void) -{ - struct page *page; - - page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), - HPAGE_PMD_ORDER); - if (page) - prep_transhuge_page(page); - return page; -} - -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_khugepaged_hugepage(); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(hugepage_flags_enabled())); - - return hpage; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - /* - * If the hpage allocated earlier was briefly exposed in page cache - * before collapse_file() failed, it is possible that racing lookups - * have not yet completed, and would then be unpleasantly surprised by - * finding the hpage reused for the same mapping at a different offset. - * Just release the previous allocation if there is any danger of that. - */ - if (*hpage && page_count(*hpage) > 1) { - put_page(*hpage); - *hpage = NULL; - } - - if (!*hpage) - *hpage = khugepaged_alloc_hugepage(wait); - - if (unlikely(!*hpage)) - return false; - - return true; -} - -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) -{ - VM_BUG_ON(!*hpage); - - return *hpage; -} -#endif /* * If mmap_lock temporarily dropped, revalidate vma @@ -1150,8 +1069,10 @@ static void collapse_huge_page(struct mm_struct *mm, out_up_write: mmap_write_unlock(mm); out_nolock: - if (!IS_ERR_OR_NULL(*hpage)) + if (!IS_ERR_OR_NULL(*hpage)) { mem_cgroup_uncharge(page_folio(*hpage)); + put_page(*hpage); + } trace_mm_collapse_huge_page(mm, isolated, result); return; } @@ -1953,8 +1874,10 @@ static void collapse_file(struct mm_struct *mm, unlock_page(new_page); out: VM_BUG_ON(!list_empty(&pagelist)); - if (!IS_ERR_OR_NULL(*hpage)) + if (!IS_ERR_OR_NULL(*hpage)) { mem_cgroup_uncharge(page_folio(*hpage)); + put_page(*hpage); + } /* TODO: tracepoints */ } @@ -2199,10 +2122,7 @@ static void khugepaged_do_scan(void) lru_add_drain_all(); - while (progress < pages) { - if (!khugepaged_prealloc_page(&hpage, &wait)) - break; - + while (true) { cond_resched(); if (unlikely(kthread_should_stop() || try_to_freeze())) @@ -2218,10 +2138,22 @@ static void khugepaged_do_scan(void) else progress = pages; spin_unlock(&khugepaged_mm_lock); - } - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); + if (progress >= pages) + break; + + if (IS_ERR(hpage)) { + /* + * If fail to allocate the first time, try to sleep for + * a while. When hit again, cancel the scan. + */ + if (!wait) + break; + wait = false; + hpage = NULL; + khugepaged_alloc_sleep(); + } + } } static bool khugepaged_should_wakeup(void) From aeeaef128b05232853ad087c9c63295ae87a28ae Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:21 -0700 Subject: [PATCH 0837/1250] mm/khugepaged: add struct collapse_control Modularize hugepage collapse by introducing struct collapse_control. This structure serves to describe the properties of the requested collapse, as well as serve as a local scratch pad to use during the collapse itself. Start by moving global per-node khugepaged statistics into this new structure. Note that this structure is still statically allocated since CONFIG_NODES_SHIFT might be arbitrary large, and stack-allocating a MAX_NUMNODES-sized array could cause -Wframe-large-than= errors. Link: https://lkml.kernel.org/r/20220706235936.2197195-4-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- mm/khugepaged.c | 87 ++++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 37 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2d763a5bf66531..c21eb83eb7fbd6 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -85,6 +85,14 @@ static struct kmem_cache *mm_slot_cache __read_mostly; #define MAX_PTE_MAPPED_THP 8 +struct collapse_control { + /* Num pages scanned per node */ + int node_load[MAX_NUMNODES]; + + /* Last target selected in khugepaged_find_target_node() */ + int last_target_node; +}; + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -735,9 +743,12 @@ static void khugepaged_alloc_sleep(void) remove_wait_queue(&khugepaged_wait, &wait); } -static int khugepaged_node_load[MAX_NUMNODES]; -static bool khugepaged_scan_abort(int nid) +struct collapse_control khugepaged_collapse_control = { + .last_target_node = NUMA_NO_NODE, +}; + +static bool khugepaged_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -749,11 +760,11 @@ static bool khugepaged_scan_abort(int nid) return false; /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) + if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) + if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; @@ -772,32 +783,31 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) +static int khugepaged_find_target_node(struct collapse_control *cc) { - static int last_khugepaged_target_node = NUMA_NO_NODE; int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; + if (cc->node_load[nid] > max_value) { + max_value = cc->node_load[nid]; target_node = nid; } /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { + if (target_node <= cc->last_target_node) + for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == cc->node_load[nid]) { target_node = nid; break; } - last_khugepaged_target_node = target_node; + cc->last_target_node = target_node; return target_node; } #else -static int khugepaged_find_target_node(void) +static int khugepaged_find_target_node(struct collapse_control *cc) { return 0; } @@ -1077,10 +1087,9 @@ static void collapse_huge_page(struct mm_struct *mm, return; } -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct page **hpage) +static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, struct page **hpage, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; @@ -1100,7 +1109,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out; } - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -1166,16 +1175,16 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Record which node the original page is from and save this - * information to khugepaged_node_load[]. + * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (khugepaged_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; goto out_unmap; @@ -1226,7 +1235,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { - node = khugepaged_find_target_node(); + node = khugepaged_find_target_node(cc); /* collapse_huge_page will return with the mmap_lock released */ collapse_huge_page(mm, address, hpage, node, referenced, unmapped); @@ -1881,8 +1890,9 @@ static void collapse_file(struct mm_struct *mm, /* TODO: tracepoints */ } -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct page **hpage, + struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -1893,7 +1903,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, present = 0; swap = 0; - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) @@ -1918,11 +1928,11 @@ static void khugepaged_scan_file(struct mm_struct *mm, } node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (khugepaged_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; @@ -1955,7 +1965,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - node = khugepaged_find_target_node(); + node = khugepaged_find_target_node(cc); collapse_file(mm, file, start, hpage, node); } } @@ -1963,8 +1973,9 @@ static void khugepaged_scan_file(struct mm_struct *mm, /* TODO: tracepoints */ } #else -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct page **hpage, + struct collapse_control *cc) { BUILD_BUG(); } @@ -1975,7 +1986,8 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) + struct page **hpage, + struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { @@ -2052,12 +2064,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, mmap_read_unlock(mm); ret = 1; - khugepaged_scan_file(mm, file, pgoff, hpage); + khugepaged_scan_file(mm, file, pgoff, hpage, + cc); fput(file); } else { ret = khugepaged_scan_pmd(mm, vma, khugepaged_scan.address, - hpage); + hpage, cc); } /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; @@ -2113,7 +2126,7 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(void) +static void khugepaged_do_scan(struct collapse_control *cc) { struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; @@ -2134,7 +2147,7 @@ static void khugepaged_do_scan(void) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); + &hpage, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); @@ -2190,7 +2203,7 @@ static int khugepaged(void *none) set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { - khugepaged_do_scan(); + khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } From adcc4e193b6ba05629b4d9ba175bd19ff1864d87 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 20 Jul 2022 07:06:00 -0700 Subject: [PATCH 0838/1250] mm/khugepaged: use minimal bits to store num page < HPAGE_PMD_NR Minimally, node_load[] entries just need to be able to hold the maximum value of HPAGE_PMD_NR, which is compile-time defined per-arch based on PMD_SHIFT and PAGE_SHIFT. node_load[] is only written either via memset(), or with via post-increment. struct collapse_control may be allocated via kmalloc() in other collapse contexts, and MAX_NUMNODES may be arbitrarily large. #define the underlying type of node_load[] based off HPAGE_PMD_NR to avoid excessive memory allocated for this struct. Link: https://lkml.kernel.org/r/20220720140603.1958773-2-zokeefe@google.com Link: https://lore.kernel.org/linux-mm/Ys2CeIm%2FQmQwWh9a@google.com/ Fixes: 3b07f3bb225a ("mm/khugepaged: add struct collapse_control") Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: Dan Carpenter Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: "Souptick Joarder (HPE)" Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c21eb83eb7fbd6..5fce308b6472c0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -87,8 +87,11 @@ static struct kmem_cache *mm_slot_cache __read_mostly; struct collapse_control { /* Num pages scanned per node */ - int node_load[MAX_NUMNODES]; - +#if HPAGE_PMD_ORDER < 16 + u16 node_load[MAX_NUMNODES]; +#else + u32 node_load[MAX_NUMNODES]; +#endif /* Last target selected in khugepaged_find_target_node() */ int last_target_node; }; From cea6d67c2b9733e69fce55d0f6ad29564ce36696 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:22 -0700 Subject: [PATCH 0839/1250] mm/khugepaged: dedup and simplify hugepage alloc and charging The following code is duplicated in collapse_huge_page() and collapse_file(): gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { result = SCAN_ALLOC_HUGE_PAGE_FAIL; goto out; } if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); Also, "node" is passed as an argument to both collapse_huge_page() and collapse_file() and obtained the same way, via khugepaged_find_target_node(). Move all this into a new helper, alloc_charge_hpage(), and remove the duplicate code from collapse_huge_page() and collapse_file(). Also, simplify khugepaged_alloc_page() by returning a bool indicating allocation success instead of a copy of the allocated struct page *. Link: https://lkml.kernel.org/r/20220706235936.2197195-5-zokeefe@google.com Signed-off-by: Zach O'Keefe Suggested-by: Peter Xu Acked-by: David Rientjes Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- mm/khugepaged.c | 78 ++++++++++++++++++++++--------------------------- 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5fce308b6472c0..7134b60c36a336 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -816,19 +816,18 @@ static int khugepaged_find_target_node(struct collapse_control *cc) } #endif -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) { *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); *hpage = ERR_PTR(-ENOMEM); - return NULL; + return false; } prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); - return *hpage; + return true; } /* @@ -926,10 +925,24 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return true; } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - int node, int referenced, int unmapped) +static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, + struct collapse_control *cc) +{ + /* Only allocate from the target node */ + gfp_t gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; + int node = khugepaged_find_target_node(cc); + + if (!khugepaged_alloc_page(hpage, gfp, node)) + return SCAN_ALLOC_HUGE_PAGE_FAIL; + if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) + return SCAN_CGROUP_CHARGE_FAIL; + count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + return SCAN_SUCCEED; +} + +static void collapse_huge_page(struct mm_struct *mm, unsigned long address, + struct page **hpage, int referenced, + int unmapped, struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; @@ -940,13 +953,9 @@ static void collapse_huge_page(struct mm_struct *mm, int isolated = 0, result = 0; struct vm_area_struct *vma; struct mmu_notifier_range range; - gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves @@ -954,17 +963,12 @@ static void collapse_huge_page(struct mm_struct *mm, * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out_nolock; - } - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_hpage(hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out_nolock; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); + + new_page = *hpage; mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, &vma); @@ -1238,10 +1242,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); if (ret) { - node = khugepaged_find_target_node(cc); /* collapse_huge_page will return with the mmap_lock released */ - collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + collapse_huge_page(mm, address, hpage, referenced, unmapped, + cc); } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, @@ -1509,7 +1512,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * @file: file that collapse on * @start: collapse start address * @hpage: new allocated huge page for collapse - * @node: appointed node the new huge page allocate from + * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; @@ -1526,12 +1529,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_file(struct mm_struct *mm, - struct file *file, pgoff_t start, - struct page **hpage, int node) +static void collapse_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct page **hpage, + struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - gfp_t gfp; struct page *new_page; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); @@ -1543,20 +1545,11 @@ static void collapse_file(struct mm_struct *mm, VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; + result = alloc_charge_hpage(hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out; - } - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; - goto out; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); + new_page = *hpage; /* * Ensure we have slots for all the pages in the range. This is @@ -1968,8 +1961,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - node = khugepaged_find_target_node(cc); - collapse_file(mm, file, start, hpage, node); + collapse_file(mm, file, start, hpage, cc); } } From 4f0aed71d4194da1042e0caf8724f1a9af7bd9ed Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:23 -0700 Subject: [PATCH 0840/1250] mm/khugepaged: propagate enum scan_result codes back to callers Propagate enum scan_result codes back through return values of functions downstream of khugepaged_scan_file() and khugepaged_scan_pmd() to inform callers if the operation was successful, and if not, why. Since khugepaged_scan_pmd()'s return value already has a specific meaning (whether mmap_lock was unlocked or not), add a bool* argument to khugepaged_scan_pmd() to retrieve this information. Change khugepaged to take action based on the return values of khugepaged_scan_file() and khugepaged_scan_pmd() instead of acting deep within the collapsing functions themselves. hugepage_vma_revalidate() now returns SCAN_SUCCEED on success to be more consistent with enum scan_result propagation. Remove dependency on error pointers to communicate to khugepaged that allocation failed and it should sleep; instead just use the result of the scan (SCAN_ALLOC_HUGE_PAGE_FAIL if allocation fails). Link: https://lkml.kernel.org/r/20220706235936.2197195-6-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- mm/khugepaged.c | 233 ++++++++++++++++++++++++------------------------ 1 file changed, 117 insertions(+), 116 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7134b60c36a336..ca5d013a2a8d08 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -561,7 +561,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, shared = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; @@ -675,13 +675,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 1; + return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 0; + return result; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -821,7 +821,6 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); return false; } @@ -833,8 +832,7 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. - * Return 0 if succeeds, otherwise return none-zero - * value (scan code). + * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, @@ -862,7 +860,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, */ if (!vma->anon_vma || !vma_is_anonymous(vma)) return SCAN_VMA_CHECK; - return 0; + return SCAN_SUCCEED; } /* @@ -873,10 +871,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * Note that if false is returned, mmap_lock will be released. */ -static bool __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - int referenced) +static int __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + int referenced) { int swapped_in = 0; vm_fault_t ret = 0; @@ -907,12 +905,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, */ if (ret & VM_FAULT_RETRY) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + /* Likely, but not guaranteed, that page lock failed */ + return SCAN_PAGE_LOCK; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + return SCAN_FAIL; } swapped_in++; } @@ -922,7 +921,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, lru_add_drain(); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); - return true; + return SCAN_SUCCEED; } static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, @@ -940,17 +939,17 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, return SCAN_SUCCEED; } -static void collapse_huge_page(struct mm_struct *mm, unsigned long address, - struct page **hpage, int referenced, - int unmapped, struct collapse_control *cc) +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, + struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; - struct page *new_page; + struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; + int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; @@ -964,15 +963,13 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, */ mmap_read_unlock(mm); - result = alloc_charge_hpage(hpage, mm, cc); + result = alloc_charge_hpage(&hpage, mm, cc); if (result != SCAN_SUCCEED) goto out_nolock; - new_page = *hpage; - mmap_read_lock(mm); result = hugepage_vma_revalidate(mm, address, &vma); - if (result) { + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } @@ -984,14 +981,16 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, goto out_nolock; } - /* - * __collapse_huge_page_swapin will return with mmap_lock released - * when it fails. So we jump out_nolock directly in that case. - * Continuing to collapse causes inconsistency. - */ - if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, - pmd, referenced)) { - goto out_nolock; + if (unmapped) { + /* + * __collapse_huge_page_swapin will return with mmap_lock + * released when it fails. So we jump out_nolock directly in + * that case. Continuing to collapse causes inconsistency. + */ + result = __collapse_huge_page_swapin(mm, vma, address, pmd, + referenced); + if (result != SCAN_SUCCEED) + goto out_nolock; } mmap_read_unlock(mm); @@ -1002,7 +1001,7 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, */ mmap_write_lock(mm); result = hugepage_vma_revalidate(mm, address, &vma); - if (result) + if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ if (mm_find_pmd(mm, address) != pmd) @@ -1029,11 +1028,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte, - &compound_pagelist); + result = __collapse_huge_page_isolate(vma, address, pte, + &compound_pagelist); spin_unlock(pte_ptl); - if (unlikely(!isolated)) { + if (unlikely(result != SCAN_SUCCEED)) { pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1045,7 +1044,6 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; goto out_up_write; } @@ -1055,8 +1053,8 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, - &compound_pagelist); + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); /* * spin_lock() below is not the equivalent of smp_wmb(), but @@ -1064,43 +1062,42 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long address, * avoid the copy_huge_page writes to become visible after * the set_pmd_at() write. */ - __SetPageUptodate(new_page); + __SetPageUptodate(hpage); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + page_add_new_anon_rmap(hpage, vma, address); + lru_cache_add_inactive_or_unevictable(hpage, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); - *hpage = NULL; + hpage = NULL; - khugepaged_pages_collapsed++; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: - if (!IS_ERR_OR_NULL(*hpage)) { - mem_cgroup_uncharge(page_folio(*hpage)); - put_page(*hpage); + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); } - trace_mm_collapse_huge_page(mm, isolated, result); - return; + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + return result; } static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, struct page **hpage, + unsigned long address, bool *mmap_locked, struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, result = 0, referenced = 0; + int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; @@ -1237,19 +1234,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { + if (result == SCAN_SUCCEED) { + result = collapse_huge_page(mm, address, referenced, + unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ - collapse_huge_page(mm, address, hpage, referenced, unmapped, - cc); + *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, none_or_zero, result, unmapped); - return ret; + return result; } static void collect_mm_slot(struct mm_slot *mm_slot) @@ -1511,7 +1508,6 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * @mm: process address space where collapse happens * @file: file that collapse on * @start: collapse start address - * @hpage: new allocated huge page for collapse * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: @@ -1529,12 +1525,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct page **hpage, - struct collapse_control *cc) +static int collapse_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - struct page *new_page; + struct page *hpage; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1545,12 +1540,10 @@ static void collapse_file(struct mm_struct *mm, struct file *file, VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - result = alloc_charge_hpage(hpage, mm, cc); + result = alloc_charge_hpage(&hpage, mm, cc); if (result != SCAN_SUCCEED) goto out; - new_page = *hpage; - /* * Ensure we have slots for all the pages in the range. This is * almost certainly a no-op because most of the pages must be present @@ -1567,14 +1560,14 @@ static void collapse_file(struct mm_struct *mm, struct file *file, } } while (1); - __SetPageLocked(new_page); + __SetPageLocked(hpage); if (is_shmem) - __SetPageSwapBacked(new_page); - new_page->index = start; - new_page->mapping = mapping; + __SetPageSwapBacked(hpage); + hpage->index = start; + hpage->mapping = mapping; /* - * At this point the new_page is locked and not up-to-date. + * At this point the hpage is locked and not up-to-date. * It's safe to insert it into the page cache, because nobody would * be able to map it or use it in another way until we unlock it. */ @@ -1602,7 +1595,7 @@ static void collapse_file(struct mm_struct *mm, struct file *file, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page); + xas_store(&xas, hpage); nr_none++; continue; } @@ -1744,19 +1737,19 @@ static void collapse_file(struct mm_struct *mm, struct file *file, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page); + xas_store(&xas, hpage); continue; out_unlock: unlock_page(page); put_page(page); goto xa_unlocked; } - nr = thp_nr_pages(new_page); + nr = thp_nr_pages(hpage); if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); else { - __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -1767,21 +1760,21 @@ static void collapse_file(struct mm_struct *mm, struct file *file, smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; } } if (nr_none) { - __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } /* Join all the small entries into a single multi-index entry */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, new_page); + xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -1803,11 +1796,11 @@ static void collapse_file(struct mm_struct *mm, struct file *file, index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { while (index < page->index) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), - page); + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), + page); list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -1818,23 +1811,22 @@ static void collapse_file(struct mm_struct *mm, struct file *file, index++; } while (index < end) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - SetPageUptodate(new_page); - page_ref_add(new_page, HPAGE_PMD_NR - 1); + SetPageUptodate(hpage); + page_ref_add(hpage, HPAGE_PMD_NR - 1); if (is_shmem) - set_page_dirty(new_page); - lru_cache_add(new_page); + set_page_dirty(hpage); + lru_cache_add(hpage); /* * Remove pte page tables, so we can re-fault the page as huge. */ retract_page_tables(mapping, start); - *hpage = NULL; - - khugepaged_pages_collapsed++; + unlock_page(hpage); + hpage = NULL; } else { struct page *page; @@ -1873,22 +1865,23 @@ static void collapse_file(struct mm_struct *mm, struct file *file, VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - new_page->mapping = NULL; + hpage->mapping = NULL; } - unlock_page(new_page); + if (hpage) + unlock_page(hpage); out: VM_BUG_ON(!list_empty(&pagelist)); - if (!IS_ERR_OR_NULL(*hpage)) { - mem_cgroup_uncharge(page_folio(*hpage)); - put_page(*hpage); + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); } /* TODO: tracepoints */ + return result; } -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct page **hpage, - struct collapse_control *cc) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -1961,16 +1954,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - collapse_file(mm, file, start, hpage, cc); + result = collapse_file(mm, file, start, cc); } } /* TODO: tracepoints */ + return result; } #else -static void khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct page **hpage, - struct collapse_control *cc) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { BUILD_BUG(); } @@ -1980,8 +1973,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) } #endif -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage, +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) @@ -1995,6 +1987,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); + *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; @@ -2044,7 +2037,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { - int ret; + bool mmap_locked = true; + cond_resched(); if (unlikely(khugepaged_test_exit(mm))) goto breakouterloop; @@ -2058,20 +2052,28 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, khugepaged_scan.address); mmap_read_unlock(mm); - ret = 1; - khugepaged_scan_file(mm, file, pgoff, hpage, - cc); + *result = khugepaged_scan_file(mm, file, pgoff, + cc); + mmap_locked = false; fput(file); } else { - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage, cc); + *result = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + &mmap_locked, cc); } + if (*result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_lock so break loop */ + if (!mmap_locked) + /* + * We released mmap_lock so break loop. Note + * that we drop mmap_lock before all hugepage + * allocations, so if allocation fails, we are + * guaranteed to break here and report the + * correct result back to caller. + */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; @@ -2123,10 +2125,10 @@ static int khugepaged_wait_event(void) static void khugepaged_do_scan(struct collapse_control *cc) { - struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; + int result = SCAN_SUCCEED; lru_add_drain_all(); @@ -2142,7 +2144,7 @@ static void khugepaged_do_scan(struct collapse_control *cc) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage, cc); + &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); @@ -2150,7 +2152,7 @@ static void khugepaged_do_scan(struct collapse_control *cc) if (progress >= pages) break; - if (IS_ERR(hpage)) { + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { /* * If fail to allocate the first time, try to sleep for * a while. When hit again, cancel the scan. @@ -2158,7 +2160,6 @@ static void khugepaged_do_scan(struct collapse_control *cc) if (!wait) break; wait = false; - hpage = NULL; khugepaged_alloc_sleep(); } } From 36f5851226ba0a3e9c2e59560f315ccef7073e93 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:24 -0700 Subject: [PATCH 0841/1250] mm/khugepaged: add flag to predicate khugepaged-only behavior Add .is_khugepaged flag to struct collapse_control so khugepaged-specific behavior can be elided by MADV_COLLAPSE context. Start by protecting khugepaged-specific heuristics by this flag. In MADV_COLLAPSE, the user presumably has reason to believe the collapse will be beneficial and khugepaged heuristics shouldn't prevent the user from doing so: 1) sysfs-controlled knobs khugepaged_max_ptes_[none|swap|shared] 2) requirement that some pages in region being collapsed be young or referenced Link: https://lkml.kernel.org/r/20220706235936.2197195-7-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- mm/khugepaged.c | 62 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ca5d013a2a8d08..3128c7fe974257 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -73,6 +73,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. + * + * Note that these are only respected if collapse was initiated by khugepaged. */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; @@ -86,6 +88,8 @@ static struct kmem_cache *mm_slot_cache __read_mostly; #define MAX_PTE_MAPPED_THP 8 struct collapse_control { + bool is_khugepaged; + /* Num pages scanned per node */ #if HPAGE_PMD_ORDER < 16 u16 node_load[MAX_NUMNODES]; @@ -557,6 +561,7 @@ static bool is_refcount_suitable(struct page *page) static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, + struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; @@ -570,7 +575,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (++none_or_zero <= khugepaged_max_ptes_none || + !cc->is_khugepaged)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -590,8 +596,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + if (cc->is_khugepaged && page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; @@ -657,10 +663,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (PageCompound(page)) list_add_tail(&page->lru, compound_pagelist); next: - /* There should be enough young pte to collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; if (pte_write(pteval)) @@ -669,7 +679,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (unlikely(!writable)) { result = SCAN_PAGE_RO; - } else if (unlikely(!referenced)) { + } else if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; @@ -748,6 +758,7 @@ static void khugepaged_alloc_sleep(void) struct collapse_control khugepaged_collapse_control = { + .is_khugepaged = true, .last_target_node = NUMA_NO_NODE, }; @@ -1028,7 +1039,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - result = __collapse_huge_page_isolate(vma, address, pte, + result = __collapse_huge_page_isolate(vma, address, pte, cc, &compound_pagelist); spin_unlock(pte_ptl); @@ -1119,7 +1130,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { + if (++unmapped <= khugepaged_max_ptes_swap || + !cc->is_khugepaged) { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1138,7 +1150,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (++none_or_zero <= khugepaged_max_ptes_none || + !cc->is_khugepaged)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1168,8 +1181,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unmap; } - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + if (cc->is_khugepaged && + page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; @@ -1223,14 +1237,22 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, result = SCAN_PAGE_COUNT; goto out_unmap; } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; - } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + } else if (cc->is_khugepaged && + (!referenced || + (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; @@ -1899,7 +1921,8 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { + if (cc->is_khugepaged && + ++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; @@ -1950,7 +1973,8 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none && + cc->is_khugepaged) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { From dccd751c7dde08a9b1c3e00c0bac814eff00e3cc Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:25 -0700 Subject: [PATCH 0842/1250] mm/thp: add flag to enforce sysfs THP in hugepage_vma_check() MADV_COLLAPSE is not coupled to the kernel-oriented sysfs THP settings[1]. hugepage_vma_check() is the authority on determining if a VMA is eligible for THP allocation/collapse, and currently enforces the sysfs THP settings. Add a flag to disable these checks. For now, only apply this arg to anon and file, which use /sys/kernel/transparent_hugepage/enabled. We can expand this to shmem, which uses /sys/kernel/transparent_hugepage/shmem_enabled, later. Use this flag in collapse_pte_mapped_thp() where previously the VMA flags passed to hugepage_vma_check() were OR'd with VM_HUGEPAGE to elide the VM_HUGEPAGE check in "madvise" THP mode. Prior to "mm: khugepaged: check THP flag in hugepage_vma_check()", this check also didn't check "never" THP mode. As such, this restores the previous behavior of collapse_pte_mapped_thp() where sysfs THP settings are ignored. See comment in code for justification why this is OK. [1] https://lore.kernel.org/linux-mm/CAAa6QmQxay1_=Pmt8oCX2-Va18t44FV-Vs-WsQt_6+qBks4nZA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220706235936.2197195-8-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- include/linux/huge_mm.h | 9 ++++----- mm/huge_memory.c | 14 ++++++-------- mm/khugepaged.c | 25 ++++++++++++++----------- mm/memory.c | 4 ++-- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 34d292cec79a60..f8cd58846a28b4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -866,7 +866,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true, false)); + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 37f2f11a6d7ee5..00312fc251c1be 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf); +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -321,8 +320,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, } static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs) { return false; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9f70582b8f5123..0dfc67dec10873 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -69,9 +69,8 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs) { if (!vma->vm_mm) /* vdso */ return false; @@ -120,11 +119,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (!hugepage_flags_enabled()) - return false; - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + /* Enforce sysfs THP requirements as necessary */ + if (enforce_sysfs && + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_flags_always()))) return false; /* Only regular file is valid */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3128c7fe974257..c2188745982c07 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -481,7 +481,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false)) + if (hugepage_vma_check(vma, vm_flags, false, false, true)) __khugepaged_enter(vma->vm_mm); } } @@ -847,7 +847,8 @@ static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - struct vm_area_struct **vmap) + struct vm_area_struct **vmap, + struct collapse_control *cc) { struct vm_area_struct *vma; @@ -860,7 +861,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, + cc->is_khugepaged)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -979,7 +981,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, goto out_nolock; mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); + result = hugepage_vma_revalidate(mm, address, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; @@ -1011,7 +1013,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); + result = hugepage_vma_revalidate(mm, address, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ @@ -1355,12 +1357,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) return; /* - * This vm_flags may not have VM_HUGEPAGE if the page was not - * collapsed by this mm. But we can still collapse if the page is - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() - * will not fail the vma for missing VM_HUGEPAGE + * If we are here, we've succeeded in replacing all the native pages + * in the page cache with a single hugepage. If a mm were to fault-in + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage + * and map it by a PMD, regardless of sysfs THP settings. As such, let's + * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -2047,7 +2050,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { skip: progress++; continue; diff --git a/mm/memory.c b/mm/memory.c index 57480ce3dbc1b5..aa7c7e15abe4ef 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5004,7 +5004,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5038,7 +5038,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; From 301e44c2d20f7fa0f05e410e32781fc919410336 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 20 Jul 2022 07:06:01 -0700 Subject: [PATCH 0843/1250] mm/khugepaged: consistently order cc->is_khugepaged and pte_* checks cc->is_khugepaged is used to predicate the khugepaged-only behavior of enforcing khugepaged heuristics limited by the sysfs knobs khugepaged_max_ptes_[none|swap|shared]. In branches where khugepaged_max_ptes_* is checked, consistently check cc->is_khugepaged first. Also, local counters (for comparison vs khugepaged_max_ptes_* limits) were previously incremented in the comparison expression. Some of these counters (unmapped) are additionally used outside of khugepaged_max_ptes_* enforcement, and all counters are communicated in tracepoints. Move the correct accounting of these counters before branching statements to avoid future errors due to C's short-circuiting evaluation. Link: https://lkml.kernel.org/r/20220720140603.1958773-3-zokeefe@google.com Link: https://lore.kernel.org/linux-mm/Ys2qJm6FaOQcxkha@google.com/ Fixes: 9fab4752a181 ("mm/khugepaged: add flag to predicate khugepaged-only behavior") Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: Dan Carpenter Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: "Souptick Joarder (HPE)" Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c2188745982c07..ae30794b1ddb2c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -574,9 +574,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - (++none_or_zero <= khugepaged_max_ptes_none || - !cc->is_khugepaged)) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -596,11 +597,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); - if (cc->is_khugepaged && page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out; + } } if (PageCompound(page)) { @@ -1132,8 +1136,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap || - !cc->is_khugepaged) { + ++unmapped; + if (!cc->is_khugepaged || + unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1151,9 +1156,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, } } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - (++none_or_zero <= khugepaged_max_ptes_none || - !cc->is_khugepaged)) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1183,12 +1189,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unmap; } - if (cc->is_khugepaged && - page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out_unmap; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out_unmap; + } } page = compound_head(page); @@ -1924,8 +1932,9 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, continue; if (xa_is_value(page)) { + ++swap; if (cc->is_khugepaged && - ++swap > khugepaged_max_ptes_swap) { + swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; @@ -1976,8 +1985,8 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none && - cc->is_khugepaged) { + if (cc->is_khugepaged && + present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { From 1281e25a51c34060fc5a46b7223b1235b2121f26 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:26 -0700 Subject: [PATCH 0844/1250] mm/khugepaged: record SCAN_PMD_MAPPED when scan_pmd() finds hugepage When scanning an anon pmd to see if it's eligible for collapse, return SCAN_PMD_MAPPED if the pmd already maps a hugepage. Note that SCAN_PMD_MAPPED is different from SCAN_PAGE_COMPOUND used in the file-collapse path, since the latter might identify pte-mapped compound pages. This is required by MADV_COLLAPSE which necessarily needs to know what hugepage-aligned/sized regions are already pmd-mapped. In order to determine if a pmd already maps a hugepage, refactor mm_find_pmd(): Return mm_find_pmd() to it's pre-commit f72e7dcdd252 ("mm: let mm_find_pmd fix buggy race with THP fault") behavior. ksm was the only caller that explicitly wanted a pte-mapping pmd, so open code the pte-mapping logic there (pmd_present() and pmd_trans_huge() checks). Undo revert change in commit f72e7dcdd252 ("mm: let mm_find_pmd fix buggy race with THP fault") that open-coded split_huge_pmd_address() pmd lookup and use mm_find_pmd() instead. Link: https://lkml.kernel.org/r/20220706235936.2197195-9-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 1 + mm/huge_memory.c | 18 +-------- mm/internal.h | 2 +- mm/khugepaged.c | 60 ++++++++++++++++++++++++------ mm/ksm.c | 10 +++++ mm/rmap.c | 15 +++----- 6 files changed, 67 insertions(+), 39 deletions(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index d651f3437367d0..55392bf30a034f 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -11,6 +11,7 @@ EM( SCAN_FAIL, "failed") \ EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_PMD_NULL, "pmd_null") \ + EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0dfc67dec10873..b81a81086f0206 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2263,25 +2263,11 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; + pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) + if (!pmd) return; - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) - return; - - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - return; - - pmd = pmd_offset(pud, address); - __split_huge_pmd(vma, pmd, address, freeze, folio); } diff --git a/mm/internal.h b/mm/internal.h index 5c7220017c7863..15e8cb1188320e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -188,7 +188,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason /* * in mm/rmap.c: */ -extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ae30794b1ddb2c..ffa695cf5a009c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -28,6 +28,7 @@ enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, + SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, @@ -880,6 +881,45 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, return SCAN_SUCCEED; } +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + pmd_t pmde; + + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + pmde = pmd_read_atomic(*pmd); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ + barrier(); +#endif + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; + if (pmd_trans_huge(pmde)) + return SCAN_PMD_MAPPED; + if (pmd_bad(pmde)) + return SCAN_PMD_NULL; + return SCAN_SUCCEED; +} + +static int check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, + pmd_t *pmd) +{ + pmd_t *new_pmd; + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + + if (result != SCAN_SUCCEED) + return result; + if (new_pmd != pmd) + return SCAN_FAIL; + return SCAN_SUCCEED; +} + /* * Bring missing pages in from swap, to complete THP collapse. * Only done if khugepaged_scan_pmd believes it is worthwhile. @@ -991,9 +1031,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, goto out_nolock; } - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } @@ -1021,7 +1060,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) + result = check_pmd_still_valid(mm, address, pmd); + if (result != SCAN_SUCCEED) goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1124,11 +1164,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) goto out; - } memset(cc->node_load, 0, sizeof(cc->node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -1386,8 +1424,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!PageHead(hpage)) goto drop_hpage; - pmd = mm_find_pmd(mm, haddr); - if (!pmd) + if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) goto drop_hpage; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); @@ -1505,8 +1542,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (vma->vm_end < addr + HPAGE_PMD_SIZE) continue; mm = vma->vm_mm; - pmd = mm_find_pmd(mm, addr); - if (!pmd) + if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; /* * We need exclusive mmap_lock to retract page table. diff --git a/mm/ksm.c b/mm/ksm.c index 075123602bd07b..3e0a0a42fa1ff2 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1136,6 +1136,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, { struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; + pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; @@ -1150,6 +1151,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; + /* + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = *pmd; + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); diff --git a/mm/rmap.c b/mm/rmap.c index edc06c52bc82e7..af775855e58f04 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -767,13 +767,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return vma_address(page, vma); } +/* + * Returns the actual pmd_t* where we expect 'address' to be mapped from, or + * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* + * represents. + */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; - pmd_t pmde; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -788,15 +792,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) goto out; pmd = pmd_offset(pud, address); - /* - * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() - * without holding anon_vma lock for write. So when looking for a - * genuine pmde (in which to find pte), test present and !THP together. - */ - pmde = *pmd; - barrier(); - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) - pmd = NULL; out: return pmd; } From 1c0e1f10dc137539897a7535944c55582cb872b1 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:27 -0700 Subject: [PATCH 0845/1250] mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse This idea was introduced by David Rientjes[1]. Introduce a new madvise mode, MADV_COLLAPSE, that allows users to request a synchronous collapse of memory at their own expense. The benefits of this approach are: * CPU is charged to the process that wants to spend the cycles for the THP * Avoid unpredictable timing of khugepaged collapse Semantics This call is independent of the system-wide THP sysfs settings, but will fail for memory marked VM_NOHUGEPAGE. If the ranges provided span multiple VMAs, the semantics of the collapse over each VMA is independent from the others. This implies a hugepage cannot cross a VMA boundary. If collapse of a given hugepage-aligned/sized region fails, the operation may continue to attempt collapsing the remainder of memory specified. The memory ranges provided must be page-aligned, but are not required to be hugepage-aligned. If the memory ranges are not hugepage-aligned, the start/end of the range will be clamped to the first/last hugepage-aligned address covered by said range. The memory ranges must span at least one hugepage-sized region. All non-resident pages covered by the range will first be swapped/faulted-in, before being internally copied onto a freshly allocated hugepage. Unmapped pages will have their data directly initialized to 0 in the new hugepage. However, for every eligible hugepage aligned/sized region to-be collapsed, at least one page must currently be backed by memory (a PMD covering the address range must already exist). Allocation for the new hugepage may enter direct reclaim and/or compaction, regardless of VMA flags. When the system has multiple NUMA nodes, the hugepage will be allocated from the node providing the most native pages. This operation operates on the current state of the specified process and makes no persistent changes or guarantees on how pages will be mapped, constructed, or faulted in the future Return Value If all hugepage-sized/aligned regions covered by the provided range were either successfully collapsed, or were already PMD-mapped THPs, this operation will be deemed successful. On success, process_madvise(2) returns the number of bytes advised, and madvise(2) returns 0. Else, -1 is returned and errno is set to indicate the error for the most-recently attempted hugepage collapse. Note that many failures might have occurred, since the operation may continue to collapse in the event a single hugepage-sized/aligned region fails. ENOMEM Memory allocation failed or VMA not found EBUSY Memcg charging failed EAGAIN Required resource temporarily unavailable. Try again might succeed. EINVAL Other error: No PMD found, subpage doesn't have Present bit set, "Special" page no backed by struct page, VMA incorrectly sized, address not page-aligned, ... Most notable here is ENOMEM and EBUSY (new to madvise) which are intended to provide the caller with actionable feedback so they may take an appropriate fallback measure. Use Cases An immediate user of this new functionality are malloc() implementations that manage memory in hugepage-sized chunks, but sometimes subrelease memory back to the system in native-sized chunks via MADV_DONTNEED; zapping the pmd. Later, when the memory is hot, the implementation could madvise(MADV_COLLAPSE) to re-back the memory by THPs to regain hugepage coverage and dTLB performance. TCMalloc is such an implementation that could benefit from this[2]. Only privately-mapped anon memory is supported for now, but additional support for file, shmem, and HugeTLB high-granularity mappings[2] is expected. File and tmpfs/shmem support would permit: * Backing executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. With MADV_COLLAPSE, we get the best of both worlds: Peak upfront performance and lower RAM footprints. * Backing guest memory by hugapages after the memory contents have been migrated in native-page-sized chunks to a new host, in a userfaultfd-based live-migration stack. [1] https://lore.kernel.org/linux-mm/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com/ [2] https://github.com/google/tcmalloc/tree/master/tcmalloc Link: https://lkml.kernel.org/r/20220706235936.2197195-10-zokeefe@google.com Signed-off-by: Zach O'Keefe Suggested-by: David Rientjes Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- arch/alpha/include/uapi/asm/mman.h | 2 + arch/mips/include/uapi/asm/mman.h | 2 + arch/parisc/include/uapi/asm/mman.h | 2 + arch/xtensa/include/uapi/asm/mman.h | 2 + include/linux/huge_mm.h | 14 ++- include/uapi/asm-generic/mman-common.h | 2 + mm/khugepaged.c | 118 ++++++++++++++++++- mm/madvise.c | 5 + tools/include/uapi/asm-generic/mman-common.h | 2 + 9 files changed, 146 insertions(+), 3 deletions(-) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 4aa996423b0d16..763929e814e9a3 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -76,6 +76,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 1be428663c1027..c6e1fc77c99688 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -103,6 +103,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index a7ea3204a5faa0..22133a6a506efb 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -70,6 +70,8 @@ #define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */ +#define MADV_COLLAPSE 73 /* Synchronous hugepage collapse */ + #define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 7966a58af472a1..1ff0c858544fa8 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -111,6 +111,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 00312fc251c1be..39193623442efe 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -218,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); +int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); @@ -361,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma, static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { - BUG(); - return 0; + return -EINVAL; } + +static inline int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + return -EINVAL; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6c1aa92a92e441..6ce1f1ceb432c6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -77,6 +77,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ffa695cf5a009c..76e450cf2465e8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -985,7 +985,8 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, struct collapse_control *cc) { /* Only allocate from the target node */ - gfp_t gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; + gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : + GFP_TRANSHUGE) | __GFP_THISNODE; int node = khugepaged_find_target_node(cc); if (!khugepaged_alloc_page(hpage, gfp, node)) @@ -2370,3 +2371,118 @@ void khugepaged_min_free_kbytes_update(void) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } + +static int madvise_collapse_errno(enum scan_result r) +{ + /* + * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide + * actionable feedback to caller, so they may take an appropriate + * fallback measure depending on the nature of the failure. + */ + switch (r) { + case SCAN_ALLOC_HUGE_PAGE_FAIL: + return -ENOMEM; + case SCAN_CGROUP_CHARGE_FAIL: + return -EBUSY; + /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_LOCK: + case SCAN_PAGE_LRU: + return -EAGAIN; + /* + * Other: Trying again likely not to succeed / error intrinsic to + * specified memory range. khugepaged likely won't be able to collapse + * either. + */ + default: + return -EINVAL; + } +} + +int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct collapse_control *cc; + struct mm_struct *mm = vma->vm_mm; + unsigned long hstart, hend, addr; + int thps = 0, last_fail = SCAN_FAIL; + bool mmap_locked = true; + + BUG_ON(vma->vm_start > start); + BUG_ON(vma->vm_end < end); + + cc = kmalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + cc->is_khugepaged = false; + cc->last_target_node = NUMA_NO_NODE; + + *prev = vma; + + /* TODO: Support file/shmem */ + if (!vma->anon_vma || !vma_is_anonymous(vma)) + return -EINVAL; + + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return -EINVAL; + + mmgrab(mm); + lru_add_drain_all(); + + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { + int result = SCAN_FAIL; + + if (!mmap_locked) { + cond_resched(); + mmap_read_lock(mm); + mmap_locked = true; + result = hugepage_vma_revalidate(mm, addr, &vma, cc); + if (result != SCAN_SUCCEED) { + last_fail = result; + goto out_nolock; + } + } + mmap_assert_locked(mm); + memset(cc->node_load, 0, sizeof(cc->node_load)); + result = khugepaged_scan_pmd(mm, vma, addr, &mmap_locked, cc); + if (!mmap_locked) + *prev = NULL; /* Tell caller we dropped mmap_lock */ + + switch (result) { + case SCAN_SUCCEED: + case SCAN_PMD_MAPPED: + ++thps; + break; + /* Whitelisted set of results where continuing OK */ + case SCAN_PMD_NULL: + case SCAN_PTE_NON_PRESENT: + case SCAN_PTE_UFFD_WP: + case SCAN_PAGE_RO: + case SCAN_LACK_REFERENCED_PAGE: + case SCAN_PAGE_NULL: + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_COMPOUND: + case SCAN_PAGE_LRU: + last_fail = result; + break; + default: + last_fail = result; + /* Other error, exit */ + goto out_maybelock; + } + } + +out_maybelock: + /* Caller expects us to hold mmap_lock on return */ + if (!mmap_locked) + mmap_read_lock(mm); +out_nolock: + mmap_assert_locked(mm); + mmdrop(mm); + + return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 + : madvise_collapse_errno(last_fail); +} diff --git a/mm/madvise.c b/mm/madvise.c index 851fa4e134bc54..9f08e958ea8618 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -1057,6 +1058,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; + case MADV_COLLAPSE: + return madvise_collapse(vma, prev, start, end); } anon_name = anon_vma_name(vma); @@ -1150,6 +1153,7 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: + case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: @@ -1339,6 +1343,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. + * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 6c1aa92a92e441..6ce1f1ceb432c6 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -77,6 +77,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 From f24147764fbdae021e206a417fb75017b058e00b Mon Sep 17 00:00:00 2001 From: "Souptick Joarder (HPE)" Date: Wed, 13 Jul 2022 08:11:09 +0530 Subject: [PATCH 0846/1250] mm/khugepaged: Avoid possible memory leak in failure path smatch warnings: mm/khugepaged.c:2409 madvise_collapse() warn: possible memory leak of 'cc' Avoiding possible memory leak. Link: https://lkml.kernel.org/r/20220713024109.62810-1-jrdr.linux@gmail.com Signed-off-by: Souptick Joarder (HPE) Reported-by: kernel test robot Reported-by: Dan Carpenter Cc: Zach O'Keefe Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 76e450cf2465e8..3c8aa2db22be8a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2410,12 +2410,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, BUG_ON(vma->vm_start > start); BUG_ON(vma->vm_end < end); - cc = kmalloc(sizeof(*cc), GFP_KERNEL); - if (!cc) - return -ENOMEM; - cc->is_khugepaged = false; - cc->last_target_node = NUMA_NO_NODE; - *prev = vma; /* TODO: Support file/shmem */ @@ -2428,6 +2422,12 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return -EINVAL; + cc = kmalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + cc->is_khugepaged = false; + cc->last_target_node = NUMA_NO_NODE; + mmgrab(mm); lru_add_drain_all(); From f3907356e31ee04779bc5b402d0b64aa31ededa7 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 13 Jul 2022 09:18:51 -0700 Subject: [PATCH 0847/1250] mm/khugepaged: add missing kfree() to madvise_collapse() smatch warnings: mm/khugepaged.c:2409 madvise_collapse() warn: possible memory leak of 'cc' Link: https://lore.kernel.org/linux-mm/202207100715.TBIYQ4fc-lkp@intel.com/ Link: https://lore.kernel.org/linux-mm/20220713024109.62810-1-jrdr.linux@gmail.com/ Link: https://lkml.kernel.org/r/20220713161851.1879439-1-zokeefe@google.com Fixes: 3f7416127072 ("mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse") Signed-off-by: Zach O'Keefe Reported-by: kernel test robot Reported-by: Dan Carpenter Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3c8aa2db22be8a..66314506e3ee9e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2482,6 +2482,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, out_nolock: mmap_assert_locked(mm); mmdrop(mm); + kfree(cc); return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 : madvise_collapse_errno(last_fail); From e6f776d7b69d8d1bc3341c6c2f3c497c9a44f3c0 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 20 Jul 2022 07:06:02 -0700 Subject: [PATCH 0848/1250] mm/khugepaged: delay computation of hpage boundaries until use Only compute hstart/hend once we've passed all checks that would cause early return in madvise_collapse(). Link: https://lkml.kernel.org/r/20220720140603.1958773-4-zokeefe@google.com Fixes: c9d968ffd9ba ("mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse") Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: Dan Carpenter Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: "Souptick Joarder (HPE)" Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 66314506e3ee9e..6eec731bb925f3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2416,9 +2416,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, if (!vma->anon_vma || !vma_is_anonymous(vma)) return -EINVAL; - hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; - hend = end & HPAGE_PMD_MASK; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return -EINVAL; @@ -2431,6 +2428,9 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmgrab(mm); lru_add_drain_all(); + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { int result = SCAN_FAIL; From 0c92854e460e6104a2526253a72c8c50ab5a5380 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:28 -0700 Subject: [PATCH 0849/1250] mm/khugepaged: rename prefix of shared collapse functions The following functions are shared between khugepaged and madvise collapse contexts. Replace the "khugepaged_" prefix with generic "hpage_collapse_" prefix in such cases: khugepaged_test_exit() -> hpage_collapse_test_exit() khugepaged_scan_abort() -> hpage_collapse_scan_abort() khugepaged_scan_pmd() -> hpage_collapse_scan_pmd() khugepaged_find_target_node() -> hpage_collapse_find_target_node() khugepaged_alloc_page() -> hpage_collapse_alloc_page() The kerenel ABI (e.g. huge_memory:mm_khugepaged_scan_pmd tracepoint) is unaltered. Link: https://lkml.kernel.org/r/20220706235936.2197195-11-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- mm/khugepaged.c | 68 +++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6eec731bb925f3..28cb8429dad4d7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -97,7 +97,7 @@ struct collapse_control { #else u32 node_load[MAX_NUMNODES]; #endif - /* Last target selected in khugepaged_find_target_node() */ + /* Last target selected in hpage_collapse_find_target_node() */ int last_target_node; }; @@ -441,7 +441,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); } -static inline int khugepaged_test_exit(struct mm_struct *mm) +static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } @@ -456,7 +456,7 @@ void __khugepaged_enter(struct mm_struct *mm) return; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return; @@ -508,11 +508,10 @@ void __khugepaged_exit(struct mm_struct *mm) } else if (mm_slot) { /* * This is required to serialize against - * khugepaged_test_exit() (which is guaranteed to run - * under mmap sem read mode). Stop here (after we - * return all pagetables will be destroyed) until - * khugepaged has finished working on the pagetables - * under the mmap_lock. + * hpage_collapse_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we return all + * pagetables will be destroyed) until khugepaged has finished + * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); @@ -761,13 +760,12 @@ static void khugepaged_alloc_sleep(void) remove_wait_queue(&khugepaged_wait, &wait); } - struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, .last_target_node = NUMA_NO_NODE, }; -static bool khugepaged_scan_abort(int nid, struct collapse_control *cc) +static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -802,7 +800,7 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(struct collapse_control *cc) +static int hpage_collapse_find_target_node(struct collapse_control *cc) { int nid, target_node = 0, max_value = 0; @@ -826,13 +824,13 @@ static int khugepaged_find_target_node(struct collapse_control *cc) return target_node; } #else -static int khugepaged_find_target_node(struct collapse_control *cc) +static int hpage_collapse_find_target_node(struct collapse_control *cc) { return 0; } #endif -static bool khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node) { *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { @@ -857,7 +855,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, { struct vm_area_struct *vma; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -922,7 +920,7 @@ static int check_pmd_still_valid(struct mm_struct *mm, /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if khugepaged_scan_pmd believes it is worthwhile. + * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Note that if false is returned, mmap_lock will be released. @@ -987,9 +985,9 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, /* Only allocate from the target node */ gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : GFP_TRANSHUGE) | __GFP_THISNODE; - int node = khugepaged_find_target_node(cc); + int node = hpage_collapse_find_target_node(cc); - if (!khugepaged_alloc_page(hpage, gfp, node)) + if (!hpage_collapse_alloc_page(hpage, gfp, node)) return SCAN_ALLOC_HUGE_PAGE_FAIL; if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) return SCAN_CGROUP_CHARGE_FAIL; @@ -1149,9 +1147,10 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, return result; } -static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, bool *mmap_locked, - struct collapse_control *cc) +static int hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; @@ -1247,7 +1246,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node, cc)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } @@ -1326,7 +1325,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (khugepaged_test_exit(mm)) { + if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); @@ -1499,7 +1498,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) if (!mmap_write_trylock(mm)) return; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto out; for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) @@ -1561,7 +1560,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * it'll always mapped in small page size for uffd-wp * registered ranges. */ - if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma)) + if (!hpage_collapse_test_exit(mm) && + !userfaultfd_wp(vma)) collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { @@ -1989,7 +1989,7 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, } node = page_to_nid(page); - if (khugepaged_scan_abort(node, cc)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } @@ -2083,7 +2083,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, goto breakouterloop_mmap_lock; progress++; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto breakouterloop; address = khugepaged_scan.address; @@ -2092,7 +2092,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, unsigned long hstart, hend; cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit(mm))) { progress++; break; } @@ -2113,7 +2113,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, bool mmap_locked = true; cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2130,9 +2130,10 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, mmap_locked = false; fput(file); } else { - *result = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - &mmap_locked, cc); + *result = hpage_collapse_scan_pmd(mm, vma, + khugepaged_scan.address, + &mmap_locked, + cc); } if (*result == SCAN_SUCCEED) ++khugepaged_pages_collapsed; @@ -2162,7 +2163,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (khugepaged_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find @@ -2446,7 +2447,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, } mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); - result = khugepaged_scan_pmd(mm, vma, addr, &mmap_locked, cc); + result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, + cc); if (!mmap_locked) *prev = NULL; /* Tell caller we dropped mmap_lock */ From febc5f7b36ec286f47ea8abdd534d52f3e94921d Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:30 -0700 Subject: [PATCH 0850/1250] mm/madvise: add MADV_COLLAPSE to process_madvise() Allow MADV_COLLAPSE behavior for process_madvise(2) if caller has CAP_SYS_ADMIN or is requesting collapse of it's own memory. This is useful for the development of userspace agents that seek to optimize THP utilization system-wide by using userspace signals to prioritize what memory is most deserving of being THP-backed. Link: https://lkml.kernel.org/r/20220706235936.2197195-13-zokeefe@google.com Signed-off-by: Zach O'Keefe Acked-by: David Rientjes Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- mm/madvise.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 9f08e958ea8618..6fb6b7160bdadb 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1171,13 +1171,15 @@ madvise_behavior_valid(int behavior) } static bool -process_madvise_behavior_valid(int behavior) +process_madvise_behavior_valid(int behavior, struct task_struct *task) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: return true; + case MADV_COLLAPSE: + return task == current || capable(CAP_SYS_ADMIN); default: return false; } @@ -1455,7 +1457,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto free_iov; } - if (!process_madvise_behavior_valid(behavior)) { + if (!process_madvise_behavior_valid(behavior, task)) { ret = -EINVAL; goto release_task; } From 9eebbc3eebc6cdb9d919d1ebc4f36471c3e31cc5 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:32 -0700 Subject: [PATCH 0851/1250] selftests/vm: modularize collapse selftests Modularize the collapse action of khugepaged collapse selftests by introducing a struct collapse_context which specifies how to collapse a given memory range and the expected semantics of the collapse. This can be reused later to test other collapse contexts. Additionally, all tests have logic that checks if a collapse occurred via reading /proc/self/smaps, and report if this is different than expected. Move this logic into the per-context ->collapse() hook instead of repeating it in every test. Link: https://lkml.kernel.org/r/20220706235936.2197195-15-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 251 +++++++++++------------- 1 file changed, 110 insertions(+), 141 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 155120b67a165d..0f1bee0eff24e1 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -23,6 +23,11 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +struct collapse_context { + void (*collapse)(const char *msg, char *p, bool expect); + bool enforce_pte_scan_limits; +}; + enum thp_enabled { THP_ALWAYS, THP_MADVISE, @@ -501,6 +506,21 @@ static bool wait_for_scan(const char *msg, char *p) return timeout == -1; } +static void khugepaged_collapse(const char *msg, char *p, bool expect) +{ + if (wait_for_scan(msg, p)) { + if (expect) + fail("Timeout"); + else + success("OK"); + return; + } else if (check_huge(p) == expect) { + success("OK"); + } else { + fail("Fail"); + } +} + static void alloc_at_fault(void) { struct settings settings = default_settings; @@ -528,53 +548,39 @@ static void alloc_at_fault(void) munmap(p, hpage_pmd_size); } -static void collapse_full(void) +static void collapse_full(struct collapse_context *c) { void *p; p = alloc_mapping(); fill_memory(p, 0, hpage_pmd_size); - if (wait_for_scan("Collapse fully populated PTE table", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse fully populated PTE table", p, true); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); } -static void collapse_empty(void) +static void collapse_empty(struct collapse_context *c) { void *p; p = alloc_mapping(); - if (wait_for_scan("Do not collapse empty PTE table", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); + c->collapse("Do not collapse empty PTE table", p, false); munmap(p, hpage_pmd_size); } -static void collapse_single_pte_entry(void) +static void collapse_single_pte_entry(struct collapse_context *c) { void *p; p = alloc_mapping(); fill_memory(p, 0, page_size); - if (wait_for_scan("Collapse PTE table with single PTE entry present", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single PTE entry present", p, + true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); } -static void collapse_max_ptes_none(void) +static void collapse_max_ptes_none(struct collapse_context *c) { int max_ptes_none = hpage_pmd_nr / 2; struct settings settings = default_settings; @@ -586,28 +592,22 @@ static void collapse_max_ptes_none(void) p = alloc_mapping(); fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); + c->collapse("Maybe collapse with max_ptes_none exceeded", p, + !c->enforce_pte_scan_limits); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + if (c->enforce_pte_scan_limits) { + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, true); + validate_memory(p, 0, + (hpage_pmd_nr - max_ptes_none) * page_size); + } munmap(p, hpage_pmd_size); write_settings(&default_settings); } -static void collapse_swapin_single_pte(void) +static void collapse_swapin_single_pte(struct collapse_context *c) { void *p; p = alloc_mapping(); @@ -625,18 +625,13 @@ static void collapse_swapin_single_pte(void) goto out; } - if (wait_for_scan("Collapse with swapping in single PTE entry", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse with swapping in single PTE entry", p, true); validate_memory(p, 0, hpage_pmd_size); out: munmap(p, hpage_pmd_size); } -static void collapse_max_ptes_swap(void) +static void collapse_max_ptes_swap(struct collapse_context *c) { int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); void *p; @@ -656,39 +651,34 @@ static void collapse_max_ptes_swap(void) goto out; } - if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, + !c->enforce_pte_scan_limits); validate_memory(p, 0, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); - if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, max_ptes_swap * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } + if (c->enforce_pte_scan_limits) { + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, + hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } - if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); + c->collapse("Collapse with max_ptes_swap pages swapped out", p, + true); + validate_memory(p, 0, hpage_pmd_size); + } out: munmap(p, hpage_pmd_size); } -static void collapse_single_pte_entry_compound(void) +static void collapse_single_pte_entry_compound(struct collapse_context *c) { void *p; @@ -710,17 +700,13 @@ static void collapse_single_pte_entry_compound(void) else fail("Fail"); - if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single PTE mapping compound page", + p, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); } -static void collapse_full_of_compound(void) +static void collapse_full_of_compound(struct collapse_context *c) { void *p; @@ -742,17 +728,12 @@ static void collapse_full_of_compound(void) else fail("Fail"); - if (wait_for_scan("Collapse PTE table full of compound pages", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of compound pages", p, true); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); } -static void collapse_compound_extreme(void) +static void collapse_compound_extreme(struct collapse_context *c) { void *p; int i; @@ -798,18 +779,14 @@ static void collapse_compound_extreme(void) else fail("Fail"); - if (wait_for_scan("Collapse PTE table full of different compound pages", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of different compound pages", p, + true); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); } -static void collapse_fork(void) +static void collapse_fork(struct collapse_context *c) { int wstatus; void *p; @@ -835,13 +812,8 @@ static void collapse_fork(void) fail("Fail"); fill_memory(p, page_size, 2 * page_size); - - if (wait_for_scan("Collapse PTE table with single page shared with parent process", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single page shared with parent process", + p, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); @@ -860,7 +832,7 @@ static void collapse_fork(void) munmap(p, hpage_pmd_size); } -static void collapse_fork_compound(void) +static void collapse_fork_compound(struct collapse_context *c) { int wstatus; void *p; @@ -896,14 +868,10 @@ static void collapse_fork_compound(void) fill_memory(p, 0, page_size); write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); - if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of compound pages in child", + p, true); write_num("khugepaged/max_ptes_shared", - default_settings.khugepaged.max_ptes_shared); + default_settings.khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); @@ -922,7 +890,7 @@ static void collapse_fork_compound(void) munmap(p, hpage_pmd_size); } -static void collapse_max_ptes_shared() +static void collapse_max_ptes_shared(struct collapse_context *c) { int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); int wstatus; @@ -957,28 +925,22 @@ static void collapse_max_ptes_shared() else fail("Fail"); - if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p)) - fail("Timeout"); - else if (!check_huge(p)) - success("OK"); - else - fail("Fail"); - - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); - if (!check_huge(p)) - success("OK"); - else - fail("Fail"); - - - if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Maybe collapse with max_ptes_shared exceeded", p, + !c->enforce_pte_scan_limits); + + if (c->enforce_pte_scan_limits) { + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * + page_size); + if (!check_huge(p)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse with max_ptes_shared PTEs shared", + p, true); + } validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); @@ -999,6 +961,8 @@ static void collapse_max_ptes_shared() int main(void) { + struct collapse_context c; + setbuf(stdout, NULL); page_size = getpagesize(); @@ -1014,18 +978,23 @@ int main(void) adjust_settings(); alloc_at_fault(); - collapse_full(); - collapse_empty(); - collapse_single_pte_entry(); - collapse_max_ptes_none(); - collapse_swapin_single_pte(); - collapse_max_ptes_swap(); - collapse_single_pte_entry_compound(); - collapse_full_of_compound(); - collapse_compound_extreme(); - collapse_fork(); - collapse_fork_compound(); - collapse_max_ptes_shared(); + + printf("\n*** Testing context: khugepaged ***\n"); + c.collapse = &khugepaged_collapse; + c.enforce_pte_scan_limits = true; + + collapse_full(&c); + collapse_empty(&c); + collapse_single_pte_entry(&c); + collapse_max_ptes_none(&c); + collapse_swapin_single_pte(&c); + collapse_max_ptes_swap(&c); + collapse_single_pte_entry_compound(&c); + collapse_full_of_compound(&c); + collapse_compound_extreme(&c); + collapse_fork(&c); + collapse_fork_compound(&c); + collapse_max_ptes_shared(&c); restore_settings(0); } From fd9ec36f95576a920516be5eea9295c6773e688e Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:33 -0700 Subject: [PATCH 0852/1250] selftests/vm: dedup hugepage allocation logic The code p = alloc_mapping(); printf("Allocate huge page..."); madvise(p, hpage_pmd_size, MADV_HUGEPAGE); fill_memory(p, 0, hpage_pmd_size); if (check_huge(p)) success("OK"); else fail("Fail"); Is repeated many times in different tests. Add a helper, alloc_hpage() to handle this. Link: https://lkml.kernel.org/r/20220706235936.2197195-16-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 62 +++++++++---------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 0f1bee0eff24e1..eb6f5bbacff136 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -461,6 +461,25 @@ static void fill_memory(int *p, unsigned long start, unsigned long end) p[i * page_size / sizeof(*p)] = i + 0xdead0000; } +/* + * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with + * validate_memory()'able contents. + */ +static void *alloc_hpage(void) +{ + void *p; + + p = alloc_mapping(); + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p)) + success("OK"); + else + fail("Fail"); + return p; +} + static void validate_memory(int *p, unsigned long start, unsigned long end) { int i; @@ -682,15 +701,7 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c) { void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); + p = alloc_hpage(); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); printf("Split huge page leaving single PTE mapping compound page..."); @@ -710,16 +721,7 @@ static void collapse_full_of_compound(struct collapse_context *c) { void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(); printf("Split huge page leaving single PTE page table full of compound pages..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); @@ -837,16 +839,7 @@ static void collapse_fork_compound(struct collapse_context *c) int wstatus; void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ @@ -896,16 +889,7 @@ static void collapse_max_ptes_shared(struct collapse_context *c) int wstatus; void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ From d253a2e6b1482ff56774154e5d07e74d760973e7 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:34 -0700 Subject: [PATCH 0853/1250] selftests/vm: add MADV_COLLAPSE collapse context to selftests Add madvise collapse context to hugepage collapse selftests. This context is tested with /sys/kernel/mm/transparent_hugepage/enabled set to "never" in order to avoid unwanted interaction with khugepaged during testing. Also, refactor updates to sysfs THP settings using a stack so that the THP settings from nested callers can be restored. Link: https://lkml.kernel.org/r/20220706235936.2197195-17-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 171 +++++++++++++++++------- 1 file changed, 125 insertions(+), 46 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index eb6f5bbacff136..780f04440e154e 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -14,6 +14,9 @@ #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif #define BASE_ADDR ((void *)(1UL << 30)) static unsigned long hpage_pmd_size; @@ -95,18 +98,6 @@ struct settings { struct khugepaged_settings khugepaged; }; -static struct settings default_settings = { - .thp_enabled = THP_MADVISE, - .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_NEVER, - .use_zero_page = 0, - .khugepaged = { - .defrag = 1, - .alloc_sleep_millisecs = 10, - .scan_sleep_millisecs = 10, - }, -}; - static struct settings saved_settings; static bool skip_settings_restore; @@ -284,6 +275,39 @@ static void write_settings(struct settings *settings) write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); } +#define MAX_SETTINGS_DEPTH 4 +static struct settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; + +static struct settings *current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +static void push_settings(struct settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + write_settings(current_settings()); +} + +static void pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + write_settings(current_settings()); +} + static void restore_settings(int sig) { if (skip_settings_restore) @@ -327,14 +351,6 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -static void adjust_settings(void) -{ - - printf("Adjust settings..."); - write_settings(&default_settings); - success("OK"); -} - #define MAX_LINE_LENGTH 500 static bool check_for_pattern(FILE *fp, char *pattern, char *buf) @@ -493,6 +509,38 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) } } +static void madvise_collapse(const char *msg, char *p, bool expect) +{ + int ret; + struct settings settings = *current_settings(); + + printf("%s...", msg); + /* Sanity check */ + if (check_huge(p)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + /* + * Prevent khugepaged interference and tests that MADV_COLLAPSE + * ignores /sys/kernel/mm/transparent_hugepage/enabled + */ + settings.thp_enabled = THP_NEVER; + push_settings(&settings); + + /* Clear VM_NOHUGEPAGE */ + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + ret = madvise(p, hpage_pmd_size, MADV_COLLAPSE); + if (((bool)ret) == expect) + fail("Fail: Bad return value"); + else if (check_huge(p) != expect) + fail("Fail: check_huge()"); + else + success("OK"); + + pop_settings(); +} + #define TICK 500000 static bool wait_for_scan(const char *msg, char *p) { @@ -542,11 +590,11 @@ static void khugepaged_collapse(const char *msg, char *p, bool expect) static void alloc_at_fault(void) { - struct settings settings = default_settings; + struct settings settings = *current_settings(); char *p; settings.thp_enabled = THP_ALWAYS; - write_settings(&settings); + push_settings(&settings); p = alloc_mapping(); *p = 1; @@ -556,7 +604,7 @@ static void alloc_at_fault(void) else fail("Fail"); - write_settings(&default_settings); + pop_settings(); madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); @@ -602,11 +650,11 @@ static void collapse_single_pte_entry(struct collapse_context *c) static void collapse_max_ptes_none(struct collapse_context *c) { int max_ptes_none = hpage_pmd_nr / 2; - struct settings settings = default_settings; + struct settings settings = *current_settings(); void *p; settings.khugepaged.max_ptes_none = max_ptes_none; - write_settings(&settings); + push_settings(&settings); p = alloc_mapping(); @@ -623,7 +671,7 @@ static void collapse_max_ptes_none(struct collapse_context *c) } munmap(p, hpage_pmd_size); - write_settings(&default_settings); + pop_settings(); } static void collapse_swapin_single_pte(struct collapse_context *c) @@ -703,7 +751,6 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c) p = alloc_hpage(); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); if (!check_huge(p)) @@ -864,7 +911,7 @@ static void collapse_fork_compound(struct collapse_context *c) c->collapse("Collapse PTE table full of compound pages in child", p, true); write_num("khugepaged/max_ptes_shared", - default_settings.khugepaged.max_ptes_shared); + current_settings()->khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); @@ -943,9 +990,21 @@ static void collapse_max_ptes_shared(struct collapse_context *c) munmap(p, hpage_pmd_size); } -int main(void) +int main(int argc, const char **argv) { struct collapse_context c; + struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_NEVER, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, + }; + const char *tests = argc == 1 ? "all" : argv[1]; setbuf(stdout, NULL); @@ -959,26 +1018,46 @@ int main(void) default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; save_settings(); - adjust_settings(); + push_settings(&default_settings); alloc_at_fault(); - printf("\n*** Testing context: khugepaged ***\n"); - c.collapse = &khugepaged_collapse; - c.enforce_pte_scan_limits = true; - - collapse_full(&c); - collapse_empty(&c); - collapse_single_pte_entry(&c); - collapse_max_ptes_none(&c); - collapse_swapin_single_pte(&c); - collapse_max_ptes_swap(&c); - collapse_single_pte_entry_compound(&c); - collapse_full_of_compound(&c); - collapse_compound_extreme(&c); - collapse_fork(&c); - collapse_fork_compound(&c); - collapse_max_ptes_shared(&c); + if (!strcmp(tests, "khugepaged") || !strcmp(tests, "all")) { + printf("\n*** Testing context: khugepaged ***\n"); + c.collapse = &khugepaged_collapse; + c.enforce_pte_scan_limits = true; + + collapse_full(&c); + collapse_empty(&c); + collapse_single_pte_entry(&c); + collapse_max_ptes_none(&c); + collapse_swapin_single_pte(&c); + collapse_max_ptes_swap(&c); + collapse_single_pte_entry_compound(&c); + collapse_full_of_compound(&c); + collapse_compound_extreme(&c); + collapse_fork(&c); + collapse_fork_compound(&c); + collapse_max_ptes_shared(&c); + } + if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) { + printf("\n*** Testing context: madvise ***\n"); + c.collapse = &madvise_collapse; + c.enforce_pte_scan_limits = false; + + collapse_full(&c); + collapse_empty(&c); + collapse_single_pte_entry(&c); + collapse_max_ptes_none(&c); + collapse_swapin_single_pte(&c); + collapse_max_ptes_swap(&c); + collapse_single_pte_entry_compound(&c); + collapse_full_of_compound(&c); + collapse_compound_extreme(&c); + collapse_fork(&c); + collapse_fork_compound(&c); + collapse_max_ptes_shared(&c); + } restore_settings(0); } From 1444590a71c51906ddade3770c3c0e7fee901199 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:35 -0700 Subject: [PATCH 0854/1250] selftests/vm: add selftest to verify recollapse of THPs Add selftest specific to madvise collapse context that tests MADV_COLLAPSE is "successful" if a hugepage-aligned/sized region is already pmd-mapped. This test also verifies that MADV_COLLAPSE can collapse memory into THPs even in "madvise" THP mode and the memory isn't marked VM_HUGEPAGE. Link: https://lkml.kernel.org/r/20220706235936.2197195-18-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 31 +++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 780f04440e154e..87cd0b99477f05 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -990,6 +990,36 @@ static void collapse_max_ptes_shared(struct collapse_context *c) munmap(p, hpage_pmd_size); } +static void madvise_collapse_existing_thps(void) +{ + void *p; + int err; + + p = alloc_mapping(); + fill_memory(p, 0, hpage_pmd_size); + + printf("Collapse fully populated PTE table..."); + /* + * Note that we don't set MADV_HUGEPAGE here, which + * also tests that VM_HUGEPAGE isn't required for + * MADV_COLLAPSE in "madvise" mode. + */ + err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); + if (err == 0 && check_huge(p)) { + success("OK"); + printf("Re-collapse PMD-mapped hugepage"); + err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); + if (err == 0 && check_huge(p)) + success("OK"); + else + fail("Fail"); + } else { + fail("Fail"); + } + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + int main(int argc, const char **argv) { struct collapse_context c; @@ -1057,6 +1087,7 @@ int main(int argc, const char **argv) collapse_fork(&c); collapse_fork_compound(&c); collapse_max_ptes_shared(&c); + madvise_collapse_existing_thps(); } restore_settings(0); From d6b6fbe084da940bfdfcc1f0883f6273ad975acc Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:36 -0700 Subject: [PATCH 0855/1250] selftests/vm: add selftest to verify multi THP collapse Add support to allocate and verify collapse of multiple hugepage-sized regions into multiple THPs. Add "nr" argument to check_huge() that instructs check_huge() to check for exactly "nr_hpages" THPs. This has the added benefit of now being able to check for exactly 0 THPs, and so callsites that previously checked the negation of exactly 1 THP are now more correct. ->collapse struct collapse_context hook has been expanded with a "nr_hpages" argument to collapse "nr_hpages" hugepages. The collapse_full() test has been repurposed to collapse 4 THPs at once. It is expected more tests will want to test multi THP collapse (e.g. file/shmem). This is of particular benefit to madvise collapse context given that it may do many THP collapses during a single syscall. Link: https://lkml.kernel.org/r/20220706235936.2197195-19-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Yang Shi Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 140 ++++++++++++------------ 1 file changed, 73 insertions(+), 67 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 87cd0b99477f05..b77b1e28cdb388 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -27,7 +27,7 @@ static int hpage_pmd_nr; #define PID_SMAPS "/proc/self/smaps" struct collapse_context { - void (*collapse)(const char *msg, char *p, bool expect); + void (*collapse)(const char *msg, char *p, int nr_hpages, bool expect); bool enforce_pte_scan_limits; }; @@ -362,7 +362,7 @@ static bool check_for_pattern(FILE *fp, char *pattern, char *buf) return false; } -static bool check_huge(void *addr) +static bool check_huge(void *addr, int nr_hpages) { bool thp = false; int ret; @@ -387,7 +387,7 @@ static bool check_huge(void *addr) goto err_out; ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", - hpage_pmd_size >> 10); + nr_hpages * (hpage_pmd_size >> 10)); if (ret >= MAX_LINE_LENGTH) { printf("%s: Pattern is too long\n", __func__); exit(EXIT_FAILURE); @@ -455,12 +455,12 @@ static bool check_swap(void *addr, unsigned long size) return swap; } -static void *alloc_mapping(void) +static void *alloc_mapping(int nr) { void *p; - p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (p != BASE_ADDR) { printf("Failed to allocate VMA at %p\n", BASE_ADDR); exit(EXIT_FAILURE); @@ -485,11 +485,11 @@ static void *alloc_hpage(void) { void *p; - p = alloc_mapping(); + p = alloc_mapping(1); printf("Allocate huge page..."); madvise(p, hpage_pmd_size, MADV_HUGEPAGE); fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -509,14 +509,15 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) } } -static void madvise_collapse(const char *msg, char *p, bool expect) +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + bool expect) { int ret; struct settings settings = *current_settings(); printf("%s...", msg); /* Sanity check */ - if (check_huge(p)) { + if (!check_huge(p, 0)) { printf("Unexpected huge page\n"); exit(EXIT_FAILURE); } @@ -529,11 +530,11 @@ static void madvise_collapse(const char *msg, char *p, bool expect) push_settings(&settings); /* Clear VM_NOHUGEPAGE */ - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - ret = madvise(p, hpage_pmd_size, MADV_COLLAPSE); + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + ret = madvise(p, nr_hpages * hpage_pmd_size, MADV_COLLAPSE); if (((bool)ret) == expect) fail("Fail: Bad return value"); - else if (check_huge(p) != expect) + else if (check_huge(p, nr_hpages) != expect) fail("Fail: check_huge()"); else success("OK"); @@ -542,25 +543,25 @@ static void madvise_collapse(const char *msg, char *p, bool expect) } #define TICK 500000 -static bool wait_for_scan(const char *msg, char *p) +static bool wait_for_scan(const char *msg, char *p, int nr_hpages) { int full_scans; int timeout = 6; /* 3 seconds */ /* Sanity check */ - if (check_huge(p)) { + if (!check_huge(p, 0)) { printf("Unexpected huge page\n"); exit(EXIT_FAILURE); } - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); /* Wait until the second full_scan completed */ full_scans = read_num("khugepaged/full_scans") + 2; printf("%s...", msg); while (timeout--) { - if (check_huge(p)) + if (check_huge(p, nr_hpages)) break; if (read_num("khugepaged/full_scans") >= full_scans) break; @@ -568,20 +569,21 @@ static bool wait_for_scan(const char *msg, char *p) usleep(TICK); } - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); return timeout == -1; } -static void khugepaged_collapse(const char *msg, char *p, bool expect) +static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, + bool expect) { - if (wait_for_scan(msg, p)) { + if (wait_for_scan(msg, p, nr_hpages)) { if (expect) fail("Timeout"); else success("OK"); return; - } else if (check_huge(p) == expect) { + } else if (check_huge(p, nr_hpages) == expect) { success("OK"); } else { fail("Fail"); @@ -596,10 +598,10 @@ static void alloc_at_fault(void) settings.thp_enabled = THP_ALWAYS; push_settings(&settings); - p = alloc_mapping(); + p = alloc_mapping(1); *p = 1; printf("Allocate huge page on fault..."); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -608,7 +610,7 @@ static void alloc_at_fault(void) madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -618,20 +620,23 @@ static void alloc_at_fault(void) static void collapse_full(struct collapse_context *c) { void *p; + int nr_hpages = 4; + unsigned long size = nr_hpages * hpage_pmd_size; - p = alloc_mapping(); - fill_memory(p, 0, hpage_pmd_size); - c->collapse("Collapse fully populated PTE table", p, true); - validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, + true); + validate_memory(p, 0, size); + munmap(p, size); } static void collapse_empty(struct collapse_context *c) { void *p; - p = alloc_mapping(); - c->collapse("Do not collapse empty PTE table", p, false); + p = alloc_mapping(1); + c->collapse("Do not collapse empty PTE table", p, 1, false); munmap(p, hpage_pmd_size); } @@ -639,10 +644,10 @@ static void collapse_single_pte_entry(struct collapse_context *c) { void *p; - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, page_size); c->collapse("Collapse PTE table with single PTE entry present", p, - true); + 1, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); } @@ -656,16 +661,17 @@ static void collapse_max_ptes_none(struct collapse_context *c) settings.khugepaged.max_ptes_none = max_ptes_none; push_settings(&settings); - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - c->collapse("Maybe collapse with max_ptes_none exceeded", p, + c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, !c->enforce_pte_scan_limits); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); if (c->enforce_pte_scan_limits) { fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - c->collapse("Collapse with max_ptes_none PTEs empty", p, true); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, + true); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); } @@ -677,7 +683,7 @@ static void collapse_max_ptes_none(struct collapse_context *c) static void collapse_swapin_single_pte(struct collapse_context *c) { void *p; - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, hpage_pmd_size); printf("Swapout one page..."); @@ -692,7 +698,7 @@ static void collapse_swapin_single_pte(struct collapse_context *c) goto out; } - c->collapse("Collapse with swapping in single PTE entry", p, true); + c->collapse("Collapse with swapping in single PTE entry", p, 1, true); validate_memory(p, 0, hpage_pmd_size); out: munmap(p, hpage_pmd_size); @@ -703,7 +709,7 @@ static void collapse_max_ptes_swap(struct collapse_context *c) int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); void *p; - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); @@ -718,7 +724,7 @@ static void collapse_max_ptes_swap(struct collapse_context *c) goto out; } - c->collapse("Maybe collapse with max_ptes_swap exceeded", p, + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, !c->enforce_pte_scan_limits); validate_memory(p, 0, hpage_pmd_size); @@ -738,7 +744,7 @@ static void collapse_max_ptes_swap(struct collapse_context *c) } c->collapse("Collapse with max_ptes_swap pages swapped out", p, - true); + 1, true); validate_memory(p, 0, hpage_pmd_size); } out: @@ -753,13 +759,13 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c) madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Collapse PTE table with single PTE mapping compound page", - p, true); + p, 1, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); } @@ -772,12 +778,12 @@ static void collapse_full_of_compound(struct collapse_context *c) printf("Split huge page leaving single PTE page table full of compound pages..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - c->collapse("Collapse PTE table full of compound pages", p, true); + c->collapse("Collapse PTE table full of compound pages", p, 1, true); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); } @@ -787,14 +793,14 @@ static void collapse_compound_extreme(struct collapse_context *c) void *p; int i; - p = alloc_mapping(); + p = alloc_mapping(1); for (i = 0; i < hpage_pmd_nr; i++) { printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", i + 1, hpage_pmd_nr); madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); fill_memory(BASE_ADDR, 0, hpage_pmd_size); - if (!check_huge(BASE_ADDR)) { + if (!check_huge(BASE_ADDR, 1)) { printf("Failed to allocate huge page\n"); exit(EXIT_FAILURE); } @@ -823,12 +829,12 @@ static void collapse_compound_extreme(struct collapse_context *c) munmap(BASE_ADDR, hpage_pmd_size); fill_memory(p, 0, hpage_pmd_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - c->collapse("Collapse PTE table full of different compound pages", p, + c->collapse("Collapse PTE table full of different compound pages", p, 1, true); validate_memory(p, 0, hpage_pmd_size); @@ -840,11 +846,11 @@ static void collapse_fork(struct collapse_context *c) int wstatus; void *p; - p = alloc_mapping(); + p = alloc_mapping(1); printf("Allocate small page..."); fill_memory(p, 0, page_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -855,14 +861,14 @@ static void collapse_fork(struct collapse_context *c) skip_settings_restore = true; exit_status = 0; - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); fill_memory(p, page_size, 2 * page_size); c->collapse("Collapse PTE table with single page shared with parent process", - p, true); + p, 1, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); @@ -873,7 +879,7 @@ static void collapse_fork(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has small page..."); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -893,7 +899,7 @@ static void collapse_fork_compound(struct collapse_context *c) skip_settings_restore = true; exit_status = 0; - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -901,7 +907,7 @@ static void collapse_fork_compound(struct collapse_context *c) printf("Split huge page PMD in child process..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -909,7 +915,7 @@ static void collapse_fork_compound(struct collapse_context *c) write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); c->collapse("Collapse PTE table full of compound pages in child", - p, true); + p, 1, true); write_num("khugepaged/max_ptes_shared", current_settings()->khugepaged.max_ptes_shared); @@ -922,7 +928,7 @@ static void collapse_fork_compound(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -943,7 +949,7 @@ static void collapse_max_ptes_shared(struct collapse_context *c) skip_settings_restore = true; exit_status = 0; - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -951,26 +957,26 @@ static void collapse_max_ptes_shared(struct collapse_context *c) printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Maybe collapse with max_ptes_shared exceeded", p, - !c->enforce_pte_scan_limits); + 1, !c->enforce_pte_scan_limits); if (c->enforce_pte_scan_limits) { printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Collapse with max_ptes_shared PTEs shared", - p, true); + p, 1, true); } validate_memory(p, 0, hpage_pmd_size); @@ -982,7 +988,7 @@ static void collapse_max_ptes_shared(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -995,7 +1001,7 @@ static void madvise_collapse_existing_thps(void) void *p; int err; - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, hpage_pmd_size); printf("Collapse fully populated PTE table..."); @@ -1005,11 +1011,11 @@ static void madvise_collapse_existing_thps(void) * MADV_COLLAPSE in "madvise" mode. */ err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); - if (err == 0 && check_huge(p)) { + if (err == 0 && check_huge(p, 1)) { success("OK"); printf("Re-collapse PMD-mapped hugepage"); err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); - if (err == 0 && check_huge(p)) + if (err == 0 && check_huge(p, 1)) success("OK"); else fail("Fail"); From f41c329749ace90a35d00ef4c82b133cb1845579 Mon Sep 17 00:00:00 2001 From: William Lam Date: Mon, 11 Jul 2022 21:28:06 +0100 Subject: [PATCH 0856/1250] mm: compaction: include compound page count for scanning in pageblock isolation The number of scanned pages can be lower than the number of isolated pages when isolating mirgratable or free pageblock. The metric is being reported in trace event and also used in vmstat. some example output from trace where it shows nr_taken can be greater than nr_scanned: Produced by kernel v5.19-rc6 kcompactd0-42 [001] ..... 1210.268022: mm_compaction_isolate_migratepages: range=(0x107ae4 ~ 0x107c00) nr_scanned=265 nr_taken=255 [...] kcompactd0-42 [001] ..... 1210.268382: mm_compaction_isolate_freepages: range=(0x215800 ~ 0x215a00) nr_scanned=13 nr_taken=128 kcompactd0-42 [001] ..... 1210.268383: mm_compaction_isolate_freepages: range=(0x215600 ~ 0x215680) nr_scanned=1 nr_taken=128 mm_compaction_isolate_migratepages does not seem to have this behaviour, but for the reason of consistency, nr_scanned should also be taken care of in that side. This behaviour is confusing since currently the count for isolated pages takes account of compound page but not for the case of scanned pages. And given that the number of isolated pages(nr_taken) reported in mm_compaction_isolate_template trace event is on a single-page basis, the ambiguity when reporting the number of scanned pages can be removed by also including compound page count. Link: https://lkml.kernel.org/r/20220711202806.22296-1-william.lam@bytedance.com Signed-off-by: William Lam Reviewed-by: Punit Agrawal Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index cd029ab03d0e57..d024d18e0b5ca1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -616,6 +616,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, break; set_page_private(page, order); + nr_scanned += isolated - 1; total_isolated += isolated; cc->nr_freepages += isolated; list_add_tail(&page->lru, freelist); @@ -1101,6 +1102,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, isolate_success_no_list: cc->nr_migratepages += compound_nr(page); nr_isolated += compound_nr(page); + nr_scanned += compound_nr(page) - 1; /* * Avoid isolating too much unless this block is being @@ -1504,6 +1506,7 @@ fast_isolate_freepages(struct compact_control *cc) if (__isolate_free_page(page, order)) { set_page_private(page, order); nr_isolated = 1 << order; + nr_scanned += nr_isolated - 1; cc->nr_freepages += nr_isolated; list_add_tail(&page->lru, &cc->freepages); count_compact_events(COMPACTISOLATED, nr_isolated); From b67c9c2c38d5a2ee82f001256700d072e7d055d9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 16 Jul 2022 16:03:59 +0800 Subject: [PATCH 0857/1250] mm: remove obsolete comment in do_fault_around() Since commit 7267ec008b5c ("mm: postpone page table allocation until we have page to map"), do_fault_around is not called with page table lock held. Cleanup the corresponding comments. Link: https://lkml.kernel.org/r/20220716080359.38791-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index aa7c7e15abe4ef..051f0815396547 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4457,10 +4457,6 @@ late_initcall(fault_around_debugfs); * It uses vm_ops->map_pages() to map the pages, which skips the page if it's * not ready to be mapped: not up-to-date, locked, etc. * - * This function is called with the page table lock taken. In the split ptlock - * case the page table lock only protects only those entries which belong to - * the page table corresponding to the fault address. - * * This function doesn't cross the VMA boundaries, in order to call map_pages() * only once. * From 4293014384a9de64482de231b7a17322335f4f36 Mon Sep 17 00:00:00 2001 From: Zhou Guanghui Date: Wed, 15 Jun 2022 10:27:42 +0000 Subject: [PATCH 0858/1250] memblock,arm64: expand the static memblock memory table In a system(Huawei Ascend ARM64 SoC) using HBM, a multi-bit ECC error occurs, and the BIOS will mark the corresponding area (for example, 2 MB) as unusable. When the system restarts next time, these areas are not reported or reported as EFI_UNUSABLE_MEMORY. Both cases lead to an increase in the number of memblocks, whereas EFI_UNUSABLE_MEMORY leads to a larger number of memblocks. For example, if the EFI_UNUSABLE_MEMORY type is reported: ... memory[0x92] [0x0000200834a00000-0x0000200835bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0 memory[0x93] [0x0000200835c00000-0x0000200835dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4 memory[0x94] [0x0000200835e00000-0x00002008367fffff], 0x0000000000a00000 bytes on node 7 flags: 0x0 memory[0x95] [0x0000200836800000-0x00002008369fffff], 0x0000000000200000 bytes on node 7 flags: 0x4 memory[0x96] [0x0000200836a00000-0x0000200837bfffff], 0x0000000001200000 bytes on node 7 flags: 0x0 memory[0x97] [0x0000200837c00000-0x0000200837dfffff], 0x0000000000200000 bytes on node 7 flags: 0x4 memory[0x98] [0x0000200837e00000-0x000020087fffffff], 0x0000000048200000 bytes on node 7 flags: 0x0 memory[0x99] [0x0000200880000000-0x0000200bcfffffff], 0x0000000350000000 bytes on node 6 flags: 0x0 memory[0x9a] [0x0000200bd0000000-0x0000200bd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4 memory[0x9b] [0x0000200bd0200000-0x0000200bd07fffff], 0x0000000000600000 bytes on node 6 flags: 0x0 memory[0x9c] [0x0000200bd0800000-0x0000200bd09fffff], 0x0000000000200000 bytes on node 6 flags: 0x4 memory[0x9d] [0x0000200bd0a00000-0x0000200fcfffffff], 0x00000003ff600000 bytes on node 6 flags: 0x0 memory[0x9e] [0x0000200fd0000000-0x0000200fd01fffff], 0x0000000000200000 bytes on node 6 flags: 0x4 memory[0x9f] [0x0000200fd0200000-0x0000200fffffffff], 0x000000002fe00000 bytes on node 6 flags: 0x0 ... The EFI memory map is parsed to construct the memblock arrays before the memblock arrays can be resized. As the result, memory regions beyond INIT_MEMBLOCK_REGIONS are lost. Add a new macro INIT_MEMBLOCK_MEMORY_REGIONS to replace INIT_MEMBLOCK_REGTIONS to define the size of the static memblock.memory array. Allow overriding memblock.memory array size with architecture defined INIT_MEMBLOCK_MEMORY_REGIONS and make arm64 to set INIT_MEMBLOCK_MEMORY_REGIONS to 1024 when CONFIG_EFI is enabled. Link: https://lkml.kernel.org/r/20220615102742.96450-1-zhouguanghui1@huawei.com Signed-off-by: Zhou Guanghui Acked-by: Mike Rapoport Tested-by: Darren Hart Acked-by: Will Deacon [arm64] Reviewed-by: Anshuman Khandual Cc: Xu Qiang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/memory.h | 9 +++++++++ mm/memblock.c | 14 +++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 0af70d9abede3d..ce8614fa376a53 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -364,6 +364,15 @@ void dump_mem_limit(void); # define INIT_MEMBLOCK_RESERVED_REGIONS (INIT_MEMBLOCK_REGIONS + NR_CPUS + 1) #endif +/* + * memory regions which marked with flag MEMBLOCK_NOMAP(for example, the memory + * of the EFI_UNUSABLE_MEMORY type) may divide a continuous memory block into + * multiple parts. As a result, the number of memory regions is large. + */ +#ifdef CONFIG_EFI +#define INIT_MEMBLOCK_MEMORY_REGIONS (INIT_MEMBLOCK_REGIONS * 8) +#endif + #include #endif /* __ASM_MEMORY_H */ diff --git a/mm/memblock.c b/mm/memblock.c index 749abd2685c4e5..b7ebf4b7e9d916 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -29,6 +29,10 @@ # define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS #endif +#ifndef INIT_MEMBLOCK_MEMORY_REGIONS +#define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS +#endif + /** * DOC: memblock overview * @@ -55,9 +59,9 @@ * the allocator metadata. The "memory" and "reserved" types are nicely * wrapped with struct memblock. This structure is statically * initialized at build time. The region arrays are initially sized to - * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS - * for "reserved". The region array for "physmem" is initially sized to - * %INIT_PHYSMEM_REGIONS. + * %INIT_MEMBLOCK_MEMORY_REGIONS for "memory" and + * %INIT_MEMBLOCK_RESERVED_REGIONS for "reserved". The region array + * for "physmem" is initially sized to %INIT_PHYSMEM_REGIONS. * The memblock_allow_resize() enables automatic resizing of the region * arrays during addition of new regions. This feature should be used * with care so that memory allocated for the region array will not @@ -102,7 +106,7 @@ unsigned long min_low_pfn; unsigned long max_pfn; unsigned long long max_possible_pfn; -static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock; #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS]; @@ -111,7 +115,7 @@ static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS struct memblock memblock __initdata_memblock = { .memory.regions = memblock_memory_init_regions, .memory.cnt = 1, /* empty dummy entry */ - .memory.max = INIT_MEMBLOCK_REGIONS, + .memory.max = INIT_MEMBLOCK_MEMORY_REGIONS, .memory.name = "memory", .reserved.regions = memblock_reserved_init_regions, From 49ad534f3d4e42468ba30e5a30b0d0f4ceba1ea9 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 14 Jul 2022 16:41:47 +0800 Subject: [PATCH 0859/1250] writeback: remove inode_to_wb_is_valid() inode_to_wb_is_valid() is no longer used since commit fe55d563d417 ("remove inode_congested()"), remove it. Link: https://lkml.kernel.org/r/20220714084147.140324-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Johannes Thumshirn Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e84b745a68119d..439815cc1ab965 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -229,18 +229,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_to_wb_is_valid - test whether an inode has a wb associated - * @inode: inode of interest - * - * Returns %true if @inode has a wb associated. May be called without any - * locking. - */ -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return inode->i_wb; -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -339,11 +327,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline bool inode_to_wb_is_valid(struct inode *inode) -{ - return true; -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; From 3998b6847b6baf16d2f08b14de53bbf7c2776515 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Thu, 14 Jul 2022 16:07:57 +0800 Subject: [PATCH 0860/1250] zsmalloc: zs_malloc: return ERR_PTR on failure zs_malloc returns 0 if it fails. zs_zpool_malloc will return -1 when zs_malloc return 0. But -1 makes the return value unclear. For example, when zswap_frontswap_store calls zs_malloc through zs_zpool_malloc, it will return -1 to its caller. The other return value is -EINVAL, -ENODEV or something else. This commit changes zs_malloc to return ERR_PTR on failure. It didn't just let zs_zpool_malloc return -ENOMEM becaue zs_malloc has two types of failure: - size is not OK return -EINVAL - memory alloc fail return -ENOMEM. Link: https://lkml.kernel.org/r/20220714080757.12161-1-teawater@gmail.com Signed-off-by: Hui Zhu Cc: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jens Axboe Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 4 ++-- mm/zsmalloc.c | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3e281a193feb3a..9d3b06d5dc5616 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1389,9 +1389,9 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, __GFP_HIGHMEM | __GFP_MOVABLE); - if (unlikely(!handle)) { + if (IS_ERR((void *)handle)) { zcomp_stream_put(zram->comp); - return -ENOMEM; + return PTR_ERR((void *)handle); } alloced_pages = zs_get_total_pages(zram->mem_pool); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f24b71568e8302..9e13fd7ee635c6 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -399,7 +399,10 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { *handle = zs_malloc(pool, size, gfp); - return *handle ? 0 : -1; + + if (IS_ERR((void *)(*handle))) + return PTR_ERR((void *)*handle); + return 0; } static void zs_zpool_free(void *pool, unsigned long handle) { @@ -1400,7 +1403,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, * @gfp: gfp flags when allocating object * * On success, handle to the allocated object is returned, - * otherwise 0. + * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) @@ -1411,11 +1414,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) struct zspage *zspage; if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) - return 0; + return (unsigned long)ERR_PTR(-EINVAL); handle = cache_alloc_handle(pool, gfp); if (!handle) - return 0; + return (unsigned long)ERR_PTR(-ENOMEM); /* extra space in chunk to keep the handle */ size += ZS_HANDLE_SIZE; @@ -1440,7 +1443,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) zspage = alloc_zspage(pool, class, gfp); if (!zspage) { cache_free_handle(pool, handle); - return 0; + return (unsigned long)ERR_PTR(-ENOMEM); } spin_lock(&class->lock); From f34a35c4288381485bf22e002a2621bb402e4d22 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Thu, 14 Jul 2022 06:49:18 +0000 Subject: [PATCH 0861/1250] mm: vmpressure: don't count proactive reclaim in vmpressure vmpressure is used in cgroup v1 to notify userspace of reclaim efficiency events, and is also used in both cgroup v1 and v2 as a signal for memory pressure for networking, see mem_cgroup_under_socket_pressure(). Proactive reclaim intends to probe memcgs for cold memory, without affecting their performance. Hence, reclaim caused by writing to memory.reclaim should not trigger vmpressure. Link: https://lkml.kernel.org/r/20220714064918.2576464-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Miaohe Lin Cc: NeilBrown Cc: Alistair Popple Cc: Suren Baghdasaryan Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 ++++- mm/memcontrol.c | 24 ++++++++++++++---------- mm/vmscan.c | 27 +++++++++++++++++---------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6d11c51b2b6275..ea895b40e6ff12 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -411,10 +411,13 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page, extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); + +#define MEMCG_RECLAIM_MAY_SWAP (1 << 1) +#define MEMCG_RECLAIM_PROACTIVE (1 << 2) extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap); + unsigned int reclaim_options); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e463660209aee..56b25521bf8b6d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2330,7 +2330,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, - gfp_mask, true); + gfp_mask, + MEMCG_RECLAIM_MAY_SWAP); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2575,7 +2576,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, struct page_counter *counter; unsigned long nr_reclaimed; bool passed_oom = false; - bool may_swap = true; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP; bool drained = false; bool raised_max_event = false; unsigned long pflags; @@ -2593,7 +2594,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, mem_over_limit = mem_cgroup_from_counter(counter, memory); } else { mem_over_limit = mem_cgroup_from_counter(counter, memsw); - may_swap = false; + reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP; } if (batch > nr_pages) { @@ -2621,7 +2622,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, may_swap); + gfp_mask, reclaim_options); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -3439,8 +3440,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, continue; } - if (!try_to_free_mem_cgroup_pages(memcg, 1, - GFP_KERNEL, !memsw)) { + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { ret = -EBUSY; break; } @@ -3550,7 +3551,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) if (signal_pending(current)) return -EINTR; - if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true)) + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, + MEMCG_RECLAIM_MAY_SWAP)) nr_retries--; } @@ -6302,7 +6304,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, true); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); if (!reclaimed && !nr_retries--) break; @@ -6351,7 +6353,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, true)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) nr_reclaims--; continue; } @@ -6480,6 +6482,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; + unsigned int reclaim_options; int err; buf = strstrip(buf); @@ -6487,6 +6490,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, if (err) return err; + reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6503,7 +6507,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, true); + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; diff --git a/mm/vmscan.c b/mm/vmscan.c index fbb4108250ee4e..9e7d8db4291879 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -101,6 +101,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* Proactive reclaim invoked by userspace through memory.reclaim */ + unsigned int proactive:1; + /* * Cgroup memory below memory.low is protected as long as we * don't threaten to OOM. If any cgroup is reclaimed at @@ -3180,9 +3183,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->priority); /* Record the group's reclaim efficiency */ - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } @@ -3305,9 +3309,10 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) } /* Record the subtree's reclaim efficiency */ - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, - sc->nr_scanned - nr_scanned, - sc->nr_reclaimed - nr_reclaimed); + if (!sc->proactive) + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; @@ -3589,8 +3594,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1); do { - vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, - sc->priority); + if (!sc->proactive) + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; shrink_zones(zonelist, sc); @@ -3880,7 +3886,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - bool may_swap) + unsigned int reclaim_options) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -3893,7 +3899,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = may_swap, + .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), + .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put From f44d6d3c2d9e3e6718e9eca3703388f5187ea746 Mon Sep 17 00:00:00 2001 From: Jianglei Nie Date: Thu, 14 Jul 2022 14:37:46 +0800 Subject: [PATCH 0862/1250] mm/damon/reclaim: fix potential memory leak in damon_reclaim_init() damon_reclaim_init() allocates a memory chunk for ctx with damon_new_ctx(). When damon_select_ops() fails, ctx is not released, which will lead to a memory leak. We should release the ctx with damon_destroy_ctx() when damon_select_ops() fails to fix the memory leak. Link: https://lkml.kernel.org/r/20220714063746.2343549-1-niejianglei2021@163.com Fixes: 4d69c3457821 ("mm/damon/reclaim: use damon_select_ops() instead of damon_{v,p}a_set_operations()") Signed-off-by: Jianglei Nie Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index e69b807fefe438..a7faf51b4bd4ad 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -435,8 +435,10 @@ static int __init damon_reclaim_init(void) if (!ctx) return -ENOMEM; - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); return -EINVAL; + } ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; From 7ada89abdd21ad757122c0971554f6d739eb3864 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2022 13:24:13 +0900 Subject: [PATCH 0863/1250] mm/hugetlb: check gigantic_page_runtime_supported() in return_unused_surplus_pages() Patch series "mm, hwpoison: enable 1GB hugepage support", v7. This patch (of 8): I found a weird state of 1GB hugepage pool, caused by the following procedure: - run a process reserving all free 1GB hugepages, - shrink free 1GB hugepage pool to zero (i.e. writing 0 to /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages), then - kill the reserving process. , then all the hugepages are free *and* surplus at the same time. $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages 3 $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/free_hugepages 3 $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/resv_hugepages 0 $ cat /sys/kernel/mm/hugepages/hugepages-1048576kB/surplus_hugepages 3 This state is resolved by reserving and allocating the pages then freeing them again, so this seems not to result in serious problem. But it's a little surprising (shrinking pool suddenly fails). This behavior is caused by hstate_is_gigantic() check in return_unused_surplus_pages(). This was introduced so long ago in 2008 by commit aa888a74977a ("hugetlb: support larger than MAX_ORDER"), and at that time the gigantic pages were not supposed to be allocated/freed at run-time. Now kernel can support runtime allocation/free, so let's check gigantic_page_runtime_supported() together. Link: https://lkml.kernel.org/r/20220714042420.1847125-1-naoya.horiguchi@linux.dev Link: https://lkml.kernel.org/r/20220714042420.1847125-2-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Mike Kravetz Cc: Liu Shixin Cc: Yang Shi Cc: Oscar Salvador Cc: Muchun Song Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d6aa88d744c0db..35e47ee1860c85 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2432,8 +2432,7 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; - /* Cannot return gigantic pages currently */ - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) goto out; /* From c7bb399afb7b0cedb0da7700241ecdf07bf7c658 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2022 13:24:14 +0900 Subject: [PATCH 0864/1250] mm/hugetlb: make pud_huge() and follow_huge_pud() aware of non-present pud entry follow_pud_mask() does not support non-present pud entry now. As long as I tested on x86_64 server, follow_pud_mask() still simply returns no_page_table() for non-present_pud_entry() due to pud_bad(), so no severe user-visible effect should happen. But generally we should call follow_huge_pud() for non-present pud entry for 1GB hugetlb page. Update pud_huge() and follow_huge_pud() to handle non-present pud entries. The changes are similar to previous works for pud entries commit e66f17ff7177 ("mm/hugetlb: take page table lock in follow_huge_pmd()") and commit cbef8478bee5 ("mm/hugetlb: pmd_huge() returns true for non-present hugepage"). Link: https://lkml.kernel.org/r/20220714042420.1847125-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: kernel test robot Cc: Liu Shixin Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- arch/x86/mm/hugetlbpage.c | 8 +++++++- mm/hugetlb.c | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 509408da0da1e2..6b3033845c6d32 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -30,9 +30,15 @@ int pmd_huge(pmd_t pmd) (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } +/* + * pud_huge() returns 1 if @pud is hugetlb related entry, that is normal + * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. + * Otherwise, returns 0. + */ int pud_huge(pud_t pud) { - return !!(pud_val(pud) & _PAGE_PSE); + return !pud_none(pud) && + (pud_val(pud) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 35e47ee1860c85..ab61b9ee9d1b9d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6979,10 +6979,38 @@ struct page * __weak follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int flags) { - if (flags & (FOLL_GET | FOLL_PIN)) + struct page *page = NULL; + spinlock_t *ptl; + pte_t pte; + + if (WARN_ON_ONCE(flags & FOLL_PIN)) return NULL; - return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); +retry: + ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); + if (!pud_huge(*pud)) + goto out; + pte = huge_ptep_get((pte_t *)pud); + if (pte_present(pte)) { + page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(!try_grab_page(page, flags))) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(pte)) { + spin_unlock(ptl); + __migration_entry_wait(mm, (pte_t *)pud, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; } struct page * __weak From 73a8e289fb459227d0dd34605404d3d9a87e2b9b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2022 13:24:15 +0900 Subject: [PATCH 0865/1250] mm, hwpoison, hugetlb: support saving mechanism of raw error pages When handling memory error on a hugetlb page, the error handler tries to dissolve and turn it into 4kB pages. If it's successfully dissolved, PageHWPoison flag is moved to the raw error page, so that's all right. However, dissolve sometimes fails, then the error page is left as hwpoisoned hugepage. It's useful if we can retry to dissolve it to save healthy pages, but that's not possible now because the information about where the raw error pages is lost. Use the private field of a few tail pages to keep that information. The code path of shrinking hugepage pool uses this info to try delayed dissolve. In order to remember multiple errors in a hugepage, a singly-linked list originated from SUBPAGE_INDEX_HWPOISON-th tail page is constructed. Only simple operations (adding an entry or clearing all) are required and the list is assumed not to be very long, so this simple data structure should be enough. If we failed to save raw error info, the hwpoison hugepage has errors on unknown subpage, then this new saving mechanism does not work any more, so disable saving new raw error info and freeing hwpoison hugepages. Link: https://lkml.kernel.org/r/20220714042420.1847125-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: kernel test robot Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Liu Shixin Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 17 +++++++- mm/hugetlb.c | 23 ++++++----- mm/memory-failure.c | 89 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 116 insertions(+), 13 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6d0620edf0a60c..3ec981a0d8b3a5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -42,6 +42,9 @@ enum { SUBPAGE_INDEX_CGROUP, /* reuse page->private */ SUBPAGE_INDEX_CGROUP_RSVD, /* reuse page->private */ __MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD, +#endif +#ifdef CONFIG_MEMORY_FAILURE + SUBPAGE_INDEX_HWPOISON, #endif __NR_USED_SUBPAGE, }; @@ -551,7 +554,7 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * Synchronization: Initially set after new page allocation with no * locking. When examined and modified during migration processing * (isolate, migrate, putback) the hugetlb_lock is held. - * HPG_temporary - - Set on a page that is temporarily allocated from the buddy + * HPG_temporary - Set on a page that is temporarily allocated from the buddy * allocator. Typically used for migration target pages when no pages * are available in the pool. The hugetlb free page path will * immediately free pages with this flag set to the buddy allocator. @@ -561,6 +564,8 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, * HPG_freed - Set when page is on the free lists. * Synchronization: hugetlb_lock held for examination and modification. * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed. + * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page + * that is not tracked by raw_hwp_page list. */ enum hugetlb_page_flags { HPG_restore_reserve = 0, @@ -568,6 +573,7 @@ enum hugetlb_page_flags { HPG_temporary, HPG_freed, HPG_vmemmap_optimized, + HPG_raw_hwp_unreliable, __NR_HPAGEFLAGS, }; @@ -614,6 +620,7 @@ HPAGEFLAG(Migratable, migratable) HPAGEFLAG(Temporary, temporary) HPAGEFLAG(Freed, freed) HPAGEFLAG(VmemmapOptimized, vmemmap_optimized) +HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable) #ifdef CONFIG_HUGETLB_PAGE @@ -796,6 +803,14 @@ extern int dissolve_free_huge_page(struct page *page); extern int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); +#ifdef CONFIG_MEMORY_FAILURE +extern void hugetlb_clear_page_hwpoison(struct page *hpage); +#else +static inline void hugetlb_clear_page_hwpoison(struct page *hpage) +{ +} +#endif + #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION #ifndef arch_hugetlb_migration_supported static inline bool arch_hugetlb_migration_supported(struct hstate *h) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab61b9ee9d1b9d..14be38822cf8ab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1535,6 +1535,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; + /* + * If we don't know which subpages are hwpoisoned, we can't free + * the hugepage, so it's leaked intentionally. + */ + if (HPageRawHwpUnreliable(page)) + return; + if (hugetlb_vmemmap_restore(h, page)) { spin_lock_irq(&hugetlb_lock); /* @@ -1547,6 +1554,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page) return; } + /* + * Move PageHWPoison flag from head page to the raw error pages, + * which makes any healthy subpages reusable. + */ + if (unlikely(PageHWPoison(page))) + hugetlb_clear_page_hwpoison(page); + for (i = 0; i < pages_per_huge_page(h); i++, subpage = mem_map_next(subpage, page, i)) { subpage->flags &= ~(1 << PG_locked | 1 << PG_error | @@ -2109,15 +2123,6 @@ int dissolve_free_huge_page(struct page *page) */ rc = hugetlb_vmemmap_restore(h, head); if (!rc) { - /* - * Move PageHWPoison flag from head page to the raw - * error page, which makes any subpages rather than - * the error page reusable. - */ - if (PageHWPoison(head) && page != head) { - SetPageHWPoison(page); - ClearPageHWPoison(head); - } update_and_free_page(h, head, false); } else { spin_lock_irq(&hugetlb_lock); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c9931c67633561..fa29849769edfa 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1664,6 +1664,90 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #endif /* CONFIG_FS_DAX */ +#ifdef CONFIG_HUGETLB_PAGE +/* + * Struct raw_hwp_page represents information about "raw error page", + * constructing singly linked list originated from ->private field of + * SUBPAGE_INDEX_HWPOISON-th tail page. + */ +struct raw_hwp_page { + struct llist_node node; + struct page *page; +}; + +static inline struct llist_head *raw_hwp_list_head(struct page *hpage) +{ + return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON); +} + +static void __free_raw_hwp_pages(struct page *hpage) +{ + struct llist_head *head; + struct llist_node *t, *tnode; + + head = raw_hwp_list_head(hpage); + llist_for_each_safe(tnode, t, head->first) { + struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); + + SetPageHWPoison(p->page); + kfree(p); + } + llist_del_all(head); +} + +static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) +{ + struct llist_head *head; + struct raw_hwp_page *raw_hwp; + struct llist_node *t, *tnode; + int ret = TestSetPageHWPoison(hpage) ? -EHWPOISON : 0; + + /* + * Once the hwpoison hugepage has lost reliable raw error info, + * there is little meaning to keep additional error info precisely, + * so skip to add additional raw error info. + */ + if (HPageRawHwpUnreliable(hpage)) + return -EHWPOISON; + head = raw_hwp_list_head(hpage); + llist_for_each_safe(tnode, t, head->first) { + struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); + + if (p->page == page) + return -EHWPOISON; + } + + raw_hwp = kmalloc(sizeof(struct raw_hwp_page), GFP_ATOMIC); + if (raw_hwp) { + raw_hwp->page = page; + llist_add(&raw_hwp->node, head); + /* the first error event will be counted in action_result(). */ + if (ret) + num_poisoned_pages_inc(); + } else { + /* + * Failed to save raw error info. We no longer trace all + * hwpoisoned subpages, and we need refuse to free/dissolve + * this hwpoisoned hugepage. + */ + SetHPageRawHwpUnreliable(hpage); + /* + * Once HPageRawHwpUnreliable is set, raw_hwp_page is not + * used any more, so free it. + */ + __free_raw_hwp_pages(hpage); + } + return ret; +} + +void hugetlb_clear_page_hwpoison(struct page *hpage) +{ + if (HPageRawHwpUnreliable(hpage)) + return; + ClearPageHWPoison(hpage); + __free_raw_hwp_pages(hpage); +} + /* * Called from hugetlb code with hugetlb_lock held. * @@ -1698,7 +1782,7 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) goto out; } - if (TestSetPageHWPoison(head)) { + if (hugetlb_set_page_hwpoison(head, page)) { ret = -EHWPOISON; goto out; } @@ -1710,7 +1794,6 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) return ret; } -#ifdef CONFIG_HUGETLB_PAGE /* * Taking refcount of hugetlb pages needs extra care about race conditions * with basic operations like hugepage allocation/free/demotion. @@ -1751,7 +1834,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb lock_page(head); if (hwpoison_filter(p)) { - ClearPageHWPoison(head); + hugetlb_clear_page_hwpoison(head); res = -EOPNOTSUPP; goto out; } From debb6b9c3fdd451c512f2613bd268278f3e5e05b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2022 13:24:16 +0900 Subject: [PATCH 0866/1250] mm, hwpoison: make unpoison aware of raw error info in hwpoisoned hugepage Raw error info list needs to be removed when hwpoisoned hugetlb is unpoisoned. And unpoison handler needs to know how many errors there are in the target hugepage. So add them. HPageVmemmapOptimized(hpage) and HPageRawHwpUnreliable(hpage)) sometimes can't be unpoisoned, so skip them. Link: https://lkml.kernel.org/r/20220714042420.1847125-5-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reported-by: kernel test robot Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: Liu Shixin Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/swapops.h | 9 +++++++ mm/memory-failure.c | 52 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bb7afd03a324f6..a3d435bf9f9729 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -490,6 +490,11 @@ static inline void num_poisoned_pages_dec(void) atomic_long_dec(&num_poisoned_pages); } +static inline void num_poisoned_pages_sub(long i) +{ + atomic_long_sub(i, &num_poisoned_pages); +} + #else static inline swp_entry_t make_hwpoison_entry(struct page *page) @@ -505,6 +510,10 @@ static inline int is_hwpoison_entry(swp_entry_t swp) static inline void num_poisoned_pages_inc(void) { } + +static inline void num_poisoned_pages_sub(long i) +{ +} #endif static inline int non_swap_entry(swp_entry_t entry) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index fa29849769edfa..8b9c0d2285494c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1680,19 +1680,23 @@ static inline struct llist_head *raw_hwp_list_head(struct page *hpage) return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON); } -static void __free_raw_hwp_pages(struct page *hpage) +static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) { struct llist_head *head; struct llist_node *t, *tnode; + unsigned long count = 0; head = raw_hwp_list_head(hpage); llist_for_each_safe(tnode, t, head->first) { struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); - SetPageHWPoison(p->page); + if (move_flag) + SetPageHWPoison(p->page); kfree(p); + count++; } llist_del_all(head); + return count; } static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) @@ -1735,17 +1739,36 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) * Once HPageRawHwpUnreliable is set, raw_hwp_page is not * used any more, so free it. */ - __free_raw_hwp_pages(hpage); + __free_raw_hwp_pages(hpage, false); } return ret; } +static unsigned long free_raw_hwp_pages(struct page *hpage, bool move_flag) +{ + /* + * HPageVmemmapOptimized hugepages can't be freed because struct + * pages for tail pages are required but they don't exist. + */ + if (move_flag && HPageVmemmapOptimized(hpage)) + return 0; + + /* + * HPageRawHwpUnreliable hugepages shouldn't be unpoisoned by + * definition. + */ + if (HPageRawHwpUnreliable(hpage)) + return 0; + + return __free_raw_hwp_pages(hpage, move_flag); +} + void hugetlb_clear_page_hwpoison(struct page *hpage) { if (HPageRawHwpUnreliable(hpage)) return; ClearPageHWPoison(hpage); - __free_raw_hwp_pages(hpage); + free_raw_hwp_pages(hpage, true); } /* @@ -1889,6 +1912,10 @@ static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int * return 0; } +static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag) +{ + return 0; +} #endif /* CONFIG_HUGETLB_PAGE */ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, @@ -2294,6 +2321,7 @@ int unpoison_memory(unsigned long pfn) struct page *p; int ret = -EBUSY; int freeit = 0; + unsigned long count = 1; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2341,6 +2369,13 @@ int unpoison_memory(unsigned long pfn) ret = get_hwpoison_page(p, MF_UNPOISON); if (!ret) { + if (PageHuge(p)) { + count = free_raw_hwp_pages(page, false); + if (count == 0) { + ret = -EBUSY; + goto unlock_mutex; + } + } ret = TestClearPageHWPoison(page) ? 0 : -EBUSY; } else if (ret < 0) { if (ret == -EHWPOISON) { @@ -2349,6 +2384,13 @@ int unpoison_memory(unsigned long pfn) unpoison_pr_info("Unpoison: failed to grab page %#lx\n", pfn, &unpoison_rs); } else { + if (PageHuge(p)) { + count = free_raw_hwp_pages(page, false); + if (count == 0) { + ret = -EBUSY; + goto unlock_mutex; + } + } freeit = !!TestClearPageHWPoison(p); put_page(page); @@ -2361,7 +2403,7 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); if (!ret || freeit) { - num_poisoned_pages_dec(); + num_poisoned_pages_sub(count); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } From b5100d2796a3e241f852307135567eafb03f2e18 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2022 13:24:17 +0900 Subject: [PATCH 0867/1250] mm, hwpoison: set PG_hwpoison for busy hugetlb pages If memory_failure() fails to grab page refcount on a hugetlb page because it's busy, it returns without setting PG_hwpoison on it. This not only loses a chance of error containment, but breaks the rule that action_result() should be called only when memory_failure() do any of handling work (even if that's just setting PG_hwpoison). This inconsistency could harm code maintainability. So set PG_hwpoison and call hugetlb_set_page_hwpoison() for such a case. Link: https://lkml.kernel.org/r/20220714042420.1847125-6-naoya.horiguchi@linux.dev Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()") Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: kernel test robot Cc: Liu Shixin Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/memory-failure.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 95388863a61a9f..b4b2fcf547b0d6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3189,6 +3189,7 @@ enum mf_flags { MF_SOFT_OFFLINE = 1 << 3, MF_UNPOISON = 1 << 4, MF_SW_SIMULATED = 1 << 5, + MF_NO_RETRY = 1 << 6, }; int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index, unsigned long count, int mf_flags); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8b9c0d2285494c..f15d521c3f1f32 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1802,7 +1802,8 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) count_increased = true; } else { ret = -EBUSY; - goto out; + if (!(flags & MF_NO_RETRY)) + goto out; } if (hugetlb_set_page_hwpoison(head, page)) { @@ -1829,7 +1830,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb struct page *p = pfn_to_page(pfn); struct page *head; unsigned long page_flags; - bool retry = true; *hugetlb = 1; retry: @@ -1845,8 +1845,8 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb } return res; } else if (res == -EBUSY) { - if (retry) { - retry = false; + if (!(flags & MF_NO_RETRY)) { + flags |= MF_NO_RETRY; goto retry; } action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); From dcbdf0e437c26665817186c7c95994d2daaf7fe6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2022 13:24:18 +0900 Subject: [PATCH 0868/1250] mm, hwpoison: make __page_handle_poison returns int __page_handle_poison() returns bool that shows whether take_page_off_buddy() has passed or not now. But we will want to distinguish another case of "dissolve has passed but taking off failed" by its return value. So change the type of the return value. No functional change. Link: https://lkml.kernel.org/r/20220714042420.1847125-7-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: kernel test robot Cc: Liu Shixin Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f15d521c3f1f32..c8fa3643791c0d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -71,7 +71,13 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -static bool __page_handle_poison(struct page *page) +/* + * Return values: + * 1: the page is dissolved (if needed) and taken off from buddy, + * 0: the page is dissolved (if needed) and not taken off from buddy, + * < 0: failed to dissolve. + */ +static int __page_handle_poison(struct page *page) { int ret; @@ -81,7 +87,7 @@ static bool __page_handle_poison(struct page *page) ret = take_page_off_buddy(page); zone_pcp_enable(page_zone(page)); - return ret > 0; + return ret; } static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, bool release) @@ -91,7 +97,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo * Doing this check for free pages is also fine since dissolve_free_huge_page * returns 0 for non-hugetlb pages as well. */ - if (!__page_handle_poison(page)) + if (__page_handle_poison(page) <= 0) /* * We could fail to take off the target page from buddy * for example due to racy page allocation, but that's @@ -1086,7 +1092,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) * subpages. */ put_page(hpage); - if (__page_handle_poison(p)) { + if (__page_handle_poison(p) > 0) { page_ref_inc(p); res = MF_RECOVERED; } @@ -1869,7 +1875,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb if (res == 0) { unlock_page(head); res = MF_FAILED; - if (__page_handle_poison(p)) { + if (__page_handle_poison(p) > 0) { page_ref_inc(p); res = MF_RECOVERED; } From 47f619669bffb4ba332d7c8bd338fd5a43b424d7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2022 13:24:19 +0900 Subject: [PATCH 0869/1250] mm, hwpoison: skip raw hwpoison page in freeing 1GB hugepage Currently if memory_failure() (modified to remove blocking code with subsequent patch) is called on a page in some 1GB hugepage, memory error handling fails and the raw error page gets into leaked state. The impact is small in production systems (just leaked single 4kB page), but this limits the testability because unpoison doesn't work for it. We can no longer create 1GB hugepage on the 1GB physical address range with such leaked pages, that's not useful when testing on small systems. When a hwpoison page in a 1GB hugepage is handled, it's caught by the PageHWPoison check in free_pages_prepare() because the 1GB hugepage is broken down into raw error pages before coming to this point: if (unlikely(PageHWPoison(page)) && !order) { ... return false; } Then, the page is not sent to buddy and the page refcount is left 0. Originally this check is supposed to work when the error page is freed from page_handle_poison() (that is called from soft-offline), but now we are opening another path to call it, so the callers of __page_handle_poison() need to handle the case by considering the return value 0 as success. Then page refcount for hwpoison is properly incremented so unpoison works. Link: https://lkml.kernel.org/r/20220714042420.1847125-8-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: kernel test robot Cc: Liu Shixin Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c8fa3643791c0d..3721de624b9865 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1084,7 +1084,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) res = truncate_error_page(hpage, page_to_pfn(p), mapping); unlock_page(hpage); } else { - res = MF_FAILED; unlock_page(hpage); /* * migration entry prevents later access on error hugepage, @@ -1092,9 +1091,11 @@ static int me_huge_page(struct page_state *ps, struct page *p) * subpages. */ put_page(hpage); - if (__page_handle_poison(p) > 0) { + if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; + } else { + res = MF_FAILED; } } @@ -1874,10 +1875,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb */ if (res == 0) { unlock_page(head); - res = MF_FAILED; - if (__page_handle_poison(p) > 0) { + if (__page_handle_poison(p) >= 0) { page_ref_inc(p); res = MF_RECOVERED; + } else { + res = MF_FAILED; } action_result(pfn, MF_MSG_FREE_HUGE, res); return res == MF_RECOVERED ? 0 : -EBUSY; From 97931adffb0dea2c0e2f4f9cf49059395e682f43 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 14 Jul 2022 13:24:20 +0900 Subject: [PATCH 0870/1250] mm, hwpoison: enable memory error handling on 1GB hugepage Now error handling code is prepared, so remove the blocking code and enable memory error handling on 1GB hugepage. Link: https://lkml.kernel.org/r/20220714042420.1847125-9-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: David Hildenbrand Cc: kernel test robot Cc: Liu Shixin Cc: Mike Kravetz Cc: Muchun Song Cc: Oscar Salvador Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - include/ras/ras_event.h | 1 - mm/memory-failure.c | 16 ---------------- 3 files changed, 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b4b2fcf547b0d6..6dee7cd93f53ff 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3242,7 +3242,6 @@ enum mf_action_page_type { MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, - MF_MSG_NON_PMD_HUGE, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index d0337a41141c84..cbd3ddd7c33d4d 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -360,7 +360,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ - EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \ EM ( MF_MSG_UNMAP_FAILED, "unmapping failed page" ) \ EM ( MF_MSG_DIRTY_SWAPCACHE, "dirty swapcache page" ) \ EM ( MF_MSG_CLEAN_SWAPCACHE, "clean swapcache page" ) \ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3721de624b9865..d86b5acd5754cf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -765,7 +765,6 @@ static const char * const action_page_types[] = { [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", - [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", [MF_MSG_UNMAP_FAILED] = "unmapping failed page", [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", @@ -1887,21 +1886,6 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb page_flags = head->flags; - /* - * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so - * simply disable it. In order to make it work properly, we need - * make sure that: - * - conversion of a pud that maps an error hugetlb into hwpoison - * entry properly works, and - * - other mm code walking over page table is aware of pud-aligned - * hwpoison entries. - */ - if (huge_page_size(page_hstate(head)) > PMD_SIZE) { - action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED); - res = -EBUSY; - goto out; - } - if (!hwpoison_user_mappings(p, pfn, flags, head)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; From 5064811182b7dbbcf80e2441de627f261f2e1aa7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 Jul 2022 21:59:12 -0400 Subject: [PATCH 0871/1250] mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs This allows userspace to set flags like FS_APPEND_FL, FS_IMMUTABLE_FL, FS_NODUMP_FL, etc., like all other standard Linux file systems. Link: https://lkml.kernel.org/r/20220715015912.2560575-1-tytso@mit.edu Signed-off-by: Theodore Ts'o Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 11 +++++++ mm/shmem.c | 63 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a68f982f22d169..1b6c4013f691b0 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -25,9 +25,20 @@ struct shmem_inode_info { struct simple_xattrs xattrs; /* list of xattrs */ atomic_t stop_eviction; /* hold when working on inode */ struct timespec64 i_crtime; /* file creation time */ + unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ struct inode vfs_inode; }; +#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE +#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE +#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE + +/* Flags that are appropriate for regular files (all but dir-specific ones). */ +#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) + +/* Flags that are appropriate for non-directories/regular files. */ +#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) + struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ struct percpu_counter used_blocks; /* How many are allocated */ diff --git a/mm/shmem.c b/mm/shmem.c index 46bfa9b107b851..b3de94b266fe29 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -1058,6 +1059,15 @@ static int shmem_getattr(struct user_namespace *mnt_userns, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); } + if (info->fsflags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (info->fsflags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); if (shmem_is_huge(NULL, inode, 0)) @@ -2272,7 +2282,18 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, +/* Mask out flags that are inappropriate for the given type of inode. */ +static unsigned shmem_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & SHMEM_REG_FLMASK; + else + return flags & SHMEM_OTHER_FLMASK; +} + +static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode; @@ -2297,6 +2318,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; info->i_crtime = inode->i_mtime; + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + info->fsflags = shmem_mask_flags(mode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -2813,6 +2837,39 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, return error; } +static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + + fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); + + return 0; +} + +static int shmem_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + + info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + (fa->flags & SHMEM_FL_USER_MODIFIABLE); + + inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); + if (info->fsflags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (info->fsflags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + + inode->i_ctime = current_time(inode); + return 0; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -3825,6 +3882,8 @@ static const struct inode_operations shmem_inode_operations = { #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif }; @@ -3844,6 +3903,8 @@ static const struct inode_operations shmem_dir_inode_operations = { #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, From e73bb04eaf10da6ebb2e7b52a4a29f49c559c475 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 18 Jul 2022 15:51:33 -0700 Subject: [PATCH 0872/1250] mm-shmem-support-fs_ioc_etflags-in-tmpfs-fix fix CONFIG_TMPFS_XATTR=n warnings Reported-by: Stephen Rothwell Cc: Hugh Dickins Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton --- mm/shmem.c | 67 +++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b3de94b266fe29..966de37a79e53a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2837,39 +2837,6 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, return error; } -static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) -{ - struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); - - fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); - - return 0; -} - -static int shmem_fileattr_set(struct user_namespace *mnt_userns, - struct dentry *dentry, struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - struct shmem_inode_info *info = SHMEM_I(inode); - - if (fileattr_has_fsx(fa)) - return -EOPNOTSUPP; - - info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | - (fa->flags & SHMEM_FL_USER_MODIFIABLE); - - inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); - if (info->fsflags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; - if (info->fsflags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - if (info->fsflags & FS_NOATIME_FL) - inode->i_flags |= S_NOATIME; - - inode->i_ctime = current_time(inode); - return 0; -} - static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -3195,6 +3162,40 @@ static const char *shmem_get_link(struct dentry *dentry, } #ifdef CONFIG_TMPFS_XATTR + +static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + + fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); + + return 0; +} + +static int shmem_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + + info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + (fa->flags & SHMEM_FL_USER_MODIFIABLE); + + inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); + if (info->fsflags & FS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (info->fsflags & FS_NOATIME_FL) + inode->i_flags |= S_NOATIME; + + inode->i_ctime = current_time(inode); + return 0; +} + /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs From 5e36c0319775c391f9a088793e6e39aad98e5589 Mon Sep 17 00:00:00 2001 From: Yixuan Cao Date: Mon, 18 Jul 2022 03:55:06 +0800 Subject: [PATCH 0873/1250] tools/vm/page_owner_sort.c: adjust the indent in is_need() I noticed one more indentation than necessary in is_need(). Link: https://lkml.kernel.org/r/20220717195506.7602-1-caoyixuan2019@email.szu.edu.cn Signed-off-by: Yixuan Cao Signed-off-by: Andrew Morton --- tools/vm/page_owner_sort.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 74c3dcecf64d99..ec2e67c85b8499 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -470,23 +470,23 @@ static bool match_str_list(const char *str, char **list, int list_size) static bool is_need(char *buf) { - if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0) - return false; - if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) - return false; - if ((filter & FILTER_TGID) && - !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) - return false; - - char *comm = get_comm(buf); - - if ((filter & FILTER_COMM) && - !match_str_list(comm, fc.comms, fc.comms_size)) { - free(comm); - return false; - } + if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0) + return false; + if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) + return false; + if ((filter & FILTER_TGID) && + !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) + return false; + + char *comm = get_comm(buf); + + if ((filter & FILTER_COMM) && + !match_str_list(comm, fc.comms, fc.comms_size)) { free(comm); - return true; + return false; + } + free(comm); + return true; } static void add_list(char *buf, int len, char *ext_buf) From 598a6bbb7031a53c62d6a7c9f5f2eb3d0252c5a6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 16 Jul 2022 16:18:16 +0800 Subject: [PATCH 0874/1250] mm: remove unneeded PageAnon check in restore_exclusive_pte() When code reaches here, the page must be !PageAnon. There's no need to check PageAnon again. Remove it. Link: https://lkml.kernel.org/r/20220716081816.10752-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 051f0815396547..fcbe7143fdc21d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -753,7 +753,7 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, * Currently device exclusive access only supports anonymous * memory so the entry shouldn't point to a filebacked page. */ - WARN_ON_ONCE(!PageAnon(page)); + WARN_ON_ONCE(1); set_pte_at(vma->vm_mm, address, ptep, pte); From 3e387b49abd90f2b300882f1199b07b99ffec69f Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Mon, 18 Jul 2022 20:03:35 +0800 Subject: [PATCH 0875/1250] mm/page_alloc: correct the wrong cpuset file path in comment cpuset.c was moved to kernel/cgroup/ in below commit 201af4c0fab0 ("cgroup: move cgroup files under kernel/cgroup/") Correct the wrong path in comment. Link: https://lkml.kernel.org/r/20220718120336.5145-1-mark-pk.tsai@mediatek.com Signed-off-by: Mark-PK Tsai Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d0d09a9ce36476..9d9241772cc88c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4170,7 +4170,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, retry: /* * Scan zonelist, looking for a zone with enough free. - * See also __cpuset_node_allowed() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c. */ no_fallback = alloc_flags & ALLOC_NOFRAGMENT; z = ac->preferred_zoneref; From 435e21775cc81e33ce60a83fcde37be8f83a512b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 19 Jul 2022 19:52:33 +0800 Subject: [PATCH 0876/1250] mm/mempolicy: remove unneeded out label We can use unlock label to unlock ptl and return ret directly to remove the unneeded out label and reduce the size of mempolicy.o. No functional change intended. [Before] text data bss dec hex filename 26702 3972 6168 36842 8fea mm/mempolicy.o [After] text data bss dec hex filename 26662 3972 6168 36802 8fc2 mm/mempolicy.o Link: https://lkml.kernel.org/r/20220719115233.6706-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dc74239d1ac776..6c27acb6cd63dc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -466,9 +466,8 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, } page = pmd_page(*pmd); if (is_huge_zero_page(page)) { - spin_unlock(ptl); walk->action = ACTION_CONTINUE; - goto out; + goto unlock; } if (!queue_pages_required(page, qp)) goto unlock; @@ -485,7 +484,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = -EIO; unlock: spin_unlock(ptl); -out: return ret; } From 93791c29b9a8ae6af336305ba0b9a42c24f21956 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Jul 2022 12:42:48 +0300 Subject: [PATCH 0877/1250] tools/testing/selftests/vm/hugetlb-madvise.c: silence uninitialized variable warning This code just reads from memory without caring about the data itself. However static checkers complain that "tmp" is never properly initialized. Initialize it to zero and change the name to "dummy" to show that we don't care about the value stored in it. Link: https://lkml.kernel.org/r/YtZ8mKJmktA2GaHB@kili Fixes: c4b6cb884011 ("selftests/vm: add hugetlb madvise MADV_DONTNEED MADV_REMOVE test") Signed-off-by: Dan Carpenter Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hugetlb-madvise.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/hugetlb-madvise.c b/tools/testing/selftests/vm/hugetlb-madvise.c index 6c6af40f57478f..3c9943131881eb 100644 --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ b/tools/testing/selftests/vm/hugetlb-madvise.c @@ -89,10 +89,11 @@ void write_fault_pages(void *addr, unsigned long nr_pages) void read_fault_pages(void *addr, unsigned long nr_pages) { - unsigned long i, tmp; + unsigned long dummy = 0; + unsigned long i; for (i = 0; i < nr_pages; i++) - tmp += *((unsigned long *)(addr + (i * huge_page_size))); + dummy += *((unsigned long *)(addr + (i * huge_page_size))); } int main(int argc, char **argv) From e121415d6227f900e0f376a4bfa7e3d781bfa5db Mon Sep 17 00:00:00 2001 From: Kassey Li Date: Tue, 19 Jul 2022 17:15:54 +0800 Subject: [PATCH 0878/1250] mm/cma_debug.c: align the name buffer length as struct cma Avoids truncating the debugfs output to 16 chars. Potentially alters the userspace output, but this is a debugfs interface and there are no stability guarantees. Link: https://lkml.kernel.org/r/20220719091554.27864-1-quic_yingangl@quicinc.com Signed-off-by: Kassey Li Cc: Sasha Levin Cc: Joonsoo Kim Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/cma_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 2e7704955f4f37..c3ffe253e05527 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -163,7 +163,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { struct dentry *tmp; - char name[16]; + char name[CMA_MAX_NAME]; scnprintf(name, sizeof(name), "cma-%s", cma->name); From 1f9c68ff5b5cae354798296141aa7c96affb261a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Jul 2022 12:04:14 +0300 Subject: [PATCH 0879/1250] selftest/vm: uninitialized variable in main() Initialize "length" to zero by default. Link: https://lkml.kernel.org/r/YtZzjvHXVXMXxpXO@kili Fixes: ff712a627f72 ("selftests/vm: cleanup hugetlb file after mremap test") Signed-off-by: Dan Carpenter Reviewed-by: Mina Almasry Reviewed-by: Muchun Song Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/hugepage-mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 585978f181ed14..e63a0214f63997 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -107,7 +107,7 @@ static void register_region_with_uffd(char *addr, size_t len) int main(int argc, char *argv[]) { - size_t length; + size_t length = 0; if (argc != 2 && argc != 3) { printf("Usage: %s [length_in_MB] \n", argv[0]); From 4cef4afa3b812136c2e2be1491f8e9129f4ac024 Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 20 Jul 2022 14:41:58 -0700 Subject: [PATCH 0880/1250] ocfs2: reflink deadlock when clone file to the same directory simultaneously Running reflink from multiple nodes simultaneously to clone a file to the same directory probably triggers a deadlock issue. For example, there is a three node ocfs2 cluster, each node mounts the ocfs2 file system to /mnt/shared, and run the reflink command from each node repeatedly, like reflink "/mnt/shared/test" \ "/mnt/shared/.snapshots/test.`date +%m%d%H%M%S`.`hostname`" then, reflink command process will be hung on each node, and you can't list this file system directory. The problematic reflink command process is blocked at one node, task:reflink state:D stack: 0 pid: 1283 ppid: 4154 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0e3e000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_init_security_and_acl+0xbe/0x1d0 [ocfs2] ocfs2_reflink+0x436/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 The other reflink command processes are blocked at other nodes, task:reflink state:D stack: 0 pid:29759 ppid: 4088 Call Trace: __schedule+0x2fd/0x750 schedule+0x2f/0xa0 schedule_timeout+0x1cc/0x310 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0b19000 wait_for_completion+0xba/0x140 ? wake_up_q+0xa0/0xa0 __ocfs2_cluster_lock.isra.41+0x3b5/0x820 [ocfs2] ? ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_inode_lock_full_nested+0x1fc/0x960 [ocfs2] ocfs2_mv_orphaned_inode_to_new+0x87/0x7e0 [ocfs2] ocfs2_reflink+0x335/0x4c0 [ocfs2] ? ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_reflink_ioctl+0x2ca/0x360 [ocfs2] ocfs2_ioctl+0x25e/0x670 [ocfs2] do_vfs_ioctl+0xa0/0x680 ksys_ioctl+0x70/0x80 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x5b/0x1e0 or task:reflink state:D stack: 0 pid:18465 ppid: 4156 Call Trace: __schedule+0x302/0x940 ? usleep_range+0x80/0x80 schedule+0x46/0xb0 schedule_timeout+0xff/0x140 ? ocfs2_control_cfu+0x50/0x50 [ocfs2_stack_user] ? 0xffffffffc0c3b000 __wait_for_common+0xb9/0x170 __ocfs2_cluster_lock.constprop.0+0x1d6/0x860 [ocfs2] ? ocfs2_wait_for_recovery+0x49/0xd0 [ocfs2] ? ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_full_nested+0x30f/0xa50 [ocfs2] ocfs2_inode_lock_tracker+0xf2/0x2b0 [ocfs2] ? dput+0x32/0x2f0 ocfs2_permission+0x45/0xe0 [ocfs2] inode_permission+0xcc/0x170 link_path_walk.part.0.constprop.0+0x2a2/0x380 ? path_init+0x2c1/0x3f0 path_parentat+0x3c/0x90 filename_parentat+0xc1/0x1d0 ? filename_lookup+0x138/0x1c0 filename_create+0x43/0x160 ocfs2_reflink_ioctl+0xe6/0x380 [ocfs2] ocfs2_ioctl+0x1ea/0x2c0 [ocfs2] ? do_sys_openat2+0x81/0x150 __x64_sys_ioctl+0x82/0xb0 do_syscall_64+0x61/0xb0 The deadlock is caused by multiple acquiring the destination directory inode dlm lock in ocfs2_reflink function, we should acquire this directory inode dlm lock at the beginning, and hold this dlm lock until end of the function. Link: https://lkml.kernel.org/r/20210729110230.18983-1-ghe@suse.com Signed-off-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 32 +++++++++++++------------------- fs/ocfs2/namei.h | 2 ++ fs/ocfs2/refcounttree.c | 15 +++++++++++---- fs/ocfs2/xattr.c | 12 +----------- fs/ocfs2/xattr.h | 1 + 5 files changed, 28 insertions(+), 34 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index c75fd54b91854b..e3dd30dd3547ff 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2489,6 +2489,7 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir, } int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode) { @@ -2597,13 +2598,16 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, brelse(new_di_bh); - if (!status) - *new_inode = inode; - ocfs2_free_dir_lookup_result(&orphan_insert); - ocfs2_inode_unlock(dir, 1); - brelse(parent_di_bh); + if (!status) { + *new_inode = inode; + *dir_bh = parent_di_bh; + } else { + ocfs2_inode_unlock(dir, 1); + brelse(parent_di_bh); + } + return status; } @@ -2760,11 +2764,11 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, } int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, struct dentry *dentry) { int status = 0; - struct buffer_head *parent_di_bh = NULL; handle_t *handle = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_dinode *dir_di, *di; @@ -2778,14 +2782,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, (unsigned long long)OCFS2_I(dir)->ip_blkno, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_inode_lock(dir, &parent_di_bh, 1); - if (status < 0) { - if (status != -ENOENT) - mlog_errno(status); - return status; - } - - dir_di = (struct ocfs2_dinode *) parent_di_bh->b_data; + dir_di = (struct ocfs2_dinode *) dir_bh->b_data; if (!dir_di->i_links_count) { /* can't make a file in a deleted directory. */ status = -ENOENT; @@ -2798,7 +2795,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; /* get a spot inside the dir. */ - status = ocfs2_prepare_dir_for_insert(osb, dir, parent_di_bh, + status = ocfs2_prepare_dir_for_insert(osb, dir, dir_bh, dentry->d_name.name, dentry->d_name.len, &lookup); if (status < 0) { @@ -2862,7 +2859,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_di_bh, + OCFS2_I(inode)->ip_blkno, dir_bh, &lookup); if (status < 0) { mlog_errno(status); @@ -2886,10 +2883,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, iput(orphan_dir_inode); leave: - ocfs2_inode_unlock(dir, 1); - brelse(di_bh); - brelse(parent_di_bh); brelse(orphan_dir_bh); ocfs2_free_dir_lookup_result(&lookup); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 9cc891eb874e04..03a2c526e2c1b8 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -24,6 +24,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio); int ocfs2_create_inode_in_orphan(struct inode *dir, + struct buffer_head **dir_bh, int mode, struct inode **new_inode); int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, @@ -32,6 +33,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *di_bh, int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, + struct buffer_head *dir_bh, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index e04358a46b6805..00ce8fe7e32370 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4252,7 +4252,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, { int error, had_lock; struct inode *inode = d_inode(old_dentry); - struct buffer_head *old_bh = NULL; + struct buffer_head *old_bh = NULL, *dir_bh = NULL; struct inode *new_orphan_inode = NULL; struct ocfs2_lock_holder oh; @@ -4260,7 +4260,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, return -EOPNOTSUPP; - error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, + error = ocfs2_create_inode_in_orphan(dir, &dir_bh, inode->i_mode, &new_orphan_inode); if (error) { mlog_errno(error); @@ -4306,13 +4306,15 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, /* If the security isn't preserved, we need to re-initialize them. */ if (!preserve) { - error = ocfs2_init_security_and_acl(dir, new_orphan_inode, + error = ocfs2_init_security_and_acl(dir, dir_bh, + new_orphan_inode, &new_dentry->d_name); if (error) mlog_errno(error); } if (!error) { - error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, + error = ocfs2_mv_orphaned_inode_to_new(dir, dir_bh, + new_orphan_inode, new_dentry); if (error) mlog_errno(error); @@ -4330,6 +4332,11 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, iput(new_orphan_inode); } + if (dir_bh) { + ocfs2_inode_unlock(dir, 1); + brelse(dir_bh); + } + return error; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 95d0611c5fc7d1..3f23e3a5018ce7 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7203,16 +7203,13 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, /* * Initialize security and acl for a already created inode. * Used for reflink a non-preserve-security file. - * - * It uses common api like ocfs2_xattr_set, so the caller - * must not hold any lock expect i_rwsem. */ int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr) { int ret = 0; - struct buffer_head *dir_bh = NULL; ret = ocfs2_init_security_get(inode, dir, qstr, NULL); if (ret) { @@ -7220,17 +7217,10 @@ int ocfs2_init_security_and_acl(struct inode *dir, goto leave; } - ret = ocfs2_inode_lock(dir, &dir_bh, 0); - if (ret) { - mlog_errno(ret); - goto leave; - } ret = ocfs2_init_acl(NULL, inode, dir, NULL, dir_bh, NULL, NULL); if (ret) mlog_errno(ret); - ocfs2_inode_unlock(dir, 0); - brelse(dir_bh); leave: return ret; } diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f18..b27fd8ba00196a 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -83,6 +83,7 @@ int ocfs2_reflink_xattrs(struct inode *old_inode, struct buffer_head *new_bh, bool preserve_security); int ocfs2_init_security_and_acl(struct inode *dir, + struct buffer_head *dir_bh, struct inode *inode, const struct qstr *qstr); #endif /* OCFS2_XATTR_H */ From 00fa8ad741b31a922fb05b21688867d83b5010b1 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Wed, 20 Jul 2022 14:41:58 -0700 Subject: [PATCH 0881/1250] ocfs2: clear links count in ocfs2_mknod() if an error occurs In this condition, the inode can not be wiped when error happened. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() ->ocfs2_set_links_count() // i_links_count is 2 -> ... // an error accrue, goto roll_back or leave. ->ocfs2_commit_trans() ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() ->ocfs2_refresh_inode() ->set_nlink(); // inode->i_nlink is 2 now. /* if wipe is 0, it will goto bail_unlock_inode */ ->ocfs2_query_inode_wipe() ->if (inode->i_nlink) return; // wipe is 0. /* inode can not be wiped */ ->ocfs2_wipe_inode() So, we need clear links before the transaction committed. Link: http://lkml.kernel.org/r/d8147c41-fb2b-bdf7-b660-1f3c8448c33f@huawei.com Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index e3dd30dd3547ff..ea27e63ec278fc 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -453,8 +453,12 @@ static int ocfs2_mknod(struct user_namespace *mnt_userns, leave: if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) @@ -598,6 +602,8 @@ static int __ocfs2_mknod_locked(struct inode *dir, leave: if (status < 0) { if (*new_fe_bh) { + if (fe) + ocfs2_set_links_count(fe, 0); brelse(*new_fe_bh); *new_fe_bh = NULL; } @@ -2027,8 +2033,12 @@ static int ocfs2_symlink(struct user_namespace *mnt_userns, ocfs2_clusters_to_bytes(osb->sb, 1)); if (status < 0 && did_quota_inode) dquot_free_inode(inode); - if (handle) + if (handle) { + if (status < 0 && new_fe_bh != NULL) + ocfs2_set_links_count((struct ocfs2_dinode *) + new_fe_bh->b_data, 0); ocfs2_commit_trans(osb, handle); + } ocfs2_inode_unlock(dir, 1); if (did_block_signals) From 07f5f687c3813a024b5d9a890a1c2b8948c6ad14 Mon Sep 17 00:00:00 2001 From: Wangyan Date: Wed, 20 Jul 2022 14:41:58 -0700 Subject: [PATCH 0882/1250] ocfs2: fix ocfs2 corrupt when iputting an inode In this condition, it will cause an bug on error. ocfs2_mkdir() ->ocfs2_mknod() ->ocfs2_mknod_locked() ->__ocfs2_mknod_locked() //Assume inode->i_generation is genN. ->inode->i_generation = osb->s_next_generation++; // The inode lockres has been initialized. ->ocfs2_populate_inode() ->ocfs2_create_new_inode_locks() ->An error happened, returned value is non-zero // free the start_bit x in bg_blkno ->ocfs2_free_suballoc_bits() ->... /* Another process execute mkdir success in this place, and it occupied the start_bit x in bg_blkno which has been freed before. Its inode->i_generation is genN + 1 */ ->iput(inode) ->evict() ->ocfs2_evict_inode() ->ocfs2_delete_inode() ->ocfs2_inode_lock() ->ocfs2_inode_lock_update() /* Bug on here, genN != genN + 1 */ ->mlog_bug_on_msg(inode->i_generation != le32_to_cpu(fe->i_generation)) So, we need not to reclaim the inode when the inode->ip_inode_lockres has been initialized. It will be freed in iput(). Link: http://lkml.kernel.org/r/ef080ca3-5d74-e276-17a1-d9e7c7e662c9@huawei.com Fixes: b1529a41f777 ("ocfs2: should reclaim the inode if '__ocfs2_mknod_locked' returns an error") Signed-off-by: Yan Wang Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Gang He Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ea27e63ec278fc..7d7f2b8f0554e0 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -640,7 +640,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh, parent_fe_bh, handle, inode_ac, fe_blkno, suballoc_loc, suballoc_bit); - if (status < 0) { + if (status < 0 && !(OCFS2_I(inode)->ip_inode_lockres.l_flags & + OCFS2_LOCK_INITIALIZED)) { u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit); int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode, inode_ac->ac_bh, suballoc_bit, bg_blkno, 1); From 364ae57439fe1cdc76dc91cd217e3221409615e0 Mon Sep 17 00:00:00 2001 From: Andrew Halaney Date: Wed, 20 Jul 2022 14:41:58 -0700 Subject: [PATCH 0883/1250] init/main.c: silence some -Wunused-parameter warnings There are a bunch of callbacks with unused arguments, go ahead and silence those so "make KCFLAGS=-W init/main.o" is a little quieter. Here's a little sample: init/main.c:182:43: warning: unused parameter 'str' [-Wunused-parameter] static int __init set_reset_devices(char *str) Link: https://lkml.kernel.org/r/20210519162341.1275452-1-ahalaney@redhat.com Signed-off-by: Andrew Halaney Cc: Rasmus Villemoes Signed-off-by: Andrew Morton --- init/main.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/init/main.c b/init/main.c index 0ee39cdcfcac97..0f452ae3b20f54 100644 --- a/init/main.c +++ b/init/main.c @@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(static_key_initialized); unsigned int reset_devices; EXPORT_SYMBOL(reset_devices); -static int __init set_reset_devices(char *str) +static int __init set_reset_devices(char *str __always_unused) { reset_devices = 1; return 1; @@ -231,13 +231,13 @@ static bool __init obsolete_checksetup(char *line) unsigned long loops_per_jiffy = (1<<12); EXPORT_SYMBOL(loops_per_jiffy); -static int __init debug_kernel(char *str) +static int __init debug_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } -static int __init quiet_kernel(char *str) +static int __init quiet_kernel(char *str __always_unused) { console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; @@ -474,7 +474,7 @@ static void __init setup_boot_config(void) get_boot_config_from_initrd(NULL); } -static int __init warn_bootconfig(char *str) +static int __init warn_bootconfig(char *str __always_unused) { pr_warn("WARNING: 'bootconfig' found on the kernel command line but CONFIG_BOOT_CONFIG is not set.\n"); return 0; @@ -503,7 +503,8 @@ static void __init repair_env_string(char *param, char *val) /* Anything after -- gets handed straight to init. */ static int __init set_init_arg(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { unsigned int i; @@ -528,7 +529,8 @@ static int __init set_init_arg(char *param, char *val, * unused parameters (modprobe will find them in /proc/cmdline). */ static int __init unknown_bootoption(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { size_t len = strlen(param); @@ -728,7 +730,8 @@ noinline void __ref rest_init(void) /* Check for early params. */ static int __init do_early_param(char *param, char *val, - const char *unused, void *arg) + const char *unused __always_unused, + void *arg __always_unused) { const struct obs_kernel_param *p; @@ -1347,8 +1350,10 @@ static const char *initcall_level_names[] __initdata = { "late", }; -static int __init ignore_unknown_bootoption(char *param, char *val, - const char *unused, void *arg) +static int __init ignore_unknown_bootoption(char *param __always_unused, + char *val __always_unused, + const char *unused __always_unused, + void *arg __always_unused) { return 0; } @@ -1487,7 +1492,7 @@ void __weak free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -static int __ref kernel_init(void *unused) +static int __ref kernel_init(void *unused __always_unused) { int ret; From 03fdeceace308e2a1a023c4161be690da6d0d68c Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 11 Jun 2022 21:06:34 +0800 Subject: [PATCH 0884/1250] lib/debugobjects: fix stat count and optimize debug_objects_mem_init. 1. Var debug_objects_allocated tracks valid kmem_cache_alloc calls, so track it in debug_objects_replace_static_objects. Do similar things in object_cpu_offline. 2. In debug_objects_mem_init, there is no need to call function cpuhp_setup_state_nocalls when debug_objects_enabled = 0 (out of memory). Link: https://lkml.kernel.org/r/20220611130634.99741-1-wuchi.zero@gmail.com Signed-off-by: wuchi Cc: Thomas Gleixner Cc: Christoph Hellwig Cc: Kees Cook Cc: Waiman Long Signed-off-by: Andrew Morton --- lib/debugobjects.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 337d797a714163..6f8e5dd1dcd0c8 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -437,6 +437,7 @@ static int object_cpu_offline(unsigned int cpu) struct debug_percpu_free *percpu_pool; struct hlist_node *tmp; struct debug_obj *obj; + unsigned long flags; /* Remote access is safe as the CPU is dead already */ percpu_pool = per_cpu_ptr(&percpu_obj_pool, cpu); @@ -444,6 +445,12 @@ static int object_cpu_offline(unsigned int cpu) hlist_del(&obj->node); kmem_cache_free(obj_cache, obj); } + + raw_spin_lock_irqsave(&pool_lock, flags); + obj_pool_used -= percpu_pool->obj_free; + debug_objects_freed += percpu_pool->obj_free; + raw_spin_unlock_irqrestore(&pool_lock, flags); + percpu_pool->obj_free = 0; return 0; @@ -1318,6 +1325,8 @@ static int __init debug_objects_replace_static_objects(void) hlist_add_head(&obj->node, &objects); } + debug_objects_allocated += i; + /* * debug_objects_mem_init() is now called early that only one CPU is up * and interrupts have been disabled, so it is safe to replace the @@ -1386,6 +1395,7 @@ void __init debug_objects_mem_init(void) debug_objects_enabled = 0; kmem_cache_destroy(obj_cache); pr_warn("out of memory.\n"); + return; } else debug_objects_selftest(); From d0b0d120f31890ee0d78a1546cc05f6011698912 Mon Sep 17 00:00:00 2001 From: Hsin-Yi Wang Date: Fri, 17 Jun 2022 16:38:09 +0800 Subject: [PATCH 0885/1250] Revert "squashfs: provide backing_dev_info in order to disable read-ahead" Patch series "Implement readahead for squashfs", v7. Commit 9eec1d897139("squashfs: provide backing_dev_info in order to disable read-ahead") mitigates the performance drop issue for squashfs by closing readahead for it. This series implements readahead callback for squashfs. This patch (of 4): This reverts 9eec1d897139e5 ("squashfs: provide backing_dev_info in order to disable read-ahead"). Revert closing the readahead to squashfs since the readahead callback for squashfs is implemented. Link: https://lkml.kernel.org/r/20220617083810.337573-1-hsinyi@chromium.org Link: https://lkml.kernel.org/r/20220617083810.337573-2-hsinyi@chromium.org Signed-off-by: Hsin-Yi Wang Suggested-by: Xiongwei Song Cc: Phillip Lougher Cc: Matthew Wilcox Cc: Marek Szyprowski Cc: Zheng Liang Cc: Zhang Yi Cc: Hou Tao Cc: Miao Xie Cc: kernel test robot Signed-off-by: Andrew Morton --- fs/squashfs/super.c | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 6d594ba2ed28ff..32565dafa7f3ba 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -113,24 +112,6 @@ static const struct squashfs_decompressor *supported_squashfs_filesystem( return decompressor; } -static int squashfs_bdi_init(struct super_block *sb) -{ - int err; - unsigned int major = MAJOR(sb->s_dev); - unsigned int minor = MINOR(sb->s_dev); - - bdi_put(sb->s_bdi); - sb->s_bdi = &noop_backing_dev_info; - - err = super_setup_bdi_name(sb, "squashfs_%u_%u", major, minor); - if (err) - return err; - - sb->s_bdi->ra_pages = 0; - sb->s_bdi->io_pages = 0; - - return 0; -} static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) { @@ -146,20 +127,6 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) TRACE("Entered squashfs_fill_superblock\n"); - /* - * squashfs provides 'backing_dev_info' in order to disable read-ahead. For - * squashfs, I/O is not deferred, it is done immediately in read_folio, - * which means the user would always have to wait their own I/O. So the effect - * of readahead is very weak for squashfs. squashfs_bdi_init will set - * sb->s_bdi->ra_pages and sb->s_bdi->io_pages to 0 and close readahead for - * squashfs. - */ - err = squashfs_bdi_init(sb); - if (err) { - errorf(fc, "squashfs init bdi failed"); - return err; - } - sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); if (sb->s_fs_info == NULL) { ERROR("Failed to allocate squashfs_sb_info\n"); From 618d914b017c4b0dd750aa1f5a1de0da2a051808 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 17 Jun 2022 16:38:11 +0800 Subject: [PATCH 0886/1250] squashfs: always build "file direct" version of page actor Squashfs_readahead uses the "file direct" version of the page actor, and so build it unconditionally. Link: https://lkml.kernel.org/r/20220617083810.337573-3-hsinyi@chromium.org Signed-off-by: Phillip Lougher Signed-off-by: Hsin-Yi Wang Reported-by: kernel test robot Cc: Hou Tao Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Miao Xie Cc: Xiongwei Song Cc: Zhang Yi Cc: Zheng Liang Signed-off-by: Andrew Morton --- fs/squashfs/Makefile | 4 ++-- fs/squashfs/page_actor.h | 46 ---------------------------------------- 2 files changed, 2 insertions(+), 48 deletions(-) diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 7bd9b8b856d0bf..477c89a519ee88 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -5,9 +5,9 @@ obj-$(CONFIG_SQUASHFS) += squashfs.o squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o -squashfs-y += namei.o super.o symlink.o decompressor.o +squashfs-y += namei.o super.o symlink.o decompressor.o page_actor.o squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o -squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o +squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 37523c54256fa7..24841d28bc0fb8 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -6,51 +6,6 @@ * Phillip Lougher */ -#ifndef CONFIG_SQUASHFS_FILE_DIRECT -struct squashfs_page_actor { - void **page; - int pages; - int length; - int next_page; -}; - -static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page, - int pages, int length) -{ - struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL); - - if (actor == NULL) - return NULL; - - actor->length = length ? : pages * PAGE_SIZE; - actor->page = page; - actor->pages = pages; - actor->next_page = 0; - return actor; -} - -static inline void *squashfs_first_page(struct squashfs_page_actor *actor) -{ - actor->next_page = 1; - return actor->page[0]; -} - -static inline void *squashfs_next_page(struct squashfs_page_actor *actor) -{ - return actor->next_page == actor->pages ? NULL : - actor->page[actor->next_page++]; -} - -static inline void squashfs_finish_page(struct squashfs_page_actor *actor) -{ - /* empty */ -} - -static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) -{ - /* empty */ -} -#else struct squashfs_page_actor { union { void **buffer; @@ -91,4 +46,3 @@ static inline void squashfs_actor_nobuff(struct squashfs_page_actor *actor) actor->alloc_buffer = 0; } #endif -#endif From b98ee2f54b9cb6bf3df6175beb801d0ae989c141 Mon Sep 17 00:00:00 2001 From: Hsin-Yi Wang Date: Fri, 17 Jun 2022 16:38:13 +0800 Subject: [PATCH 0887/1250] squashfs: implement readahead Implement readahead callback for squashfs. It will read datablocks which cover pages in readahead request. For a few cases it will not mark page as uptodate, including: - file end is 0. - zero filled blocks. - current batch of pages isn't in the same datablock. - decompressor error. Otherwise pages will be marked as uptodate. The unhandled pages will be updated by readpage later. Link: https://lkml.kernel.org/r/20220617083810.337573-4-hsinyi@chromium.org Signed-off-by: Hsin-Yi Wang Suggested-by: Matthew Wilcox Reported-by: Matthew Wilcox Reported-by: Phillip Lougher Reported-by: Xiongwei Song Reported-by: Andrew Morton Cc: Hou Tao Cc: kernel test robot Cc: Marek Szyprowski Cc: Miao Xie Cc: Zhang Yi Cc: Zheng Liang Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 92 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index a8e495d8eb8600..128ebe9aded87d 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -39,6 +39,7 @@ #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" +#include "page_actor.h" /* * Locate cache slot in range [offset, index] for specified inode. If @@ -495,7 +496,96 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) return 0; } +static void squashfs_readahead(struct readahead_control *ractl) +{ + struct inode *inode = ractl->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + size_t mask = (1UL << msblk->block_log) - 1; + unsigned short shift = msblk->block_log - PAGE_SHIFT; + loff_t start = readahead_pos(ractl) & ~mask; + size_t len = readahead_length(ractl) + readahead_pos(ractl) - start; + struct squashfs_page_actor *actor; + unsigned int nr_pages = 0; + struct page **pages; + int i, file_end = i_size_read(inode) >> msblk->block_log; + unsigned int max_pages = 1UL << shift; + + readahead_expand(ractl, start, (len | mask) + 1); + + if (file_end == 0) + return; + + pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return; + + for (;;) { + pgoff_t index; + int res, bsize; + u64 block = 0; + unsigned int expected; + + nr_pages = __readahead_batch(ractl, pages, max_pages); + if (!nr_pages) + break; + + if (readahead_pos(ractl) >= i_size_read(inode)) + goto skip_pages; + + index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) + goto skip_pages; + + expected = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + bsize = read_blocklist(inode, index, &block); + if (bsize == 0) + goto skip_pages; + + actor = squashfs_page_actor_init_special(msblk, pages, nr_pages, + expected); + if (!actor) + goto skip_pages; + + res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); + + kfree(actor); + + if (res == expected) { + int bytes; + + /* Last page (if present) may have trailing bytes not filled */ + bytes = res % PAGE_SIZE; + if (pages[nr_pages - 1]->index == file_end && bytes) + memzero_page(pages[nr_pages - 1], bytes, + PAGE_SIZE - bytes); + + for (i = 0; i < nr_pages; i++) { + flush_dcache_page(pages[i]); + SetPageUptodate(pages[i]); + } + } + + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + + kfree(pages); + return; + +skip_pages: + for (i = 0; i < nr_pages; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + kfree(pages); +} const struct address_space_operations squashfs_aops = { - .read_folio = squashfs_read_folio + .read_folio = squashfs_read_folio, + .readahead = squashfs_readahead }; From 69fea72945ea435191b9915d8adc5b5625d44878 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 17 Jun 2022 16:38:15 +0800 Subject: [PATCH 0888/1250] squashfs: support reading fragments in readahead call Add a function which can be used to read fragments in the readahead call. This function is necessary because filesystems built with the -tailends (or -always-use-fragments) option may have fragments present which cannot be currently handled. Link: https://lkml.kernel.org/r/20220617083810.337573-5-hsinyi@chromium.org Signed-off-by: Phillip Lougher Signed-off-by: Hsin-Yi Wang Cc: Hou Tao Cc: kernel test robot Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Miao Xie Cc: Xiongwei Song Cc: Zhang Yi Cc: Zheng Liang Signed-off-by: Andrew Morton --- fs/squashfs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index 128ebe9aded87d..7ff0b03cceab01 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -496,6 +496,41 @@ static int squashfs_read_folio(struct file *file, struct folio *folio) return 0; } +static int squashfs_readahead_fragment(struct page **page, + unsigned int pages, unsigned int expected) +{ + struct inode *inode = page[0]->mapping->host; + struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + + if (buffer->error) + goto out; + + expected += squashfs_i(inode)->fragment_offset; + + for (n = 0; n < pages; n++) { + unsigned int base = (page[n]->index & mask) << PAGE_SHIFT; + unsigned int offset = base + squashfs_i(inode)->fragment_offset; + + if (expected > offset) { + unsigned int avail = min_t(unsigned int, expected - + offset, PAGE_SIZE); + + squashfs_fill_page(page[n], buffer, offset, avail); + } + + unlock_page(page[n]); + put_page(page[n]); + } + +out: + squashfs_cache_put(buffer); + return buffer->error; +} + static void squashfs_readahead(struct readahead_control *ractl) { struct inode *inode = ractl->mapping->host; @@ -512,9 +547,6 @@ static void squashfs_readahead(struct readahead_control *ractl) readahead_expand(ractl, start, (len | mask) + 1); - if (file_end == 0) - return; - pages = kmalloc_array(max_pages, sizeof(void *), GFP_KERNEL); if (!pages) return; @@ -540,6 +572,15 @@ static void squashfs_readahead(struct readahead_control *ractl) (i_size_read(inode) & (msblk->block_size - 1)) : msblk->block_size; + if (index == file_end && squashfs_i(inode)->fragment_block != + SQUASHFS_INVALID_BLK) { + res = squashfs_readahead_fragment(pages, nr_pages, + expected); + if (res) + goto skip_pages; + continue; + } + bsize = read_blocklist(inode, index, &block); if (bsize == 0) goto skip_pages; From 1408bd8fa33ed23959cf7ce993a0c479a4307955 Mon Sep 17 00:00:00 2001 From: Jiangshan Yi Date: Thu, 14 Jul 2022 09:54:41 +0800 Subject: [PATCH 0889/1250] lib/lzo/lzo1x_compress.c: replace ternary operator with min() and min_t() Fix the following coccicheck warning: lib/lzo/lzo1x_compress.c:54: WARNING opportunity for min(). lib/lzo/lzo1x_compress.c:329: WARNING opportunity for min(). min() and min_t() macro is defined in include/linux/minmax.h. It avoids multiple evaluations of the arguments when non-constant and performs strict type-checking. Link: https://lkml.kernel.org/r/20220714015441.1313036-1-13667453960@163.com Signed-off-by: Jiangshan Yi Tested-by: Dave Rodgman Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- lib/lzo/lzo1x_compress.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c index 76758e9296ba65..9d31e7126606ac 100644 --- a/lib/lzo/lzo1x_compress.c +++ b/lib/lzo/lzo1x_compress.c @@ -50,9 +50,7 @@ lzo1x_1_do_compress(const unsigned char *in, size_t in_len, if (dv == 0 && bitstream_version) { const unsigned char *ir = ip + 4; - const unsigned char *limit = ip_end - < (ip + MAX_ZERO_RUN_LENGTH + 1) - ? ip_end : ip + MAX_ZERO_RUN_LENGTH + 1; + const unsigned char *limit = min(ip_end, ip + MAX_ZERO_RUN_LENGTH + 1); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && \ defined(LZO_FAST_64BIT_MEMORY_ACCESS) u64 dv64; @@ -326,7 +324,7 @@ static int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, data_start = op; while (l > 20) { - size_t ll = l <= (m4_max_offset + 1) ? l : (m4_max_offset + 1); + size_t ll = min_t(size_t, l, m4_max_offset + 1); uintptr_t ll_end = (uintptr_t) ip + ll; if ((ll_end + ((t + ll) >> 5)) <= ll_end) break; From 314f7a3467c6018825c2e84ad9d76d7a14215be5 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Thu, 14 Jul 2022 08:47:44 +0100 Subject: [PATCH 0890/1250] kernel/hung_task: fix address space of proc_dohung_task_timeout_secs The proc_dohung_task_timeout_secs() function is incorrectly marked as having a __user buffer as argument 3. However this is not the case and it is casing multiple sparse warnings. Fix the following warnings by removing __user from the argument: kernel/hung_task.c:237:52: warning: incorrect type in argument 3 (different address spaces) kernel/hung_task.c:237:52: expected void * kernel/hung_task.c:237:52: got void [noderef] __user *buffer kernel/hung_task.c:287:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces)) kernel/hung_task.c:287:35: expected int ( [usertype] *proc_handler )( ... ) kernel/hung_task.c:287:35: got int ( * )( ... ) kernel/hung_task.c:295:35: warning: incorrect type in initializer (incompatible argument 3 (different address spaces)) kernel/hung_task.c:295:35: expected int ( [usertype] *proc_handler )( ... ) kernel/hung_task.c:295:35: got int ( * )( ... ) Link: https://lkml.kernel.org/r/20220714074744.189017-1-ben.dooks@sifive.com Signed-off-by: Ben Dooks Cc: Signed-off-by: Andrew Morton --- kernel/hung_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index cff3ae8c818fd3..bb2354f73dedca 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -229,7 +229,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; From 18a5e6e6678b49392a87cebd48b76f8f42731254 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 19 Jul 2022 16:33:49 +0800 Subject: [PATCH 0891/1250] bdi: remove enum wb_congested_state enum wb_congested_state and the member 'congested' in bdi_writeback are useless since commit a88f2096d5a2 ("remove congestion tracking framework"), so remove it. Link: https://lkml.kernel.org/r/20220719083349.87547-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Cc: Jan Kara Cc: NeilBrown Signed-off-by: Andrew Morton --- include/linux/backing-dev-defs.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index e863c88df95f97..ae12696ec492c6 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -28,11 +28,6 @@ enum wb_state { WB_start_all, /* nr_pages == 0 (all) work pending */ }; -enum wb_congested_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ -}; - enum wb_stat_item { WB_RECLAIMABLE, WB_WRITEBACK, @@ -122,8 +117,6 @@ struct bdi_writeback { atomic_t writeback_inodes; /* number of inodes under writeback */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; - unsigned long congested; /* WB_[a]sync_congested flags */ - unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ From 108d5cecd2a6eb037a876f727eeced185bb46ad4 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Tue, 19 Jul 2022 14:31:51 -0300 Subject: [PATCH 0892/1250] smb2: small refactor in smb2_check_message() If the command is SMB2_IOCTL, OutputLength and OutputContext are optional and can be zero, so return early and skip calculated length check. Move the mismatched length message to the end of the check, to avoid unnecessary logs when the check was not a real miscalculation. Also change the pr_warn_once() to a pr_warn() so we're sure to get a log for the real mismatches. Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- fs/cifs/connect.c | 13 ++++++------- fs/cifs/smb2misc.c | 47 +++++++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index fdd8452b8450d4..8859da70cb060d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1039,19 +1039,18 @@ int cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) { char *buf = server->large_buf ? server->bigbuf : server->smallbuf; - int length; + int rc; /* * We know that we received enough to get to the MID as we * checked the pdu_length earlier. Now check to see - * if the rest of the header is OK. We borrow the length - * var for the rest of the loop to avoid a new stack var. + * if the rest of the header is OK. * * 48 bytes is enough to display the header and a little bit * into the payload for debugging purposes. */ - length = server->ops->check_message(buf, server->total_read, server); - if (length != 0) + rc = server->ops->check_message(buf, server->total_read, server); + if (rc) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); @@ -1066,9 +1065,9 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) return -1; if (!mid) - return length; + return rc; - handle_mid(mid, server, buf, length); + handle_mid(mid, server, buf, rc); return 0; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index db0f27fd373b6f..818cc4dee0e2e1 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -132,15 +132,15 @@ static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, } int -smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) +smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) { struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)shdr; - __u64 mid; - __u32 clc_len; /* calculated length */ - int command; - int pdu_size = sizeof(struct smb2_pdu); int hdr_size = sizeof(struct smb2_hdr); + int pdu_size = sizeof(struct smb2_pdu); + int command; + __u32 calc_len; /* calculated length */ + __u64 mid; /* * Add function to do table lookup of StructureSize by command @@ -154,7 +154,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); - list_for_each_entry(iter, &srvr->smb_ses_list, smb_ses_list) { + list_for_each_entry(iter, &server->smb_ses_list, smb_ses_list) { if (iter->Suid == le64_to_cpu(thdr->SessionId)) { ses = iter; break; @@ -221,30 +221,33 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) } } - clc_len = smb2_calc_size(buf, srvr); + calc_len = smb2_calc_size(buf, server); + + /* For SMB2_IOCTL, OutputOffset and OutputLength are optional, so might + * be 0, and not a real miscalculation */ + if (command == SMB2_IOCTL_HE && calc_len == 0) + return 0; - if (shdr->Command == SMB2_NEGOTIATE) - clc_len += get_neg_ctxt_len(shdr, len, clc_len); + if (command == SMB2_NEGOTIATE_HE) + calc_len += get_neg_ctxt_len(shdr, len, calc_len); - if (len != clc_len) { - cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n", - clc_len, len, mid); + if (len != calc_len) { /* create failed on symlink */ if (command == SMB2_CREATE_HE && shdr->Status == STATUS_STOPPED_ON_SYMLINK) return 0; /* Windows 7 server returns 24 bytes more */ - if (clc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE) + if (calc_len + 24 == len && command == SMB2_OPLOCK_BREAK_HE) return 0; /* server can return one byte more due to implied bcc[0] */ - if (clc_len == len + 1) + if (calc_len == len + 1) return 0; /* * Some windows servers (win2016) will pad also the final * PDU in a compound to 8 bytes. */ - if (((clc_len + 7) & ~7) == len) + if (((calc_len + 7) & ~7) == len) return 0; /* @@ -253,12 +256,18 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) * SMB2/SMB3 frame length (header + smb2 response specific data) * Some windows servers also pad up to 8 bytes when compounding. */ - if (clc_len < len) + if (calc_len < len) return 0; - pr_warn_once( - "srv rsp too short, len %d not %d. cmd:%d mid:%llu\n", - len, clc_len, command, mid); + /* Only log a message if len was really miscalculated */ + if (unlikely(cifsFYI)) + cifs_dbg(FYI, "Server response too short: calculated " + "length %u doesn't match read length %u (cmd=%d, mid=%llu)\n", + calc_len, len, command, mid); + else + pr_warn("Server response too short: calculated length " + "%u doesn't match read length %u (cmd=%d, mid=%llu)\n", + calc_len, len, command, mid); return 1; } From b8efc026a05c5f8ba5fdda7334ef64c9c976a81b Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 6 Jul 2022 13:15:19 +0200 Subject: [PATCH 0893/1250] highmem: Make __kunmap_{local,atomic}() take const void pointer __kunmap_ {local,atomic}() currently take pointers to void. However, this is semantically incorrect, since these functions do not change the memory their arguments point to. Therefore, make this semantics explicit by modifying the __kunmap_{local,atomic}() prototypes to take pointers to const void. As a side effect, compilers may produce more efficient code. Acked-by: Andrew Morton Acked-by: Helge Deller # parisc Suggested-by: David Sterba Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Reviewed-by: David Sterba Signed-off-by: David Sterba --- arch/parisc/include/asm/cacheflush.h | 6 +++--- arch/parisc/kernel/cache.c | 2 +- include/linux/highmem-internal.h | 10 +++++----- mm/highmem.c | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 8d03b3b26229e7..0bdee672413206 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -22,7 +22,7 @@ void flush_kernel_icache_range_asm(unsigned long, unsigned long); void flush_user_dcache_range_asm(unsigned long, unsigned long); void flush_kernel_dcache_range_asm(unsigned long, unsigned long); void purge_kernel_dcache_range_asm(unsigned long, unsigned long); -void flush_kernel_dcache_page_asm(void *); +void flush_kernel_dcache_page_asm(const void *addr); void flush_kernel_icache_page(void *); /* Cache flush operations */ @@ -31,7 +31,7 @@ void flush_cache_all_local(void); void flush_cache_all(void); void flush_cache_mm(struct mm_struct *mm); -void flush_kernel_dcache_page_addr(void *addr); +void flush_kernel_dcache_page_addr(const void *addr); #define flush_kernel_dcache_range(start,size) \ flush_kernel_dcache_range_asm((start), (start)+(size)); @@ -75,7 +75,7 @@ void flush_dcache_page_asm(unsigned long phys_addr, unsigned long vaddr); void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr); #define ARCH_HAS_FLUSH_ON_KUNMAP -static inline void kunmap_flush_on_unmap(void *addr) +static inline void kunmap_flush_on_unmap(const void *addr) { flush_kernel_dcache_page_addr(addr); } diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index a9bc578e4c52e5..993999a65e5448 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -549,7 +549,7 @@ extern void purge_kernel_dcache_page_asm(unsigned long); extern void clear_user_page_asm(void *, unsigned long); extern void copy_user_page_asm(void *, void *, unsigned long); -void flush_kernel_dcache_page_addr(void *addr) +void flush_kernel_dcache_page_addr(const void *addr) { unsigned long flags; diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index cddb42ff04730b..034b1106d02285 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -8,7 +8,7 @@ #ifdef CONFIG_KMAP_LOCAL void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); -void kunmap_local_indexed(void *vaddr); +void kunmap_local_indexed(const void *vaddr); void kmap_local_fork(struct task_struct *tsk); void __kmap_local_sched_out(void); void __kmap_local_sched_in(void); @@ -89,7 +89,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) return __kmap_local_pfn_prot(pfn, kmap_prot); } -static inline void __kunmap_local(void *vaddr) +static inline void __kunmap_local(const void *vaddr) { kunmap_local_indexed(vaddr); } @@ -121,7 +121,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) return __kmap_local_pfn_prot(pfn, kmap_prot); } -static inline void __kunmap_atomic(void *addr) +static inline void __kunmap_atomic(const void *addr) { kunmap_local_indexed(addr); pagefault_enable(); @@ -197,7 +197,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) return kmap_local_page(pfn_to_page(pfn)); } -static inline void __kunmap_local(void *addr) +static inline void __kunmap_local(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(addr); @@ -224,7 +224,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) return kmap_atomic(pfn_to_page(pfn)); } -static inline void __kunmap_atomic(void *addr) +static inline void __kunmap_atomic(const void *addr) { #ifdef ARCH_HAS_FLUSH_ON_KUNMAP kunmap_flush_on_unmap(addr); diff --git a/mm/highmem.c b/mm/highmem.c index 1a692997fac4c9..e32083e4ce0d33 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -561,7 +561,7 @@ void *__kmap_local_page_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(__kmap_local_page_prot); -void kunmap_local_indexed(void *vaddr) +void kunmap_local_indexed(const void *vaddr) { unsigned long addr = (unsigned long) vaddr & PAGE_MASK; pte_t *kmap_pte; From 9e15f58d00880ab8049332d5f8af7e48b88c1d1b Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 6 Jul 2022 13:15:20 +0200 Subject: [PATCH 0894/1250] btrfs: zstd: replace kmap() with kmap_local_page() The use of kmap() is being deprecated in favor of kmap_local_page(). With kmap_local_page(), the mapping is per thread, CPU local and not globally visible. Therefore, use kmap_local_page() / kunmap_local() in zstd.c because in this file the mappings are per thread and are not visible in other contexts. In the meanwhile use plain page_address() on output pages allocated with the GFP_NOFS flag instead of calling kmap*() on them (since they are always allocated from ZONE_NORMAL). Tested with xfstests on QEMU + KVM 32 bits VM with 4GB of RAM, booting a kernel with HIGHMEM64G enabled. Suggested-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zstd.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 0fe31a6f6e68f0..35a0224d4eb7fa 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -403,7 +403,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); @@ -415,7 +415,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -450,9 +450,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -462,7 +460,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -477,13 +475,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; - kunmap(in_page); + kunmap_local(workspace->in_buf.src); put_page(in_page); - start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -510,9 +507,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -522,7 +517,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -537,13 +532,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = tot_out; out: *out_pages = nr_pages; - /* Cleanup */ - if (in_page) { - kunmap(in_page); + if (workspace->in_buf.src) { + kunmap_local(workspace->in_buf.src); put_page(in_page); } - if (out_page) - kunmap(out_page); return ret; } @@ -567,7 +559,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -603,14 +595,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - kunmap(pages_in[page_in_index++]); + kunmap_local(workspace->in_buf.src); + page_in_index++; if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -619,7 +612,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) zero_fill_bio(cb->orig_bio); done: if (workspace->in_buf.src) - kunmap(pages_in[page_in_index]); + kunmap_local(workspace->in_buf.src); return ret; } From 8dc1477382490a978f70acf4121489e8b901c21d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 27 Jun 2022 18:33:05 +0200 Subject: [PATCH 0895/1250] btrfs: zlib: replace kmap() with kmap_local_page() in zlib_compress_pages() The use of kmap() is being deprecated in favor of kmap_local_page(). With kmap_local_page(), the mapping is per thread, CPU local and not globally visible. Therefore, use kmap_local_page() / kunmap_local() in zlib_compress_pages() because in this function the mappings are per thread and are not visible in other contexts. Furthermore, drop the mappings of "out_page" which is allocated within zlib_compress_pages() with alloc_page(GFP_NOFS) and use page_address(). Tested with xfstests on a QEMU + KVM 32-bits VM with 4GB of RAM booting a kernel with HIGHMEM64G enabled. This patch passes 26/26 tests of group "compress". CC: Qu Wenruo Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 767a0c6c9694b0..82a43ac9038789 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - char *data_in; + char *data_in = NULL; char *cpage_out; int nr_pages = 0; struct page *in_page = NULL; @@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,26 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -196,9 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -207,7 +205,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -234,9 +232,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -245,7 +241,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -264,13 +260,11 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } + return ret; } From 24c3d5ef97909405326428d08e5ec40a1e44ff05 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Sat, 18 Jun 2022 11:19:01 +0200 Subject: [PATCH 0896/1250] btrfs: zlib: replace kmap() with kmap_local_page() in zlib_decompress_bio() The use of kmap() is being deprecated in favor of kmap_local_page(). With kmap_local_page(), the mapping is per thread, CPU local and not globally visible. Therefore, use kmap_local_page() / kunmap_local() in zlib_decompress_bio() because in this function the mappings are per thread and are not visible in other contexts. Tested with xfstests on QEMU + KVM 32-bits VM with 4GB of RAM and HIGHMEM64G enabled. This patch passes 26/26 tests of group "compress". Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Reviewed-by: Qu Wenruo Signed-off-by: Fabio M. De Francesco Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 82a43ac9038789..b4f44662cda7ce 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -281,7 +281,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -303,7 +303,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -330,13 +330,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -349,7 +349,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) done: zlib_inflateEnd(&workspace->strm); if (data_in) - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); if (!ret) zero_fill_bio(cb->orig_bio); return ret; From 6406ac4a931ea342b4bce42079c4c5dc02eab5ed Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Mon, 27 Jun 2022 19:48:49 +0200 Subject: [PATCH 0897/1250] btrfs: replace kmap_atomic() with kmap_local_page() kmap_atomic() is being deprecated in favor of kmap_local_page() where it is feasible. With kmap_local_page() mappings are per thread, CPU local, and not globally visible. The last use of kmap_atomic is in inode.c where the context is atomic [1] and can be safely replaced by kmap_local_page. Tested with xfstests on a QEMU + KVM 32-bits VM with 4GB RAM and booting a kernel with HIGHMEM64GB enabled. [1] https://lore.kernel.org/linux-btrfs/20220601132545.GM20633@twin.jikos.cz/ Suggested-by: Ira Weiny Reviewed-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b04280a682316e..b26bb73d9b33c0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -333,9 +333,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = kmap_atomic(cpage); + kaddr = kmap_local_page(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); i++; ptr += cur_size; @@ -346,9 +346,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } else { page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); write_extent_buffer(leaf, kaddr, ptr, size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_page(page); } btrfs_mark_buffer_dirty(leaf); From a55a71ab9c8e465f4bb46ec711e1b31579ef7eba Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 11 Jul 2022 15:22:49 +0100 Subject: [PATCH 0898/1250] btrfs: set the objectid of the btree inode's location key We currently don't use the location key of the btree inode, its content is set to zeroes, as it's a special inode that is not persisted (it has no inode item stored in any btree). At btrfs_ino(), an inline function used extensively in btrfs, we have this special check if the given inode's location objectid is 0, and if it is, we return the value stored in the VFS' inode i_ino field instead (which is BTRFS_BTREE_INODE_OBJECTID for the btree inode). To reduce the code at btrfs_ino(), we can simply set the objectid of the btree inode to the value BTRFS_BTREE_INODE_OBJECTID. This eliminates the need to check for the special case of the objectid being zero, with the side effect of reducing the overall code size and having less code to execute, as btrfs_ino() is an inline function. Before: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1620502 189240 29032 1838774 1c0eb6 fs/btrfs/btrfs.ko After: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1617487 189240 29032 1835759 1c02ef fs/btrfs/btrfs.ko Reviewed-by: Nikolay Borisov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 7 ++----- fs/btrfs/disk-io.c | 4 +++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b467264bd1bbd4..a18f90ff16f12c 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -283,11 +283,8 @@ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->location.objectid; - /* - * !ino: btree_inode - * type == BTRFS_ROOT_ITEM_KEY: subvol dir - */ - if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY) + /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ + if (inode->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->vfs_inode.i_ino; return ino; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index bcb6807ce19e86..494e55ed370991 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2327,7 +2327,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); - memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); + BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; + BTRFS_I(inode)->location.type = 0; + BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_insert_inode_hash(inode); } From 5f2135526620854f8764c78e120b7a3083440bcf Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 11 Jul 2022 15:22:50 +0100 Subject: [PATCH 0899/1250] btrfs: add optimized btrfs_ino() version for 64 bits systems Currently btrfs_ino() tries to use first the objectid of the inode's location key. This is to avoid truncation of the inode number on 32 bits platforms because the i_ino field of struct inode has the unsigned long type, while the objectid is a 64 bits unsigned type (u64) on every system. This logic was added in commit 33345d01522f81 ("Btrfs: Always use 64bit inode number"). However if we are running on a 64 bits system, we can always directly return the i_ino value from struct inode, which eliminates the need for he special if statement that tests for a location key type of BTRFS_ROOT_ITEM_KEY - in which case i_ino may not have the same value as the objectid in the inode's location objectid, it may have a value of BTRFS_EMPTY_SUBVOL_DIR_OBJECTID, for the case of snapshots of trees with subvolumes/snapshots inside them. So add a special version for 64 bits system that directly returns i_ino of struct inode. This eliminates one branch and reduces the overall code size, since btrfs_ino() is an inline function that is extensively used. Before: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1617487 189240 29032 1835759 1c02ef fs/btrfs/btrfs.ko After: $ size fs/btrfs/btrfs.ko text data bss dec hex filename 1612028 189180 29032 1830240 1bed60 fs/btrfs/btrfs.ko Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 15 +++++++++++++++ fs/btrfs/tests/btrfs-tests.c | 1 + 2 files changed, 16 insertions(+) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a18f90ff16f12c..b160b8e124e011 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,6 +279,12 @@ static inline void btrfs_insert_inode_hash(struct inode *inode) __insert_inode_hash(inode, h); } +#if BITS_PER_LONG == 32 + +/* + * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so + * we use the inode's location objectid which is a u64 to avoid truncation. + */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->location.objectid; @@ -289,6 +295,15 @@ static inline u64 btrfs_ino(const struct btrfs_inode *inode) return ino; } +#else + +static inline u64 btrfs_ino(const struct btrfs_inode *inode) +{ + return inode->vfs_inode.i_ino; +} + +#endif + static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d8e56edd69910d..cc9377cf56a33f 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -59,6 +59,7 @@ struct inode *btrfs_new_test_inode(void) return NULL; inode->i_mode = S_IFREG; + inode->i_ino = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; From 8e0207f87eb923aaed426b45261ad020412bdafd Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:38 +0900 Subject: [PATCH 0900/1250] block: add bdev_max_segments() helper Add bdev_max_segments() like other queue parameters. Reviewed-by: Johannes Thumshirn Reviewed-by: Jens Axboe Reviewed-by: Christoph Hellwig Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- include/linux/blkdev.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2f7b43444c5f8d..62e3ff52ab0337 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1206,6 +1206,11 @@ bdev_max_zone_append_sectors(struct block_device *bdev) return queue_max_zone_append_sectors(bdev_get_queue(bdev)); } +static inline unsigned int bdev_max_segments(struct block_device *bdev) +{ + return queue_max_segments(bdev_get_queue(bdev)); +} + static inline unsigned queue_logical_block_size(const struct request_queue *q) { int retval = 512; From 3ee390bfe8ea09aa4692332e9a1089cedbb20840 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:39 +0900 Subject: [PATCH 0901/1250] btrfs: zoned: revive max_zone_append_bytes This patch is basically a revert of commit 5a80d1c6a270 ("btrfs: zoned: remove max_zone_append_size logic"), but without unnecessary ASSERT and check. The max_zone_append_size will be used as a hint to estimate the number of extents to cover delalloc/writeback region in the later commits. The size of a ZONE APPEND bio is also limited by queue_max_segments(), so this commit considers it to calculate max_zone_append_size. Technically, a bio can be larger than queue_max_segments() * PAGE_SIZE if the pages are contiguous. But, it is safe to consider "queue_max_segments() * PAGE_SIZE" as an upper limit of an extent size to calculate the number of extents needed to write data. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/zoned.c | 17 +++++++++++++++++ fs/btrfs/zoned.h | 1 + 3 files changed, 20 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 64c65d0f7d0676..b2a161227ac518 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1070,6 +1070,8 @@ struct btrfs_fs_info { */ u64 zone_size; + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 79a2d48a525165..bdc533fa80ae68 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -415,6 +415,16 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + /* + * We limit max_zone_append_size also by max_segments * + * PAGE_SIZE. Technically, we can have multiple pages per segment. But, + * since btrfs adds the pages one by one to a bio, and btrfs cannot + * increase the metadata reservation even if it increases the number of + * extents, it is safe to stick with the limit. + */ + zone_info->max_zone_append_size = + min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, + (u64)bdev_max_segments(bdev) << PAGE_SHIFT); if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -640,6 +650,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 zoned_devices = 0; u64 nr_devices = 0; u64 zone_size = 0; + u64 max_zone_append_size = 0; const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; @@ -674,6 +685,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) ret = -EINVAL; goto out; } + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = + zone_info->max_zone_append_size; } nr_devices++; } @@ -723,6 +739,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; + fs_info->max_zone_append_size = max_zone_append_size; fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; /* diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 6b2eec99162bfc..9caeab07fd3808 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -19,6 +19,7 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; + u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; From baf8c86329f251828ccb5f04c6cd506f9f2d328c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:40 +0900 Subject: [PATCH 0902/1250] btrfs: replace BTRFS_MAX_EXTENT_SIZE with fs_info->max_extent_size On zoned filesystem, data write out is limited by max_zone_append_size, and a large ordered extent is split according the size of a bio. OTOH, the number of extents to be written is calculated using BTRFS_MAX_EXTENT_SIZE, and that estimated number is used to reserve the metadata bytes to update and/or create the metadata items. The metadata reservation is done at e.g, btrfs_buffered_write() and then released according to the estimation changes. Thus, if the number of extent increases massively, the reserved metadata can run out. The increase of the number of extents easily occurs on zoned filesystem if BTRFS_MAX_EXTENT_SIZE > max_zone_append_size. And, it causes the following warning on a small RAM environment with disabling metadata over-commit (in the following patch). [75721.498492] ------------[ cut here ]------------ [75721.505624] BTRFS: block rsv 1 returned -28 [75721.512230] WARNING: CPU: 24 PID: 2327559 at fs/btrfs/block-rsv.c:537 btrfs_use_block_rsv+0x560/0x760 [btrfs] [75721.581854] CPU: 24 PID: 2327559 Comm: kworker/u64:10 Kdump: loaded Tainted: G W 5.18.0-rc2-BTRFS-ZNS+ #109 [75721.597200] Hardware name: Supermicro Super Server/H12SSL-NT, BIOS 2.0 02/22/2021 [75721.607310] Workqueue: btrfs-endio-write btrfs_work_helper [btrfs] [75721.616209] RIP: 0010:btrfs_use_block_rsv+0x560/0x760 [btrfs] [75721.646649] RSP: 0018:ffffc9000fbdf3e0 EFLAGS: 00010286 [75721.654126] RAX: 0000000000000000 RBX: 0000000000004000 RCX: 0000000000000000 [75721.663524] RDX: 0000000000000004 RSI: 0000000000000008 RDI: fffff52001f7be6e [75721.672921] RBP: ffffc9000fbdf420 R08: 0000000000000001 R09: ffff889f8d1fc6c7 [75721.682493] R10: ffffed13f1a3f8d8 R11: 0000000000000001 R12: ffff88980a3c0e28 [75721.692284] R13: ffff889b66590000 R14: ffff88980a3c0e40 R15: ffff88980a3c0e8a [75721.701878] FS: 0000000000000000(0000) GS:ffff889f8d000000(0000) knlGS:0000000000000000 [75721.712601] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [75721.720726] CR2: 000055d12e05c018 CR3: 0000800193594000 CR4: 0000000000350ee0 [75721.730499] Call Trace: [75721.735166] [75721.739886] btrfs_alloc_tree_block+0x1e1/0x1100 [btrfs] [75721.747545] ? btrfs_alloc_logged_file_extent+0x550/0x550 [btrfs] [75721.756145] ? btrfs_get_32+0xea/0x2d0 [btrfs] [75721.762852] ? btrfs_get_32+0xea/0x2d0 [btrfs] [75721.769520] ? push_leaf_left+0x420/0x620 [btrfs] [75721.776431] ? memcpy+0x4e/0x60 [75721.781931] split_leaf+0x433/0x12d0 [btrfs] [75721.788392] ? btrfs_get_token_32+0x580/0x580 [btrfs] [75721.795636] ? push_for_double_split.isra.0+0x420/0x420 [btrfs] [75721.803759] ? leaf_space_used+0x15d/0x1a0 [btrfs] [75721.811156] btrfs_search_slot+0x1bc3/0x2790 [btrfs] [75721.818300] ? lock_downgrade+0x7c0/0x7c0 [75721.824411] ? free_extent_buffer.part.0+0x107/0x200 [btrfs] [75721.832456] ? split_leaf+0x12d0/0x12d0 [btrfs] [75721.839149] ? free_extent_buffer.part.0+0x14f/0x200 [btrfs] [75721.846945] ? free_extent_buffer+0x13/0x20 [btrfs] [75721.853960] ? btrfs_release_path+0x4b/0x190 [btrfs] [75721.861429] btrfs_csum_file_blocks+0x85c/0x1500 [btrfs] [75721.869313] ? rcu_read_lock_sched_held+0x16/0x80 [75721.876085] ? lock_release+0x552/0xf80 [75721.881957] ? btrfs_del_csums+0x8c0/0x8c0 [btrfs] [75721.888886] ? __kasan_check_write+0x14/0x20 [75721.895152] ? do_raw_read_unlock+0x44/0x80 [75721.901323] ? _raw_write_lock_irq+0x60/0x80 [75721.907983] ? btrfs_global_root+0xb9/0xe0 [btrfs] [75721.915166] ? btrfs_csum_root+0x12b/0x180 [btrfs] [75721.921918] ? btrfs_get_global_root+0x820/0x820 [btrfs] [75721.929166] ? _raw_write_unlock+0x23/0x40 [75721.935116] ? unpin_extent_cache+0x1e3/0x390 [btrfs] [75721.942041] btrfs_finish_ordered_io.isra.0+0xa0c/0x1dc0 [btrfs] [75721.949906] ? try_to_wake_up+0x30/0x14a0 [75721.955700] ? btrfs_unlink_subvol+0xda0/0xda0 [btrfs] [75721.962661] ? rcu_read_lock_sched_held+0x16/0x80 [75721.969111] ? lock_acquire+0x41b/0x4c0 [75721.974982] finish_ordered_fn+0x15/0x20 [btrfs] [75721.981639] btrfs_work_helper+0x1af/0xa80 [btrfs] [75721.988184] ? _raw_spin_unlock_irq+0x28/0x50 [75721.994643] process_one_work+0x815/0x1460 [75722.000444] ? pwq_dec_nr_in_flight+0x250/0x250 [75722.006643] ? do_raw_spin_trylock+0xbb/0x190 [75722.013086] worker_thread+0x59a/0xeb0 [75722.018511] kthread+0x2ac/0x360 [75722.023428] ? process_one_work+0x1460/0x1460 [75722.029431] ? kthread_complete_and_exit+0x30/0x30 [75722.036044] ret_from_fork+0x22/0x30 [75722.041255] [75722.045047] irq event stamp: 0 [75722.049703] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [75722.057610] hardirqs last disabled at (0): [] copy_process+0x1c1a/0x66b0 [75722.067533] softirqs last enabled at (0): [] copy_process+0x1c59/0x66b0 [75722.077423] softirqs last disabled at (0): [<0000000000000000>] 0x0 [75722.085335] ---[ end trace 0000000000000000 ]--- To fix the estimation, we need to introduce fs_info->max_extent_size to replace BTRFS_MAX_EXTENT_SIZE, which allow setting the different size for regular vs zoned filesystem. Set fs_info->max_extent_size to BTRFS_MAX_EXTENT_SIZE by default. On zoned filesystem, it is set to fs_info->max_zone_append_size. CC: stable@vger.kernel.org # 5.12+ Fixes: d8e3fb106f39 ("btrfs: zoned: use ZONE_APPEND write for zoned mode") Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ++++++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/extent_io.c | 4 +++- fs/btrfs/inode.c | 6 ++++-- fs/btrfs/zoned.c | 5 ++++- 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b2a161227ac518..0ca83f72dbd9dd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1055,6 +1055,12 @@ struct btrfs_fs_info { u32 csums_per_leaf; u32 stripesize; + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + /* Block groups and devices containing active swapfiles. */ spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 494e55ed370991..90e513e54b48d5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3159,6 +3159,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; + spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 70fc7a65092422..fb09b83e2ab489 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2021,10 +2021,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; - u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + /* The sanity tests may not set a valid fs_info. */ + u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; bool found; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b26bb73d9b33c0..abf5fca26e2c5e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2201,6 +2201,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 size; /* not delalloc, ignore it */ @@ -2208,7 +2209,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; size = orig->end - orig->start + 1; - if (size > BTRFS_MAX_EXTENT_SIZE) { + if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; @@ -2237,6 +2238,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size, old_size; u32 num_extents; @@ -2250,7 +2252,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ - if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + if (new_size <= fs_info->max_extent_size) { spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index bdc533fa80ae68..d8a0a522b3cad7 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -739,8 +739,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; - fs_info->max_zone_append_size = max_zone_append_size; + fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + if (fs_info->max_zone_append_size < fs_info->max_extent_size) + fs_info->max_extent_size = fs_info->max_zone_append_size; /* * Check mount options here, because we might change fs_info->zoned From b172a0b8733fa24fac0fa206a0576b3a294a87d2 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:41 +0900 Subject: [PATCH 0903/1250] btrfs: convert count_max_extents() to use fs_info->max_extent_size If count_max_extents() uses BTRFS_MAX_EXTENT_SIZE to calculate the number of extents needed, btrfs release the metadata reservation too much on its way to write out the data. Now that BTRFS_MAX_EXTENT_SIZE is replaced with fs_info->max_extent_size, convert count_max_extents() to use it instead, and fix the calculation of the metadata reservation. CC: stable@vger.kernel.org # 5.12+ Fixes: d8e3fb106f39 ("btrfs: zoned: use ZONE_APPEND write for zoned mode") Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 21 +++++++++++++-------- fs/btrfs/delalloc-space.c | 6 +++--- fs/btrfs/inode.c | 16 ++++++++-------- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0ca83f72dbd9dd..7859635d876ec2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -107,14 +107,6 @@ struct btrfs_ioctl_encoded_io_args; #define BTRFS_STAT_CURR 0 #define BTRFS_STAT_PREV 1 -/* - * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size - */ -static inline u32 count_max_extents(u64 size) -{ - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -} - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -4058,6 +4050,19 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zone_size > 0; } +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 36ab0859a26349..1e8f17ff829e35 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -273,7 +273,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 disk_num_bytes, u64 *meta_reserve, u64 *qgroup_reserve) { - u64 nr_extents = count_max_extents(num_bytes); + u64 nr_extents = count_max_extents(fs_info, num_bytes); u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); @@ -350,7 +350,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * needs to free the reservation we just made. */ spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); + nr_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); @@ -413,7 +413,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) unsigned num_extents; spin_lock(&inode->lock); - num_extents = count_max_extents(num_bytes); + num_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, -num_extents); btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index abf5fca26e2c5e..22e2597ce163a1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2218,10 +2218,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * applies here, just in reverse. */ new_size = orig->end - split + 1; - num_extents = count_max_extents(new_size); + num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; - num_extents += count_max_extents(new_size); - if (count_max_extents(size) >= num_extents) + num_extents += count_max_extents(fs_info, new_size); + if (count_max_extents(fs_info, size) >= num_extents) return; } @@ -2278,10 +2278,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, * this case. */ old_size = other->end - other->start + 1; - num_extents = count_max_extents(old_size); + num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; - num_extents += count_max_extents(old_size); - if (count_max_extents(new_size) >= num_extents) + num_extents += count_max_extents(fs_info, old_size); + if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); @@ -2360,7 +2360,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); spin_lock(&BTRFS_I(inode)->lock); @@ -2402,7 +2402,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); From aa9ecd92c2af7cf79a6e1e0d20f21181c98e784d Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:42 +0900 Subject: [PATCH 0904/1250] btrfs: use fs_info->max_extent_size in get_extent_max_capacity() Use fs_info->max_extent_size also in get_extent_max_capacity() for the completeness. This is only used for defrag and not really necessary to fix the metadata reservation size. But, it still suppresses unnecessary defrag operations. Signed-off-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7e1b4b0fbd6c6e..fe0cc816b4eba2 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1230,16 +1230,18 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } -static u32 get_extent_max_capacity(const struct extent_map *em) +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) { if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) return BTRFS_MAX_COMPRESSED; - return BTRFS_MAX_EXTENT_SIZE; + return fs_info->max_extent_size; } static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *next; bool ret = false; @@ -1263,7 +1265,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, * If the next extent is at its max capacity, defragging current extent * makes no sense, as the total number of extents won't change. */ - if (next->len >= get_extent_max_capacity(em)) + if (next->len >= get_extent_max_capacity(fs_info, em)) goto out; /* Skip older extent */ if (next->generation < newer_than) @@ -1400,6 +1402,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool locked, struct list_head *target_list, u64 *last_scanned_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1484,7 +1487,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * Skip extents already at its max capacity, this is mostly for * compressed extents, which max cap is only 128K. */ - if (em->len >= get_extent_max_capacity(em)) + if (em->len >= get_extent_max_capacity(fs_info, em)) goto next; /* From a0eaccd2ea3cc371afcffd764a60b71a0db9395c Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:43 +0900 Subject: [PATCH 0905/1250] btrfs: let can_allocate_chunk return error For the later patch, convert the return type from bool to int and return errors. No functional changes. Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a1696e3ffb1e38..166ce2c539aa52 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3965,12 +3965,12 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } -static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl) +static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - return true; + return 0; case BTRFS_EXTENT_ALLOC_ZONED: /* * If we have enough free space left in an already @@ -3980,8 +3980,8 @@ static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, */ if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) - return false; - return true; + return -ENOSPC; + return 0; default: BUG(); } @@ -4063,8 +4063,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, int exist = 0; /*Check if allocation policy allows to create a new chunk */ - if (!can_allocate_chunk(fs_info, ffe_ctl)) - return -ENOSPC; + ret = can_allocate_chunk(fs_info, ffe_ctl); + if (ret) + return ret; trans = current->journal_info; if (trans) From 7ebc76dcfd08cd8729c9609112852f7da5d30bdb Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:44 +0900 Subject: [PATCH 0906/1250] btrfs: zoned: finish least available block group on data bg allocation When we run out of active zones and no sufficient space is left in any block groups, we need to finish one block group to make room to activate a new block group. However, we cannot do this for metadata block groups because we can cause a deadlock by waiting for a running transaction commit. So, do that only for a data block group. Furthermore, the block group to be finished has two requirements. First, the block group must not have reserved bytes left. Having reserved bytes means we have an allocated region but did not yet send bios for it. If that region is allocated by the thread calling btrfs_zone_finish(), it results in a deadlock. Second, the block group to be finished must not be a SYSTEM block group. Finishing a SYSTEM block group easily breaks further chunk allocation by nullifying the SYSTEM free space. In a certain case, we cannot find any zone finish candidate or btrfs_zone_finish() may fail. In that case, we fall back to split the allocation bytes and fill the last spaces left in the block groups. CC: stable@vger.kernel.org # 5.16+ Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 50 +++++++++++++++++++++++++++++++++--------- fs/btrfs/zoned.c | 40 +++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 7 ++++++ 3 files changed, 87 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 166ce2c539aa52..5b604c3dc357ec 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3965,6 +3965,45 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } +static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + /* If we can activate new zone, just allocate a chunk and use it */ + if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return 0; + + /* + * We already reached the max active zones. Try to finish one block + * group to make a room for a new block group. This is only possible + * for a data block group because btrfs_zone_finish() may need to wait + * for a running transaction which can cause a deadlock for metadata + * allocation. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + int ret = btrfs_zone_finish_one_bg(fs_info); + + if (ret == 1) + return 0; + else if (ret < 0) + return ret; + } + + /* + * If we have enough free space left in an already active block group + * and we can't activate any other zone now, do not allow allocating a + * new chunk and let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) + return -ENOSPC; + + /* + * We cannot activate a new block group and no enough space left in any + * block groups. So, allocating a new block group may not help. But, + * there is nothing to do anyway, so let's go with it. + */ + return 0; +} + static int can_allocate_chunk(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { @@ -3972,16 +4011,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, case BTRFS_EXTENT_ALLOC_CLUSTERED: return 0; case BTRFS_EXTENT_ALLOC_ZONED: - /* - * If we have enough free space left in an already - * active block group and we can't activate any other - * zone now, do not allow allocating a new chunk and - * let find_free_extent() retry with a smaller size. - */ - if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && - !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) - return -ENOSPC; - return 0; + return can_allocate_chunk_zoned(fs_info, ffe_ctl); default: BUG(); } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d8a0a522b3cad7..35f6f8988494c5 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2180,3 +2180,43 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica spin_unlock(&block_group->lock); btrfs_put_block_group(block_group); } + +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_block_group *min_bg = NULL; + u64 min_avail = U64_MAX; + int ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, + active_bg_list) { + u64 avail; + + spin_lock(&block_group->lock); + if (block_group->reserved || + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + spin_unlock(&block_group->lock); + continue; + } + + avail = block_group->zone_capacity - block_group->alloc_offset; + if (min_avail > avail) { + if (min_bg) + btrfs_put_block_group(min_bg); + min_bg = block_group; + min_avail = avail; + btrfs_get_block_group(min_bg); + } + spin_unlock(&block_group->lock); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + + if (!min_bg) + return 0; + + ret = btrfs_zone_finish(min_bg); + btrfs_put_block_group(min_bg); + + return ret < 0 ? ret : 1; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 9caeab07fd3808..329d28e2fd8d67 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -80,6 +80,7 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -249,6 +250,12 @@ static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { } + +static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + return 1; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From b088778d620feb8fef9e33523074c5442f013bce Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:45 +0900 Subject: [PATCH 0907/1250] btrfs: zoned: introduce space_info->active_total_bytes The active_total_bytes, like the total_bytes, accounts for the total bytes of active block groups in the space_info. With an introduction of active_total_bytes, we can check if the reserved bytes can be written to the block groups without activating a new block group. The check is necessary for metadata allocation on zoned filesystem. We cannot finish a block group, which may require waiting for the current transaction, from the metadata allocation context. Instead, we need to ensure the ongoing allocation (reserved bytes) fits in active block groups. Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 12 +++++++++--- fs/btrfs/space-info.c | 41 ++++++++++++++++++++++++++++++++--------- fs/btrfs/space-info.h | 4 +++- fs/btrfs/zoned.c | 6 ++++++ 4 files changed, 50 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index e930749770ac58..51e7c1f1d93f87 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1051,8 +1051,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); + WARN_ON(block_group->zone_is_active && + block_group->space_info->active_total_bytes + < block_group->length); } block_group->space_info->total_bytes -= block_group->length; + if (block_group->zone_is_active) + block_group->space_info->active_total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); block_group->space_info->bytes_zone_unusable -= @@ -2107,7 +2112,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, trace_btrfs_add_block_group(info, cache, 0); btrfs_update_space_info(info, cache->flags, cache->length, cache->used, cache->bytes_super, - cache->zone_unusable, &space_info); + cache->zone_unusable, cache->zone_is_active, + &space_info); cache->space_info = space_info; @@ -2177,7 +2183,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) } btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, 0, &space_info); + 0, 0, false, &space_info); bg->space_info = space_info; link_block_group(bg); @@ -2558,7 +2564,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, cache->zone_unusable, - &cache->space_info); + cache->zone_is_active, &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 62d25112310d96..826193c31dff32 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -295,7 +295,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info) + bool active, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; int factor; @@ -306,6 +306,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, ASSERT(found); spin_lock(&found->lock); found->total_bytes += total_bytes; + if (active) + found->active_total_bytes += total_bytes; found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; @@ -369,6 +371,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, return avail; } +static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + /* + * On regular filesystem, all total_bytes are always writable. On zoned + * filesystem, there may be a limitation imposed by max_active_zones. + * For metadata allocation, we cannot finish an existing active block + * group to avoid a deadlock. Thus, we need to consider only the active + * groups to be writable for metadata space. + */ + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return space_info->total_bytes; + + return space_info->active_total_bytes; +} + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) @@ -383,7 +401,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, used = btrfs_space_info_used(space_info, true); avail = calc_available_free_space(fs_info, space_info, flush); - if (used + bytes < space_info->total_bytes + avail) + if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) return 1; return 0; } @@ -419,7 +437,7 @@ void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, ticket = list_first_entry(head, struct reserve_ticket, list); /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= space_info->total_bytes) || + if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(fs_info, @@ -750,6 +768,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; + u64 total; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -764,8 +783,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, * space. If that's the case add in our overage so we make sure to put * appropriate pressure on the flushing state machine. */ - if (space_info->total_bytes + avail < used) - to_reclaim += used - (space_info->total_bytes + avail); + total = writable_total_bytes(fs_info, space_info); + if (total + avail < used) + to_reclaim += used - (total + avail); return to_reclaim; } @@ -775,9 +795,12 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; - u64 thresh = div_factor_fine(space_info->total_bytes, 90); + u64 total = writable_total_bytes(fs_info, space_info); + u64 thresh; u64 used; + thresh = div_factor_fine(total, 90); + lockdep_assert_held(&space_info->lock); /* If we're just plain full then async reclaim just slows us down. */ @@ -839,8 +862,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; - if (used < space_info->total_bytes) - thresh += space_info->total_bytes - used; + if (used < total) + thresh += total - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -1557,7 +1580,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * can_overcommit() to ensure we can overcommit to continue. */ if (!pending_tickets && - ((used + orig_bytes <= space_info->total_bytes) || + ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index e7de24a529cfb5..12fd6147f92d60 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -19,6 +19,8 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + /* Total bytes in the space, but only accounts active block groups. */ + u64 active_total_bytes; u64 bytes_zone_unusable; /* total bytes that are unusable until resetting the device zone */ @@ -124,7 +126,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info); + bool active, struct btrfs_space_info **space_info); void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 35f6f8988494c5..d0a0d62c527881 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1849,6 +1849,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; @@ -1860,6 +1861,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (block_group->zone_is_active) { ret = true; @@ -1888,7 +1890,10 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* Successfully activated all the zones */ block_group->zone_is_active = 1; + space_info->active_total_bytes += block_group->length; spin_unlock(&block_group->lock); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); @@ -1901,6 +1906,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); return ret; } From 9cbf95339dbcfe9dcd14d5bec8090cdeceb1e8bd Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:46 +0900 Subject: [PATCH 0908/1250] btrfs: zoned: disable metadata overcommit for zoned The metadata overcommit makes the space reservation flexible but it is also harmful to active zone tracking. Since we cannot finish a block group from the metadata allocation context, we might not activate a new block group and might not be able to actually write out the overcommit reservations. So, disable metadata overcommit for zoned filesystems. We will ensure the reservations are under active_total_bytes in the following patches. CC: stable@vger.kernel.org # 5.16+ Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") Signed-off-by: Naohiro Aota Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 826193c31dff32..5284312aad042c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -399,7 +399,10 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(fs_info, space_info, flush); + if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + avail = 0; + else + avail = calc_available_free_space(fs_info, space_info, flush); if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) return 1; From cc3835a383a8ef0d540132d9c87318e3308ab356 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:47 +0900 Subject: [PATCH 0909/1250] btrfs: zoned: activate metadata block group on flush_space For metadata space on zoned filesystem, reaching ALLOC_CHUNK{,_FORCE} means we don't have enough space left in the active_total_bytes. Before allocating a new chunk, we can try to activate an existing block group in this case. Also, allocating a chunk is not enough to grant a ticket for metadata space on zoned filesystem we need to activate the block group to increase the active_total_bytes. btrfs_zoned_activate_one_bg() implements the activation feature. It will activate a block group by (maybe) finishing a block group. It will give up activating a block group if it cannot finish any block group. CC: stable@vger.kernel.org # 5.16+ Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 30 ++++++++++++++++++++++++ fs/btrfs/zoned.c | 53 +++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/zoned.h | 10 ++++++++ 3 files changed, 93 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 5284312aad042c..d0cbeb7ae81c12 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -9,6 +9,7 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -724,6 +725,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: + /* + * For metadata space on zoned filesystem, reaching here means we + * don't have enough space left in active_total_bytes. Try to + * activate a block group first, because we may have inactive + * block group already allocated. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); + if (ret < 0) + break; + else if (ret == 1) + break; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -734,6 +747,23 @@ static void flush_space(struct btrfs_fs_info *fs_info, (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); + + /* + * For metadata space on zoned filesystem, allocating a new chunk + * is not enough. We still need to activate the block * group. + * Active the newly allocated block group by (maybe) finishing + * a block group. + */ + if (ret == 1) { + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); + /* + * Revert to the original ret regardless we could finish + * one block group or not. + */ + if (ret >= 0) + ret = 1; + } + if (ret > 0 || ret == -ENOSPC) ret = 0; break; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index d0a0d62c527881..6c391b5b417284 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2226,3 +2226,56 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) return ret < 0 ? ret : 1; } + +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + struct btrfs_block_group *bg; + int index; + + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + /* No more block groups to activate */ + if (space_info->active_total_bytes == space_info->total_bytes) + return 0; + + for (;;) { + int ret; + bool need_finish = false; + + down_read(&space_info->groups_sem); + for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { + list_for_each_entry(bg, &space_info->block_groups[index], + list) { + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + + if (btrfs_zone_activate(bg)) { + up_read(&space_info->groups_sem); + return 1; + } + + need_finish = true; + } + } + up_read(&space_info->groups_sem); + + if (!do_finish || !need_finish) + break; + + ret = btrfs_zone_finish_one_bg(fs_info); + if (ret == 0) + break; + if (ret < 0) + return ret; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 329d28e2fd8d67..e17462db3a842c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -81,6 +81,8 @@ bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, u64 length); int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, bool do_finish); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -256,6 +258,14 @@ static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) return 1; } +static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + /* Consider all the block groups are active */ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 6b58aef2453871f7210b204871f7b50cde7821e9 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:48 +0900 Subject: [PATCH 0910/1250] btrfs: zoned: activate necessary block group There are two places where allocating a chunk is not enough. These two places are trying to ensure the space by allocating a chunk. To meet the condition for active_total_bytes, we also need to activate a block group there. CC: stable@vger.kernel.org # 5.16+ Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 51e7c1f1d93f87..c3aecfb0a71d2b 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2664,6 +2664,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; + /* + * We have allocated a new chunk. We also need to activate that chunk to + * grant metadata tickets for zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + if (ret < 0) + goto out; + ret = inc_block_group_ro(cache, 0); if (ret == -ETXTBSY) goto unlock_out; @@ -3889,6 +3897,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (IS_ERR(bg)) { ret = PTR_ERR(bg); } else { + /* + * We have a new chunk. We also need to activate it for + * zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + if (ret < 0) + return; + /* * If we fail to add the chunk item here, we end up * trying again at phase 2 of chunk allocation, at From 251ec57736e3433d696407ab773b29a49427e2c5 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:49 +0900 Subject: [PATCH 0911/1250] btrfs: zoned: write out partially allocated region cow_file_range() works in an all-or-nothing way: if it fails to allocate an extent for a part of the given region, it gives up all the region including the successfully allocated parts. On cow_file_range(), run_delalloc_zoned() writes data for the region only when it successfully allocate all the region. This all-or-nothing allocation and write-out are problematic when available space in all the block groups are get tight with the active zone restriction. btrfs_reserve_extent() try hard to utilize the left space in the active block groups and gives up finally and fails with -ENOSPC. However, if we send IOs for the successfully allocated region, we can finish a zone and can continue on the rest of the allocation on a newly allocated block group. This patch implements the partial write-out for run_delalloc_zoned(). With this patch applied, cow_file_range() returns -EAGAIN to tell the caller to do something to progress the further allocation, and tells the successfully allocated region with done_offset. Furthermore, the zoned extent allocator returns -EAGAIN to tell cow_file_range() going back to the caller side. Actually, we still need to wait for an IO to complete to continue the allocation. The next patch implements that part. CC: stable@vger.kernel.org # 5.16+ Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 10 +++++++ fs/btrfs/inode.c | 63 ++++++++++++++++++++++++++++++++---------- 2 files changed, 59 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5b604c3dc357ec..ea3ec1e761e846 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3996,6 +3996,16 @@ static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) return -ENOSPC; + /* + * Even min_alloc_size is not left in any block groups. Since we cannot + * activate a new block group, allocating it may not help. Let's tell a + * caller to try again and hope it progress something by writing some + * parts of the region. That is only possible for data block groups, + * where a part of the region can be written. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) + return -EAGAIN; + /* * We cannot activate a new block group and no enough space left in any * block groups. So, allocating a new block group may not help. But, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 22e2597ce163a1..a8f97283554421 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -117,7 +117,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + unsigned long *nr_written, int unlock, + u64 *done_offset); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -921,7 +922,7 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, * can directly submit them without interruption. */ ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0); + &nr_written, 0, NULL); /* Inline extent inserted, page gets unlocked and everything is done */ if (page_started) { ret = 0; @@ -1170,7 +1171,8 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock) + unsigned long *nr_written, int unlock, + u64 *done_offset) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1363,6 +1365,21 @@ static noinline int cow_file_range(struct btrfs_inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + /* + * If done_offset is non-NULL and ret == -EAGAIN, we expect the + * caller to write out the successfully allocated region and retry. + */ + if (done_offset && ret == -EAGAIN) { + if (orig_start < start) + *done_offset = start - 1; + else + *done_offset = start; + return ret; + } else if (ret == -EAGAIN) { + /* Convert to -ENOSPC since the caller cannot retry. */ + ret = -ENOSPC; + } + /* * Now, we have three regions to clean up: * @@ -1608,19 +1625,37 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, u64 end, int *page_started, unsigned long *nr_written) { + u64 done_offset = end; int ret; + bool locked_page_done = false; - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0); - if (ret) - return ret; + while (start <= end) { + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0, &done_offset); + if (ret && ret != -EAGAIN) + return ret; - if (*page_started) - return 0; + if (*page_started) { + ASSERT(ret == 0); + return 0; + } + + if (ret == 0) + done_offset = end; + + if (done_offset == start) + return -ENOSPC; + + if (!locked_page_done) { + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + } + locked_page_done = true; + extent_write_locked_range(&inode->vfs_inode, start, done_offset); + + start = done_offset + 1; + } - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - extent_write_locked_range(&inode->vfs_inode, start, end); *page_started = 1; return 0; @@ -1712,7 +1747,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, } return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1); + nr_written, 1, NULL); } struct can_nocow_file_extent_args { @@ -2185,7 +2220,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page page_started, nr_written); else ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, From 98dddc58c850f3fa52e33d5993a5af0fea79d368 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Sat, 9 Jul 2022 08:18:50 +0900 Subject: [PATCH 0912/1250] btrfs: zoned: wait until zone is finished when allocation didn't progress When the allocated position doesn't progress, we cannot submit IOs to finish a block group, but there should be ongoing IOs that will finish a block group. So, in that case, we wait for a zone to be finished and retry the allocation after that. Introduce a new flag BTRFS_FS_NEED_ZONE_FINISH for fs_info->flags to indicate we need a zone finish to have proceeded. The flag is set when the allocator detected it cannot activate a new block group. And, it is cleared once a zone is finished. CC: stable@vger.kernel.org # 5.16+ Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 5 +++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/inode.c | 9 +++++++-- fs/btrfs/zoned.c | 6 ++++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7859635d876ec2..202496172059c7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -638,6 +638,9 @@ enum { /* Indicate we have half completed snapshot deletions pending. */ BTRFS_FS_UNFINISHED_DROPS, + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -1086,6 +1089,8 @@ struct btrfs_fs_info { spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; + /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */ + wait_queue_head_t zone_finish_wait; /* Updates are not protected by any lock */ struct btrfs_commit_stats commit_stats; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 90e513e54b48d5..3fac429cf8a407 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3152,6 +3152,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); + init_waitqueue_head(&fs_info->zone_finish_wait); /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a8f97283554421..16789bbacf75a7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1643,8 +1643,13 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, if (ret == 0) done_offset = end; - if (done_offset == start) - return -ENOSPC; + if (done_offset == start) { + struct btrfs_fs_info *info = inode->root->fs_info; + + wait_var_event(&info->zone_finish_wait, + !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags)); + continue; + } if (!locked_page_done) { __set_page_dirty_nobuffers(locked_page); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 6c391b5b417284..b150b07ba1a766 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -2007,6 +2007,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ /* For active_bg_list */ btrfs_put_block_group(block_group); + clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + wake_up_all(&fs_info->zone_finish_wait); + return 0; } @@ -2043,6 +2046,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) } mutex_unlock(&fs_info->chunk_mutex); + if (!ret) + set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + return ret; } From df11a7e54f12c58c50e8ab2f05f4f18c21620b1d Mon Sep 17 00:00:00 2001 From: BingJing Chang Date: Tue, 12 Jul 2022 09:36:31 +0800 Subject: [PATCH 0913/1250] btrfs: send: introduce recorded_ref_alloc and recorded_ref_free Introduce wrappers to allocate and free recorded_ref structures. Reviewed-by: Robbie Ko Reviewed-by: Filipe Manana Signed-off-by: BingJing Chang Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3ed80da71dad29..5d95820b3c5d39 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2795,6 +2795,26 @@ struct recorded_ref { int name_len; }; +static struct recorded_ref *recorded_ref_alloc(void) +{ + struct recorded_ref *ref; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + return NULL; + INIT_LIST_HEAD(&ref->list); + return ref; +} + +static void recorded_ref_free(struct recorded_ref *ref) +{ + if (!ref) + return; + list_del(&ref->list); + fs_path_free(ref->full_path); + kfree(ref); +} + static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) { ref->full_path = path; @@ -2812,7 +2832,7 @@ static int __record_ref(struct list_head *head, u64 dir, { struct recorded_ref *ref; - ref = kmalloc(sizeof(*ref), GFP_KERNEL); + ref = recorded_ref_alloc(); if (!ref) return -ENOMEM; @@ -2827,14 +2847,12 @@ static int dup_ref(struct recorded_ref *ref, struct list_head *list) { struct recorded_ref *new; - new = kmalloc(sizeof(*ref), GFP_KERNEL); + new = recorded_ref_alloc(); if (!new) return -ENOMEM; new->dir = ref->dir; new->dir_gen = ref->dir_gen; - new->full_path = NULL; - INIT_LIST_HEAD(&new->list); list_add_tail(&new->list, list); return 0; } @@ -2845,9 +2863,7 @@ static void __free_recorded_refs(struct list_head *head) while (!list_empty(head)) { cur = list_entry(head->next, struct recorded_ref, list); - fs_path_free(cur->full_path); - list_del(&cur->list); - kfree(cur); + recorded_ref_free(cur); } } @@ -6484,9 +6500,7 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) ret = send_unlink(sctx, ref->full_path); if (ret < 0) goto out; - fs_path_free(ref->full_path); - list_del(&ref->list); - kfree(ref); + recorded_ref_free(ref); } ret = 0; out: From cf8743e8fbd5666f46e15658b44c9a7e1f9a71b5 Mon Sep 17 00:00:00 2001 From: BingJing Chang Date: Tue, 12 Jul 2022 09:36:32 +0800 Subject: [PATCH 0914/1250] btrfs: send: fix sending link commands for existing file paths There is a bug sending link commands for existing file paths. When we're processing an inode, we go over all references. All the new file paths are added to the "new_refs" list. And all the deleted file paths are added to the "deleted_refs" list. In the end, when we finish processing the inode, we iterate over all the items in the "new_refs" list and send link commands for those file paths. After that, we go over all the items in the "deleted_refs" list and send unlink commands for them. If there are duplicated file paths in both lists, we will try to create them before we remove them. Then the receiver gets an -EEXIST error when trying the link operations. Example for having duplicated file paths in both list: $ btrfs subvolume create vol # create a file and 2000 hard links to the same inode $ touch vol/foo $ for i in {1..2000}; do link vol/foo vol/$i ; done # take a snapshot for a parent snapshot $ btrfs subvolume snapshot -r vol snap1 # remove 2000 hard links and re-create the last 1000 links $ for i in {1..2000}; do rm vol/$i; done; $ for i in {1001..2000}; do link vol/foo vol/$i; done # take another one for a send snapshot $ btrfs subvolume snapshot -r vol snap2 $ mkdir receive_dir $ btrfs send snap2 -p snap1 | btrfs receive receive_dir/ At subvol snap2 link 1238 -> foo ERROR: link 1238 -> foo failed: File exists In this case, we will have the same file paths added to both lists. In the parent snapshot, reference paths {1..1237} are stored in inode references, but reference paths {1238..2000} are stored in inode extended references. In the send snapshot, all reference paths {1001..2000} are stored in inode references. During the incremental send, we process their inode references first. In record_changed_ref(), we iterate all its inode references in the send/parent snapshot. For every inode reference, we also use find_iref() to check whether the same file path also appears in the parent/send snapshot or not. Inode references {1238..2000} which appear in the send snapshot but not in the parent snapshot are added to the "new_refs" list. On the other hand, Inode references {1..1000} which appear in the parent snapshot but not in the send snapshot are added to the "deleted_refs" list. Next, when we process their inode extended references, reference paths {1238..2000} are added to the "deleted_refs" list because all of them only appear in the parent snapshot. Now two lists contain items as below: "new_refs" list: {1238..2000} "deleted_refs" list: {1..1000}, {1238..2000} Reference paths {1238..2000} appear in both lists. And as the processing order mentioned about before, the receiver gets an -EEXIST error when trying the link operations. To fix the bug, the idea is to process the "deleted_refs" list before the "new_refs" list. However, it's not easy to reshuffle the processing order. For one reason, if we do so, we may unlink all the existing paths first, there's no valid path anymore for links. And it's inefficient because we do a bunch of unlinks followed by links for the same paths. Moreover, it makes less sense to have duplications in both lists. A reference path cannot not only be regarded as new but also has been seen in the past, or we won't call it a new path. However, it's also not a good idea to make find_iref() check a reference against all inode references and all inode extended references because it may result in large disk reads. So we introduce two rbtrees to make the references easier for lookups. And we also introduce record_new_ref_if_needed() and record_deleted_ref_if_needed() for changed_ref() to check and remove duplicated references early. Reviewed-by: Robbie Ko Reviewed-by: Filipe Manana Signed-off-by: BingJing Chang Signed-off-by: David Sterba --- fs/btrfs/send.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 152 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5d95820b3c5d39..83dd43593eca3c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -240,6 +240,9 @@ struct send_ctx { * Indexed by the inode number of the directory to be deleted. */ struct rb_root orphan_dirs; + + struct rb_root rbtree_new_refs; + struct rb_root rbtree_deleted_refs; }; struct pending_dir_move { @@ -2793,6 +2796,8 @@ struct recorded_ref { u64 dir; u64 dir_gen; int name_len; + struct rb_node node; + struct rb_root *root; }; static struct recorded_ref *recorded_ref_alloc(void) @@ -2802,6 +2807,7 @@ static struct recorded_ref *recorded_ref_alloc(void) ref = kzalloc(sizeof(*ref), GFP_KERNEL); if (!ref) return NULL; + RB_CLEAR_NODE(&ref->node); INIT_LIST_HEAD(&ref->list); return ref; } @@ -2810,6 +2816,8 @@ static void recorded_ref_free(struct recorded_ref *ref) { if (!ref) return; + if (!RB_EMPTY_NODE(&ref->node)) + rb_erase(&ref->node, ref->root); list_del(&ref->list); fs_path_free(ref->full_path); kfree(ref); @@ -4418,12 +4426,149 @@ static int __record_deleted_ref(int num, u64 dir, int index, &sctx->deleted_refs); } +static int rbtree_ref_comp(const void *k, const struct rb_node *node) +{ + const struct recorded_ref *data = k; + const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); + int result; + + if (data->dir > ref->dir) + return 1; + if (data->dir < ref->dir) + return -1; + if (data->dir_gen > ref->dir_gen) + return 1; + if (data->dir_gen < ref->dir_gen) + return -1; + if (data->name_len > ref->name_len) + return 1; + if (data->name_len < ref->name_len) + return -1; + result = strcmp(data->name, ref->name); + if (result > 0) + return 1; + if (result < 0) + return -1; + return 0; +} + +static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); + + return rbtree_ref_comp(entry, parent) < 0; +} + +static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, + struct fs_path *name, u64 dir, u64 dir_gen, + struct send_ctx *sctx) +{ + int ret = 0; + struct fs_path *path = NULL; + struct recorded_ref *ref = NULL; + + path = fs_path_alloc(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ref = recorded_ref_alloc(); + if (!ref) { + ret = -ENOMEM; + goto out; + } + + ret = get_cur_path(sctx, dir, dir_gen, path); + if (ret < 0) + goto out; + ret = fs_path_add_path(path, name); + if (ret < 0) + goto out; + + ref->dir = dir; + ref->dir_gen = dir_gen; + set_ref_path(ref, path); + list_add_tail(&ref->list, refs); + rb_add(&ref->node, root, rbtree_ref_less); + ref->root = root; +out: + if (ret) { + if (path && (!ref || !ref->full_path)) + fs_path_free(path); + recorded_ref_free(ref); + } + return ret; +} + +static int record_new_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; + + ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_new_refs, + &sctx->new_refs, name, dir, dir_gen, + sctx); + } +out: + return ret; +} + +static int record_deleted_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) +{ + int ret = 0; + struct send_ctx *sctx = ctx; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; + + ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_deleted_refs, + &sctx->deleted_refs, name, dir, + dir_gen, sctx); + } +out: + return ret; +} + static int record_new_ref(struct send_ctx *sctx) { int ret; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_new_ref, sctx); + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) goto out; ret = 0; @@ -4437,7 +4582,8 @@ static int record_deleted_ref(struct send_ctx *sctx) int ret; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_deleted_ref, sctx); + sctx->cmp_key, 0, record_deleted_ref_if_needed, + sctx); if (ret < 0) goto out; ret = 0; @@ -4520,7 +4666,7 @@ static int __record_changed_new_ref(int num, u64 dir, int index, ret = find_iref(sctx->parent_root, sctx->right_path, sctx->cmp_key, dir, dir_gen, name); if (ret == -ENOENT) - ret = __record_new_ref(num, dir, index, name, sctx); + ret = record_new_ref_if_needed(num, dir, index, name, sctx); else if (ret > 0) ret = 0; @@ -4543,7 +4689,7 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index, ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, dir, dir_gen, name); if (ret == -ENOENT) - ret = __record_deleted_ref(num, dir, index, name, sctx); + ret = record_deleted_ref_if_needed(num, dir, index, name, sctx); else if (ret > 0) ret = 0; @@ -7871,6 +8017,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, From dba4c5c9b71d7aaba7f769ad0b50bd03522d6da6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 12 Jul 2022 16:31:22 +0100 Subject: [PATCH 0915/1250] btrfs: send: always use the rbtree based inode ref management infrastructure After the patch "btrfs: send: fix sending link commands for existing file paths", we now have two infrastructures to detect and eliminate duplicated inode references (due to names that got removed and re-added between the send and parent snapshots): 1) One that works on a single inode ref/extref item; 2) A new one that works acrosss all ref/extref items for an inode, and it's also more efficient because even in the single ref/extref item case, it does not do a linear search for all the names encoded in the ref/extref item, it uses red black trees to speedup up the search. There's no good reason to keep both infrastructures, we can use the new one everywhere, and it's always more efficient. So remove the old infrastructure and change all sites that are using it to use the new one. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 193 +++--------------------------------------------- 1 file changed, 12 insertions(+), 181 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 83dd43593eca3c..e7671afcee4f0e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2195,7 +2195,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in - * __record_new_ref + * record_new_ref_if_needed(). */ ret = is_inode_existent(sctx, ino, gen); if (ret < 0) @@ -2830,27 +2830,6 @@ static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) ref->name_len = ref->full_path->end - ref->name; } -/* - * We need to process new refs before deleted refs, but compare_tree gives us - * everything mixed. So we first record all refs and later process them. - * This function is a helper to record one ref. - */ -static int __record_ref(struct list_head *head, u64 dir, - u64 dir_gen, struct fs_path *path) -{ - struct recorded_ref *ref; - - ref = recorded_ref_alloc(); - if (!ref) - return -ENOMEM; - - ref->dir = dir; - ref->dir_gen = dir_gen; - set_ref_path(ref, path); - list_add_tail(&ref->list, head); - return 0; -} - static int dup_ref(struct recorded_ref *ref, struct list_head *list) { struct recorded_ref *new; @@ -4377,55 +4356,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) return ret; } -static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, - void *ctx, struct list_head *refs) -{ - int ret = 0; - struct send_ctx *sctx = ctx; - struct fs_path *p; - u64 gen; - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, NULL, NULL, NULL); - if (ret < 0) - goto out; - - ret = get_cur_path(sctx, dir, gen, p); - if (ret < 0) - goto out; - ret = fs_path_add_path(p, name); - if (ret < 0) - goto out; - - ret = __record_ref(refs, dir, gen, p); - -out: - if (ret) - fs_path_free(p); - return ret; -} - -static int __record_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - struct send_ctx *sctx = ctx; - return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); -} - - -static int __record_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - struct send_ctx *sctx = ctx; - return record_ref(sctx->parent_root, dir, name, ctx, - &sctx->deleted_refs); -} - static int rbtree_ref_comp(const void *k, const struct rb_node *node) { const struct recorded_ref *data = k; @@ -4592,120 +4522,16 @@ static int record_deleted_ref(struct send_ctx *sctx) return ret; } -struct find_ref_ctx { - u64 dir; - u64 dir_gen; - struct btrfs_root *root; - struct fs_path *name; - int found_idx; -}; - -static int __find_iref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx_) -{ - struct find_ref_ctx *ctx = ctx_; - u64 dir_gen; - int ret; - - if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && - strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { - /* - * To avoid doing extra lookups we'll only do this if everything - * else matches. - */ - ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL, NULL); - if (ret) - return ret; - if (dir_gen != ctx->dir_gen) - return 0; - ctx->found_idx = num; - return 1; - } - return 0; -} - -static int find_iref(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 dir, u64 dir_gen, struct fs_path *name) -{ - int ret; - struct find_ref_ctx ctx; - - ctx.dir = dir; - ctx.name = name; - ctx.dir_gen = dir_gen; - ctx.found_idx = -1; - ctx.root = root; - - ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); - if (ret < 0) - return ret; - - if (ctx.found_idx == -1) - return -ENOENT; - - return ctx.found_idx; -} - -static int __record_changed_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - u64 dir_gen; - int ret; - struct send_ctx *sctx = ctx; - - ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL, NULL); - if (ret) - return ret; - - ret = find_iref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, dir, dir_gen, name); - if (ret == -ENOENT) - ret = record_new_ref_if_needed(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; - - return ret; -} - -static int __record_changed_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - u64 dir_gen; - int ret; - struct send_ctx *sctx = ctx; - - ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL, NULL); - if (ret) - return ret; - - ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, - dir, dir_gen, name); - if (ret == -ENOENT) - ret = record_deleted_ref_if_needed(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; - - return ret; -} - static int record_changed_ref(struct send_ctx *sctx) { int ret = 0; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_changed_new_ref, sctx); + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) goto out; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); + sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) goto out; ret = 0; @@ -4736,10 +4562,10 @@ static int process_all_refs(struct send_ctx *sctx, if (cmd == BTRFS_COMPARE_TREE_NEW) { root = sctx->send_root; - cb = __record_new_ref; + cb = record_new_ref_if_needed; } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { root = sctx->parent_root; - cb = __record_deleted_ref; + cb = record_deleted_ref_if_needed; } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); @@ -6591,8 +6417,13 @@ static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name, { struct parent_paths_ctx *ppctx = ctx; - return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx, - ppctx->refs); + /* + * Pass 0 as the generation for the directory, we don't care about it + * here as we have no new references to add, we just want to delete all + * references for an inode. + */ + return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs, + name, dir, 0, ppctx->sctx); } /* From 8f814141fa3b855964f34d4192f1297b033e5ed1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 14 Jul 2022 13:48:10 +0300 Subject: [PATCH 0916/1250] btrfs: simplify error handling in btrfs_lookup_dentry In btrfs_lookup_dentry releasing the reference of the sub_root and the running orphan cleanup should only happen if the dentry found actually represents a subvolume. This can only be true in the 'else' branch as otherwise either fixup_tree_root_location returned an ENOENT error, in which case sub_root wouldn't have been changed or if we got a different errno this means btrfs_get_fs_root couldn't have executed successfully again meaning sub_root will equal to root. So simplify all the branches by moving the code into the 'else'. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 16789bbacf75a7..f20740812e5bf3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5866,14 +5866,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, sub_root); + inode = new_simple_dir(dir->i_sb, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); - } - if (root != sub_root) btrfs_put_root(sub_root); - if (!IS_ERR(inode) && root != sub_root) { + if (IS_ERR(inode)) + return inode; + down_read(&fs_info->cleanup_work_sem); if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); From bf4d69293be9bd762ddbba698ccb522d1b7eb571 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 17 Jul 2022 22:05:05 +0100 Subject: [PATCH 0917/1250] btrfs: join running log transaction when logging new name When logging a new name, in case of a rename, we pin the log before changing it. We then either delete a directory entry from the log or insert a key range item to mark the old name for deletion on log replay. However when doing one of those log changes we may have another task that started writing out the log (at btrfs_sync_log()) and it started before we pinned the log root. So we may end up changing a log tree while its writeback is being started by another task syncing the log. This can lead to inconsistencies in a log tree and other unexpected results during log replay, because we can get some committed node pointing to a node/leaf that ends up not getting written to disk before the next log commit. The problem, conceptually, started to happen in commit 88d2beec7e53fc ("btrfs: avoid logging all directory changes during renames"), because there we started to update the log without joining its current transaction first. However the problem only became visible with commit 259c4b96d78dda ("btrfs: stop doing unnecessary log updates during a rename"), and that is because we used to pin the log at btrfs_rename() and then before entering btrfs_log_new_name(), when unlinking the old dentry, we ended up at btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log(). Both of them join the current log transaction, effectively waiting for any log transaction writeout (due to acquiring the root's log_mutex). This made it safe even after leaving the current log transaction, because we remained with the log pinned when we called btrfs_log_new_name(). Then in commit 259c4b96d78dda ("btrfs: stop doing unnecessary log updates during a rename"), we removed the log pinning from btrfs_rename() and stopped calling btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log() during the rename, and started to do all the needed work at btrfs_log_new_name(), but without joining the current log transaction, only pinning the log, which is racy because another task may have started writeout of the log tree right before we pinned the log. Both commits landed in kernel 5.18, so it doesn't make any practical difference which should be blamed, but I'm blaming the second commit only because with the first one, by chance, the problem did not happen due to the fact we joined the log transaction after pinning the log and unpinned it only after calling btrfs_log_new_name(). So make btrfs_log_new_name() join the current log transaction instead of pinning it, so that we never do log updates if it's writeout is starting. Fixes: 259c4b96d78dda ("btrfs: stop doing unnecessary log updates during a rename") CC: stable@vger.kernel.org # 5.18+ Reported-by: Zygo Blaxell Tested-by: Zygo Blaxell Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d898ba13285fb9..dcf75a8daa200b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7029,8 +7029,15 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * anyone from syncing the log until we have updated both inodes * in the log. */ + ret = join_running_log_trans(root); + /* + * At least one of the inodes was logged before, so this should + * not fail, but if it does, it's not serious, just bail out and + * mark the log for a full commit. + */ + if (WARN_ON_ONCE(ret < 0)) + goto out; log_pinned = true; - btrfs_pin_log_trans(root); path = btrfs_alloc_path(); if (!path) { From 9bf28cbaecc4574cb3ed7171a58dad72d29d6eef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Jul 2022 13:37:23 +0200 Subject: [PATCH 0918/1250] btrfs: merge btrfs_dev_stat_print_on_error with its only caller Fold it into the only caller. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bf4e140f6bfc7e..272901514b0c14 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -245,7 +245,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -7842,11 +7841,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) { btrfs_dev_stat_inc(dev, index); - btrfs_dev_stat_print_on_error(dev); -} -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) -{ if (!dev->dev_stats_valid) return; btrfs_err_rl_in_rcu(dev->fs_info, From d096b965fdb020ccd090b179ffb105e2a1dfdbfe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jul 2022 07:33:26 +0200 Subject: [PATCH 0919/1250] btrfs: repair all known bad mirrors When there is more than a single level of redundancy there can also be multiple bad mirrors, and the current read repair code only repairs the last bad one. Restructure btrfs_repair_one_sector so that it records the originally failed mirror and the number of copies, and then repair all known bad copies until we reach the originally failed copy in clean_io_failure. Note that this also means the read repair reads will always start from the next bad mirror and not mirror 0. This fixes btrfs/265 in xfstests. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 126 +++++++++++++++++++++---------------------- fs/btrfs/extent_io.h | 1 + 2 files changed, 61 insertions(+), 66 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fb09b83e2ab489..267b9acea7228d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2434,6 +2434,20 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) return ret; } +static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == failrec->num_copies) + return cur_mirror + 1 - failrec->num_copies; + return cur_mirror + 1; +} + +static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == 1) + return failrec->num_copies; + return cur_mirror - 1; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2446,7 +2460,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, u64 private; struct io_failure_record *failrec; struct extent_state *state; - int num_copies; + int mirror; int ret; private = 0; @@ -2470,20 +2484,19 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, EXTENT_LOCKED); spin_unlock(&io_tree->lock); - if (state && state->start <= failrec->start && - state->end >= failrec->start + failrec->len - 1) { - num_copies = btrfs_num_copies(fs_info, failrec->logical, - failrec->len); - if (num_copies > 1) { - repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, - failrec->failed_mirror); - } - } + if (!state || state->start > failrec->start || + state->end < failrec->start + failrec->len - 1) + goto out; + + mirror = failrec->this_mirror; + do { + mirror = prev_mirror(failrec, mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, mirror); + } while (mirror != failrec->failed_mirror); out: free_io_failure(failure_tree, io_tree, failrec); - return 0; } @@ -2522,7 +2535,8 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start) + u64 start, + int failed_mirror) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct io_failure_record *failrec; @@ -2544,7 +2558,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ - + ASSERT(failrec->this_mirror == failed_mirror); + ASSERT(failrec->len == fs_info->sectorsize); return failrec; } @@ -2554,7 +2569,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->start = start; failrec->len = sectorsize; - failrec->this_mirror = 0; + failrec->failed_mirror = failed_mirror; + failrec->this_mirror = failed_mirror; failrec->compress_type = BTRFS_COMPRESS_NONE; read_lock(&em_tree->lock); @@ -2589,6 +2605,20 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->logical = logical; free_extent_map(em); + failrec->num_copies = btrfs_num_copies(fs_info, logical, sectorsize); + if (failrec->num_copies == 1) { + /* + * We only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. No + * matter what the error is, it is very likely to persist. + */ + btrfs_debug(fs_info, + "cannot repair logical %llu num_copies %d", + failrec->logical, failrec->num_copies); + kfree(failrec); + return ERR_PTR(-EIO); + } + /* Set the bits in the private failure tree */ ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, EXTENT_LOCKED | EXTENT_DIRTY); @@ -2605,54 +2635,6 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -static bool btrfs_check_repairable(struct inode *inode, - struct io_failure_record *failrec, - int failed_mirror) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int num_copies; - - num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - /* The failure record should only contain one sector */ - ASSERT(failrec->len == fs_info->sectorsize); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - ASSERT(failed_mirror); - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - - if (failrec->this_mirror > num_copies) { - btrfs_debug(fs_info, - "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - return true; -} - int btrfs_repair_one_sector(struct inode *inode, struct bio *failed_bio, u32 bio_offset, struct page *page, unsigned int pgoff, @@ -2673,12 +2655,24 @@ int btrfs_repair_one_sector(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start); + failrec = btrfs_get_io_failure_record(inode, start, failed_mirror); if (IS_ERR(failrec)) return PTR_ERR(failrec); - - if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { + /* + * There are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + * + * Since we're only doing repair for one sector, we only need to get + * a good copy of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); + if (failrec->this_mirror == failrec->failed_mirror) { + btrfs_debug(fs_info, + "failed to repair num_copies %d this_mirror %d failed_mirror %d", + failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); free_io_failure(failure_tree, tree, failrec); return -EIO; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a76c6ef74cd3c5..280af70c049537 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -263,6 +263,7 @@ struct io_failure_record { enum btrfs_compression_type compress_type; int this_mirror; int failed_mirror; + int num_copies; }; int btrfs_repair_one_sector(struct inode *inode, From 0a1730bb43aba0d5c13439a048eb0801db93824c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jul 2022 07:33:27 +0200 Subject: [PATCH 0920/1250] btrfs: simplify the pending I/O counting in struct compressed_bio Instead of counting the sectors just count the bios, with an extra reference held during submission. This significantly simplifies the submission side error handling. This slightly changes completion and error handling of btrfs_submit_compressed_{read,write} because with the old code the compressed_bio could have been completed in submit_compressed_{read,write} only if there was an error during submission for one of the lower bio, whilst with the new code there is a chance for this to happen even for successful submission if the all the lower bios complete before the end of the function is reached. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/compression.c | 125 ++++++++++------------------------------- fs/btrfs/compression.h | 4 +- 2 files changed, 32 insertions(+), 97 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 907fc8a4c092cb..37676949a2b056 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -191,44 +191,6 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, return 0; } -/* - * Reduce bio and io accounting for a compressed_bio with its corresponding bio. - * - * Return true if there is no pending bio nor io. - * Return false otherwise. - */ -static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - unsigned int bi_size = 0; - bool last_io = false; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * At endio time, bi_iter.bi_size doesn't represent the real bio size. - * Thus here we have to iterate through all segments to grab correct - * bio size. - */ - bio_for_each_segment_all(bvec, bio, iter_all) - bi_size += bvec->bv_len; - - if (bio->bi_status) - cb->status = bio->bi_status; - - ASSERT(bi_size && bi_size <= cb->compressed_len); - last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, - &cb->pending_sectors); - /* - * Here we must wake up the possible error handler after all other - * operations on @cb finished, or we can race with - * finish_compressed_bio_*() which may free @cb. - */ - wake_up_var(cb); - - return last_io; -} - static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; @@ -288,7 +250,10 @@ static void end_compressed_bio_read(struct bio *bio) unsigned int mirror = btrfs_bio(bio)->mirror_num; int ret = 0; - if (!dec_and_test_compressed_bio(cb, bio)) + if (bio->bi_status) + cb->status = bio->bi_status; + + if (!refcount_dec_and_test(&cb->pending_ios)) goto out; /* @@ -417,7 +382,10 @@ static void end_compressed_bio_write(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; - if (dec_and_test_compressed_bio(cb, bio)) { + if (bio->bi_status) + cb->status = bio->bi_status; + + if (refcount_dec_and_test(&cb->pending_ios)) { struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); btrfs_record_physical_zoned(cb->inode, cb->start, bio); @@ -476,7 +444,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte return ERR_PTR(ret); } *next_stripe_start = disk_bytenr + geom.len; - + refcount_inc(&cb->pending_ios); return bio; } @@ -503,7 +471,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; u64 next_stripe_start; - blk_status_t ret; + blk_status_t ret = BLK_STS_OK; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; const bool use_append = btrfs_use_zone_append(inode, disk_start); const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; @@ -513,7 +481,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); + refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; @@ -543,8 +511,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, &next_stripe_start); if (IS_ERR(bio)) { ret = errno_to_blk_status(PTR_ERR(bio)); - bio = NULL; - goto finish_cb; + break; } if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; @@ -588,8 +555,11 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (submit) { if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, start, true); - if (ret) - goto finish_cb; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + break; + } } ASSERT(bio->bi_iter.bi_size); @@ -598,33 +568,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, } cond_resched(); } - if (blkcg_css) - kthread_associate_blkcg(NULL); - return 0; - -finish_cb: if (blkcg_css) kthread_associate_blkcg(NULL); - if (bio) { - bio->bi_status = ret; - bio_endio(bio); - } - /* Last byte of @cb is submitted, endio will free @cb */ - if (cur_disk_bytenr == disk_start + compressed_len) - return ret; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_start + compressed_len - cur_disk_bytenr) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_write(cb); + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_write(cb); return ret; } @@ -830,7 +779,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto out; } - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); + refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; cb->mirror_num = mirror_num; @@ -880,9 +829,8 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, REQ_OP_READ, end_compressed_bio_read, &next_stripe_start); if (IS_ERR(comp_bio)) { - ret = errno_to_blk_status(PTR_ERR(comp_bio)); - comp_bio = NULL; - goto finish_cb; + cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); + break; } } /* @@ -921,8 +869,11 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, unsigned int nr_sectors; ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - if (ret) - goto finish_cb; + if (ret) { + comp_bio->bi_status = ret; + bio_endio(comp_bio); + break; + } nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, fs_info->sectorsize); @@ -933,6 +884,9 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = NULL; } } + + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); return; fail: @@ -950,25 +904,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio->bi_status = ret; bio_endio(bio); return; -finish_cb: - if (comp_bio) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - /* All bytes of @cb is submitted, endio will free @cb */ - if (cur_disk_byte == disk_bytenr + compressed_len) - return; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_bytenr + compressed_len - cur_disk_byte) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish @cb manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_read(cb); } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 5fca7603e928a5..0e4cbf04fd8660 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of sectors with unfinished IO (unsubmitted or unfinished) */ - refcount_t pending_sectors; + /* Number of outstanding bios */ + refcount_t pending_ios; /* Number of compressed pages in the array */ unsigned int nr_pages; From c2b4e6a29708f9a4ed5e4fbeee4350414857563c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jul 2022 07:33:28 +0200 Subject: [PATCH 0921/1250] btrfs: pass a btrfs_bio to btrfs_repair_one_sector Pass the btrfs_bio instead of the plain bio to btrfs_repair_one_sector, and remove the start and failed_mirror arguments in favor of deriving them from the btrfs_bio. For this to work ensure that the file_offset field is also initialized for buffered I/O. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 48 ++++++++++++++++++++++++-------------------- fs/btrfs/extent_io.h | 7 +++---- fs/btrfs/inode.c | 5 ++--- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 267b9acea7228d..4baf5cac7b192e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -182,6 +182,7 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; + struct bio_vec *bv; struct inode *inode; int mirror_num; @@ -189,12 +190,15 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) return; bio = bio_ctrl->bio; - inode = bio_first_page_all(bio)->mapping->host; + bv = bio_first_bvec_all(bio); + inode = bv->bv_page->mapping->host; mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); + btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; + if (!is_data_inode(inode)) btrfs_submit_metadata_bio(inode, bio, mirror_num); else if (btrfs_op(bio) == BTRFS_MAP_WRITE) @@ -2535,10 +2539,11 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start, - int failed_mirror) + struct btrfs_bio *bbio, + unsigned int bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; struct extent_map *em; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -2558,7 +2563,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ - ASSERT(failrec->this_mirror == failed_mirror); + ASSERT(failrec->this_mirror == bbio->mirror_num); ASSERT(failrec->len == fs_info->sectorsize); return failrec; } @@ -2569,8 +2574,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->start = start; failrec->len = sectorsize; - failrec->failed_mirror = failed_mirror; - failrec->this_mirror = failed_mirror; + failrec->failed_mirror = bbio->mirror_num; + failrec->this_mirror = bbio->mirror_num; failrec->compress_type = BTRFS_COMPRESS_NONE; read_lock(&em_tree->lock); @@ -2635,17 +2640,16 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook) { + u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); + struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; @@ -2655,7 +2659,7 @@ int btrfs_repair_one_sector(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start, failed_mirror); + failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); @@ -2751,9 +2755,10 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate) offset + sectorsize - 1, &cached); } -static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio, +static void submit_data_read_repair(struct inode *inode, + struct btrfs_bio *failed_bbio, u32 bio_offset, const struct bio_vec *bvec, - int failed_mirror, unsigned int error_bitmap) + unsigned int error_bitmap) { const unsigned int pgoff = bvec->bv_offset; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2764,7 +2769,7 @@ static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio, const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; int i; - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); /* This repair is only for data */ ASSERT(is_data_inode(inode)); @@ -2776,7 +2781,7 @@ static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio, * We only get called on buffered IO, thus page must be mapped and bio * must not be cloned. */ - ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); + ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); /* Iterate through all the sectors in the range */ for (i = 0; i < nr_bits; i++) { @@ -2793,10 +2798,9 @@ static void submit_data_read_repair(struct inode *inode, struct bio *failed_bio, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bio, - bio_offset + offset, - page, pgoff + offset, start + offset, - failed_mirror, btrfs_submit_data_read_bio); + ret = btrfs_repair_one_sector(inode, failed_bbio, + bio_offset + offset, page, pgoff + offset, + btrfs_submit_data_read_bio); if (!ret) { /* * We have submitted the read repair, the page release @@ -3130,8 +3134,8 @@ static void end_bio_extent_readpage(struct bio *bio) * submit_data_read_repair() will handle all the good * and bad sectors, we just continue to the next bvec. */ - submit_data_read_repair(inode, bio, bio_offset, bvec, - mirror, error_bitmap); + submit_data_read_repair(inode, bbio, bio_offset, bvec, + error_bitmap); } else { /* Update page status and unlock */ end_page_read(page, uptodate, start, len); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 280af70c049537..b802ac85cb74cb 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -57,6 +57,7 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) +struct btrfs_bio; struct btrfs_root; struct btrfs_inode; struct btrfs_io_bio; @@ -266,10 +267,8 @@ struct io_failure_record { int num_copies; }; -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f20740812e5bf3..934cd10dac68b0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8012,9 +8012,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, } else { int ret; - ret = btrfs_repair_one_sector(inode, &bbio->bio, offset, - bv.bv_page, bv.bv_offset, start, - bbio->mirror_num, + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, submit_dio_repair_bio); if (ret) err = errno_to_blk_status(ret); From 358169f6f995415c7fcd51596daaf80f1f4c9065 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jul 2022 07:33:29 +0200 Subject: [PATCH 0922/1250] btrfs: remove the start argument to check_data_csum and export Derive the value of start from the btrfs_bio now that ->file_offset is always valid. Also export and rename the function so it's available outside of inode.c as we'll need that soon. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/inode.c | 26 +++++++++++--------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 202496172059c7..c567c73f750963 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3296,6 +3296,8 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 934cd10dac68b0..18d397bfd28e59 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3440,20 +3440,18 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode - * @io_bio: btrfs_io_bio which contains the csum + * @bbio: btrfs_bio which contains the csum * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page - * @start: logical offset in the file * * The length of such check is always one sector size. * * When csum mismatch is detected, we will also report the error and fill the * corrupted range with zero. (Thus it needs the extra parameters) */ -static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff, - u64 start) +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u32 len = fs_info->sectorsize; @@ -3469,8 +3467,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - bbio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), + bbio->file_offset + bio_offset, + csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -3539,8 +3538,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, - page_offset(page) + pg_off); + ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; @@ -8004,8 +8002,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, u64 start = bbio->file_offset + offset; if (uptodate && - (!csum || !check_data_csum(inode, bbio, offset, bv.bv_page, - bv.bv_offset, start))) { + (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, + bv.bv_offset))) { clean_io_failure(fs_info, failure_tree, io_tree, start, bv.bv_page, btrfs_ino(BTRFS_I(inode)), bv.bv_offset); @@ -10387,7 +10385,6 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) u32 sectorsize = fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; - u64 start = priv->file_offset; u32 bio_offset = 0; if (priv->skip_csum || !uptodate) @@ -10400,10 +10397,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) pgoff = bvec->bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, - bvec->bv_page, pgoff, start)) + if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + bvec->bv_page, pgoff)) return BLK_STS_IOERR; - start += sectorsize; bio_offset += sectorsize; pgoff += sectorsize; } From 5bbe423cf8ed915cacdad30e4da7ee02e389f6e6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jul 2022 07:33:30 +0200 Subject: [PATCH 0923/1250] btrfs: fix repair of compressed extents Currently the checksum of compressed extents is verified based on the compressed data and the lower btrfs_bio, but the actual repair process is driven by end_bio_extent_readpage on the upper btrfs_bio for the decompressed data. This has a bunch of issues, including not being able to properly communicate the failed mirror up in case that the I/O submission got preempted, a general loss of if an error was an I/O error or a checksum verification failure, but most importantly that this design causes btrfs_clean_io_failure to eventually write back the uncompressed good data onto the disk sectors that are supposed to contain compressed data. Fix this by moving the repair to the lower btrfs_bio. To do so, a fair amount of code has to be reshuffled: a) the lower btrfs_bio now needs a valid csum pointer. The easiest way to achieve that is to pass NULL btrfs_lookup_bio_sums and just use the btrfs_bio management of csums. For a compressed_bio that is split into multiple btrfs_bios this means additional memory allocations, but the code becomes a lot more regular. b) checksum verification now runs directly on the lower btrfs_bio instead of the compressed_bio. This actually nicely simplifies the end I/O processing. c) btrfs_repair_one_sector can't just look up the logical address for the file offset any more, as there is no corresponding relative offsets that apply to the file offset and the logic address for compressed extents. Instead require that the saved bvec_iter in the btrfs_bio is filled out for all read bios and use that, which again removes a fair amount of code. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/compression.c | 171 ++++++++++++++--------------------------- fs/btrfs/compression.h | 7 -- fs/btrfs/ctree.h | 2 + fs/btrfs/extent_io.c | 45 +++-------- fs/btrfs/extent_io.h | 1 - fs/btrfs/inode.c | 7 ++ 6 files changed, 76 insertions(+), 157 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 37676949a2b056..8124cd3d0b6bfb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -136,66 +136,14 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, - unsigned long disk_size) -{ - return sizeof(struct compressed_bio) + - (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size; -} - -static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, - u64 disk_start) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - const u32 csum_size = fs_info->csum_size; - const u32 sectorsize = fs_info->sectorsize; - struct page *page; - unsigned int i; - u8 csum[BTRFS_CSUM_SIZE]; - struct compressed_bio *cb = bio->bi_private; - u8 *cb_sum = cb->sums; - - if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) - return 0; - - for (i = 0; i < cb->nr_pages; i++) { - u32 pg_offset; - u32 bytes_left = PAGE_SIZE; - page = cb->compressed_pages[i]; - - /* Determine the remaining bytes inside the page first */ - if (i == cb->nr_pages - 1) - bytes_left = cb->compressed_len - i * PAGE_SIZE; - - /* Hash through the page sector by sector */ - for (pg_offset = 0; pg_offset < bytes_left; - pg_offset += sectorsize) { - int ret; - - ret = btrfs_check_sector_csum(fs_info, page, pg_offset, - csum, cb_sum); - if (ret) { - btrfs_print_data_csum_error(inode, disk_start, - csum, cb_sum, cb->mirror_num); - if (btrfs_bio(bio)->device) - btrfs_dev_stat_inc_and_print( - btrfs_bio(bio)->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - return -EIO; - } - cb_sum += csum_size; - disk_start += sectorsize; - } - } - return 0; -} - static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; struct page *page; + if (cb->status == BLK_STS_OK) + cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); + /* Release the compressed pages */ for (index = 0; index < cb->nr_pages; index++) { page = cb->compressed_pages[index]; @@ -233,59 +181,54 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) kfree(cb); } -/* when we finish reading compressed pages from the disk, we - * decompress them and then run the bio end_io routines on the - * decompressed pages (in the inode address space). - * - * This allows the checksumming and other IO error handling routines - * to work normally - * - * The compressed pages are freed here, and it must be run - * in process context +/* + * Verify the checksums and kick off repair if needed on the uncompressed data + * before decompressing it into the original bio and freeing the uncompressed + * pages. */ static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - unsigned int mirror = btrfs_bio(bio)->mirror_num; - int ret = 0; - - if (bio->bi_status) - cb->status = bio->bi_status; - - if (!refcount_dec_and_test(&cb->pending_ios)) - goto out; - - /* - * Record the correct mirror_num in cb->orig_bio so that - * read-repair can work properly. - */ - btrfs_bio(cb->orig_bio)->mirror_num = mirror; - cb->mirror_num = mirror; - - /* - * Some IO in this cb have failed, just skip checksum as there - * is no way it could be correct. - */ - if (cb->status != BLK_STS_OK) - goto csum_failed; + struct inode *inode = cb->inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *bi = BTRFS_I(inode); + bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + blk_status_t status = bio->bi_status; + struct btrfs_bio *bbio = btrfs_bio(bio); + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (!status && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, + bv.bv_page, bv.bv_offset))) { + clean_io_failure(fs_info, &bi->io_failure_tree, + &bi->io_tree, start, bv.bv_page, + btrfs_ino(bi), bv.bv_offset); + } else { + int ret; - inode = cb->inode; - ret = check_compressed_csum(BTRFS_I(inode), bio, - bio->bi_iter.bi_sector << 9); - if (ret) - goto csum_failed; + refcount_inc(&cb->pending_ios); + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + btrfs_submit_data_read_bio); + if (ret) { + refcount_dec(&cb->pending_ios); + status = errno_to_blk_status(ret); + } + } + } - /* ok, we're the last bio for this extent, lets start - * the decompression. - */ - ret = btrfs_decompress_bio(cb); + if (status) + cb->status = status; -csum_failed: - if (ret) - cb->status = errno_to_blk_status(ret); - finish_compressed_bio_read(cb); -out: + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); + btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -478,7 +421,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; refcount_set(&cb->pending_ios, 1); @@ -486,7 +429,6 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; - cb->mirror_num = 0; cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; cb->writeback = writeback; @@ -755,7 +697,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, blk_status_t ret; int ret2; int i; - u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -773,7 +714,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) { ret = BLK_STS_RESOURCE; goto out; @@ -782,8 +723,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; - cb->mirror_num = mirror_num; - sums = cb->sums; cb->start = em->orig_start; em_len = em->len; @@ -866,19 +805,25 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, submit = true; if (submit) { - unsigned int nr_sectors; + /* Save the original iter for read repair */ + if (bio_op(comp_bio) == REQ_OP_READ) + btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; + + /* + * Save the initial offset of this chunk, as there + * is no direct correlation between compressed pages and + * the original file offset. The field is only used for + * priting error messages. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); if (ret) { comp_bio->bi_status = ret; bio_endio(comp_bio); break; } - nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - fs_info->sectorsize); - sums += fs_info->csum_size * nr_sectors; - ASSERT(comp_bio->bi_iter.bi_size); btrfs_submit_bio(fs_info, comp_bio, mirror_num); comp_bio = NULL; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 0e4cbf04fd8660..e9ef24034cad0a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -59,19 +59,12 @@ struct compressed_bio { /* IO errors */ blk_status_t status; - int mirror_num; union { /* For reads, this is the bio we are copying the data into */ struct bio *orig_bio; struct work_struct write_end_work; }; - - /* - * the start of a variable length array of checksums only - * used by reads - */ - u8 sums[]; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c567c73f750963..4db85b9dc7edd6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3293,6 +3293,8 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4baf5cac7b192e..b290bd1b38b085 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2545,13 +2545,10 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct extent_map *em; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; const u32 sectorsize = fs_info->sectorsize; int ret; - u64 logical; failrec = get_state_failrec(failure_tree, start); if (!IS_ERR(failrec)) { @@ -2576,41 +2573,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->len = sectorsize; failrec->failed_mirror = bbio->mirror_num; failrec->this_mirror = bbio->mirror_num; - failrec->compress_type = BTRFS_COMPRESS_NONE; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return ERR_PTR(-EIO); - } - - if (em->start > start || em->start + em->len <= start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - if (!em) { - kfree(failrec); - return ERR_PTR(-EIO); - } - - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->compress_type = em->compress_type; - } + failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; btrfs_debug(fs_info, - "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", - logical, start, failrec->len); - - failrec->logical = logical; - free_extent_map(em); + "new io failure record logical %llu start %llu", + failrec->logical, start); - failrec->num_copies = btrfs_num_copies(fs_info, logical, sectorsize); + failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); if (failrec->num_copies == 1) { /* * We only have a single copy of the data, so don't bother with @@ -2709,7 +2678,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, * will be handled by the endio on the repair_bio, so we can't return an * error here. */ - submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type); + submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); return BLK_STS_OK; } @@ -3117,6 +3086,10 @@ static void end_bio_extent_readpage(struct bio *bio) * Only try to repair bios that actually made it to a * device. If the bio failed to be submitted mirror * is 0 and we need to fail it without retrying. + * + * This also includes the high level bios for compressed + * extents - these never make it to a device and repair + * is already handled on the lower compressed bio. */ if (mirror > 0) repair = true; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b802ac85cb74cb..4bc72a87b9a99c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -261,7 +261,6 @@ struct io_failure_record { u64 start; u64 len; u64 logical; - enum btrfs_compression_type compress_type; int this_mirror; int failed_mirror; int num_copies; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 18d397bfd28e59..e8021d52c846c4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2749,6 +2749,9 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, return; } + /* Save the original iter for read repair */ + btrfs_bio(bio)->iter = bio->bi_iter; + /* * Lookup bio sums does extra checks around whether we need to csum or * not, which is why we ignore skip_sum here. @@ -8060,6 +8063,10 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, struct btrfs_dio_private *dip = bio->bi_private; blk_status_t ret; + /* Save the original iter for read repair */ + if (btrfs_op(bio) == BTRFS_MAP_READ) + btrfs_bio(bio)->iter = bio->bi_iter; + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; From 9b198f41a0d17e40a29febf4012da71cbf432cd2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Jul 2022 07:33:31 +0200 Subject: [PATCH 0924/1250] btrfs: don't call btrfs_page_set_checked in finish_compressed_bio_read This flag was used to communicate that the low-level compression code already did verify the checksum to the high-level I/O completion code. But it has been unused for a long time as the upper btrfs_bio for the decompressed data had a NULL csum pointer basically since that pointer existed and the code already checks for that a little later. Note that this does not affect the other use of the checked flag, which is only used for the COW fixup worker. Reviewed-by: Nikolay Borisov Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/compression.c | 24 ++---------------------- fs/btrfs/inode.c | 5 ----- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8124cd3d0b6bfb..f3df9b9b438165 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -152,29 +152,9 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) } /* Do io completion on the original bio */ - if (cb->status != BLK_STS_OK) { + if (cb->status != BLK_STS_OK) cb->orig_bio->bi_status = cb->status; - bio_endio(cb->orig_bio); - } else { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * We have verified the checksum already, set page checked so - * the end_io handlers know about it - */ - ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { - u64 bvec_start = page_offset(bvec->bv_page) + - bvec->bv_offset; - - btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), - bvec->bv_page, bvec_start, - bvec->bv_len); - } - - bio_endio(cb->orig_bio); - } + bio_endio(cb->orig_bio); /* Finally free the cb struct */ kfree(cb->compressed_pages); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e8021d52c846c4..ecc5fa3343fc5e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3504,11 +3504,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 pg_off; unsigned int result = 0; - if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { - btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); - return 0; - } - /* * This only happens for NODATASUM or compressed read. * Normally this should be covered by above check for compressed read From 7f40c1eca12f00a427039871515d5deca5d8f381 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 21 Jul 2022 15:17:53 +0200 Subject: [PATCH 0925/1250] soc: document merges Signed-off-by: Arnd Bergmann --- arch/arm/arm-soc-for-next-contents.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/arch/arm/arm-soc-for-next-contents.txt b/arch/arm/arm-soc-for-next-contents.txt index 81efe0bb17072f..bbbe368b332d4c 100644 --- a/arch/arm/arm-soc-for-next-contents.txt +++ b/arch/arm/arm-soc-for-next-contents.txt @@ -21,6 +21,8 @@ arm/soc https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/soc-part2 mvebu/soc git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu tags/mvebu-arm-5.20-1 + at91/soc + git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux tags/at91-soc-5.20 arm/dt samsung/dt @@ -113,6 +115,14 @@ arm/dt git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu tags/mvebu-dt-5.20-1 mvebu/dt64 git://git.kernel.org/pub/scm/linux/kernel/git/gclement/mvebu tags/mvebu-dt64-5.20-1 + patch + ARM: dts: aspeed: centriq2400: drop the board + aspeed/dt-bindings + git://git.kernel.org/pub/scm/linux/kernel/git/krzk/linux-dt tags/dt-bindings-aspeed-5.20 + qcom/dts-2 + git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-dts-for-5.20-2 + qcom/dt64-2 + git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-arm64-for-5.20-2 arm/drivers renesas/drivers @@ -141,6 +151,9 @@ arm/drivers git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-drivers-for-5.20 patch soc: fujitsu: Add A64FX diagnostic interrupt driver + soc: a64fx-diag: disable modular build + qcom/drivers-2 + git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-drivers-for-5.20-2 arm/defconfig renesas/defconfig @@ -157,10 +170,16 @@ arm/defconfig https://github.com/Broadcom/stblinux tags/arm-soc/for-5.20/defconfig-arm64 qcom/defconfig git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-arm64-defconfig-for-5.20 + qcom/defconfig-2 + git://git.kernel.org/pub/scm/linux/kernel/git/qcom/linux tags/qcom-arm64-defconfig-for-5.20-2 arm/late arm/fixes + patch + mailmap: update Baolin Wang's email + at91/fixes-3 + git://git.kernel.org/pub/scm/linux/kernel/git/at91/linux tags/at91-fixes-5.19-3 arm/newsoc sunplus/newsoc From c8d80924ae025cc6340a1fcdae0a8353db1f5478 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Wed, 22 Jun 2022 17:17:58 -0400 Subject: [PATCH 0926/1250] virtio_fs: Modify format for virtio_fs_direct_access We should isolate operators with spaces. Signed-off-by: Deming Wang --- fs/fuse/virtio_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 0991199d19c1af..4d8d4f16c727b9 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -756,7 +756,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); - size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; + size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff; if (kaddr) *kaddr = fs->window_kaddr + offset; From 877ef7557c75e984a70190ff7449fcff3bf550b4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 May 2022 16:56:21 -0600 Subject: [PATCH 0927/1250] io_uring: define a 'prep' and 'issue' handler for each opcode Rather than have two giant switches for doing request preparation and then for doing request issue, add a prep and issue handler for each of them in the io_op_defs[] request definition. Signed-off-by: Jens Axboe --- fs/io_uring.c | 838 ++++++++++++++++++++++---------------------------- 1 file changed, 365 insertions(+), 473 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index e8e769be9ed058..63cad0e12d8b37 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1110,231 +1110,13 @@ struct io_op_def { unsigned iopoll : 1; /* size of async data needed, if any */ unsigned short async_size; -}; -static const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_READV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .needs_async_setup = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITEV] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_FSYNC] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_READ_FIXED] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITE_FIXED] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_POLL_ADD] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_POLL_REMOVE] = { - .audit_skip = 1, - }, - [IORING_OP_SYNC_FILE_RANGE] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SENDMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .ioprio = 1, - .async_size = sizeof(struct io_async_msghdr), - }, - [IORING_OP_RECVMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .needs_async_setup = 1, - .ioprio = 1, - .async_size = sizeof(struct io_async_msghdr), - }, - [IORING_OP_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - }, - [IORING_OP_TIMEOUT_REMOVE] = { - /* used by timeout updates' prep() */ - .audit_skip = 1, - }, - [IORING_OP_ACCEPT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .poll_exclusive = 1, - .ioprio = 1, /* used for flags */ - }, - [IORING_OP_ASYNC_CANCEL] = { - .audit_skip = 1, - }, - [IORING_OP_LINK_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - }, - [IORING_OP_CONNECT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .needs_async_setup = 1, - .async_size = sizeof(struct io_async_connect), - }, - [IORING_OP_FALLOCATE] = { - .needs_file = 1, - }, - [IORING_OP_OPENAT] = {}, - [IORING_OP_CLOSE] = {}, - [IORING_OP_FILES_UPDATE] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_STATX] = { - .audit_skip = 1, - }, - [IORING_OP_READ] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_WRITE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - }, - [IORING_OP_FADVISE] = { - .needs_file = 1, - .audit_skip = 1, - }, - [IORING_OP_MADVISE] = {}, - [IORING_OP_SEND] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .audit_skip = 1, - .ioprio = 1, - }, - [IORING_OP_RECV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .audit_skip = 1, - .ioprio = 1, - }, - [IORING_OP_OPENAT2] = { - }, - [IORING_OP_EPOLL_CTL] = { - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SPLICE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_PROVIDE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_REMOVE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - }, - [IORING_OP_TEE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - }, - [IORING_OP_SHUTDOWN] = { - .needs_file = 1, - }, - [IORING_OP_RENAMEAT] = {}, - [IORING_OP_UNLINKAT] = {}, - [IORING_OP_MKDIRAT] = {}, - [IORING_OP_SYMLINKAT] = {}, - [IORING_OP_LINKAT] = {}, - [IORING_OP_MSG_RING] = { - .needs_file = 1, - .iopoll = 1, - }, - [IORING_OP_FSETXATTR] = { - .needs_file = 1 - }, - [IORING_OP_SETXATTR] = {}, - [IORING_OP_FGETXATTR] = { - .needs_file = 1 - }, - [IORING_OP_GETXATTR] = {}, - [IORING_OP_SOCKET] = { - .audit_skip = 1, - }, - [IORING_OP_URING_CMD] = { - .needs_file = 1, - .plug = 1, - .needs_async_setup = 1, - .async_size = uring_cmd_pdu_size(1), - }, + int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); + int (*issue)(struct io_kiocb *, unsigned int); }; +static const struct io_op_def io_op_defs[]; + /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) @@ -8039,96 +7821,33 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_req_prep_async(struct io_kiocb *req) { + const struct io_op_def *def = &io_op_defs[req->opcode]; + + /* assign early for deferred execution for non-fixed file */ + if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) + req->file = io_file_get_normal(req, req->cqe.fd); + if (!def->needs_async_setup) + return 0; + if (WARN_ON_ONCE(req_has_async_data(req))) + return -EFAULT; + if (io_alloc_async_data(req)) + return -EAGAIN; + switch (req->opcode) { - case IORING_OP_NOP: - return io_nop_prep(req, sqe); case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: + return io_readv_prep_async(req); case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - return io_prep_rw(req, sqe); - case IORING_OP_POLL_ADD: - return io_poll_add_prep(req, sqe); - case IORING_OP_POLL_REMOVE: - return io_poll_remove_prep(req, sqe); - case IORING_OP_FSYNC: - return io_fsync_prep(req, sqe); - case IORING_OP_SYNC_FILE_RANGE: - return io_sfr_prep(req, sqe); + return io_writev_prep_async(req); case IORING_OP_SENDMSG: - case IORING_OP_SEND: - return io_sendmsg_prep(req, sqe); + return io_sendmsg_prep_async(req); case IORING_OP_RECVMSG: - case IORING_OP_RECV: - return io_recvmsg_prep(req, sqe); + return io_recvmsg_prep_async(req); case IORING_OP_CONNECT: - return io_connect_prep(req, sqe); - case IORING_OP_TIMEOUT: - return io_timeout_prep(req, sqe); - case IORING_OP_TIMEOUT_REMOVE: - return io_timeout_remove_prep(req, sqe); - case IORING_OP_ASYNC_CANCEL: - return io_async_cancel_prep(req, sqe); - case IORING_OP_LINK_TIMEOUT: - return io_link_timeout_prep(req, sqe); - case IORING_OP_ACCEPT: - return io_accept_prep(req, sqe); - case IORING_OP_FALLOCATE: - return io_fallocate_prep(req, sqe); - case IORING_OP_OPENAT: - return io_openat_prep(req, sqe); - case IORING_OP_CLOSE: - return io_close_prep(req, sqe); - case IORING_OP_FILES_UPDATE: - return io_files_update_prep(req, sqe); - case IORING_OP_STATX: - return io_statx_prep(req, sqe); - case IORING_OP_FADVISE: - return io_fadvise_prep(req, sqe); - case IORING_OP_MADVISE: - return io_madvise_prep(req, sqe); - case IORING_OP_OPENAT2: - return io_openat2_prep(req, sqe); - case IORING_OP_EPOLL_CTL: - return io_epoll_ctl_prep(req, sqe); - case IORING_OP_SPLICE: - return io_splice_prep(req, sqe); - case IORING_OP_PROVIDE_BUFFERS: - return io_provide_buffers_prep(req, sqe); - case IORING_OP_REMOVE_BUFFERS: - return io_remove_buffers_prep(req, sqe); - case IORING_OP_TEE: - return io_tee_prep(req, sqe); - case IORING_OP_SHUTDOWN: - return io_shutdown_prep(req, sqe); - case IORING_OP_RENAMEAT: - return io_renameat_prep(req, sqe); - case IORING_OP_UNLINKAT: - return io_unlinkat_prep(req, sqe); - case IORING_OP_MKDIRAT: - return io_mkdirat_prep(req, sqe); - case IORING_OP_SYMLINKAT: - return io_symlinkat_prep(req, sqe); - case IORING_OP_LINKAT: - return io_linkat_prep(req, sqe); - case IORING_OP_MSG_RING: - return io_msg_ring_prep(req, sqe); - case IORING_OP_FSETXATTR: - return io_fsetxattr_prep(req, sqe); - case IORING_OP_SETXATTR: - return io_setxattr_prep(req, sqe); - case IORING_OP_FGETXATTR: - return io_fgetxattr_prep(req, sqe); - case IORING_OP_GETXATTR: - return io_getxattr_prep(req, sqe); - case IORING_OP_SOCKET: - return io_socket_prep(req, sqe); + return io_connect_prep_async(req); case IORING_OP_URING_CMD: - return io_uring_cmd_prep(req, sqe); + return io_uring_cmd_prep_async(req); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -8136,39 +7855,6 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; } -static int io_req_prep_async(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - - /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) - req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->needs_async_setup) - return 0; - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (io_alloc_async_data(req)) - return -EAGAIN; - - switch (req->opcode) { - case IORING_OP_READV: - return io_readv_prep_async(req); - case IORING_OP_WRITEV: - return io_writev_prep_async(req); - case IORING_OP_SENDMSG: - return io_sendmsg_prep_async(req); - case IORING_OP_RECVMSG: - return io_recvmsg_prep_async(req); - case IORING_OP_CONNECT: - return io_connect_prep_async(req); - case IORING_OP_URING_CMD: - return io_uring_cmd_prep_async(req); - } - printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n", - req->opcode); - return -EFAULT; -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; @@ -8335,141 +8021,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!def->audit_skip) audit_uring_entry(req->opcode); - switch (req->opcode) { - case IORING_OP_NOP: - ret = io_nop(req, issue_flags); - break; - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - ret = io_read(req, issue_flags); - break; - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: - ret = io_write(req, issue_flags); - break; - case IORING_OP_FSYNC: - ret = io_fsync(req, issue_flags); - break; - case IORING_OP_POLL_ADD: - ret = io_poll_add(req, issue_flags); - break; - case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, issue_flags); - break; - case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, issue_flags); - break; - case IORING_OP_SENDMSG: - ret = io_sendmsg(req, issue_flags); - break; - case IORING_OP_SEND: - ret = io_send(req, issue_flags); - break; - case IORING_OP_RECVMSG: - ret = io_recvmsg(req, issue_flags); - break; - case IORING_OP_RECV: - ret = io_recv(req, issue_flags); - break; - case IORING_OP_TIMEOUT: - ret = io_timeout(req, issue_flags); - break; - case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, issue_flags); - break; - case IORING_OP_ACCEPT: - ret = io_accept(req, issue_flags); - break; - case IORING_OP_CONNECT: - ret = io_connect(req, issue_flags); - break; - case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, issue_flags); - break; - case IORING_OP_FALLOCATE: - ret = io_fallocate(req, issue_flags); - break; - case IORING_OP_OPENAT: - ret = io_openat(req, issue_flags); - break; - case IORING_OP_CLOSE: - ret = io_close(req, issue_flags); - break; - case IORING_OP_FILES_UPDATE: - ret = io_files_update(req, issue_flags); - break; - case IORING_OP_STATX: - ret = io_statx(req, issue_flags); - break; - case IORING_OP_FADVISE: - ret = io_fadvise(req, issue_flags); - break; - case IORING_OP_MADVISE: - ret = io_madvise(req, issue_flags); - break; - case IORING_OP_OPENAT2: - ret = io_openat2(req, issue_flags); - break; - case IORING_OP_EPOLL_CTL: - ret = io_epoll_ctl(req, issue_flags); - break; - case IORING_OP_SPLICE: - ret = io_splice(req, issue_flags); - break; - case IORING_OP_PROVIDE_BUFFERS: - ret = io_provide_buffers(req, issue_flags); - break; - case IORING_OP_REMOVE_BUFFERS: - ret = io_remove_buffers(req, issue_flags); - break; - case IORING_OP_TEE: - ret = io_tee(req, issue_flags); - break; - case IORING_OP_SHUTDOWN: - ret = io_shutdown(req, issue_flags); - break; - case IORING_OP_RENAMEAT: - ret = io_renameat(req, issue_flags); - break; - case IORING_OP_UNLINKAT: - ret = io_unlinkat(req, issue_flags); - break; - case IORING_OP_MKDIRAT: - ret = io_mkdirat(req, issue_flags); - break; - case IORING_OP_SYMLINKAT: - ret = io_symlinkat(req, issue_flags); - break; - case IORING_OP_LINKAT: - ret = io_linkat(req, issue_flags); - break; - case IORING_OP_MSG_RING: - ret = io_msg_ring(req, issue_flags); - break; - case IORING_OP_FSETXATTR: - ret = io_fsetxattr(req, issue_flags); - break; - case IORING_OP_SETXATTR: - ret = io_setxattr(req, issue_flags); - break; - case IORING_OP_FGETXATTR: - ret = io_fgetxattr(req, issue_flags); - break; - case IORING_OP_GETXATTR: - ret = io_getxattr(req, issue_flags); - break; - case IORING_OP_SOCKET: - ret = io_socket(req, issue_flags); - break; - case IORING_OP_URING_CMD: - ret = io_uring_cmd(req, issue_flags); - break; - default: - ret = -EINVAL; - break; - } + ret = def->issue(req, issue_flags); if (!def->audit_skip) audit_uring_exit(!ret, ret); @@ -8898,7 +8450,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->flags |= REQ_F_CREDS; } - return io_req_prep(req, sqe); + return def->prep(req, sqe); } static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, @@ -13200,8 +12752,343 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, return ret; } +static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) +{ + WARN_ON_ONCE(1); + return -ECANCELED; +} + +static const struct io_op_def io_op_defs[] = { + [IORING_OP_NOP] = { + .audit_skip = 1, + .iopoll = 1, + .prep = io_nop_prep, + .issue = io_nop, + }, + [IORING_OP_READV] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .needs_async_setup = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITEV] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .needs_async_setup = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_FSYNC] = { + .needs_file = 1, + .audit_skip = 1, + .prep = io_fsync_prep, + .issue = io_fsync, + }, + [IORING_OP_READ_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITE_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_POLL_ADD] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .prep = io_poll_add_prep, + .issue = io_poll_add, + }, + [IORING_OP_POLL_REMOVE] = { + .audit_skip = 1, + .prep = io_poll_remove_prep, + .issue = io_poll_remove, + }, + [IORING_OP_SYNC_FILE_RANGE] = { + .needs_file = 1, + .audit_skip = 1, + .prep = io_sfr_prep, + .issue = io_sync_file_range, + }, + [IORING_OP_SENDMSG] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .needs_async_setup = 1, + .ioprio = 1, + .async_size = sizeof(struct io_async_msghdr), + .prep = io_sendmsg_prep, + .issue = io_sendmsg, + }, + [IORING_OP_RECVMSG] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .needs_async_setup = 1, + .ioprio = 1, + .async_size = sizeof(struct io_async_msghdr), + .prep = io_recvmsg_prep, + .issue = io_recvmsg, + }, + [IORING_OP_TIMEOUT] = { + .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), + .prep = io_timeout_prep, + .issue = io_timeout, + }, + [IORING_OP_TIMEOUT_REMOVE] = { + /* used by timeout updates' prep() */ + .audit_skip = 1, + .prep = io_timeout_remove_prep, + .issue = io_timeout_remove, + }, + [IORING_OP_ACCEPT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .poll_exclusive = 1, + .ioprio = 1, /* used for flags */ + .prep = io_accept_prep, + .issue = io_accept, + }, + [IORING_OP_ASYNC_CANCEL] = { + .audit_skip = 1, + .prep = io_async_cancel_prep, + .issue = io_async_cancel, + }, + [IORING_OP_LINK_TIMEOUT] = { + .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), + .prep = io_link_timeout_prep, + .issue = io_no_issue, + }, + [IORING_OP_CONNECT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .needs_async_setup = 1, + .async_size = sizeof(struct io_async_connect), + .prep = io_connect_prep, + .issue = io_connect, + }, + [IORING_OP_FALLOCATE] = { + .needs_file = 1, + .prep = io_fallocate_prep, + .issue = io_fallocate, + }, + [IORING_OP_OPENAT] = { + .prep = io_openat_prep, + .issue = io_openat, + }, + [IORING_OP_CLOSE] = { + .prep = io_close_prep, + .issue = io_close, + }, + [IORING_OP_FILES_UPDATE] = { + .audit_skip = 1, + .iopoll = 1, + .prep = io_files_update_prep, + .issue = io_files_update, + }, + [IORING_OP_STATX] = { + .audit_skip = 1, + .prep = io_statx_prep, + .issue = io_statx, + }, + [IORING_OP_READ] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_FADVISE] = { + .needs_file = 1, + .audit_skip = 1, + .prep = io_fadvise_prep, + .issue = io_fadvise, + }, + [IORING_OP_MADVISE] = { + .prep = io_madvise_prep, + .issue = io_madvise, + }, + [IORING_OP_SEND] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .audit_skip = 1, + .ioprio = 1, + .prep = io_sendmsg_prep, + .issue = io_send, + }, + [IORING_OP_RECV] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .audit_skip = 1, + .ioprio = 1, + .prep = io_recvmsg_prep, + .issue = io_recv, + }, + [IORING_OP_OPENAT2] = { + .prep = io_openat2_prep, + .issue = io_openat2, + }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .audit_skip = 1, + .prep = io_epoll_ctl_prep, + .issue = io_epoll_ctl, + }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .prep = io_splice_prep, + .issue = io_splice, + }, + [IORING_OP_PROVIDE_BUFFERS] = { + .audit_skip = 1, + .iopoll = 1, + .prep = io_provide_buffers_prep, + .issue = io_provide_buffers, + }, + [IORING_OP_REMOVE_BUFFERS] = { + .audit_skip = 1, + .iopoll = 1, + .prep = io_remove_buffers_prep, + .issue = io_remove_buffers, + }, + [IORING_OP_TEE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .prep = io_tee_prep, + .issue = io_tee, + }, + [IORING_OP_SHUTDOWN] = { + .needs_file = 1, + .prep = io_shutdown_prep, + .issue = io_shutdown, + }, + [IORING_OP_RENAMEAT] = { + .prep = io_renameat_prep, + .issue = io_renameat, + }, + [IORING_OP_UNLINKAT] = { + .prep = io_unlinkat_prep, + .issue = io_unlinkat, + }, + [IORING_OP_MKDIRAT] = { + .prep = io_mkdirat_prep, + .issue = io_mkdirat, + }, + [IORING_OP_SYMLINKAT] = { + .prep = io_symlinkat_prep, + .issue = io_symlinkat, + }, + [IORING_OP_LINKAT] = { + .prep = io_linkat_prep, + .issue = io_linkat, + }, + [IORING_OP_MSG_RING] = { + .needs_file = 1, + .iopoll = 1, + .prep = io_msg_ring_prep, + .issue = io_msg_ring, + }, + [IORING_OP_FSETXATTR] = { + .needs_file = 1, + .prep = io_fsetxattr_prep, + .issue = io_fsetxattr, + }, + [IORING_OP_SETXATTR] = { + .prep = io_setxattr_prep, + .issue = io_setxattr, + }, + [IORING_OP_FGETXATTR] = { + .needs_file = 1, + .prep = io_fgetxattr_prep, + .issue = io_fgetxattr, + }, + [IORING_OP_GETXATTR] = { + .prep = io_getxattr_prep, + .issue = io_getxattr, + }, + [IORING_OP_SOCKET] = { + .audit_skip = 1, + .prep = io_socket_prep, + .issue = io_socket, + }, + [IORING_OP_URING_CMD] = { + .needs_file = 1, + .plug = 1, + .needs_async_setup = 1, + .async_size = uring_cmd_pdu_size(1), + .prep = io_uring_cmd_prep, + .issue = io_uring_cmd, + }, +}; + static int __init io_uring_init(void) { + int i; + #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ @@ -13266,6 +13153,11 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); + for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { + BUG_ON(!io_op_defs[i].prep); + BUG_ON(!io_op_defs[i].issue); + } + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); return 0; From f38d5d5d9bebd110ef1a0839f37a0e500cd06c82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 May 2022 17:05:03 -0600 Subject: [PATCH 0928/1250] io_uring: move to separate directory In preparation for splitting io_uring up a bit, move it into its own top level directory. It didn't really belong in fs/ anyway, as it's not a file system only API. This adds io_uring/ and moves the core files in there, and updates the MAINTAINERS file for the new location. Signed-off-by: Jens Axboe --- MAINTAINERS | 7 +------ Makefile | 1 + fs/Makefile | 2 -- io_uring/Makefile | 6 ++++++ {fs => io_uring}/io-wq.c | 0 {fs => io_uring}/io-wq.h | 0 {fs => io_uring}/io_uring.c | 2 +- kernel/sched/core.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) create mode 100644 io_uring/Makefile rename {fs => io_uring}/io-wq.c (100%) rename {fs => io_uring}/io-wq.h (100%) rename {fs => io_uring}/io_uring.c (99%) diff --git a/MAINTAINERS b/MAINTAINERS index 651616ed8ae25e..6ae97d7708940a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7773,9 +7773,6 @@ F: include/linux/fs.h F: include/linux/fs_types.h F: include/uapi/linux/fs.h F: include/uapi/linux/openat2.h -X: fs/io-wq.c -X: fs/io-wq.h -X: fs/io_uring.c FINTEK F75375S HARDWARE MONITOR AND FAN CONTROLLER DRIVER M: Riku Voipio @@ -10476,9 +10473,7 @@ L: io-uring@vger.kernel.org S: Maintained T: git git://git.kernel.dk/linux-block T: git git://git.kernel.dk/liburing -F: fs/io-wq.c -F: fs/io-wq.h -F: fs/io_uring.c +F: io_uring/ F: include/linux/io_uring.h F: include/uapi/linux/io_uring.h F: tools/io_uring/ diff --git a/Makefile b/Makefile index 00fd80c5dd6e65..c03b80d16f0379 100644 --- a/Makefile +++ b/Makefile @@ -1097,6 +1097,7 @@ export MODULES_NSDEPS := $(extmod_prefix)modules.nsdeps ifeq ($(KBUILD_EXTMOD),) core-y += kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ core-$(CONFIG_BLOCK) += block/ +core-$(CONFIG_IO_URING) += io_uring/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff --git a/fs/Makefile b/fs/Makefile index 208a74e0b00e12..93b80529f8e827 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -34,8 +34,6 @@ obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o -obj-$(CONFIG_IO_URING) += io_uring.o -obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ diff --git a/io_uring/Makefile b/io_uring/Makefile new file mode 100644 index 00000000000000..3680425df9478b --- /dev/null +++ b/io_uring/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for io_uring + +obj-$(CONFIG_IO_URING) += io_uring.o +obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/fs/io-wq.c b/io_uring/io-wq.c similarity index 100% rename from fs/io-wq.c rename to io_uring/io-wq.c diff --git a/fs/io-wq.h b/io_uring/io-wq.h similarity index 100% rename from fs/io-wq.h rename to io_uring/io-wq.h diff --git a/fs/io_uring.c b/io_uring/io_uring.c similarity index 99% rename from fs/io_uring.c rename to io_uring/io_uring.c index 63cad0e12d8b37..f429b68d1fc295 100644 --- a/fs/io_uring.c +++ b/io_uring/io_uring.c @@ -87,7 +87,7 @@ #include -#include "internal.h" +#include "../fs/internal.h" #include "io-wq.h" #define IORING_MAX_ENTRIES 32768 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da0bf6fe9ecdcf..f35674e89621b2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -91,7 +91,7 @@ #include "stats.h" #include "../workqueue_internal.h" -#include "../../fs/io-wq.h" +#include "../../io_uring/io-wq.h" #include "../smpboot.h" /* From c7e32b1e6dcefed613e944fa6ef107d24bd4bc72 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 23 May 2022 17:30:37 -0600 Subject: [PATCH 0929/1250] io_uring: move req async preparation into opcode handler Define an io_op_def->prep_async() handler and push the async preparation to there. Since we now have that, we can drop ->needs_async_setup, as they mean the same thing. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f429b68d1fc295..f353822436c4ed 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1098,8 +1098,6 @@ struct io_op_def { unsigned poll_exclusive : 1; /* op supports buffer selection */ unsigned buffer_select : 1; - /* do prep async if is going to be punted */ - unsigned needs_async_setup : 1; /* opcode is not supported by this kernel */ unsigned not_supported : 1; /* skip auditing */ @@ -1113,6 +1111,7 @@ struct io_op_def { int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); int (*issue)(struct io_kiocb *, unsigned int); + int (*prep_async)(struct io_kiocb *); }; static const struct io_op_def io_op_defs[]; @@ -3916,7 +3915,7 @@ static inline bool io_alloc_async_data(struct io_kiocb *req) static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, struct io_rw_state *s, bool force) { - if (!force && !io_op_defs[req->opcode].needs_async_setup) + if (!force && !io_op_defs[req->opcode].prep_async) return 0; if (!req_has_async_data(req)) { struct io_async_rw *iorw; @@ -7828,31 +7827,14 @@ static int io_req_prep_async(struct io_kiocb *req) /* assign early for deferred execution for non-fixed file */ if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) req->file = io_file_get_normal(req, req->cqe.fd); - if (!def->needs_async_setup) + if (!def->prep_async) return 0; if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; if (io_alloc_async_data(req)) return -EAGAIN; - switch (req->opcode) { - case IORING_OP_READV: - return io_readv_prep_async(req); - case IORING_OP_WRITEV: - return io_writev_prep_async(req); - case IORING_OP_SENDMSG: - return io_sendmsg_prep_async(req); - case IORING_OP_RECVMSG: - return io_recvmsg_prep_async(req); - case IORING_OP_CONNECT: - return io_connect_prep_async(req); - case IORING_OP_URING_CMD: - return io_uring_cmd_prep_async(req); - } - - printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", - req->opcode); - return -EINVAL; + return def->prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) @@ -12770,7 +12752,6 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_async_setup = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, @@ -12778,13 +12759,13 @@ static const struct io_op_def io_op_defs[] = { .async_size = sizeof(struct io_async_rw), .prep = io_prep_rw, .issue = io_read, + .prep_async = io_readv_prep_async, }, [IORING_OP_WRITEV] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_async_setup = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, @@ -12792,6 +12773,7 @@ static const struct io_op_def io_op_defs[] = { .async_size = sizeof(struct io_async_rw), .prep = io_prep_rw, .issue = io_write, + .prep_async = io_writev_prep_async, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -12846,22 +12828,22 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_async_setup = 1, .ioprio = 1, .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, + .prep_async = io_sendmsg_prep_async, }, [IORING_OP_RECVMSG] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .needs_async_setup = 1, .ioprio = 1, .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, + .prep_async = io_recvmsg_prep_async, }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, @@ -12899,10 +12881,10 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .needs_async_setup = 1, .async_size = sizeof(struct io_async_connect), .prep = io_connect_prep, .issue = io_connect, + .prep_async = io_connect_prep_async, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, @@ -13078,10 +13060,10 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_URING_CMD] = { .needs_file = 1, .plug = 1, - .needs_async_setup = 1, .async_size = uring_cmd_pdu_size(1), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, + .prep_async = io_uring_cmd_prep_async, }, }; From 1208b72e7901ec8ee4dd5126a1639fe7932be2c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 08:32:05 -0600 Subject: [PATCH 0930/1250] io_uring: add generic command payload type to struct io_kiocb Each opcode generally has a command structure in io_kiocb which it can use to store data associated with that request. In preparation for having the core layer not know about what's inside these fields, add a generic io_cmd_data type and put in the union as well. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f353822436c4ed..c596ffd92a0a00 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -954,14 +954,27 @@ enum { }; /* - * NOTE! Each of the iocb union members has the file pointer - * as the first entry in their struct definition. So you can - * access the file pointer through any of the sub-structs, - * or directly as just 'file' in this struct. + * Each request type overlays its private data structure on top of this one. + * They must not exceed this one in size. */ +struct io_cmd_data { + struct file *file; + /* each command gets 56 bytes of data */ + __u8 data[56]; +}; + +#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) + struct io_kiocb { union { + /* + * NOTE! Each of the io_kiocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'file' in this struct. + */ struct file *file; + struct io_cmd_data cmd; struct io_rw rw; struct io_poll_iocb poll; struct io_poll_update poll_update; From 245406a9986b21e86c080effa2bfde957a06f34c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 06:57:44 -0600 Subject: [PATCH 0931/1250] io_uring: convert read/write path to use io_cmd_type Remove struct io_rw from io_kiocb, and convert the read/write path to use the io_cmd_type approach instead. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 194 ++++++++++++++++++++++++-------------------- 1 file changed, 106 insertions(+), 88 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c596ffd92a0a00..db2880ccb22778 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -964,6 +964,7 @@ struct io_cmd_data { }; #define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) +#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) struct io_kiocb { union { @@ -975,7 +976,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_rw rw; struct io_poll_iocb poll; struct io_poll_update poll_update; struct io_accept accept; @@ -3032,7 +3032,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) wq_list_for_each(pos, start, &ctx->iopoll_list) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - struct kiocb *kiocb = &req->rw.kiocb; + struct io_rw *rw = io_kiocb_to_cmd(req); int ret; /* @@ -3043,7 +3043,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (READ_ONCE(req->iopoll_completed)) break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); + ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags); if (unlikely(ret < 0)) return ret; else if (ret) @@ -3188,11 +3188,11 @@ static void kiocb_end_write(struct io_kiocb *req) #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req) { - struct io_async_rw *rw = req->async_data; + struct io_async_rw *io = req->async_data; if (!req_has_async_data(req)) return !io_req_prep_async(req); - iov_iter_restore(&rw->s.iter, &rw->s.iter_state); + iov_iter_restore(&io->s.iter, &io->s.iter_state); return true; } @@ -3234,7 +3234,9 @@ static bool io_rw_should_reissue(struct io_kiocb *req) static bool __io_complete_rw_common(struct io_kiocb *req, long res) { - if (req->rw.kiocb.ki_flags & IOCB_WRITE) { + struct io_rw *rw = io_kiocb_to_cmd(req); + + if (rw->kiocb.ki_flags & IOCB_WRITE) { kiocb_end_write(req); fsnotify_modify(req->file); } else { @@ -3276,7 +3278,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, static void io_complete_rw(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); + struct io_kiocb *req = cmd_to_io_kiocb(rw); if (__io_complete_rw_common(req, res)) return; @@ -3287,7 +3290,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res) static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) { - struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); + struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); + struct io_kiocb *req = cmd_to_io_kiocb(rw); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); @@ -3418,11 +3422,11 @@ static inline bool io_file_supports_nowait(struct io_kiocb *req) static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct kiocb *kiocb = &req->rw.kiocb; + struct io_rw *rw = io_kiocb_to_cmd(req); unsigned ioprio; int ret; - kiocb->ki_pos = READ_ONCE(sqe->off); + rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); @@ -3444,14 +3448,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ret) return ret; - kiocb->ki_ioprio = ioprio; + rw->kiocb.ki_ioprio = ioprio; } else { - kiocb->ki_ioprio = get_current_ioprio(); + rw->kiocb.ki_ioprio = get_current_ioprio(); } - req->rw.addr = READ_ONCE(sqe->addr); - req->rw.len = READ_ONCE(sqe->len); - req->rw.flags = READ_ONCE(sqe->rw_flags); + rw->addr = READ_ONCE(sqe->addr); + rw->len = READ_ONCE(sqe->len); + rw->flags = READ_ONCE(sqe->rw_flags); return 0; } @@ -3478,18 +3482,18 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) { - struct kiocb *kiocb = &req->rw.kiocb; + struct io_rw *rw = io_kiocb_to_cmd(req); - if (kiocb->ki_pos != -1) - return &kiocb->ki_pos; + if (rw->kiocb.ki_pos != -1) + return &rw->kiocb.ki_pos; if (!(req->file->f_mode & FMODE_STREAM)) { req->flags |= REQ_F_CUR_POS; - kiocb->ki_pos = req->file->f_pos; - return &kiocb->ki_pos; + rw->kiocb.ki_pos = req->file->f_pos; + return &rw->kiocb.ki_pos; } - kiocb->ki_pos = 0; + rw->kiocb.ki_pos = 0; return NULL; } @@ -3497,6 +3501,7 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { struct io_async_rw *io = req->async_data; + struct io_rw *rw = io_kiocb_to_cmd(req); /* add previously done IO, if any */ if (req_has_async_data(req) && io->bytes_done > 0) { @@ -3507,11 +3512,11 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, } if (req->flags & REQ_F_CUR_POS) - req->file->f_pos = req->rw.kiocb.ki_pos; - if (ret >= 0 && (req->rw.kiocb.ki_complete == io_complete_rw)) + req->file->f_pos = rw->kiocb.ki_pos; + if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) __io_complete_rw(req, ret, issue_flags); else - io_rw_done(&req->rw.kiocb, ret); + io_rw_done(&rw->kiocb, ret); if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; @@ -3522,11 +3527,12 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, } } -static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - struct io_mapped_ubuf *imu) +static int __io_import_fixed(struct io_kiocb *req, int ddir, + struct iov_iter *iter, struct io_mapped_ubuf *imu) { - size_t len = req->rw.len; - u64 buf_end, buf_addr = req->rw.addr; + struct io_rw *rw = io_kiocb_to_cmd(req); + size_t len = rw->len; + u64 buf_end, buf_addr = rw->addr; size_t offset; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) @@ -3540,7 +3546,7 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); if (offset) { /* @@ -3682,12 +3688,13 @@ static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); struct compat_iovec __user *uiov; compat_ssize_t clen; void __user *buf; size_t len; - uiov = u64_to_user_ptr(req->rw.addr); + uiov = u64_to_user_ptr(rw->addr); if (!access_ok(uiov, sizeof(*uiov))) return -EFAULT; if (__get_user(clen, &uiov->iov_len)) @@ -3699,9 +3706,9 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; - req->rw.addr = (unsigned long) buf; + rw->addr = (unsigned long) buf; iov[0].iov_base = buf; - req->rw.len = iov[0].iov_len = (compat_size_t) len; + rw->len = iov[0].iov_len = (compat_size_t) len; return 0; } #endif @@ -3709,7 +3716,8 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { - struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); + struct io_rw *rw = io_kiocb_to_cmd(req); + struct iovec __user *uiov = u64_to_user_ptr(rw->addr); void __user *buf; ssize_t len; @@ -3722,21 +3730,23 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; - req->rw.addr = (unsigned long) buf; + rw->addr = (unsigned long) buf; iov[0].iov_base = buf; - req->rw.len = iov[0].iov_len = len; + rw->len = iov[0].iov_len = len; return 0; } static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - iov[0].iov_base = u64_to_user_ptr(req->rw.addr); - iov[0].iov_len = req->rw.len; + iov[0].iov_base = u64_to_user_ptr(rw->addr); + iov[0].iov_len = rw->len; return 0; } - if (req->rw.len != 1) + if (rw->len != 1) return -EINVAL; #ifdef CONFIG_COMPAT @@ -3754,10 +3764,11 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); } -static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, +static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, struct io_rw_state *s, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); struct iov_iter *iter = &s->iter; u8 opcode = req->opcode; struct iovec *iovec; @@ -3766,25 +3777,25 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, ssize_t ret; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, rw, iter, issue_flags); + ret = io_import_fixed(req, ddir, iter, issue_flags); if (ret) return ERR_PTR(ret); return NULL; } - buf = u64_to_user_ptr(req->rw.addr); - sqe_len = req->rw.len; + buf = u64_to_user_ptr(rw->addr); + sqe_len = rw->len; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (io_do_buffer_select(req)) { buf = io_buffer_select(req, &sqe_len, issue_flags); if (!buf) return ERR_PTR(-ENOBUFS); - req->rw.addr = (unsigned long) buf; - req->rw.len = sqe_len; + rw->addr = (unsigned long) buf; + rw->len = sqe_len; } - ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); + ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); if (ret) return ERR_PTR(ret); return NULL; @@ -3795,11 +3806,11 @@ static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, ret = io_iov_buffer_select(req, iovec, issue_flags); if (ret) return ERR_PTR(ret); - iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); + iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len); return NULL; } - ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, + ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, req->ctx->compat); if (unlikely(ret < 0)) return ERR_PTR(ret); @@ -3827,10 +3838,10 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) * For files that don't have ->read_iter() and ->write_iter(), handle them * by looping over ->read() or ->write() manually. */ -static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) +static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) { - struct kiocb *kiocb = &req->rw.kiocb; - struct file *file = req->file; + struct kiocb *kiocb = &rw->kiocb; + struct file *file = kiocb->ki_filp; ssize_t ret = 0; loff_t *ppos; @@ -3854,11 +3865,11 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (!iov_iter_is_bvec(iter)) { iovec = iov_iter_iovec(iter); } else { - iovec.iov_base = u64_to_user_ptr(req->rw.addr); - iovec.iov_len = req->rw.len; + iovec.iov_base = u64_to_user_ptr(rw->addr); + iovec.iov_len = rw->len; } - if (rw == READ) { + if (ddir == READ) { nr = file->f_op->read(file, iovec.iov_base, iovec.iov_len, ppos); } else { @@ -3875,9 +3886,9 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) if (!iov_iter_is_bvec(iter)) { iov_iter_advance(iter, nr); } else { - req->rw.addr += nr; - req->rw.len -= nr; - if (!req->rw.len) + rw->addr += nr; + rw->len -= nr; + if (!rw->len) break; } if (nr != iovec.iov_len) @@ -3890,24 +3901,24 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, const struct iovec *fast_iov, struct iov_iter *iter) { - struct io_async_rw *rw = req->async_data; + struct io_async_rw *io = req->async_data; - memcpy(&rw->s.iter, iter, sizeof(*iter)); - rw->free_iovec = iovec; - rw->bytes_done = 0; + memcpy(&io->s.iter, iter, sizeof(*iter)); + io->free_iovec = iovec; + io->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ if (iov_iter_is_bvec(iter)) return; if (!iovec) { unsigned iov_off = 0; - rw->s.iter.iov = rw->s.fast_iov; + io->s.iter.iov = io->s.fast_iov; if (iter->iov != fast_iov) { iov_off = iter->iov - fast_iov; - rw->s.iter.iov += iov_off; + io->s.iter.iov += iov_off; } - if (rw->s.fast_iov != fast_iov) - memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off, + if (io->s.fast_iov != fast_iov) + memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { req->flags |= REQ_F_NEED_CLEANUP; @@ -3989,6 +4000,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, { struct wait_page_queue *wpq; struct io_kiocb *req = wait->private; + struct io_rw *rw = io_kiocb_to_cmd(req); struct wait_page_key *key = arg; wpq = container_of(wait, struct wait_page_queue, wait); @@ -3996,7 +4008,7 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, if (!wake_page_match(wpq, key)) return 0; - req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; + rw->kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); io_req_task_queue(req); return 1; @@ -4016,9 +4028,10 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, */ static bool io_rw_should_retry(struct io_kiocb *req) { - struct io_async_rw *rw = req->async_data; - struct wait_page_queue *wait = &rw->wpq; - struct kiocb *kiocb = &req->rw.kiocb; + struct io_async_rw *io = req->async_data; + struct wait_page_queue *wait = &io->wpq; + struct io_rw *rw = io_kiocb_to_cmd(req); + struct kiocb *kiocb = &rw->kiocb; /* never retry for NOWAIT, we just complete with -EAGAIN */ if (req->flags & REQ_F_NOWAIT) @@ -4045,12 +4058,14 @@ static bool io_rw_should_retry(struct io_kiocb *req) return true; } -static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) +static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) { - if (likely(req->file->f_op->read_iter)) - return call_read_iter(req->file, &req->rw.kiocb, iter); - else if (req->file->f_op->read) - return loop_rw_iter(READ, req, iter); + struct file *file = rw->kiocb.ki_filp; + + if (likely(file->f_op->read_iter)) + return call_read_iter(file, &rw->kiocb, iter); + else if (file->f_op->read) + return loop_rw_iter(READ, rw, iter); else return -EINVAL; } @@ -4063,7 +4078,8 @@ static bool need_read_all(struct io_kiocb *req) static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) { - struct kiocb *kiocb = &req->rw.kiocb; + struct io_rw *rw = io_kiocb_to_cmd(req); + struct kiocb *kiocb = &rw->kiocb; struct io_ring_ctx *ctx = req->ctx; struct file *file = req->file; int ret; @@ -4075,7 +4091,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; kiocb->ki_flags = iocb_flags(file); - ret = kiocb_set_rw_flags(kiocb, req->rw.flags); + ret = kiocb_set_rw_flags(kiocb, rw->flags); if (unlikely(ret)) return ret; @@ -4107,11 +4123,12 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) static int io_read(struct io_kiocb *req, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); struct io_rw_state __s, *s = &__s; struct iovec *iovec; - struct kiocb *kiocb = &req->rw.kiocb; + struct kiocb *kiocb = &rw->kiocb; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *rw; + struct io_async_rw *io; ssize_t ret, ret2; loff_t *ppos; @@ -4120,8 +4137,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } else { - rw = req->async_data; - s = &rw->s; + io = req->async_data; + s = &io->s; /* * Safe and required to re-import if we're using provided @@ -4168,7 +4185,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) return ret; } - ret = io_iter_do_read(req, &s->iter); + ret = io_iter_do_read(rw, &s->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; @@ -4202,8 +4219,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) return ret2; iovec = NULL; - rw = req->async_data; - s = &rw->s; + io = req->async_data; + s = &io->s; /* * Now use our persistent iterator and state, if we aren't already. * We've restored and mapped the iter to match. @@ -4218,7 +4235,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_advance(&s->iter, ret); if (!iov_iter_count(&s->iter)) break; - rw->bytes_done += ret; + io->bytes_done += ret; iov_iter_save_state(&s->iter, &s->iter_state); /* if we can retry, do so with the callbacks armed */ @@ -4233,7 +4250,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(req, &s->iter); + ret = io_iter_do_read(rw, &s->iter); if (ret == -EIOCBQUEUED) return 0; /* we got some bytes, but not all. retry. */ @@ -4251,9 +4268,10 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) static int io_write(struct io_kiocb *req, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req); struct io_rw_state __s, *s = &__s; struct iovec *iovec; - struct kiocb *kiocb = &req->rw.kiocb; + struct kiocb *kiocb = &rw->kiocb; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ssize_t ret, ret2; loff_t *ppos; @@ -4263,9 +4281,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < 0)) return ret; } else { - struct io_async_rw *rw = req->async_data; + struct io_async_rw *io = req->async_data; - s = &rw->s; + s = &io->s; iov_iter_restore(&s->iter, &s->iter_state); iovec = NULL; } @@ -4315,7 +4333,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) if (likely(req->file->f_op->write_iter)) ret2 = call_write_iter(req->file, kiocb, &s->iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req, &s->iter); + ret2 = loop_rw_iter(WRITE, rw, &s->iter); else ret2 = -EINVAL; From 6c1ecd8af0108b32a6c0c1ef285d1f8206179eac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:13:46 -0600 Subject: [PATCH 0932/1250] io_uring: convert poll path to use io_cmd_type Remove struct io_poll_iocb from io_kiocb, and convert the poll path to use the io_cmd_type approach instead. While at it, rename io_poll_iocb to io_poll which is consistent with the other request type private structures. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 53 ++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index db2880ccb22778..7459a535060c33 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -556,7 +556,7 @@ struct io_uring_task { * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in */ -struct io_poll_iocb { +struct io_poll { struct file *file; struct wait_queue_head *head; __poll_t events; @@ -919,8 +919,8 @@ enum { }; struct async_poll { - struct io_poll_iocb poll; - struct io_poll_iocb *double_poll; + struct io_poll poll; + struct io_poll *double_poll; }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); @@ -976,7 +976,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_poll_iocb poll; struct io_poll_update poll_update; struct io_accept accept; struct io_sync sync; @@ -6579,7 +6578,7 @@ static void io_poll_mark_cancelled(struct io_kiocb *req) atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); } -static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) +static struct io_poll *io_poll_get_double(struct io_kiocb *req) { /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ if (req->opcode == IORING_OP_POLL_ADD) @@ -6587,10 +6586,10 @@ static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) return req->apoll->double_poll; } -static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) +static struct io_poll *io_poll_get_single(struct io_kiocb *req) { if (req->opcode == IORING_OP_POLL_ADD) - return &req->poll; + return io_kiocb_to_cmd(req); return &req->apoll->poll; } @@ -6603,7 +6602,7 @@ static void io_poll_req_insert(struct io_kiocb *req) hlist_add_head(&req->hash_node, list); } -static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, +static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, wait_queue_func_t wake_func) { poll->head = NULL; @@ -6614,7 +6613,7 @@ static void io_init_poll_iocb(struct io_poll_iocb *poll, __poll_t events, init_waitqueue_func_entry(&poll->wait, wake_func); } -static inline void io_poll_remove_entry(struct io_poll_iocb *poll) +static inline void io_poll_remove_entry(struct io_poll *poll) { struct wait_queue_head *head = smp_load_acquire(&poll->head); @@ -6740,7 +6739,9 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) return; if (!ret) { - req->cqe.res = mangle_poll(req->cqe.res & req->poll.events); + struct io_poll *poll = io_kiocb_to_cmd(req); + + req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else { req->cqe.res = ret; req_set_fail(req); @@ -6816,8 +6817,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wqe_to_req(wait); - struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, - wait); + struct io_poll *poll = container_of(wait, struct io_poll, wait); __poll_t mask = key_to_poll(key); if (unlikely(mask & POLLFREE)) { @@ -6863,20 +6863,20 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, return 1; } -static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, +static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, struct wait_queue_head *head, - struct io_poll_iocb **poll_ptr) + struct io_poll **poll_ptr) { struct io_kiocb *req = pt->req; unsigned long wqe_private = (unsigned long) req; /* * The file being polled uses multiple waitqueues for poll handling - * (e.g. one for read, one for write). Setup a separate io_poll_iocb + * (e.g. one for read, one for write). Setup a separate io_poll * if this happens. */ if (unlikely(pt->nr_entries)) { - struct io_poll_iocb *first = poll; + struct io_poll *first = poll; /* double add on the same waitqueue head, ignore */ if (first->head == head) @@ -6918,13 +6918,14 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, struct poll_table_struct *p) { struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct io_poll *poll = io_kiocb_to_cmd(pt->req); - __io_queue_proc(&pt->req->poll, pt, head, - (struct io_poll_iocb **) &pt->req->async_data); + __io_queue_proc(poll, pt, head, + (struct io_poll **) &pt->req->async_data); } static int __io_arm_poll_handler(struct io_kiocb *req, - struct io_poll_iocb *poll, + struct io_poll *poll, struct io_poll_table *ipt, __poll_t mask) { struct io_ring_ctx *ctx = req->ctx; @@ -7207,7 +7208,7 @@ static int io_poll_remove_prep(struct io_kiocb *req, static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_poll_iocb *poll = &req->poll; + struct io_poll *poll = io_kiocb_to_cmd(req); u32 flags; if (sqe->buf_index || sqe->off || sqe->addr) @@ -7225,13 +7226,13 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) { - struct io_poll_iocb *poll = &req->poll; + struct io_poll *poll = io_kiocb_to_cmd(req); struct io_poll_table ipt; int ret; ipt.pt._qproc = io_poll_queue_proc; - ret = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events); + ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); if (!ret && ipt.error) req_set_fail(req); ret = ret ?: ipt.error; @@ -7260,9 +7261,11 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) if (req->poll_update.update_events || req->poll_update.update_user_data) { /* only mask one event flags, keep behavior flags */ if (req->poll_update.update_events) { - preq->poll.events &= ~0xffff; - preq->poll.events |= req->poll_update.events & 0xffff; - preq->poll.events |= IO_POLL_UNMASK; + struct io_poll *poll = io_kiocb_to_cmd(preq); + + poll->events &= ~0xffff; + poll->events |= req->poll_update.events & 0xffff; + poll->events |= IO_POLL_UNMASK; } if (req->poll_update.update_user_data) preq->cqe.user_data = req->poll_update.new_user_data; From 35bc16416609bf8faf5ff3d42bb30170726d9edc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:16:40 -0600 Subject: [PATCH 0933/1250] io_uring: convert poll_update path to use io_cmd_type Remove struct io_poll_update from io_kiocb, and convert the poll path to use the io_cmd_type approach instead. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7459a535060c33..9114d4a42f2bca 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -976,7 +976,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_poll_update poll_update; struct io_accept accept; struct io_sync sync; struct io_cancel cancel; @@ -7178,7 +7177,7 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, static int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_poll_update *upd = &req->poll_update; + struct io_poll_update *upd = io_kiocb_to_cmd(req); u32 flags; if (sqe->buf_index || sqe->splice_fd_in) @@ -7243,7 +7242,8 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) { - struct io_cancel_data cd = { .data = req->poll_update.old_user_data, }; + struct io_poll_update *poll_update = io_kiocb_to_cmd(req); + struct io_cancel_data cd = { .data = poll_update->old_user_data, }; struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *preq; int ret2, ret = 0; @@ -7258,17 +7258,17 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) } spin_unlock(&ctx->completion_lock); - if (req->poll_update.update_events || req->poll_update.update_user_data) { + if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ - if (req->poll_update.update_events) { + if (poll_update->update_events) { struct io_poll *poll = io_kiocb_to_cmd(preq); poll->events &= ~0xffff; - poll->events |= req->poll_update.events & 0xffff; + poll->events |= poll_update->events & 0xffff; poll->events |= IO_POLL_UNMASK; } - if (req->poll_update.update_user_data) - preq->cqe.user_data = req->poll_update.new_user_data; + if (poll_update->update_user_data) + preq->cqe.user_data = poll_update->new_user_data; ret2 = io_poll_add(preq, issue_flags); /* successfully updated, don't complete poll request */ From b59188be43a90d5e0cddd5cc426a8a6e0e0a158b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:24:42 -0600 Subject: [PATCH 0934/1250] io_uring: remove recvmsg knowledge from io_arm_poll_handler() There's a special case for recvmsg with MSG_ERRQUEUE set. This is problematic as it means the core needs to know about this special request type. For now, just add a generic flag for it. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9114d4a42f2bca..2c061424797d7c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -846,6 +846,7 @@ enum { REQ_F_PARTIAL_IO_BIT, REQ_F_CQE32_INIT_BIT, REQ_F_APOLL_MULTISHOT_BIT, + REQ_F_CLEAR_POLLIN_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -916,6 +917,8 @@ enum { REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), /* ->extra1 and ->extra2 are initialised */ REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), + /* recvmsg special flag, clear EPOLLIN */ + REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), }; struct async_poll { @@ -6145,6 +6148,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (sr->msg_flags & MSG_ERRQUEUE) + req->flags |= REQ_F_CLEAR_POLLIN; #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -7023,8 +7028,7 @@ static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) mask |= EPOLLIN | EPOLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if ((req->opcode == IORING_OP_RECVMSG) && - (req->sr_msg.msg_flags & MSG_ERRQUEUE)) + if (req->flags & REQ_F_CLEAR_POLLIN) mask &= ~EPOLLIN; } else { mask |= EPOLLOUT | EPOLLWRNORM; From ffca591a6e3ef263a51ca25ae86dc99c38ce941a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:27:38 -0600 Subject: [PATCH 0935/1250] io_uring: convert net related opcodes to use io_cmd_type This converts accept, connect, send/recv, sendmsg/recvmsg, shutdown, and socket to use io_cmd_type. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 53 +++++++++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2c061424797d7c..6798859c601d65 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,13 +979,10 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_accept accept; struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; struct io_timeout_rem timeout_rem; - struct io_connect connect; - struct io_sr_msg sr_msg; struct io_open open; struct io_close close; struct io_rsrc_update rsrc_update; @@ -995,7 +992,6 @@ struct io_kiocb { struct io_splice splice; struct io_provide_buf pbuf; struct io_statx statx; - struct io_shutdown shutdown; struct io_rename rename; struct io_unlink unlink; struct io_mkdir mkdir; @@ -1003,7 +999,6 @@ struct io_kiocb { struct io_hardlink hardlink; struct io_msg msg; struct io_xattr xattr; - struct io_socket sock; struct io_uring_cmd uring_cmd; }; @@ -5824,16 +5819,19 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_shutdown *shutdown = io_kiocb_to_cmd(req); + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; - req->shutdown.how = READ_ONCE(sqe->len); + shutdown->how = READ_ONCE(sqe->len); return 0; } static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) { + struct io_shutdown *shutdown = io_kiocb_to_cmd(req); struct socket *sock; int ret; @@ -5844,7 +5842,7 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - ret = __sys_shutdown_sock(sock, req->shutdown.how); + ret = __sys_shutdown_sock(sock, shutdown->how); io_req_complete(req, ret); return 0; } @@ -5881,10 +5879,12 @@ static int io_setup_async_msg(struct io_kiocb *req, static int io_sendmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + iomsg->msg.msg_name = &iomsg->addr; iomsg->free_iov = iomsg->fast_iov; - return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg, - req->sr_msg.msg_flags, &iomsg->free_iov); + return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, + &iomsg->free_iov); } static int io_sendmsg_prep_async(struct io_kiocb *req) @@ -5899,7 +5899,7 @@ static int io_sendmsg_prep_async(struct io_kiocb *req) static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -5923,8 +5923,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct io_async_msghdr iomsg, *kmsg; - struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; unsigned flags; int min_ret = 0; @@ -5981,7 +5981,7 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) static int io_send(struct io_kiocb *req, unsigned int issue_flags) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct msghdr msg; struct iovec iov; struct socket *sock; @@ -6039,7 +6039,7 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct iovec __user *uiov; size_t iov_len; int ret; @@ -6072,7 +6072,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct compat_iovec __user *uiov; compat_uptr_t ptr; compat_size_t len; @@ -6135,7 +6135,7 @@ static int io_recvmsg_prep_async(struct io_kiocb *req) static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -6161,8 +6161,8 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct io_async_msghdr iomsg, *kmsg; - struct io_sr_msg *sr = &req->sr_msg; struct socket *sock; unsigned int cflags; unsigned flags; @@ -6238,7 +6238,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) static int io_recv(struct io_kiocb *req, unsigned int issue_flags) { - struct io_sr_msg *sr = &req->sr_msg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req); struct msghdr msg; struct socket *sock; struct iovec iov; @@ -6314,7 +6314,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_accept *accept = &req->accept; + struct io_accept *accept = io_kiocb_to_cmd(req); unsigned flags; if (sqe->len || sqe->buf_index) @@ -6348,7 +6348,7 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_accept(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_accept *accept = &req->accept; + struct io_accept *accept = io_kiocb_to_cmd(req); bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; bool fixed = !!accept->file_slot; @@ -6413,7 +6413,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_socket *sock = &req->sock; + struct io_socket *sock = io_kiocb_to_cmd(req); if (sqe->addr || sqe->rw_flags || sqe->buf_index) return -EINVAL; @@ -6434,7 +6434,7 @@ static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_socket(struct io_kiocb *req, unsigned int issue_flags) { - struct io_socket *sock = &req->sock; + struct io_socket *sock = io_kiocb_to_cmd(req); bool fixed = !!sock->file_slot; struct file *file; int ret, fd; @@ -6468,14 +6468,14 @@ static int io_socket(struct io_kiocb *req, unsigned int issue_flags) static int io_connect_prep_async(struct io_kiocb *req) { struct io_async_connect *io = req->async_data; - struct io_connect *conn = &req->connect; + struct io_connect *conn = io_kiocb_to_cmd(req); return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); } static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_connect *conn = &req->connect; + struct io_connect *conn = io_kiocb_to_cmd(req); if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; @@ -6487,6 +6487,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_connect(struct io_kiocb *req, unsigned int issue_flags) { + struct io_connect *connect = io_kiocb_to_cmd(req); struct io_async_connect __io, *io; unsigned file_flags; int ret; @@ -6495,8 +6496,8 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { io = req->async_data; } else { - ret = move_addr_to_kernel(req->connect.addr, - req->connect.addr_len, + ret = move_addr_to_kernel(connect->addr, + connect->addr_len, &__io.address); if (ret) goto out; @@ -6506,7 +6507,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_connect_file(req->file, &io->address, - req->connect.addr_len, file_flags); + connect->addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { if (req_has_async_data(req)) return -EAGAIN; From 206933c3b5fae2ae2b0acf0b1391a21f6b7bf2e8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:30:45 -0600 Subject: [PATCH 0936/1250] io_uring: convert the sync and fallocate paths to use io_cmd_type They all share the same struct io_sync, convert them to use the io_cmd_type approach instead. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6798859c601d65..b59edae29956f2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_sync sync; struct io_cancel cancel; struct io_timeout timeout; struct io_timeout_rem timeout_rem; @@ -5085,30 +5084,32 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_sync *sync = io_kiocb_to_cmd(req); + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; - req->sync.flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) + sync->flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC)) return -EINVAL; - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->len); + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->len); return 0; } static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) { - loff_t end = req->sync.off + req->sync.len; + struct io_sync *sync = io_kiocb_to_cmd(req); + loff_t end = sync->off + sync->len; int ret; /* fsync always requires a blocking context */ if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ret = vfs_fsync_range(req->file, req->sync.off, - end > 0 ? end : LLONG_MAX, - req->sync.flags & IORING_FSYNC_DATASYNC); + ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, + sync->flags & IORING_FSYNC_DATASYNC); io_req_complete(req, ret); return 0; } @@ -5116,24 +5117,26 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_sync *sync = io_kiocb_to_cmd(req); + if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->addr); - req->sync.mode = READ_ONCE(sqe->len); + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->addr); + sync->mode = READ_ONCE(sqe->len); return 0; } static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) { + struct io_sync *sync = io_kiocb_to_cmd(req); int ret; /* fallocate always requiring blocking context */ if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, - req->sync.len); + ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); io_req_complete(req, ret); @@ -5792,25 +5795,27 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_sync *sync = io_kiocb_to_cmd(req); + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; - req->sync.off = READ_ONCE(sqe->off); - req->sync.len = READ_ONCE(sqe->len); - req->sync.flags = READ_ONCE(sqe->sync_range_flags); + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->len); + sync->flags = READ_ONCE(sqe->sync_range_flags); return 0; } static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) { + struct io_sync *sync = io_kiocb_to_cmd(req); int ret; /* sync_file_range always requires a blocking context */ if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ret = sync_file_range(req->file, req->sync.off, req->sync.len, - req->sync.flags); + ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_complete(req, ret); return 0; } From 70c0c7cc2c4435953d5b2130c6612e45840091d1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:33:01 -0600 Subject: [PATCH 0937/1250] io_uring: convert cancel path to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b59edae29956f2..ef7b5430bb14bb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_cancel cancel; struct io_timeout timeout; struct io_timeout_rem timeout_rem; struct io_open open; @@ -7699,19 +7698,21 @@ static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) static int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_cancel *cancel = io_kiocb_to_cmd(req); + if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sqe->off || sqe->len || sqe->splice_fd_in) return -EINVAL; - req->cancel.addr = READ_ONCE(sqe->addr); - req->cancel.flags = READ_ONCE(sqe->cancel_flags); - if (req->cancel.flags & ~CANCEL_FLAGS) + cancel->addr = READ_ONCE(sqe->addr); + cancel->flags = READ_ONCE(sqe->cancel_flags); + if (cancel->flags & ~CANCEL_FLAGS) return -EINVAL; - if (req->cancel.flags & IORING_ASYNC_CANCEL_FD) { - if (req->cancel.flags & IORING_ASYNC_CANCEL_ANY) + if (cancel->flags & IORING_ASYNC_CANCEL_FD) { + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) return -EINVAL; - req->cancel.fd = READ_ONCE(sqe->fd); + cancel->fd = READ_ONCE(sqe->fd); } return 0; @@ -7753,20 +7754,21 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { + struct io_cancel *cancel = io_kiocb_to_cmd(req); struct io_cancel_data cd = { .ctx = req->ctx, - .data = req->cancel.addr, - .flags = req->cancel.flags, + .data = cancel->addr, + .flags = cancel->flags, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, req->cancel.fd, + req->file = io_file_get_fixed(req, cancel->fd, issue_flags); else - req->file = io_file_get_normal(req, req->cancel.fd); + req->file = io_file_get_normal(req, cancel->fd); if (!req->file) { ret = -EBADF; goto done; From ae28dd1c71d119e0990c4ea57dad8a275c33fb05 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:45:22 -0600 Subject: [PATCH 0938/1250] io_uring: convert timeout path to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 117 +++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 49 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ef7b5430bb14bb..d0daa65f7138cc 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,8 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_timeout timeout; - struct io_timeout_rem timeout_rem; struct io_open open; struct io_close close; struct io_rsrc_update rsrc_update; @@ -1652,7 +1650,9 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) static inline bool io_is_timeout_noseq(struct io_kiocb *req) { - return !req->timeout.off; + struct io_timeout *timeout = io_kiocb_to_cmd(req); + + return !timeout->off; } static __cold void io_fallback_req_func(struct work_struct *work) @@ -1898,11 +1898,13 @@ static void io_kill_timeout(struct io_kiocb *req, int status) struct io_timeout_data *io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) != -1) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); + if (status) req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&req->timeout.list); + list_del_init(&timeout->list); io_req_tw_post_queue(req, status, 0); } } @@ -1925,10 +1927,11 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - struct io_kiocb *req, *tmp; + struct io_timeout *timeout, *tmp; spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { + struct io_kiocb *req = cmd_to_io_kiocb(timeout); u32 events_needed, events_got; if (io_is_timeout_noseq(req)) @@ -1941,7 +1944,7 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) * these subtractions won't have wrapped, so we can check if * target is in [last_seq, current_seq] by comparing the two. */ - events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; + events_needed = timeout->target_seq - ctx->cq_last_tm_flush; events_got = seq - ctx->cq_last_tm_flush; if (events_got < events_needed) break; @@ -2546,11 +2549,12 @@ static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { struct io_timeout_data *io = link->async_data; + struct io_timeout *timeout = io_kiocb_to_cmd(link); io_remove_next_linked(req); - link->timeout.head = NULL; + timeout->head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { - list_del(&link->timeout.list); + list_del(&timeout->list); return link; } } @@ -7302,11 +7306,12 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); struct io_kiocb *req = data->req; + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->timeout_lock, flags); - list_del_init(&req->timeout.list); + list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); spin_unlock_irqrestore(&ctx->timeout_lock, flags); @@ -7324,29 +7329,32 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, struct io_cancel_data *cd) __must_hold(&ctx->timeout_lock) { + struct io_timeout *timeout; struct io_timeout_data *io; - struct io_kiocb *req; - bool found = false; + struct io_kiocb *req = NULL; + + list_for_each_entry(timeout, &ctx->timeout_list, list) { + struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - list_for_each_entry(req, &ctx->timeout_list, timeout.list) { if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - cd->data != req->cqe.user_data) + cd->data != tmp->cqe.user_data) continue; if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == req->work.cancel_seq) + if (cd->seq == tmp->work.cancel_seq) continue; - req->work.cancel_seq = cd->seq; + tmp->work.cancel_seq = cd->seq; } - found = true; + req = tmp; break; } - if (!found) + if (!req) return ERR_PTR(-ENOENT); io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return ERR_PTR(-EALREADY); - list_del_init(&req->timeout.list); + timeout = io_kiocb_to_cmd(req); + list_del_init(&timeout->list); return req; } @@ -7386,15 +7394,18 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; - struct io_kiocb *req; - bool found = false; + struct io_timeout *timeout; + struct io_kiocb *req = NULL; + + list_for_each_entry(timeout, &ctx->ltimeout_list, list) { + struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { - found = user_data == req->cqe.user_data; - if (found) + if (user_data == tmp->cqe.user_data) { + req = tmp; break; + } } - if (!found) + if (!req) return -ENOENT; io = req->async_data; @@ -7412,14 +7423,15 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, { struct io_cancel_data cd = { .data = user_data, }; struct io_kiocb *req = io_timeout_extract(ctx, &cd); + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_timeout_data *data; if (IS_ERR(req)) return PTR_ERR(req); - req->timeout.off = 0; /* noseq */ + timeout->off = 0; /* noseq */ data = req->async_data; - list_add_tail(&req->timeout.list, &ctx->timeout_list); + list_add_tail(&timeout->list, &ctx->timeout_list); hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); @@ -7429,7 +7441,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, static int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_timeout_rem *tr = &req->timeout_rem; + struct io_timeout_rem *tr = io_kiocb_to_cmd(req); if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; @@ -7469,11 +7481,11 @@ static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) */ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) { - struct io_timeout_rem *tr = &req->timeout_rem; + struct io_timeout_rem *tr = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; int ret; - if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { + if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { struct io_cancel_data cd = { .data = tr->addr, }; spin_lock(&ctx->completion_lock); @@ -7500,6 +7512,7 @@ static int __io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool is_timeout_link) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_timeout_data *data; unsigned flags; u32 off = READ_ONCE(sqe->off); @@ -7516,8 +7529,8 @@ static int __io_timeout_prep(struct io_kiocb *req, if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; - INIT_LIST_HEAD(&req->timeout.list); - req->timeout.off = off; + INIT_LIST_HEAD(&timeout->list); + timeout->off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; @@ -7536,7 +7549,7 @@ static int __io_timeout_prep(struct io_kiocb *req, if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; - INIT_LIST_HEAD(&req->timeout.list); + INIT_LIST_HEAD(&timeout->list); data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); @@ -7547,7 +7560,7 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; if (link->last->opcode == IORING_OP_LINK_TIMEOUT) return -EINVAL; - req->timeout.head = link->last; + timeout->head = link->last; link->last->flags |= REQ_F_ARM_LTIMEOUT; } return 0; @@ -7567,10 +7580,11 @@ static int io_link_timeout_prep(struct io_kiocb *req, static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data = req->async_data; struct list_head *entry; - u32 tail, off = req->timeout.off; + u32 tail, off = timeout->off; spin_lock_irq(&ctx->timeout_lock); @@ -7585,7 +7599,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) } tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - req->timeout.target_seq = tail + off; + timeout->target_seq = tail + off; /* Update the last seq here in case io_flush_timeouts() hasn't. * This is safe because ->completion_lock is held, and submissions @@ -7598,17 +7612,17 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) * the one we need first. */ list_for_each_prev(entry, &ctx->timeout_list) { - struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, - timeout.list); + struct io_timeout *nextt = list_entry(entry, struct io_timeout, list); + struct io_kiocb *nxt = cmd_to_io_kiocb(nextt); if (io_is_timeout_noseq(nxt)) continue; /* nxt.seq is behind @tail, otherwise would've been completed */ - if (off >= nxt->timeout.target_seq - tail) + if (off >= nextt->target_seq - tail) break; } add: - list_add(&req->timeout.list, entry); + list_add(&timeout->list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->timeout_lock); @@ -8198,7 +8212,8 @@ static struct file *io_file_get_normal(struct io_kiocb *req, int fd) static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { - struct io_kiocb *prev = req->timeout.prev; + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; if (prev) { @@ -8222,12 +8237,13 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_timeout_data *data = container_of(timer, struct io_timeout_data, timer); struct io_kiocb *prev, *req = data->req; + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; unsigned long flags; spin_lock_irqsave(&ctx->timeout_lock, flags); - prev = req->timeout.head; - req->timeout.head = NULL; + prev = timeout->head; + timeout->head = NULL; /* * We don't expect the list to be empty, that will only happen if we @@ -8238,8 +8254,8 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!req_ref_inc_not_zero(prev)) prev = NULL; } - list_del(&req->timeout.list); - req->timeout.prev = prev; + list_del(&timeout->list); + timeout->prev = prev; spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; @@ -8249,6 +8265,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) static void io_queue_linked_timeout(struct io_kiocb *req) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; spin_lock_irq(&ctx->timeout_lock); @@ -8256,13 +8273,13 @@ static void io_queue_linked_timeout(struct io_kiocb *req) * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer */ - if (req->timeout.head) { + if (timeout->head) { struct io_timeout_data *data = req->async_data; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - list_add_tail(&req->timeout.list, &ctx->ltimeout_list); + list_add_tail(&timeout->list, &ctx->ltimeout_list); } spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ @@ -10930,12 +10947,14 @@ static __cold void io_ring_exit_work(struct work_struct *work) static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all) { - struct io_kiocb *req, *tmp; + struct io_timeout *timeout, *tmp; int canceled = 0; spin_lock(&ctx->completion_lock); spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { + list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { + struct io_kiocb *req = cmd_to_io_kiocb(timeout); + if (io_match_task(req, tsk, cancel_all)) { io_kill_timeout(req, -ECANCELED); canceled++; From b6c2e5ee093c2ad9ec9d9a46ea348da1629b34bb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:49:25 -0600 Subject: [PATCH 0939/1250] io_uring: convert open/close path to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 72 +++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 32 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d0daa65f7138cc..65eb41a60d741d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,8 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_open open; - struct io_close close; struct io_rsrc_update rsrc_update; struct io_fadvise fadvise; struct io_madvise madvise; @@ -5148,6 +5146,7 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_open *open = io_kiocb_to_cmd(req); const char __user *fname; int ret; @@ -5157,38 +5156,40 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EBADF; /* open.how should be already initialised */ - if (!(req->open.how.flags & O_PATH) && force_o_largefile()) - req->open.how.flags |= O_LARGEFILE; + if (!(open->how.flags & O_PATH) && force_o_largefile()) + open->how.flags |= O_LARGEFILE; - req->open.dfd = READ_ONCE(sqe->fd); + open->dfd = READ_ONCE(sqe->fd); fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->open.filename = getname(fname); - if (IS_ERR(req->open.filename)) { - ret = PTR_ERR(req->open.filename); - req->open.filename = NULL; + open->filename = getname(fname); + if (IS_ERR(open->filename)) { + ret = PTR_ERR(open->filename); + open->filename = NULL; return ret; } - req->open.file_slot = READ_ONCE(sqe->file_index); - if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) + open->file_slot = READ_ONCE(sqe->file_index); + if (open->file_slot && (open->how.flags & O_CLOEXEC)) return -EINVAL; - req->open.nofile = rlimit(RLIMIT_NOFILE); + open->nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; } static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_open *open = io_kiocb_to_cmd(req); u64 mode = READ_ONCE(sqe->len); u64 flags = READ_ONCE(sqe->open_flags); - req->open.how = build_open_how(flags, mode); + open->how = build_open_how(flags, mode); return __io_openat_prep(req, sqe); } static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_open *open = io_kiocb_to_cmd(req); struct open_how __user *how; size_t len; int ret; @@ -5198,8 +5199,7 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (len < OPEN_HOW_SIZE_VER0) return -EINVAL; - ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how, - len); + ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len); if (ret) return ret; @@ -5261,35 +5261,36 @@ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { + struct io_open *open = io_kiocb_to_cmd(req); struct open_flags op; struct file *file; bool resolve_nonblock, nonblock_set; - bool fixed = !!req->open.file_slot; + bool fixed = !!open->file_slot; int ret; - ret = build_open_flags(&req->open.how, &op); + ret = build_open_flags(&open->how, &op); if (ret) goto err; nonblock_set = op.open_flag & O_NONBLOCK; - resolve_nonblock = req->open.how.resolve & RESOLVE_CACHED; + resolve_nonblock = open->how.resolve & RESOLVE_CACHED; if (issue_flags & IO_URING_F_NONBLOCK) { /* * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, * it'll always -EAGAIN */ - if (req->open.how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) return -EAGAIN; op.lookup_flags |= LOOKUP_CACHED; op.open_flag |= O_NONBLOCK; } if (!fixed) { - ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); + ret = __get_unused_fd_flags(open->how.flags, open->nofile); if (ret < 0) goto err; } - file = do_filp_open(req->open.dfd, req->open.filename, &op); + file = do_filp_open(open->dfd, open->filename, &op); if (IS_ERR(file)) { /* * We could hang on to this 'fd' on retrying, but seems like @@ -5315,9 +5316,9 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) fd_install(ret, file); else ret = io_fixed_fd_install(req, issue_flags, file, - req->open.file_slot); + open->file_slot); err: - putname(req->open.filename); + putname(open->filename); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail(req); @@ -5737,14 +5738,16 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_close *close = io_kiocb_to_cmd(req); + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; - req->close.fd = READ_ONCE(sqe->fd); - req->close.file_slot = READ_ONCE(sqe->file_index); - if (req->close.file_slot && req->close.fd) + close->fd = READ_ONCE(sqe->fd); + close->file_slot = READ_ONCE(sqe->file_index); + if (close->file_slot && close->fd) return -EINVAL; return 0; @@ -5753,12 +5756,12 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_close(struct io_kiocb *req, unsigned int issue_flags) { struct files_struct *files = current->files; - struct io_close *close = &req->close; + struct io_close *close = io_kiocb_to_cmd(req); struct fdtable *fdt; struct file *file; int ret = -EBADF; - if (req->close.file_slot) { + if (close->file_slot) { ret = io_close_fixed(req, issue_flags); goto err; } @@ -7982,10 +7985,13 @@ static void io_clean_op(struct io_kiocb *req) break; } case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: - if (req->open.filename) - putname(req->open.filename); + case IORING_OP_OPENAT2: { + struct io_open *open = io_kiocb_to_cmd(req); + + if (open->filename) + putname(open->filename); break; + } case IORING_OP_RENAMEAT: putname(req->rename.oldpath); putname(req->rename.newpath); @@ -9831,7 +9837,9 @@ static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { - return __io_close_fixed(req, issue_flags, req->close.file_slot - 1); + struct io_close *close = io_kiocb_to_cmd(req); + + return __io_close_fixed(req, issue_flags, close->file_slot - 1); } static int __io_sqe_files_update(struct io_ring_ctx *ctx, From 318ad765e898811bd0a23e204e150948078cb1b9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:51:05 -0600 Subject: [PATCH 0940/1250] io_uring: convert madvise/fadvise to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 65eb41a60d741d..60d462d7c8479b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -980,8 +980,6 @@ struct io_kiocb { struct file *file; struct io_cmd_data cmd; struct io_rsrc_update rsrc_update; - struct io_fadvise fadvise; - struct io_madvise madvise; struct io_epoll epoll; struct io_splice splice; struct io_provide_buf pbuf; @@ -5629,12 +5627,14 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = io_kiocb_to_cmd(req); + if (sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; - req->madvise.addr = READ_ONCE(sqe->addr); - req->madvise.len = READ_ONCE(sqe->len); - req->madvise.advice = READ_ONCE(sqe->fadvise_advice); + ma->addr = READ_ONCE(sqe->addr); + ma->len = READ_ONCE(sqe->len); + ma->advice = READ_ONCE(sqe->fadvise_advice); return 0; #else return -EOPNOTSUPP; @@ -5644,7 +5644,7 @@ static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - struct io_madvise *ma = &req->madvise; + struct io_madvise *ma = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -5660,18 +5660,20 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_fadvise *fa = io_kiocb_to_cmd(req); + if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; - req->fadvise.offset = READ_ONCE(sqe->off); - req->fadvise.len = READ_ONCE(sqe->len); - req->fadvise.advice = READ_ONCE(sqe->fadvise_advice); + fa->offset = READ_ONCE(sqe->off); + fa->len = READ_ONCE(sqe->len); + fa->advice = READ_ONCE(sqe->fadvise_advice); return 0; } static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) { - struct io_fadvise *fa = &req->fadvise; + struct io_fadvise *fa = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) { From 70ebe5d44e3d9ca6ebd4f8964874c981b680b828 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 09:59:28 -0600 Subject: [PATCH 0941/1250] io_uring: convert file system request types to use io_cmd_type This converts statx, rename, unlink, mkdir, symlink, and hardlink to use io_cmd_type. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 102 +++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 45 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 60d462d7c8479b..38c34da088b5ac 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -983,12 +983,6 @@ struct io_kiocb { struct io_epoll epoll; struct io_splice splice; struct io_provide_buf pbuf; - struct io_statx statx; - struct io_rename rename; - struct io_unlink unlink; - struct io_mkdir mkdir; - struct io_symlink symlink; - struct io_hardlink hardlink; struct io_msg msg; struct io_xattr xattr; struct io_uring_cmd uring_cmd; @@ -4367,7 +4361,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) static int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_rename *ren = &req->rename; + struct io_rename *ren = io_kiocb_to_cmd(req); const char __user *oldf, *newf; if (sqe->buf_index || sqe->splice_fd_in) @@ -4397,7 +4391,7 @@ static int io_renameat_prep(struct io_kiocb *req, static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_rename *ren = &req->rename; + struct io_rename *ren = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4654,7 +4648,7 @@ static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) static int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_unlink *un = &req->unlink; + struct io_unlink *un = io_kiocb_to_cmd(req); const char __user *fname; if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) @@ -4679,7 +4673,7 @@ static int io_unlinkat_prep(struct io_kiocb *req, static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_unlink *un = &req->unlink; + struct io_unlink *un = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4698,7 +4692,7 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) static int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_mkdir *mkd = &req->mkdir; + struct io_mkdir *mkd = io_kiocb_to_cmd(req); const char __user *fname; if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4720,7 +4714,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_mkdir *mkd = &req->mkdir; + struct io_mkdir *mkd = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4736,7 +4730,7 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) static int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_symlink *sl = &req->symlink; + struct io_symlink *sl = io_kiocb_to_cmd(req); const char __user *oldpath, *newpath; if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4764,7 +4758,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_symlink *sl = &req->symlink; + struct io_symlink *sl = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4780,7 +4774,7 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) static int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_hardlink *lnk = &req->hardlink; + struct io_hardlink *lnk = io_kiocb_to_cmd(req); const char __user *oldf, *newf; if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4810,7 +4804,7 @@ static int io_linkat_prep(struct io_kiocb *req, static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_hardlink *lnk = &req->hardlink; + struct io_hardlink *lnk = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -5696,6 +5690,7 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_statx *sx = io_kiocb_to_cmd(req); const char __user *path; if (sqe->buf_index || sqe->splice_fd_in) @@ -5703,20 +5698,20 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->flags & REQ_F_FIXED_FILE) return -EBADF; - req->statx.dfd = READ_ONCE(sqe->fd); - req->statx.mask = READ_ONCE(sqe->len); + sx->dfd = READ_ONCE(sqe->fd); + sx->mask = READ_ONCE(sqe->len); path = u64_to_user_ptr(READ_ONCE(sqe->addr)); - req->statx.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - req->statx.flags = READ_ONCE(sqe->statx_flags); + sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + sx->flags = READ_ONCE(sqe->statx_flags); - req->statx.filename = getname_flags(path, - getname_statx_lookup_flags(req->statx.flags), - NULL); + sx->filename = getname_flags(path, + getname_statx_lookup_flags(sx->flags), + NULL); - if (IS_ERR(req->statx.filename)) { - int ret = PTR_ERR(req->statx.filename); + if (IS_ERR(sx->filename)) { + int ret = PTR_ERR(sx->filename); - req->statx.filename = NULL; + sx->filename = NULL; return ret; } @@ -5726,14 +5721,13 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_statx(struct io_kiocb *req, unsigned int issue_flags) { - struct io_statx *ctx = &req->statx; + struct io_statx *sx = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) return -EAGAIN; - ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask, - ctx->buffer); + ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_complete(req, ret); return 0; } @@ -7994,28 +7988,46 @@ static void io_clean_op(struct io_kiocb *req) putname(open->filename); break; } - case IORING_OP_RENAMEAT: - putname(req->rename.oldpath); - putname(req->rename.newpath); + case IORING_OP_RENAMEAT: { + struct io_rename *ren = io_kiocb_to_cmd(req); + + putname(ren->oldpath); + putname(ren->newpath); break; - case IORING_OP_UNLINKAT: - putname(req->unlink.filename); + } + case IORING_OP_UNLINKAT: { + struct io_unlink *ul = io_kiocb_to_cmd(req); + + putname(ul->filename); break; - case IORING_OP_MKDIRAT: - putname(req->mkdir.filename); + } + case IORING_OP_MKDIRAT: { + struct io_mkdir *md = io_kiocb_to_cmd(req); + + putname(md->filename); break; - case IORING_OP_SYMLINKAT: - putname(req->symlink.oldpath); - putname(req->symlink.newpath); + } + case IORING_OP_SYMLINKAT: { + struct io_symlink *sl = io_kiocb_to_cmd(req); + + putname(sl->oldpath); + putname(sl->newpath); break; - case IORING_OP_LINKAT: - putname(req->hardlink.oldpath); - putname(req->hardlink.newpath); + } + case IORING_OP_LINKAT: { + struct io_hardlink *hl = io_kiocb_to_cmd(req); + + putname(hl->oldpath); + putname(hl->newpath); break; - case IORING_OP_STATX: - if (req->statx.filename) - putname(req->statx.filename); + } + case IORING_OP_STATX: { + struct io_statx *sx = io_kiocb_to_cmd(req); + + if (sx->filename) + putname(sx->filename); break; + } case IORING_OP_SETXATTR: case IORING_OP_FSETXATTR: case IORING_OP_GETXATTR: From c08c06be76290b43aca854e03d1a043dac68b3f1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:01:09 -0600 Subject: [PATCH 0942/1250] io_uring: convert epoll to io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 38c34da088b5ac..bcdc6ed7f46bb7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -980,7 +980,6 @@ struct io_kiocb { struct file *file; struct io_cmd_data cmd; struct io_rsrc_update rsrc_update; - struct io_epoll epoll; struct io_splice splice; struct io_provide_buf pbuf; struct io_msg msg; @@ -5577,18 +5576,20 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) + struct io_epoll *epoll = io_kiocb_to_cmd(req); + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; - req->epoll.epfd = READ_ONCE(sqe->fd); - req->epoll.op = READ_ONCE(sqe->len); - req->epoll.fd = READ_ONCE(sqe->off); + epoll->epfd = READ_ONCE(sqe->fd); + epoll->op = READ_ONCE(sqe->len); + epoll->fd = READ_ONCE(sqe->off); - if (ep_op_has_event(req->epoll.op)) { + if (ep_op_has_event(epoll->op)) { struct epoll_event __user *ev; ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (copy_from_user(&req->epoll.event, ev, sizeof(*ev))) + if (copy_from_user(&epoll->event, ev, sizeof(*ev))) return -EFAULT; } @@ -5601,7 +5602,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) { #if defined(CONFIG_EPOLL) - struct io_epoll *ie = &req->epoll; + struct io_epoll *ie = io_kiocb_to_cmd(req); int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; From 6048498179fc8ac018c4bc90fea61eabf655dd28 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:01:47 -0600 Subject: [PATCH 0943/1250] io_uring: convert splice to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bcdc6ed7f46bb7..d0251d0744494d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -980,7 +980,6 @@ struct io_kiocb { struct file *file; struct io_cmd_data cmd; struct io_rsrc_update rsrc_update; - struct io_splice splice; struct io_provide_buf pbuf; struct io_msg msg; struct io_xattr xattr; @@ -4918,7 +4917,7 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) static int __io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_splice *sp = &req->splice; + struct io_splice *sp = io_kiocb_to_cmd(req); unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; sp->len = READ_ONCE(sqe->len); @@ -4939,7 +4938,7 @@ static int io_tee_prep(struct io_kiocb *req, static int io_tee(struct io_kiocb *req, unsigned int issue_flags) { - struct io_splice *sp = &req->splice; + struct io_splice *sp = io_kiocb_to_cmd(req); struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; struct file *in; @@ -4971,7 +4970,7 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_splice *sp = &req->splice; + struct io_splice *sp = io_kiocb_to_cmd(req); sp->off_in = READ_ONCE(sqe->splice_off_in); sp->off_out = READ_ONCE(sqe->off); @@ -4980,7 +4979,7 @@ static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_splice(struct io_kiocb *req, unsigned int issue_flags) { - struct io_splice *sp = &req->splice; + struct io_splice *sp = io_kiocb_to_cmd(req); struct file *out = sp->file_out; unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; loff_t *poff_in, *poff_out; From 4d856c254feeeeac0aa8edfcc929c8d1be629e8b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:03:49 -0600 Subject: [PATCH 0944/1250] io_uring: convert msg and nop to io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d0251d0744494d..41253ef58ada7e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -980,8 +980,6 @@ struct io_kiocb { struct file *file; struct io_cmd_data cmd; struct io_rsrc_update rsrc_update; - struct io_provide_buf pbuf; - struct io_msg msg; struct io_xattr xattr; struct io_uring_cmd uring_cmd; }; @@ -5030,19 +5028,21 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_msg *msg = io_kiocb_to_cmd(req); + if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || sqe->buf_index || sqe->personality)) return -EINVAL; - req->msg.user_data = READ_ONCE(sqe->off); - req->msg.len = READ_ONCE(sqe->len); + msg->user_data = READ_ONCE(sqe->off); + msg->len = READ_ONCE(sqe->len); return 0; } static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { + struct io_msg *msg = io_kiocb_to_cmd(req); struct io_ring_ctx *target_ctx; - struct io_msg *msg = &req->msg; bool filled; int ret; @@ -5324,7 +5324,7 @@ static int io_openat(struct io_kiocb *req, unsigned int issue_flags) static int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_provide_buf *p = &req->pbuf; + struct io_provide_buf *p = io_kiocb_to_cmd(req); u64 tmp; if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || @@ -5381,7 +5381,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) { - struct io_provide_buf *p = &req->pbuf; + struct io_provide_buf *p = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; @@ -5409,7 +5409,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; - struct io_provide_buf *p = &req->pbuf; + struct io_provide_buf *p = io_kiocb_to_cmd(req); u64 tmp; if (sqe->rw_flags || sqe->splice_fd_in) @@ -5528,7 +5528,7 @@ static __cold int io_init_bl_list(struct io_ring_ctx *ctx) static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) { - struct io_provide_buf *p = &req->pbuf; + struct io_provide_buf *p = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; int ret = 0; From 23406d4bf843efbedda70a64bf2cba706403a95a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:05:49 -0600 Subject: [PATCH 0945/1250] io_uring: convert rsrc_update to io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 41253ef58ada7e..be8da26f70d6d6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_rsrc_update rsrc_update; struct io_xattr xattr; struct io_uring_cmd uring_cmd; }; @@ -7800,23 +7799,26 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; if (sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; - req->rsrc_update.offset = READ_ONCE(sqe->off); - req->rsrc_update.nr_args = READ_ONCE(sqe->len); - if (!req->rsrc_update.nr_args) + up->offset = READ_ONCE(sqe->off); + up->nr_args = READ_ONCE(sqe->len); + if (!up->nr_args) return -EINVAL; - req->rsrc_update.arg = READ_ONCE(sqe->addr); + up->arg = READ_ONCE(sqe->addr); return 0; } static int io_files_update_with_index_alloc(struct io_kiocb *req, unsigned int issue_flags) { - __s32 __user *fds = u64_to_user_ptr(req->rsrc_update.arg); + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + __s32 __user *fds = u64_to_user_ptr(up->arg); unsigned int done; struct file *file; int ret, fd; @@ -7824,7 +7826,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, if (!req->ctx->file_data) return -ENXIO; - for (done = 0; done < req->rsrc_update.nr_args; done++) { + for (done = 0; done < up->nr_args; done++) { if (copy_from_user(&fd, &fds[done], sizeof(fd))) { ret = -EFAULT; break; @@ -7853,23 +7855,24 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { + struct io_rsrc_update *up = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; - struct io_uring_rsrc_update2 up; + struct io_uring_rsrc_update2 up2; int ret; - up.offset = req->rsrc_update.offset; - up.data = req->rsrc_update.arg; - up.nr = 0; - up.tags = 0; - up.resv = 0; - up.resv2 = 0; + up2.offset = up->offset; + up2.data = up->arg; + up2.nr = 0; + up2.tags = 0; + up2.resv = 0; + up2.resv2 = 0; - if (req->rsrc_update.offset == IORING_FILE_INDEX_ALLOC) { + if (up->offset == IORING_FILE_INDEX_ALLOC) { ret = io_files_update_with_index_alloc(req, issue_flags); } else { io_ring_submit_lock(ctx, issue_flags); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, - &up, req->rsrc_update.nr_args); + &up2, up->nr_args); io_ring_submit_unlock(ctx, issue_flags); } From 2736926825b9858eeeab03df03274b28b89ba033 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:06:46 -0600 Subject: [PATCH 0946/1250] io_uring: convert xattr to use io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index be8da26f70d6d6..0bb3c63f38696c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_xattr xattr; struct io_uring_cmd uring_cmd; }; @@ -4402,7 +4401,7 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) static inline void __io_xattr_finish(struct io_kiocb *req) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); if (ix->filename) putname(ix->filename); @@ -4422,7 +4421,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret) static int __io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); const char __user *name; int ret; @@ -4465,7 +4464,7 @@ static int io_fgetxattr_prep(struct io_kiocb *req, static int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); const char __user *path; int ret; @@ -4486,7 +4485,7 @@ static int io_getxattr_prep(struct io_kiocb *req, static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4502,7 +4501,7 @@ static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int ret; @@ -4531,7 +4530,7 @@ static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) static int __io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); const char __user *name; int ret; @@ -4562,7 +4561,7 @@ static int __io_setxattr_prep(struct io_kiocb *req, static int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); const char __user *path; int ret; @@ -4590,7 +4589,7 @@ static int io_fsetxattr_prep(struct io_kiocb *req, static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, struct path *path) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); int ret; ret = mnt_want_write(path->mnt); @@ -4617,7 +4616,7 @@ static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) { - struct io_xattr *ix = &req->xattr; + struct io_xattr *ix = io_kiocb_to_cmd(req); unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; int ret; From 584c06fa3cad7eb6d62e0648baa049879675eb8d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:09:32 -0600 Subject: [PATCH 0947/1250] io_uring: convert iouring_cmd to io_cmd_type Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0bb3c63f38696c..21246d2e6221ff 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -979,7 +979,6 @@ struct io_kiocb { */ struct file *file; struct io_cmd_data cmd; - struct io_uring_cmd uring_cmd; }; u8 opcode; @@ -4814,15 +4813,17 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) { - req->uring_cmd.task_work_cb(&req->uring_cmd); + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + + ioucmd->task_work_cb(ioucmd); } void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *)) { - struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - req->uring_cmd.task_work_cb = task_work_cb; + ioucmd->task_work_cb = task_work_cb; req->io_task_work.func = io_uring_cmd_work; io_req_task_work_add(req); } @@ -4842,7 +4843,7 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, */ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) { - struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd); + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); if (ret < 0) req_set_fail(req); @@ -4855,18 +4856,19 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_done); static int io_uring_cmd_prep_async(struct io_kiocb *req) { + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); size_t cmd_size; cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); - memcpy(req->async_data, req->uring_cmd.cmd, cmd_size); + memcpy(req->async_data, ioucmd->cmd, cmd_size); return 0; } static int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); if (sqe->rw_flags || sqe->__pad1) return -EINVAL; @@ -4877,7 +4879,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req, static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) { - struct io_uring_cmd *ioucmd = &req->uring_cmd; + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; struct file *file = req->file; int ret; From 6273e727d185763d5f59f89e814174eada645913 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:19:47 -0600 Subject: [PATCH 0948/1250] io_uring: unify struct io_symlink and io_hardlink They are really just a subset of each other, just use the one type. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 27 +++++++-------------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 21246d2e6221ff..808acb854b660d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -761,14 +761,7 @@ struct io_mkdir { struct filename *filename; }; -struct io_symlink { - struct file *file; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; -}; - -struct io_hardlink { +struct io_link { struct file *file; int old_dfd; int new_dfd; @@ -4723,7 +4716,7 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) static int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_symlink *sl = io_kiocb_to_cmd(req); + struct io_link *sl = io_kiocb_to_cmd(req); const char __user *oldpath, *newpath; if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4751,7 +4744,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_symlink *sl = io_kiocb_to_cmd(req); + struct io_link *sl = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -4767,7 +4760,7 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) static int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_hardlink *lnk = io_kiocb_to_cmd(req); + struct io_link *lnk = io_kiocb_to_cmd(req); const char __user *oldf, *newf; if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) @@ -4797,7 +4790,7 @@ static int io_linkat_prep(struct io_kiocb *req, static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { - struct io_hardlink *lnk = io_kiocb_to_cmd(req); + struct io_link *lnk = io_kiocb_to_cmd(req); int ret; if (issue_flags & IO_URING_F_NONBLOCK) @@ -8011,15 +8004,9 @@ static void io_clean_op(struct io_kiocb *req) putname(md->filename); break; } - case IORING_OP_SYMLINKAT: { - struct io_symlink *sl = io_kiocb_to_cmd(req); - - putname(sl->oldpath); - putname(sl->newpath); - break; - } + case IORING_OP_SYMLINKAT: case IORING_OP_LINKAT: { - struct io_hardlink *hl = io_kiocb_to_cmd(req); + struct io_link *hl = io_kiocb_to_cmd(req); putname(hl->oldpath); putname(hl->newpath); From 24210d3b02f920fab6474e1aea59301029133453 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:26:28 -0600 Subject: [PATCH 0949/1250] io_uring: define a request type cleanup handler This can move request type specific cleanup into a private handler, removing the need for the core io_uring parts to know what types they are dealing with. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 155 ++++++++++++++++++++++++-------------------- 1 file changed, 86 insertions(+), 69 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 808acb854b660d..75d8c31a59d50e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1093,6 +1093,7 @@ struct io_op_def { int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); int (*issue)(struct io_kiocb *, unsigned int); int (*prep_async)(struct io_kiocb *); + void (*cleanup)(struct io_kiocb *); }; static const struct io_op_def io_op_defs[]; @@ -3433,6 +3434,13 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static void io_readv_writev_cleanup(struct io_kiocb *req) +{ + struct io_async_rw *io = req->async_data; + + kfree(io->free_iovec); +} + static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) { switch (ret) { @@ -4391,7 +4399,15 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static inline void __io_xattr_finish(struct io_kiocb *req) +static void io_renameat_cleanup(struct io_kiocb *req) +{ + struct io_rename *ren = io_kiocb_to_cmd(req); + + putname(ren->oldpath); + putname(ren->newpath); +} + +static inline void io_xattr_cleanup(struct io_kiocb *req) { struct io_xattr *ix = io_kiocb_to_cmd(req); @@ -4406,7 +4422,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret) { req->flags &= ~REQ_F_NEED_CLEANUP; - __io_xattr_finish(req); + io_xattr_cleanup(req); io_req_complete(req, ret); } @@ -4675,6 +4691,13 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_unlinkat_cleanup(struct io_kiocb *req) +{ + struct io_unlink *ul = io_kiocb_to_cmd(req); + + putname(ul->filename); +} + static int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4713,6 +4736,13 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_mkdirat_cleanup(struct io_kiocb *req) +{ + struct io_mkdir *md = io_kiocb_to_cmd(req); + + putname(md->filename); +} + static int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4804,6 +4834,14 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_link_cleanup(struct io_kiocb *req) +{ + struct io_link *sl = io_kiocb_to_cmd(req); + + putname(sl->oldpath); + putname(sl->newpath); +} + static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); @@ -5314,6 +5352,14 @@ static int io_openat(struct io_kiocb *req, unsigned int issue_flags) return io_openat2(req, issue_flags); } +static void io_open_cleanup(struct io_kiocb *req) +{ + struct io_open *open = io_kiocb_to_cmd(req); + + if (open->filename) + putname(open->filename); +} + static int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -5725,6 +5771,14 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static void io_statx_cleanup(struct io_kiocb *req) +{ + struct io_statx *sx = io_kiocb_to_cmd(req); + + if (sx->filename) + putname(sx->filename); +} + static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_close *close = io_kiocb_to_cmd(req); @@ -5897,6 +5951,13 @@ static int io_sendmsg_prep_async(struct io_kiocb *req) return ret; } +static void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) +{ + struct io_async_msghdr *io = req->async_data; + + kfree(io->free_iov); +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); @@ -7958,74 +8019,10 @@ static void io_clean_op(struct io_kiocb *req) } if (req->flags & REQ_F_NEED_CLEANUP) { - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - case IORING_OP_WRITE: { - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); - break; - } - case IORING_OP_RECVMSG: - case IORING_OP_SENDMSG: { - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); - break; - } - case IORING_OP_OPENAT: - case IORING_OP_OPENAT2: { - struct io_open *open = io_kiocb_to_cmd(req); - - if (open->filename) - putname(open->filename); - break; - } - case IORING_OP_RENAMEAT: { - struct io_rename *ren = io_kiocb_to_cmd(req); - - putname(ren->oldpath); - putname(ren->newpath); - break; - } - case IORING_OP_UNLINKAT: { - struct io_unlink *ul = io_kiocb_to_cmd(req); - - putname(ul->filename); - break; - } - case IORING_OP_MKDIRAT: { - struct io_mkdir *md = io_kiocb_to_cmd(req); - - putname(md->filename); - break; - } - case IORING_OP_SYMLINKAT: - case IORING_OP_LINKAT: { - struct io_link *hl = io_kiocb_to_cmd(req); - - putname(hl->oldpath); - putname(hl->newpath); - break; - } - case IORING_OP_STATX: { - struct io_statx *sx = io_kiocb_to_cmd(req); + const struct io_op_def *def = &io_op_defs[req->opcode]; - if (sx->filename) - putname(sx->filename); - break; - } - case IORING_OP_SETXATTR: - case IORING_OP_FSETXATTR: - case IORING_OP_GETXATTR: - case IORING_OP_FGETXATTR: - __io_xattr_finish(req); - break; - } + if (def->cleanup) + def->cleanup(req); } if ((req->flags & REQ_F_POLLED) && req->apoll) { kfree(req->apoll->double_poll); @@ -12838,6 +12835,7 @@ static const struct io_op_def io_op_defs[] = { .prep = io_prep_rw, .issue = io_read, .prep_async = io_readv_prep_async, + .cleanup = io_readv_writev_cleanup, }, [IORING_OP_WRITEV] = { .needs_file = 1, @@ -12852,6 +12850,7 @@ static const struct io_op_def io_op_defs[] = { .prep = io_prep_rw, .issue = io_write, .prep_async = io_writev_prep_async, + .cleanup = io_readv_writev_cleanup, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -12911,6 +12910,9 @@ static const struct io_op_def io_op_defs[] = { .prep = io_sendmsg_prep, .issue = io_sendmsg, .prep_async = io_sendmsg_prep_async, +#if defined(CONFIG_NET) + .cleanup = io_sendmsg_recvmsg_cleanup, +#endif }, [IORING_OP_RECVMSG] = { .needs_file = 1, @@ -12922,6 +12924,9 @@ static const struct io_op_def io_op_defs[] = { .prep = io_recvmsg_prep, .issue = io_recvmsg, .prep_async = io_recvmsg_prep_async, +#if defined(CONFIG_NET) + .cleanup = io_sendmsg_recvmsg_cleanup, +#endif }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, @@ -12972,6 +12977,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_OPENAT] = { .prep = io_openat_prep, .issue = io_openat, + .cleanup = io_open_cleanup, }, [IORING_OP_CLOSE] = { .prep = io_close_prep, @@ -12987,6 +12993,7 @@ static const struct io_op_def io_op_defs[] = { .audit_skip = 1, .prep = io_statx_prep, .issue = io_statx, + .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { .needs_file = 1, @@ -13046,6 +13053,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_OPENAT2] = { .prep = io_openat2_prep, .issue = io_openat2, + .cleanup = io_open_cleanup, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, @@ -13089,22 +13097,27 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_RENAMEAT] = { .prep = io_renameat_prep, .issue = io_renameat, + .cleanup = io_renameat_cleanup, }, [IORING_OP_UNLINKAT] = { .prep = io_unlinkat_prep, .issue = io_unlinkat, + .cleanup = io_unlinkat_cleanup, }, [IORING_OP_MKDIRAT] = { .prep = io_mkdirat_prep, .issue = io_mkdirat, + .cleanup = io_mkdirat_cleanup, }, [IORING_OP_SYMLINKAT] = { .prep = io_symlinkat_prep, .issue = io_symlinkat, + .cleanup = io_link_cleanup, }, [IORING_OP_LINKAT] = { .prep = io_linkat_prep, .issue = io_linkat, + .cleanup = io_link_cleanup, }, [IORING_OP_MSG_RING] = { .needs_file = 1, @@ -13116,19 +13129,23 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .prep = io_fsetxattr_prep, .issue = io_fsetxattr, + .cleanup = io_xattr_cleanup, }, [IORING_OP_SETXATTR] = { .prep = io_setxattr_prep, .issue = io_setxattr, + .cleanup = io_xattr_cleanup, }, [IORING_OP_FGETXATTR] = { .needs_file = 1, .prep = io_fgetxattr_prep, .issue = io_fgetxattr, + .cleanup = io_xattr_cleanup, }, [IORING_OP_GETXATTR] = { .prep = io_getxattr_prep, .issue = io_getxattr, + .cleanup = io_xattr_cleanup, }, [IORING_OP_SOCKET] = { .audit_skip = 1, From 1137696137ef6e67549c84d128cdf7f454ef8d28 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 10:56:14 -0600 Subject: [PATCH 0950/1250] io_uring: add io_uring_types.h This adds definitions of structs that both the core and the various opcode handlers need to know about. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 491 +------------------------------------ io_uring/io_uring_types.h | 496 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 499 insertions(+), 488 deletions(-) create mode 100644 io_uring/io_uring_types.h diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 75d8c31a59d50e..ff7886c35490ab 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -90,6 +90,8 @@ #include "../fs/internal.h" #include "io-wq.h" +#include "io_uring_types.h" + #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 @@ -122,89 +124,6 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) -struct io_uring { - u32 head ____cacheline_aligned_in_smp; - u32 tail ____cacheline_aligned_in_smp; -}; - -/* - * This data is shared with the application through the mmap at offsets - * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. - * - * The offsets to the member fields are published through struct - * io_sqring_offsets when calling io_uring_setup. - */ -struct io_rings { - /* - * Head and tail offsets into the ring; the offsets need to be - * masked to get valid indices. - * - * The kernel controls head of the sq ring and the tail of the cq ring, - * and the application controls tail of the sq ring and the head of the - * cq ring. - */ - struct io_uring sq, cq; - /* - * Bitmasks to apply to head and tail offsets (constant, equals - * ring_entries - 1) - */ - u32 sq_ring_mask, cq_ring_mask; - /* Ring sizes (constant, power of 2) */ - u32 sq_ring_entries, cq_ring_entries; - /* - * Number of invalid entries dropped by the kernel due to - * invalid index stored in array - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * After a new SQ head value was read by the application this - * counter includes all submissions that were dropped reaching - * the new SQ head (and possibly more). - */ - u32 sq_dropped; - /* - * Runtime SQ flags - * - * Written by the kernel, shouldn't be modified by the - * application. - * - * The application needs a full memory barrier before checking - * for IORING_SQ_NEED_WAKEUP after updating the sq tail. - */ - atomic_t sq_flags; - /* - * Runtime CQ flags - * - * Written by the application, shouldn't be modified by the - * kernel. - */ - u32 cq_flags; - /* - * Number of completion events lost because the queue was full; - * this should be avoided by the application by making sure - * there are not more requests pending than there is space in - * the completion queue. - * - * Written by the kernel, shouldn't be modified by the - * application (i.e. get number of "new events" by comparing to - * cached value). - * - * As completion events come in out of order this counter is not - * ordered with any other data. - */ - u32 cq_overflow; - /* - * Ring buffer of completion events. - * - * The kernel writes completion events fresh every time they are - * produced, so the application is allowed to modify pending - * entries. - */ - struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; -}; - struct io_mapped_ubuf { u64 ubuf; u64 ubuf_end; @@ -252,12 +171,6 @@ struct io_rsrc_put { }; }; -struct io_file_table { - struct io_fixed_file *files; - unsigned long *bitmap; - unsigned int alloc_hint; -}; - struct io_rsrc_node { struct percpu_ref refs; struct list_head node; @@ -310,14 +223,6 @@ struct io_buffer { __u16 bgid; }; -struct io_restriction { - DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); - DECLARE_BITMAP(sqe_op, IORING_OP_LAST); - u8 sqe_flags_allowed; - u8 sqe_flags_required; - bool registered; -}; - enum { IO_SQ_THREAD_SHOULD_STOP = 0, IO_SQ_THREAD_SHOULD_PARK, @@ -347,186 +252,7 @@ struct io_sq_data { #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -struct io_submit_link { - struct io_kiocb *head; - struct io_kiocb *last; -}; - -struct io_submit_state { - /* inline/task_work completion list, under ->uring_lock */ - struct io_wq_work_node free_list; - /* batch completion logic */ - struct io_wq_work_list compl_reqs; - struct io_submit_link link; - - bool plug_started; - bool need_plug; - bool flush_cqes; - unsigned short submit_nr; - struct blk_plug plug; -}; - -struct io_ev_fd { - struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; -}; - -#define BGID_ARRAY 64 - -struct io_ring_ctx { - /* const or read-mostly hot data */ - struct { - struct percpu_ref refs; - - struct io_rings *rings; - unsigned int flags; - enum task_work_notify_mode notify_method; - unsigned int compat: 1; - unsigned int drain_next: 1; - unsigned int restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int drain_disabled: 1; - unsigned int has_evfd: 1; - unsigned int syscall_iopoll: 1; - } ____cacheline_aligned_in_smp; - - /* submission data */ - struct { - struct mutex uring_lock; - - /* - * Ring buffer of indices into array of io_uring_sqe, which is - * mmapped by the application using the IORING_OFF_SQES offset. - * - * This indirection could e.g. be used to assign fixed - * io_uring_sqe entries to operations and only submit them to - * the queue when needed. - * - * The kernel modifies neither the indices array nor the entries - * array. - */ - u32 *sq_array; - struct io_uring_sqe *sq_sqes; - unsigned cached_sq_head; - unsigned sq_entries; - struct list_head defer_list; - - /* - * Fixed resources fast path, should be accessed only under - * uring_lock, and updated through io_uring_register(2) - */ - struct io_rsrc_node *rsrc_node; - int rsrc_cached_refs; - atomic_t cancel_seq; - struct io_file_table file_table; - unsigned nr_user_files; - unsigned nr_user_bufs; - struct io_mapped_ubuf **user_bufs; - - struct io_submit_state submit_state; - - struct io_buffer_list *io_bl; - struct xarray io_bl_xa; - struct list_head io_buffers_cache; - - struct list_head timeout_list; - struct list_head ltimeout_list; - struct list_head cq_overflow_list; - struct list_head apoll_cache; - struct xarray personalities; - u32 pers_next; - unsigned sq_thread_idle; - } ____cacheline_aligned_in_smp; - - /* IRQ completion list, under ->completion_lock */ - struct io_wq_work_list locked_free_list; - unsigned int locked_free_nr; - - const struct cred *sq_creds; /* cred used for __io_sq_thread() */ - struct io_sq_data *sq_data; /* if using sq thread polling */ - - struct wait_queue_head sqo_sq_wait; - struct list_head sqd_list; - - unsigned long check_cq; - - struct { - /* - * We cache a range of free CQEs we can use, once exhausted it - * should go through a slower range setup, see __io_get_cqe() - */ - struct io_uring_cqe *cqe_cached; - struct io_uring_cqe *cqe_sentinel; - - unsigned cached_cq_tail; - unsigned cq_entries; - struct io_ev_fd __rcu *io_ev_fd; - struct wait_queue_head cq_wait; - unsigned cq_extra; - atomic_t cq_timeouts; - unsigned cq_last_tm_flush; - } ____cacheline_aligned_in_smp; - - struct { - spinlock_t completion_lock; - - spinlock_t timeout_lock; - - /* - * ->iopoll_list is protected by the ctx->uring_lock for - * io_uring instances that don't use IORING_SETUP_SQPOLL. - * For SQPOLL, only the single threaded io_sq_thread() will - * manipulate the list, hence no extra locking is needed there. - */ - struct io_wq_work_list iopoll_list; - struct hlist_head *cancel_hash; - unsigned cancel_hash_bits; - bool poll_multi_queue; - - struct list_head io_buffers_comp; - } ____cacheline_aligned_in_smp; - - struct io_restriction restrictions; - - /* slow path rsrc auxilary data, used by update/register */ - struct { - struct io_rsrc_node *rsrc_backup_node; - struct io_mapped_ubuf *dummy_ubuf; - struct io_rsrc_data *file_data; - struct io_rsrc_data *buf_data; - - struct delayed_work rsrc_put_work; - struct llist_head rsrc_put_llist; - struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; - - struct list_head io_buffers_pages; - }; - - /* Keep this last, we don't need it for the fast path */ - struct { - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif - /* hashed buffered write serialization */ - struct io_wq_hash *hash_map; - - /* Only used for accounting purposes */ - struct user_struct *user; - struct mm_struct *mm_account; - - /* ctx exit and cancelation */ - struct llist_head fallback_llist; - struct delayed_work fallback_work; - struct work_struct exit_work; - struct list_head tctx_list; - struct completion ref_comp; - u32 iowq_limits[2]; - bool iowq_limits_set; - }; -}; +#define BGID_ARRAY 64 /* * Arbitrary limit, can be raised if need be @@ -808,232 +534,21 @@ struct io_xattr { struct filename *filename; }; -enum { - REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, - REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, - REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, - REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, - REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, - REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, - - /* first byte is taken by user flags, shift it to not overlap */ - REQ_F_FAIL_BIT = 8, - REQ_F_INFLIGHT_BIT, - REQ_F_CUR_POS_BIT, - REQ_F_NOWAIT_BIT, - REQ_F_LINK_TIMEOUT_BIT, - REQ_F_NEED_CLEANUP_BIT, - REQ_F_POLLED_BIT, - REQ_F_BUFFER_SELECTED_BIT, - REQ_F_BUFFER_RING_BIT, - REQ_F_COMPLETE_INLINE_BIT, - REQ_F_REISSUE_BIT, - REQ_F_CREDS_BIT, - REQ_F_REFCOUNT_BIT, - REQ_F_ARM_LTIMEOUT_BIT, - REQ_F_ASYNC_DATA_BIT, - REQ_F_SKIP_LINK_CQES_BIT, - REQ_F_SINGLE_POLL_BIT, - REQ_F_DOUBLE_POLL_BIT, - REQ_F_PARTIAL_IO_BIT, - REQ_F_CQE32_INIT_BIT, - REQ_F_APOLL_MULTISHOT_BIT, - REQ_F_CLEAR_POLLIN_BIT, - /* keep async read/write and isreg together and in order */ - REQ_F_SUPPORT_NOWAIT_BIT, - REQ_F_ISREG_BIT, - - /* not a real bit, just to check we're not overflowing the space */ - __REQ_F_LAST_BIT, -}; - -enum { - /* ctx owns file */ - REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), - /* drain existing IO first */ - REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), - /* linked sqes */ - REQ_F_LINK = BIT(REQ_F_LINK_BIT), - /* doesn't sever on completion < 0 */ - REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), - /* IOSQE_ASYNC */ - REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), - /* IOSQE_BUFFER_SELECT */ - REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), - /* IOSQE_CQE_SKIP_SUCCESS */ - REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), - - /* fail rest of links */ - REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), - /* on inflight list, should be cancelled and waited on exit reliably */ - REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), - /* read/write uses file position */ - REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), - /* must not punt to workers */ - REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), - /* has or had linked timeout */ - REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), - /* needs cleanup */ - REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), - /* already went through poll handler */ - REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), - /* buffer already selected */ - REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), - /* buffer selected from ring, needs commit */ - REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), - /* completion is deferred through io_comp_state */ - REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), - /* caller should reissue async */ - REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), - /* supports async reads/writes */ - REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), - /* regular file */ - REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), - /* has creds assigned */ - REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), - /* skip refcounting if not set */ - REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), - /* there is a linked timeout that has to be armed */ - REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), - /* ->async_data allocated */ - REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), - /* don't post CQEs while failing linked requests */ - REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), - /* single poll may be active */ - REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), - /* double poll may active */ - REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), - /* request has already done partial IO */ - REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), - /* fast poll multishot mode */ - REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), - /* ->extra1 and ->extra2 are initialised */ - REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), - /* recvmsg special flag, clear EPOLLIN */ - REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), -}; - struct async_poll { struct io_poll poll; struct io_poll *double_poll; }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); - -struct io_task_work { - union { - struct io_wq_work_node node; - struct llist_node fallback_node; - }; - io_req_tw_func_t func; -}; - enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, }; -struct io_cqe { - __u64 user_data; - __s32 res; - /* fd initially, then cflags for completion */ - union { - __u32 flags; - int fd; - }; -}; - enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, }; -/* - * Each request type overlays its private data structure on top of this one. - * They must not exceed this one in size. - */ -struct io_cmd_data { - struct file *file; - /* each command gets 56 bytes of data */ - __u8 data[56]; -}; - -#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) -#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) - -struct io_kiocb { - union { - /* - * NOTE! Each of the io_kiocb union members has the file pointer - * as the first entry in their struct definition. So you can - * access the file pointer through any of the sub-structs, - * or directly as just 'file' in this struct. - */ - struct file *file; - struct io_cmd_data cmd; - }; - - u8 opcode; - /* polled IO has completed */ - u8 iopoll_completed; - /* - * Can be either a fixed buffer index, or used with provided buffers. - * For the latter, before issue it points to the buffer group ID, - * and after selection it points to the buffer ID itself. - */ - u16 buf_index; - unsigned int flags; - - struct io_cqe cqe; - - struct io_ring_ctx *ctx; - struct task_struct *task; - - struct io_rsrc_node *rsrc_node; - - union { - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; - - /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ - struct io_buffer *kbuf; - - /* - * stores buffer ID for ring provided buffers, valid IFF - * REQ_F_BUFFER_RING is set. - */ - struct io_buffer_list *buf_list; - }; - - union { - /* used by request caches, completion batching and iopoll */ - struct io_wq_work_node comp_list; - /* cache ->apoll->events */ - __poll_t apoll_events; - }; - atomic_t refs; - atomic_t poll_refs; - struct io_task_work io_task_work; - /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ - union { - struct hlist_node hash_node; - struct { - u64 extra1; - u64 extra2; - }; - }; - /* internal polling, see IORING_FEAT_FAST_POLL */ - struct async_poll *apoll; - /* opcode allocated if it needs to store data for async defer */ - void *async_data; - /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ - struct io_kiocb *link; - /* custom credentials, valid IFF REQ_F_CREDS is set */ - const struct cred *creds; - struct io_wq_work work; -}; - struct io_tctx_node { struct list_head ctx_node; struct task_struct *task; diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h new file mode 100644 index 00000000000000..1a0f592ff6fc35 --- /dev/null +++ b/io_uring/io_uring_types.h @@ -0,0 +1,496 @@ +#ifndef IO_URING_TYPES_H +#define IO_URING_TYPES_H + +#include +#include + +#include "io-wq.h" + +struct io_uring { + u32 head ____cacheline_aligned_in_smp; + u32 tail ____cacheline_aligned_in_smp; +}; + +/* + * This data is shared with the application through the mmap at offsets + * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING. + * + * The offsets to the member fields are published through struct + * io_sqring_offsets when calling io_uring_setup. + */ +struct io_rings { + /* + * Head and tail offsets into the ring; the offsets need to be + * masked to get valid indices. + * + * The kernel controls head of the sq ring and the tail of the cq ring, + * and the application controls tail of the sq ring and the head of the + * cq ring. + */ + struct io_uring sq, cq; + /* + * Bitmasks to apply to head and tail offsets (constant, equals + * ring_entries - 1) + */ + u32 sq_ring_mask, cq_ring_mask; + /* Ring sizes (constant, power of 2) */ + u32 sq_ring_entries, cq_ring_entries; + /* + * Number of invalid entries dropped by the kernel due to + * invalid index stored in array + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * After a new SQ head value was read by the application this + * counter includes all submissions that were dropped reaching + * the new SQ head (and possibly more). + */ + u32 sq_dropped; + /* + * Runtime SQ flags + * + * Written by the kernel, shouldn't be modified by the + * application. + * + * The application needs a full memory barrier before checking + * for IORING_SQ_NEED_WAKEUP after updating the sq tail. + */ + atomic_t sq_flags; + /* + * Runtime CQ flags + * + * Written by the application, shouldn't be modified by the + * kernel. + */ + u32 cq_flags; + /* + * Number of completion events lost because the queue was full; + * this should be avoided by the application by making sure + * there are not more requests pending than there is space in + * the completion queue. + * + * Written by the kernel, shouldn't be modified by the + * application (i.e. get number of "new events" by comparing to + * cached value). + * + * As completion events come in out of order this counter is not + * ordered with any other data. + */ + u32 cq_overflow; + /* + * Ring buffer of completion events. + * + * The kernel writes completion events fresh every time they are + * produced, so the application is allowed to modify pending + * entries. + */ + struct io_uring_cqe cqes[] ____cacheline_aligned_in_smp; +}; + +struct io_restriction { + DECLARE_BITMAP(register_op, IORING_REGISTER_LAST); + DECLARE_BITMAP(sqe_op, IORING_OP_LAST); + u8 sqe_flags_allowed; + u8 sqe_flags_required; + bool registered; +}; + +struct io_submit_link { + struct io_kiocb *head; + struct io_kiocb *last; +}; + +struct io_submit_state { + /* inline/task_work completion list, under ->uring_lock */ + struct io_wq_work_node free_list; + /* batch completion logic */ + struct io_wq_work_list compl_reqs; + struct io_submit_link link; + + bool plug_started; + bool need_plug; + bool flush_cqes; + unsigned short submit_nr; + struct blk_plug plug; +}; + +struct io_ev_fd { + struct eventfd_ctx *cq_ev_fd; + unsigned int eventfd_async: 1; + struct rcu_head rcu; +}; + +struct io_file_table { + struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; +}; + +struct io_ring_ctx { + /* const or read-mostly hot data */ + struct { + struct percpu_ref refs; + + struct io_rings *rings; + unsigned int flags; + enum task_work_notify_mode notify_method; + unsigned int compat: 1; + unsigned int drain_next: 1; + unsigned int restricted: 1; + unsigned int off_timeout_used: 1; + unsigned int drain_active: 1; + unsigned int drain_disabled: 1; + unsigned int has_evfd: 1; + unsigned int syscall_iopoll: 1; + } ____cacheline_aligned_in_smp; + + /* submission data */ + struct { + struct mutex uring_lock; + + /* + * Ring buffer of indices into array of io_uring_sqe, which is + * mmapped by the application using the IORING_OFF_SQES offset. + * + * This indirection could e.g. be used to assign fixed + * io_uring_sqe entries to operations and only submit them to + * the queue when needed. + * + * The kernel modifies neither the indices array nor the entries + * array. + */ + u32 *sq_array; + struct io_uring_sqe *sq_sqes; + unsigned cached_sq_head; + unsigned sq_entries; + struct list_head defer_list; + + /* + * Fixed resources fast path, should be accessed only under + * uring_lock, and updated through io_uring_register(2) + */ + struct io_rsrc_node *rsrc_node; + int rsrc_cached_refs; + atomic_t cancel_seq; + struct io_file_table file_table; + unsigned nr_user_files; + unsigned nr_user_bufs; + struct io_mapped_ubuf **user_bufs; + + struct io_submit_state submit_state; + + struct io_buffer_list *io_bl; + struct xarray io_bl_xa; + struct list_head io_buffers_cache; + + struct list_head timeout_list; + struct list_head ltimeout_list; + struct list_head cq_overflow_list; + struct list_head apoll_cache; + struct xarray personalities; + u32 pers_next; + unsigned sq_thread_idle; + } ____cacheline_aligned_in_smp; + + /* IRQ completion list, under ->completion_lock */ + struct io_wq_work_list locked_free_list; + unsigned int locked_free_nr; + + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ + struct io_sq_data *sq_data; /* if using sq thread polling */ + + struct wait_queue_head sqo_sq_wait; + struct list_head sqd_list; + + unsigned long check_cq; + + struct { + /* + * We cache a range of free CQEs we can use, once exhausted it + * should go through a slower range setup, see __io_get_cqe() + */ + struct io_uring_cqe *cqe_cached; + struct io_uring_cqe *cqe_sentinel; + + unsigned cached_cq_tail; + unsigned cq_entries; + struct io_ev_fd __rcu *io_ev_fd; + struct wait_queue_head cq_wait; + unsigned cq_extra; + atomic_t cq_timeouts; + unsigned cq_last_tm_flush; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t completion_lock; + + spinlock_t timeout_lock; + + /* + * ->iopoll_list is protected by the ctx->uring_lock for + * io_uring instances that don't use IORING_SETUP_SQPOLL. + * For SQPOLL, only the single threaded io_sq_thread() will + * manipulate the list, hence no extra locking is needed there. + */ + struct io_wq_work_list iopoll_list; + struct hlist_head *cancel_hash; + unsigned cancel_hash_bits; + bool poll_multi_queue; + + struct list_head io_buffers_comp; + } ____cacheline_aligned_in_smp; + + struct io_restriction restrictions; + + /* slow path rsrc auxilary data, used by update/register */ + struct { + struct io_rsrc_node *rsrc_backup_node; + struct io_mapped_ubuf *dummy_ubuf; + struct io_rsrc_data *file_data; + struct io_rsrc_data *buf_data; + + struct delayed_work rsrc_put_work; + struct llist_head rsrc_put_llist; + struct list_head rsrc_ref_list; + spinlock_t rsrc_ref_lock; + + struct list_head io_buffers_pages; + }; + + /* Keep this last, we don't need it for the fast path */ + struct { + #if defined(CONFIG_UNIX) + struct socket *ring_sock; + #endif + /* hashed buffered write serialization */ + struct io_wq_hash *hash_map; + + /* Only used for accounting purposes */ + struct user_struct *user; + struct mm_struct *mm_account; + + /* ctx exit and cancelation */ + struct llist_head fallback_llist; + struct delayed_work fallback_work; + struct work_struct exit_work; + struct list_head tctx_list; + struct completion ref_comp; + u32 iowq_limits[2]; + bool iowq_limits_set; + }; +}; + +enum { + REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, + REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, + REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT, + REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT, + REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, + REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, + REQ_F_CQE_SKIP_BIT = IOSQE_CQE_SKIP_SUCCESS_BIT, + + /* first byte is taken by user flags, shift it to not overlap */ + REQ_F_FAIL_BIT = 8, + REQ_F_INFLIGHT_BIT, + REQ_F_CUR_POS_BIT, + REQ_F_NOWAIT_BIT, + REQ_F_LINK_TIMEOUT_BIT, + REQ_F_NEED_CLEANUP_BIT, + REQ_F_POLLED_BIT, + REQ_F_BUFFER_SELECTED_BIT, + REQ_F_BUFFER_RING_BIT, + REQ_F_COMPLETE_INLINE_BIT, + REQ_F_REISSUE_BIT, + REQ_F_CREDS_BIT, + REQ_F_REFCOUNT_BIT, + REQ_F_ARM_LTIMEOUT_BIT, + REQ_F_ASYNC_DATA_BIT, + REQ_F_SKIP_LINK_CQES_BIT, + REQ_F_SINGLE_POLL_BIT, + REQ_F_DOUBLE_POLL_BIT, + REQ_F_PARTIAL_IO_BIT, + REQ_F_CQE32_INIT_BIT, + REQ_F_APOLL_MULTISHOT_BIT, + REQ_F_CLEAR_POLLIN_BIT, + /* keep async read/write and isreg together and in order */ + REQ_F_SUPPORT_NOWAIT_BIT, + REQ_F_ISREG_BIT, + + /* not a real bit, just to check we're not overflowing the space */ + __REQ_F_LAST_BIT, +}; + +enum { + /* ctx owns file */ + REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT), + /* drain existing IO first */ + REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT), + /* linked sqes */ + REQ_F_LINK = BIT(REQ_F_LINK_BIT), + /* doesn't sever on completion < 0 */ + REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT), + /* IOSQE_ASYNC */ + REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT), + /* IOSQE_BUFFER_SELECT */ + REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT), + /* IOSQE_CQE_SKIP_SUCCESS */ + REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT), + + /* fail rest of links */ + REQ_F_FAIL = BIT(REQ_F_FAIL_BIT), + /* on inflight list, should be cancelled and waited on exit reliably */ + REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT), + /* read/write uses file position */ + REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT), + /* must not punt to workers */ + REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT), + /* has or had linked timeout */ + REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT), + /* needs cleanup */ + REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT), + /* already went through poll handler */ + REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), + /* buffer already selected */ + REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), + /* buffer selected from ring, needs commit */ + REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), + /* completion is deferred through io_comp_state */ + REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), + /* caller should reissue async */ + REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), + /* supports async reads/writes */ + REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), + /* regular file */ + REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), + /* has creds assigned */ + REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), + /* skip refcounting if not set */ + REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + /* there is a linked timeout that has to be armed */ + REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + /* ->async_data allocated */ + REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), + /* don't post CQEs while failing linked requests */ + REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT), + /* single poll may be active */ + REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT), + /* double poll may active */ + REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT), + /* request has already done partial IO */ + REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + /* fast poll multishot mode */ + REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT), + /* ->extra1 and ->extra2 are initialised */ + REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), + /* recvmsg special flag, clear EPOLLIN */ + REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), +}; + +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); + +struct io_task_work { + union { + struct io_wq_work_node node; + struct llist_node fallback_node; + }; + io_req_tw_func_t func; +}; + +struct io_cqe { + __u64 user_data; + __s32 res; + /* fd initially, then cflags for completion */ + union { + __u32 flags; + int fd; + }; +}; + +/* + * Each request type overlays its private data structure on top of this one. + * They must not exceed this one in size. + */ +struct io_cmd_data { + struct file *file; + /* each command gets 56 bytes of data */ + __u8 data[56]; +}; + +#define io_kiocb_to_cmd(req) ((void *) &(req)->cmd) +#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) + +struct io_kiocb { + union { + /* + * NOTE! Each of the io_kiocb union members has the file pointer + * as the first entry in their struct definition. So you can + * access the file pointer through any of the sub-structs, + * or directly as just 'file' in this struct. + */ + struct file *file; + struct io_cmd_data cmd; + }; + + u8 opcode; + /* polled IO has completed */ + u8 iopoll_completed; + /* + * Can be either a fixed buffer index, or used with provided buffers. + * For the latter, before issue it points to the buffer group ID, + * and after selection it points to the buffer ID itself. + */ + u16 buf_index; + unsigned int flags; + + struct io_cqe cqe; + + struct io_ring_ctx *ctx; + struct task_struct *task; + + struct io_rsrc_node *rsrc_node; + + union { + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; + + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; + + /* + * stores buffer ID for ring provided buffers, valid IFF + * REQ_F_BUFFER_RING is set. + */ + struct io_buffer_list *buf_list; + }; + + union { + /* used by request caches, completion batching and iopoll */ + struct io_wq_work_node comp_list; + /* cache ->apoll->events */ + __poll_t apoll_events; + }; + atomic_t refs; + atomic_t poll_refs; + struct io_task_work io_task_work; + /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ + union { + struct hlist_node hash_node; + struct { + u64 extra1; + u64 extra2; + }; + }; + /* internal polling, see IORING_FEAT_FAST_POLL */ + struct async_poll *apoll; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; + /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */ + struct io_kiocb *link; + /* custom credentials, valid IFF REQ_F_CREDS is set */ + const struct cred *creds; + struct io_wq_work work; +}; + +#endif From dda7decff9413b5a6e628a1f8a11b7d527428423 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 12:45:38 -0600 Subject: [PATCH 0951/1250] io_uring: set completion results upfront Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 21 +++++++++------------ io_uring/io_uring.h | 13 +++++++++++++ 2 files changed, 22 insertions(+), 12 deletions(-) create mode 100644 io_uring/io_uring.h diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ff7886c35490ab..92e51bbb769c96 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -91,6 +91,7 @@ #include "io-wq.h" #include "io_uring_types.h" +#include "io_uring.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -1876,21 +1877,15 @@ static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) io_cqring_ev_posted(ctx); } -static inline void io_req_complete_state(struct io_kiocb *req, s32 res, - u32 cflags) -{ - req->cqe.res = res; - req->cqe.flags = cflags; - req->flags |= REQ_F_COMPLETE_INLINE; -} - static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, s32 res, u32 cflags) { - if (issue_flags & IO_URING_F_COMPLETE_DEFER) - io_req_complete_state(req, res, cflags); - else + if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + io_req_set_res(req, res, cflags); + req->flags |= REQ_F_COMPLETE_INLINE; + } else { io_req_complete_post(req, res, cflags); + } } static inline void io_req_complete(struct io_kiocb *req, s32 res) @@ -2749,7 +2744,8 @@ static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) int res = req->cqe.res; if (*locked) { - io_req_complete_state(req, res, io_put_kbuf(req, 0)); + io_req_set_res(req, res, io_put_kbuf(req, 0)); + req->flags |= REQ_F_COMPLETE_INLINE; io_req_add_compl_list(req); } else { io_req_complete_post(req, res, @@ -4394,6 +4390,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) if (ret < 0) req_set_fail(req); + io_req_set_res(req, 0, ret); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); io_req_complete(req, ret); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h new file mode 100644 index 00000000000000..522e652197572e --- /dev/null +++ b/io_uring/io_uring.h @@ -0,0 +1,13 @@ +#ifndef IOU_CORE_H +#define IOU_CORE_H + +#include +#include "io_uring_types.h" + +static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) +{ + req->cqe.res = res; + req->cqe.flags = cflags; +} + +#endif From 0245ca65ee1264b931de78c4db89a3f9e09aabc3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 15:21:00 -0600 Subject: [PATCH 0952/1250] io_uring: handle completions in the core Normally request handlers complete requests themselves, if they don't return an error. For the latter case, the core will complete it for them. This is unhandy for pushing opcode handlers further out, as we don't want a bunch of inline completion code and we don't want to make the completion path slower than it is now. Let the core handle any completion, unless the handler explicitly asks us not to. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 271 ++++++++++++++++++++++---------------------- io_uring/io_uring.h | 5 + 2 files changed, 142 insertions(+), 134 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 92e51bbb769c96..7b8f1c9b7b48ea 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -625,7 +625,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static void __io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags); static void io_dismantle_req(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, @@ -1126,7 +1125,7 @@ static inline void req_set_fail(struct io_kiocb *req) static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); - req->cqe.res = res; + io_req_set_res(req, res, 0); } static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx) @@ -1855,50 +1854,37 @@ static void __io_req_complete_put(struct io_kiocb *req) } } -static void __io_req_complete_post(struct io_kiocb *req, s32 res, - u32 cflags) +static void __io_req_complete_post(struct io_kiocb *req) { - if (!(req->flags & REQ_F_CQE_SKIP)) { - req->cqe.res = res; - req->cqe.flags = cflags; + if (!(req->flags & REQ_F_CQE_SKIP)) __io_fill_cqe_req(req->ctx, req); - } __io_req_complete_put(req); } -static void io_req_complete_post(struct io_kiocb *req, s32 res, u32 cflags) +static void io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; spin_lock(&ctx->completion_lock); - __io_req_complete_post(req, res, cflags); + __io_req_complete_post(req); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); } -static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, - s32 res, u32 cflags) +static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) { - if (issue_flags & IO_URING_F_COMPLETE_DEFER) { - io_req_set_res(req, res, cflags); + if (issue_flags & IO_URING_F_COMPLETE_DEFER) req->flags |= REQ_F_COMPLETE_INLINE; - } else { - io_req_complete_post(req, res, cflags); - } -} - -static inline void io_req_complete(struct io_kiocb *req, s32 res) -{ - if (res < 0) - req_set_fail(req); - __io_req_complete(req, 0, res, 0); + else + io_req_complete_post(req); } static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); - io_req_complete_post(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); + io_req_complete_post(req); } /* @@ -2071,7 +2057,8 @@ static void io_fail_links(struct io_kiocb *req) link->flags |= REQ_F_CQE_SKIP; else link->flags &= ~REQ_F_CQE_SKIP; - __io_req_complete_post(link, res, 0); + io_req_set_res(link, res, 0); + __io_req_complete_post(link); link = nxt; } } @@ -2185,11 +2172,12 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (unlikely(!*uring_locked)) spin_lock(&(*ctx)->completion_lock); } - if (likely(*uring_locked)) + if (likely(*uring_locked)) { req->io_task_work.func(req, uring_locked); - else - __io_req_complete_post(req, req->cqe.res, - io_put_kbuf_comp(req)); + } else { + req->cqe.flags = io_put_kbuf_comp(req); + __io_req_complete_post(req); + } node = next; } while (node); @@ -2317,13 +2305,12 @@ static void io_req_task_prio_work_add(struct io_kiocb *req) static void io_req_tw_post(struct io_kiocb *req, bool *locked) { - io_req_complete_post(req, req->cqe.res, req->cqe.flags); + io_req_complete_post(req); } static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) { - req->cqe.res = res; - req->cqe.flags = cflags; + io_req_set_res(req, res, cflags); req->io_task_work.func = io_req_tw_post; io_req_task_work_add(req); } @@ -2347,7 +2334,7 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked) static void io_req_task_queue_fail(struct io_kiocb *req, int ret) { - req->cqe.res = ret; + io_req_set_res(req, ret, 0); req->io_task_work.func = io_req_task_cancel; io_req_task_work_add(req); } @@ -2741,15 +2728,13 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { - int res = req->cqe.res; - if (*locked) { - io_req_set_res(req, res, io_put_kbuf(req, 0)); + req->cqe.flags |= io_put_kbuf(req, 0); req->flags |= REQ_F_COMPLETE_INLINE; io_req_add_compl_list(req); } else { - io_req_complete_post(req, res, - io_put_kbuf(req, IO_URING_F_UNLOCKED)); + req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED); + io_req_complete_post(req); } } @@ -2758,8 +2743,8 @@ static void __io_complete_rw(struct io_kiocb *req, long res, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, issue_flags, req->cqe.res, - io_put_kbuf(req, issue_flags)); + io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags)); + __io_req_complete(req, issue_flags); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -2769,7 +2754,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) if (__io_complete_rw_common(req, res)) return; - req->cqe.res = res; + io_req_set_res(req, res, 0); req->io_task_work.func = io_req_task_complete; io_req_task_prio_work_add(req); } @@ -3745,7 +3730,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) */ ret = io_iter_do_read(rw, &s->iter); if (ret == -EIOCBQUEUED) - return 0; + return IOU_ISSUE_SKIP_COMPLETE; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; iov_iter_restore(&s->iter, &s->iter_state); @@ -3756,7 +3741,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) /* it's faster to check here then delegate to kfree */ if (iovec) kfree(iovec); - return 0; + return IOU_ISSUE_SKIP_COMPLETE; } static int io_write(struct io_kiocb *req, unsigned int issue_flags) @@ -3850,6 +3835,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) goto copy_iov; done: kiocb_done(req, ret2, issue_flags); + ret = IOU_ISSUE_SKIP_COMPLETE; } else { copy_iov: iov_iter_restore(&s->iter, &s->iter_state); @@ -3906,8 +3892,8 @@ static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) ren->newpath, ren->flags); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_renameat_cleanup(struct io_kiocb *req) @@ -3934,7 +3920,7 @@ static void io_xattr_finish(struct io_kiocb *req, int ret) req->flags &= ~REQ_F_NEED_CLEANUP; io_xattr_cleanup(req); - io_req_complete(req, ret); + io_req_set_res(req, ret, 0); } static int __io_getxattr_prep(struct io_kiocb *req, @@ -4015,7 +4001,7 @@ static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) &ix->ctx); io_xattr_finish(req, ret); - return 0; + return IOU_OK; } static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -4043,7 +4029,7 @@ static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) } io_xattr_finish(req, ret); - return 0; + return IOU_OK; } static int __io_setxattr_prep(struct io_kiocb *req, @@ -4129,8 +4115,7 @@ static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) ret = __io_setxattr(req, issue_flags, &req->file->f_path); io_xattr_finish(req, ret); - - return 0; + return IOU_OK; } static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -4155,7 +4140,7 @@ static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) } io_xattr_finish(req, ret); - return 0; + return IOU_OK; } static int io_unlinkat_prep(struct io_kiocb *req, @@ -4198,8 +4183,8 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_unlinkat(un->dfd, un->filename); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_unlinkat_cleanup(struct io_kiocb *req) @@ -4243,8 +4228,8 @@ static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_mkdirat_cleanup(struct io_kiocb *req) @@ -4294,8 +4279,8 @@ static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_linkat_prep(struct io_kiocb *req, @@ -4341,8 +4326,8 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) lnk->newpath, lnk->flags); req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_link_cleanup(struct io_kiocb *req) @@ -4393,7 +4378,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) io_req_set_res(req, 0, ret); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); - io_req_complete(req, ret); + __io_req_complete(req, 0); } EXPORT_SYMBOL_GPL(io_uring_cmd_done); @@ -4450,9 +4435,12 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - if (ret != -EIOCBQUEUED) + if (ret != -EIOCBQUEUED) { io_uring_cmd_done(ioucmd, ret, 0); - return 0; + return IOU_OK; + } + + return IOU_ISSUE_SKIP_COMPLETE; } static int __io_splice_prep(struct io_kiocb *req, @@ -4505,8 +4493,8 @@ static int io_tee(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - __io_req_complete(req, 0, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4550,8 +4538,8 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) done: if (ret != sp->len) req_set_fail(req); - __io_req_complete(req, 0, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4564,8 +4552,8 @@ static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) */ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - __io_req_complete(req, issue_flags, 0, 0); - return 0; + io_req_set_res(req, 0, 0); + return IOU_OK; } static int io_msg_ring_prep(struct io_kiocb *req, @@ -4609,11 +4597,11 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) done: if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); + io_req_set_res(req, ret, 0); /* put file to avoid an attempt to IOPOLL the req */ io_put_file(req->file); req->file = NULL; - return 0; + return IOU_OK; } static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4644,8 +4632,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_fallocate_prep(struct io_kiocb *req, @@ -4673,8 +4661,8 @@ static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); if (ret >= 0) fsnotify_modify(req->file); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -4855,8 +4843,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_openat(struct io_kiocb *req, unsigned int issue_flags) @@ -4951,9 +4939,10 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ - __io_req_complete(req, issue_flags, ret, 0); + io_req_set_res(req, ret, 0); + __io_req_complete(req, issue_flags); io_ring_submit_unlock(ctx, issue_flags); - return 0; + return IOU_ISSUE_SKIP_COMPLETE; } static int io_provide_buffers_prep(struct io_kiocb *req, @@ -5117,9 +5106,10 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ - __io_req_complete(req, issue_flags, ret, 0); + io_req_set_res(req, ret, 0); + __io_req_complete(req, issue_flags); io_ring_submit_unlock(ctx, issue_flags); - return 0; + return IOU_ISSUE_SKIP_COMPLETE; } static int io_epoll_ctl_prep(struct io_kiocb *req, @@ -5162,8 +5152,8 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; #else return -EOPNOTSUPP; #endif @@ -5196,8 +5186,8 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; #else return -EOPNOTSUPP; #endif @@ -5235,8 +5225,8 @@ static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -5279,8 +5269,8 @@ static int io_statx(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static void io_statx_cleanup(struct io_kiocb *req) @@ -5350,8 +5340,8 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) err: if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -5377,8 +5367,8 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } #if defined(CONFIG_NET) @@ -5409,8 +5399,8 @@ static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; ret = __sys_shutdown_sock(sock, shutdown->how); - io_req_complete(req, ret); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static bool io_net_retry(struct socket *sock, int flags) @@ -5548,8 +5538,8 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_send(struct io_kiocb *req, unsigned int issue_flags) @@ -5605,8 +5595,8 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int __io_recvmsg_copy_hdr(struct io_kiocb *req, @@ -5805,8 +5795,8 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - __io_req_complete(req, issue_flags, ret, cflags); - return 0; + io_req_set_res(req, ret, cflags); + return IOU_OK; } static int io_recv(struct io_kiocb *req, unsigned int issue_flags) @@ -5881,8 +5871,8 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); if (msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - __io_req_complete(req, issue_flags, ret, cflags); - return 0; + io_req_set_res(req, ret, cflags); + return IOU_OK; } static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -5948,7 +5938,7 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) */ if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) - ret = 0; + ret = IOU_ISSUE_SKIP_COMPLETE; return ret; } if (ret == -ERESTARTSYS) @@ -5963,8 +5953,8 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) } if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } if (ret >= 0) { bool filled; @@ -6034,8 +6024,8 @@ static int io_socket(struct io_kiocb *req, unsigned int issue_flags) ret = io_fixed_fd_install(req, issue_flags, file, sock->file_slot); } - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_connect_prep_async(struct io_kiocb *req) @@ -6096,8 +6086,8 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) out: if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } #else /* !CONFIG_NET */ #define IO_NETOP_FN(op) \ @@ -6328,7 +6318,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) io_poll_remove_entries(req); spin_lock(&ctx->completion_lock); hash_del(&req->hash_node); - __io_req_complete_post(req, req->cqe.res, 0); + req->cqe.flags = 0; + __io_req_complete_post(req); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -6357,7 +6348,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) static void __io_poll_execute(struct io_kiocb *req, int mask, __poll_t __maybe_unused events) { - req->cqe.res = mask; + io_req_set_res(req, mask, 0); /* * This is useful for poll that is armed on behalf of another * request, and where the wakeup path could be on a different @@ -6810,12 +6801,16 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.pt._qproc = io_poll_queue_proc; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); - if (!ret && ipt.error) + if (ret) { + io_req_set_res(req, ret, 0); + return IOU_OK; + } + if (ipt.error) { req_set_fail(req); - ret = ret ?: ipt.error; - if (ret) - __io_req_complete(req, issue_flags, ret, 0); - return 0; + return ipt.error; + } + + return IOU_ISSUE_SKIP_COMPLETE; } static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) @@ -6850,20 +6845,22 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) ret2 = io_poll_add(preq, issue_flags); /* successfully updated, don't complete poll request */ - if (!ret2) + if (!ret2 || ret2 == -EIOCBQUEUED) goto out; } req_set_fail(preq); - preq->cqe.res = -ECANCELED; + io_req_set_res(preq, -ECANCELED, 0); locked = !(issue_flags & IO_URING_F_UNLOCKED); io_req_task_complete(preq, &locked); out: - if (ret < 0) + if (ret < 0) { req_set_fail(req); + return ret; + } /* complete update request, we're done with it */ - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) @@ -6884,7 +6881,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); - req->cqe.res = -ETIME; + io_req_set_res(req, -ETIME, 0); req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return HRTIMER_NORESTART; @@ -7069,8 +7066,8 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); - io_req_complete_post(req, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int __io_timeout_prep(struct io_kiocb *req, @@ -7191,7 +7188,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->timeout_lock); - return 0; + return IOU_ISSUE_SKIP_COMPLETE; } static bool io_cancel_cb(struct io_wq_work *work, void *data) @@ -7359,8 +7356,8 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) done: if (ret < 0) req_set_fail(req); - io_req_complete_post(req, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_files_update_prep(struct io_kiocb *req, @@ -7445,8 +7442,8 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); - __io_req_complete(req, issue_flags, ret, 0); - return 0; + io_req_set_res(req, ret, 0); + return IOU_OK; } static int io_req_prep_async(struct io_kiocb *req) @@ -7590,8 +7587,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (creds) revert_creds(creds); - if (ret) + + if (ret == IOU_OK) + __io_req_complete(req, issue_flags); + else if (ret != IOU_ISSUE_SKIP_COMPLETE) return ret; + /* If the op doesn't have a file, we're not polling for it */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) io_iopoll_req_issued(req, issue_flags); @@ -7668,7 +7669,7 @@ static void io_wq_submit_work(struct io_wq_work *work) } while (1); /* avoid locking problems by failing it from a clean context */ - if (ret) + if (ret < 0) io_req_task_queue_fail(req, ret); } @@ -7745,10 +7746,12 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) ret = io_try_cancel(req, &cd); } - io_req_complete_post(req, ret ?: -ETIME, 0); + io_req_set_res(req, ret ?: -ETIME, 0); + io_req_complete_post(req); io_put_req(prev); } else { - io_req_complete_post(req, -ETIME, 0); + io_req_set_res(req, -ETIME, 0); + io_req_complete_post(req); } } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 522e652197572e..73943dbe884e5a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -4,6 +4,11 @@ #include #include "io_uring_types.h" +enum { + IOU_OK = 0, + IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, +}; + static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) { req->cqe.res = res; From ecd1bf726b211a18157623fa2c4cbccb28e47dff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 11:46:43 -0600 Subject: [PATCH 0953/1250] io_uring: move xattr related opcodes to its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 248 +----------------------------------------- io_uring/xattr.c | 259 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/xattr.h | 15 +++ 4 files changed, 277 insertions(+), 247 deletions(-) create mode 100644 io_uring/xattr.c create mode 100644 io_uring/xattr.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 3680425df9478b..479b6957b85ff3 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,5 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7b8f1c9b7b48ea..1af97e4a78b7f0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -80,7 +80,6 @@ #include #include #include -#include #define CREATE_TRACE_POINTS #include @@ -93,6 +92,8 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "xattr.h" + #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 @@ -529,12 +530,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -struct io_xattr { - struct file *file; - struct xattr_ctx ctx; - struct filename *filename; -}; - struct async_poll { struct io_poll poll; struct io_poll *double_poll; @@ -3904,245 +3899,6 @@ static void io_renameat_cleanup(struct io_kiocb *req) putname(ren->newpath); } -static inline void io_xattr_cleanup(struct io_kiocb *req) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - - if (ix->filename) - putname(ix->filename); - - kfree(ix->ctx.kname); - kvfree(ix->ctx.kvalue); -} - -static void io_xattr_finish(struct io_kiocb *req, int ret) -{ - req->flags &= ~REQ_F_NEED_CLEANUP; - - io_xattr_cleanup(req); - io_req_set_res(req, ret, 0); -} - -static int __io_getxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - const char __user *name; - int ret; - - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ix->filename = NULL; - ix->ctx.kvalue = NULL; - name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ix->ctx.size = READ_ONCE(sqe->len); - ix->ctx.flags = READ_ONCE(sqe->xattr_flags); - - if (ix->ctx.flags) - return -EINVAL; - - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); - if (!ix->ctx.kname) - return -ENOMEM; - - ret = strncpy_from_user(ix->ctx.kname->name, name, - sizeof(ix->ctx.kname->name)); - if (!ret || ret == sizeof(ix->ctx.kname->name)) - ret = -ERANGE; - if (ret < 0) { - kfree(ix->ctx.kname); - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_fgetxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_getxattr_prep(req, sqe); -} - -static int io_getxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - const char __user *path; - int ret; - - ret = __io_getxattr_prep(req, sqe); - if (ret) - return ret; - - path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } - - return ret; -} - -static int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), - req->file->f_path.dentry, - &ix->ctx); - - io_xattr_finish(req, ret); - return IOU_OK; -} - -static int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = do_getxattr(mnt_user_ns(path.mnt), - path.dentry, - &ix->ctx); - - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - - io_xattr_finish(req, ret); - return IOU_OK; -} - -static int __io_setxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - const char __user *name; - int ret; - - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ix->filename = NULL; - name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ix->ctx.kvalue = NULL; - ix->ctx.size = READ_ONCE(sqe->len); - ix->ctx.flags = READ_ONCE(sqe->xattr_flags); - - ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); - if (!ix->ctx.kname) - return -ENOMEM; - - ret = setxattr_copy(name, &ix->ctx); - if (ret) { - kfree(ix->ctx.kname); - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_setxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - const char __user *path; - int ret; - - ret = __io_setxattr_prep(req, sqe); - if (ret) - return ret; - - path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - - ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } - - return ret; -} - -static int io_fsetxattr_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_setxattr_prep(req, sqe); -} - -static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, - struct path *path) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - int ret; - - ret = mnt_want_write(path->mnt); - if (!ret) { - ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); - mnt_drop_write(path->mnt); - } - - return ret; -} - -static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = __io_setxattr(req, issue_flags, &req->file->f_path); - io_xattr_finish(req, ret); - return IOU_OK; -} - -static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req); - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = __io_setxattr(req, issue_flags, &path); - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - - io_xattr_finish(req, ret); - return IOU_OK; -} - static int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { diff --git a/io_uring/xattr.c b/io_uring/xattr.c new file mode 100644 index 00000000000000..79adf4efba0184 --- /dev/null +++ b/io_uring/xattr.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../fs/internal.h" + +#include "io_uring_types.h" +#include "io_uring.h" +#include "xattr.h" + +struct io_xattr { + struct file *file; + struct xattr_ctx ctx; + struct filename *filename; +}; + +void io_xattr_cleanup(struct io_kiocb *req) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + + if (ix->filename) + putname(ix->filename); + + kfree(ix->ctx.kname); + kvfree(ix->ctx.kvalue); +} + +static void io_xattr_finish(struct io_kiocb *req, int ret) +{ + req->flags &= ~REQ_F_NEED_CLEANUP; + + io_xattr_cleanup(req); + io_req_set_res(req, ret, 0); +} + +static int __io_getxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + ix->ctx.kvalue = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + if (ix->ctx.flags) + return -EINVAL; + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = strncpy_from_user(ix->ctx.kname->name, name, + sizeof(ix->ctx.kname->name)); + if (!ret || ret == sizeof(ix->ctx.kname->name)) + ret = -ERANGE; + if (ret < 0) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_getxattr_prep(req, sqe); +} + +int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + const char __user *path; + int ret; + + ret = __io_getxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt), + req->file->f_path.dentry, + &ix->ctx); + + io_xattr_finish(req, ret); + return IOU_OK; +} + +int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = do_getxattr(mnt_user_ns(path.mnt), + path.dentry, + &ix->ctx); + + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return IOU_OK; +} + +static int __io_setxattr_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + const char __user *name; + int ret; + + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ix->filename = NULL; + name = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.kvalue = NULL; + ix->ctx.size = READ_ONCE(sqe->len); + ix->ctx.flags = READ_ONCE(sqe->xattr_flags); + + ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL); + if (!ix->ctx.kname) + return -ENOMEM; + + ret = setxattr_copy(name, &ix->ctx); + if (ret) { + kfree(ix->ctx.kname); + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + const char __user *path; + int ret; + + ret = __io_setxattr_prep(req, sqe); + if (ret) + return ret; + + path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + + ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL); + if (IS_ERR(ix->filename)) { + ret = PTR_ERR(ix->filename); + ix->filename = NULL; + } + + return ret; +} + +int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_setxattr_prep(req, sqe); +} + +static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, + struct path *path) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + int ret; + + ret = mnt_want_write(path->mnt); + if (!ret) { + ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx); + mnt_drop_write(path->mnt); + } + + return ret; +} + +int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = __io_setxattr(req, issue_flags, &req->file->f_path); + io_xattr_finish(req, ret); + return IOU_OK; +} + +int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_xattr *ix = io_kiocb_to_cmd(req); + unsigned int lookup_flags = LOOKUP_FOLLOW; + struct path path; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + +retry: + ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); + if (!ret) { + ret = __io_setxattr(req, issue_flags, &path); + path_put(&path); + if (retry_estale(ret, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } + } + + io_xattr_finish(req, ret); + return IOU_OK; +} diff --git a/io_uring/xattr.h b/io_uring/xattr.h new file mode 100644 index 00000000000000..9b459d2ae90c5a --- /dev/null +++ b/io_uring/xattr.h @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +void io_xattr_cleanup(struct io_kiocb *req); + +int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags); + +int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_setxattr(struct io_kiocb *req, unsigned int issue_flags); + +int io_fgetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags); + +int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_getxattr(struct io_kiocb *req, unsigned int issue_flags); From 5e6941bb2ceba26f03c4fdc21e80378ded897f42 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 11:56:42 -0600 Subject: [PATCH 0954/1250] io_uring: move nop into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 15 +-------------- io_uring/nop.c | 26 ++++++++++++++++++++++++++ io_uring/nop.h | 4 ++++ 4 files changed, 32 insertions(+), 15 deletions(-) create mode 100644 io_uring/nop.c create mode 100644 io_uring/nop.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 479b6957b85ff3..32c02a38f83b42 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,5 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1af97e4a78b7f0..3e74925bd4b454 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -93,6 +93,7 @@ #include "io_uring.h" #include "xattr.h" +#include "nop.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -4298,20 +4299,6 @@ static int io_splice(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return 0; -} - -/* - * IORING_OP_NOP just posts a completion event, nothing else. - */ -static int io_nop(struct io_kiocb *req, unsigned int issue_flags) -{ - io_req_set_res(req, 0, 0); - return IOU_OK; -} - static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { diff --git a/io_uring/nop.c b/io_uring/nop.c new file mode 100644 index 00000000000000..d363d8ce70a3b7 --- /dev/null +++ b/io_uring/nop.c @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "nop.h" + +int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return 0; +} + +/* + * IORING_OP_NOP just posts a completion event, nothing else. + */ +int io_nop(struct io_kiocb *req, unsigned int issue_flags) +{ + io_req_set_res(req, 0, 0); + return IOU_OK; +} diff --git a/io_uring/nop.h b/io_uring/nop.h new file mode 100644 index 00000000000000..97f1535c9dec42 --- /dev/null +++ b/io_uring/nop.h @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_nop(struct io_kiocb *req, unsigned int issue_flags); From 98b979583fd436556290cf7055a90fe348a2c63f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:13:00 -0600 Subject: [PATCH 0955/1250] io_uring: split out filesystem related operations This splits out renameat, unlinkat, mkdirat, symlinkat, and linkat. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/fs.c | 294 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/fs.h | 20 +++ io_uring/io_uring.c | 283 +----------------------------------------- 4 files changed, 316 insertions(+), 283 deletions(-) create mode 100644 io_uring/fs.c create mode 100644 io_uring/fs.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 32c02a38f83b42..50e68c9a4d1bb6 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,5 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/fs.c b/io_uring/fs.c new file mode 100644 index 00000000000000..aac1bc5255b07f --- /dev/null +++ b/io_uring/fs.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../fs/internal.h" + +#include "io_uring_types.h" +#include "io_uring.h" +#include "fs.h" + +struct io_rename { + struct file *file; + int old_dfd; + int new_dfd; + struct filename *oldpath; + struct filename *newpath; + int flags; +}; + +struct io_unlink { + struct file *file; + int dfd; + int flags; + struct filename *filename; +}; + +struct io_mkdir { + struct file *file; + int dfd; + umode_t mode; + struct filename *filename; +}; + +struct io_link { + struct file *file; + int old_dfd; + int new_dfd; + struct filename *oldpath; + struct filename *newpath; + int flags; +}; + +int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_rename *ren = io_kiocb_to_cmd(req); + const char __user *oldf, *newf; + + if (sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + ren->old_dfd = READ_ONCE(sqe->fd); + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ren->new_dfd = READ_ONCE(sqe->len); + ren->flags = READ_ONCE(sqe->rename_flags); + + ren->oldpath = getname(oldf); + if (IS_ERR(ren->oldpath)) + return PTR_ERR(ren->oldpath); + + ren->newpath = getname(newf); + if (IS_ERR(ren->newpath)) { + putname(ren->oldpath); + return PTR_ERR(ren->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_renameat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rename *ren = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, + ren->newpath, ren->flags); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_renameat_cleanup(struct io_kiocb *req) +{ + struct io_rename *ren = io_kiocb_to_cmd(req); + + putname(ren->oldpath); + putname(ren->newpath); +} + +int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_unlink *un = io_kiocb_to_cmd(req); + const char __user *fname; + + if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + un->dfd = READ_ONCE(sqe->fd); + + un->flags = READ_ONCE(sqe->unlink_flags); + if (un->flags & ~AT_REMOVEDIR) + return -EINVAL; + + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + un->filename = getname(fname); + if (IS_ERR(un->filename)) + return PTR_ERR(un->filename); + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_unlink *un = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + if (un->flags & AT_REMOVEDIR) + ret = do_rmdir(un->dfd, un->filename); + else + ret = do_unlinkat(un->dfd, un->filename); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_unlinkat_cleanup(struct io_kiocb *req) +{ + struct io_unlink *ul = io_kiocb_to_cmd(req); + + putname(ul->filename); +} + +int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_mkdir *mkd = io_kiocb_to_cmd(req); + const char __user *fname; + + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + mkd->dfd = READ_ONCE(sqe->fd); + mkd->mode = READ_ONCE(sqe->len); + + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + mkd->filename = getname(fname); + if (IS_ERR(mkd->filename)) + return PTR_ERR(mkd->filename); + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_mkdir *mkd = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_mkdirat_cleanup(struct io_kiocb *req) +{ + struct io_mkdir *md = io_kiocb_to_cmd(req); + + putname(md->filename); +} + +int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_link *sl = io_kiocb_to_cmd(req); + const char __user *oldpath, *newpath; + + if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + sl->new_dfd = READ_ONCE(sqe->fd); + oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + + sl->oldpath = getname(oldpath); + if (IS_ERR(sl->oldpath)) + return PTR_ERR(sl->oldpath); + + sl->newpath = getname(newpath); + if (IS_ERR(sl->newpath)) { + putname(sl->oldpath); + return PTR_ERR(sl->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_link *sl = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_link *lnk = io_kiocb_to_cmd(req); + const char __user *oldf, *newf; + + if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + lnk->old_dfd = READ_ONCE(sqe->fd); + lnk->new_dfd = READ_ONCE(sqe->len); + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + lnk->flags = READ_ONCE(sqe->hardlink_flags); + + lnk->oldpath = getname(oldf); + if (IS_ERR(lnk->oldpath)) + return PTR_ERR(lnk->oldpath); + + lnk->newpath = getname(newf); + if (IS_ERR(lnk->newpath)) { + putname(lnk->oldpath); + return PTR_ERR(lnk->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_linkat(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_link *lnk = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, + lnk->newpath, lnk->flags); + + req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_link_cleanup(struct io_kiocb *req) +{ + struct io_link *sl = io_kiocb_to_cmd(req); + + putname(sl->oldpath); + putname(sl->newpath); +} diff --git a/io_uring/fs.h b/io_uring/fs.h new file mode 100644 index 00000000000000..0bb5efe3d6bbdf --- /dev/null +++ b/io_uring/fs.h @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_renameat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_renameat(struct io_kiocb *req, unsigned int issue_flags); +void io_renameat_cleanup(struct io_kiocb *req); + +int io_unlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags); +void io_unlinkat_cleanup(struct io_kiocb *req); + +int io_mkdirat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags); +void io_mkdirat_cleanup(struct io_kiocb *req); + +int io_symlinkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags); + +int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_linkat(struct io_kiocb *req, unsigned int issue_flags); +void io_link_cleanup(struct io_kiocb *req); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3e74925bd4b454..5b1e67ff0faadf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -94,6 +94,7 @@ #include "xattr.h" #include "nop.h" +#include "fs.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -467,38 +468,6 @@ struct io_shutdown { int how; }; -struct io_rename { - struct file *file; - int old_dfd; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; - int flags; -}; - -struct io_unlink { - struct file *file; - int dfd; - int flags; - struct filename *filename; -}; - -struct io_mkdir { - struct file *file; - int dfd; - umode_t mode; - struct filename *filename; -}; - -struct io_link { - struct file *file; - int old_dfd; - int new_dfd; - struct filename *oldpath; - struct filename *newpath; - int flags; -}; - struct io_msg { struct file *file; u64 user_data; @@ -3845,256 +3814,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } -static int io_renameat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_rename *ren = io_kiocb_to_cmd(req); - const char __user *oldf, *newf; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - ren->old_dfd = READ_ONCE(sqe->fd); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - ren->new_dfd = READ_ONCE(sqe->len); - ren->flags = READ_ONCE(sqe->rename_flags); - - ren->oldpath = getname(oldf); - if (IS_ERR(ren->oldpath)) - return PTR_ERR(ren->oldpath); - - ren->newpath = getname(newf); - if (IS_ERR(ren->newpath)) { - putname(ren->oldpath); - return PTR_ERR(ren->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_renameat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rename *ren = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_renameat2(ren->old_dfd, ren->oldpath, ren->new_dfd, - ren->newpath, ren->flags); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_renameat_cleanup(struct io_kiocb *req) -{ - struct io_rename *ren = io_kiocb_to_cmd(req); - - putname(ren->oldpath); - putname(ren->newpath); -} - -static int io_unlinkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_unlink *un = io_kiocb_to_cmd(req); - const char __user *fname; - - if (sqe->off || sqe->len || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - un->dfd = READ_ONCE(sqe->fd); - - un->flags = READ_ONCE(sqe->unlink_flags); - if (un->flags & ~AT_REMOVEDIR) - return -EINVAL; - - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - un->filename = getname(fname); - if (IS_ERR(un->filename)) - return PTR_ERR(un->filename); - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_unlink *un = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (un->flags & AT_REMOVEDIR) - ret = do_rmdir(un->dfd, un->filename); - else - ret = do_unlinkat(un->dfd, un->filename); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_unlinkat_cleanup(struct io_kiocb *req) -{ - struct io_unlink *ul = io_kiocb_to_cmd(req); - - putname(ul->filename); -} - -static int io_mkdirat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_mkdir *mkd = io_kiocb_to_cmd(req); - const char __user *fname; - - if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - mkd->dfd = READ_ONCE(sqe->fd); - mkd->mode = READ_ONCE(sqe->len); - - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - mkd->filename = getname(fname); - if (IS_ERR(mkd->filename)) - return PTR_ERR(mkd->filename); - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_mkdir *mkd = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_mkdirat_cleanup(struct io_kiocb *req) -{ - struct io_mkdir *md = io_kiocb_to_cmd(req); - - putname(md->filename); -} - -static int io_symlinkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_link *sl = io_kiocb_to_cmd(req); - const char __user *oldpath, *newpath; - - if (sqe->len || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - sl->new_dfd = READ_ONCE(sqe->fd); - oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - - sl->oldpath = getname(oldpath); - if (IS_ERR(sl->oldpath)) - return PTR_ERR(sl->oldpath); - - sl->newpath = getname(newpath); - if (IS_ERR(sl->newpath)) { - putname(sl->oldpath); - return PTR_ERR(sl->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_link *sl = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_linkat_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_link *lnk = io_kiocb_to_cmd(req); - const char __user *oldf, *newf; - - if (sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - lnk->old_dfd = READ_ONCE(sqe->fd); - lnk->new_dfd = READ_ONCE(sqe->len); - oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - lnk->flags = READ_ONCE(sqe->hardlink_flags); - - lnk->oldpath = getname(oldf); - if (IS_ERR(lnk->oldpath)) - return PTR_ERR(lnk->oldpath); - - lnk->newpath = getname(newf); - if (IS_ERR(lnk->newpath)) { - putname(lnk->oldpath); - return PTR_ERR(lnk->newpath); - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_link *lnk = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, - lnk->newpath, lnk->flags); - - req->flags &= ~REQ_F_NEED_CLEANUP; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_link_cleanup(struct io_kiocb *req) -{ - struct io_link *sl = io_kiocb_to_cmd(req); - - putname(sl->oldpath); - putname(sl->newpath); -} - static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); From 7291afb55e2778fa170aa49952ed5ab39e7c1169 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:19:47 -0600 Subject: [PATCH 0956/1250] io_uring: split out splice related operations This splits out splice and tee support. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 133 ++------------------------------------------ io_uring/io_uring.h | 19 +++++++ io_uring/splice.c | 123 ++++++++++++++++++++++++++++++++++++++++ io_uring/splice.h | 7 +++ 5 files changed, 154 insertions(+), 130 deletions(-) create mode 100644 io_uring/splice.c create mode 100644 io_uring/splice.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 50e68c9a4d1bb6..c6aca2af6f4ab9 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,5 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5b1e67ff0faadf..43d4044d3bb958 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "xattr.h" #include "nop.h" #include "fs.h" +#include "splice.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -436,15 +437,6 @@ struct io_epoll { struct epoll_event event; }; -struct io_splice { - struct file *file_out; - loff_t off_out; - loff_t off_in; - u64 len; - int splice_fd_in; - unsigned int flags; -}; - struct io_provide_buf { struct file *file; __u64 addr; @@ -596,9 +588,6 @@ static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args); static void io_clean_op(struct io_kiocb *req); -static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned issue_flags); -static struct file *io_file_get_normal(struct io_kiocb *req, int fd); static void io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); @@ -1078,15 +1067,6 @@ static inline bool req_has_async_data(struct io_kiocb *req) return req->flags & REQ_F_ASYNC_DATA; } -static inline void req_set_fail(struct io_kiocb *req) -{ - req->flags |= REQ_F_FAIL; - if (req->flags & REQ_F_CQE_SKIP) { - req->flags &= ~REQ_F_CQE_SKIP; - req->flags |= REQ_F_SKIP_LINK_CQES; - } -} - static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); @@ -1941,12 +1921,6 @@ static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) return container_of(node, struct io_kiocb, comp_list); } -static inline void io_put_file(struct file *file) -{ - if (file) - fput(file); -} - static inline void io_dismantle_req(struct io_kiocb *req) { unsigned int flags = req->flags; @@ -3919,105 +3893,6 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) return IOU_ISSUE_SKIP_COMPLETE; } -static int __io_splice_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_splice *sp = io_kiocb_to_cmd(req); - unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; - - sp->len = READ_ONCE(sqe->len); - sp->flags = READ_ONCE(sqe->splice_flags); - if (unlikely(sp->flags & ~valid_flags)) - return -EINVAL; - sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); - return 0; -} - -static int io_tee_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) - return -EINVAL; - return __io_splice_prep(req, sqe); -} - -static int io_tee(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_splice *sp = io_kiocb_to_cmd(req); - struct file *out = sp->file_out; - unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; - struct file *in; - long ret = 0; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); - if (!in) { - ret = -EBADF; - goto done; - } - - if (sp->len) - ret = do_tee(in, out, sp->len, flags); - - if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); -done: - if (ret != sp->len) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_splice *sp = io_kiocb_to_cmd(req); - - sp->off_in = READ_ONCE(sqe->splice_off_in); - sp->off_out = READ_ONCE(sqe->off); - return __io_splice_prep(req, sqe); -} - -static int io_splice(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_splice *sp = io_kiocb_to_cmd(req); - struct file *out = sp->file_out; - unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; - loff_t *poff_in, *poff_out; - struct file *in; - long ret = 0; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); - if (!in) { - ret = -EBADF; - goto done; - } - - poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; - poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; - - if (sp->len) - ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); - - if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) - io_put_file(in); -done: - if (ret != sp->len) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -7157,8 +7032,8 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file file_slot->file_ptr = file_ptr; } -static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned int issue_flags) +inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct file *file = NULL; @@ -7181,7 +7056,7 @@ static inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, return file; } -static struct file *io_file_get_normal(struct io_kiocb *req, int fd) +struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 73943dbe884e5a..02c00122b97a31 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -9,10 +9,29 @@ enum { IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, }; +static inline void req_set_fail(struct io_kiocb *req) +{ + req->flags |= REQ_F_FAIL; + if (req->flags & REQ_F_CQE_SKIP) { + req->flags &= ~REQ_F_CQE_SKIP; + req->flags |= REQ_F_SKIP_LINK_CQES; + } +} + static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) { req->cqe.res = res; req->cqe.flags = cflags; } +static inline void io_put_file(struct file *file) +{ + if (file) + fput(file); +} + +struct file *io_file_get_normal(struct io_kiocb *req, int fd); +struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned issue_flags); + #endif diff --git a/io_uring/splice.c b/io_uring/splice.c new file mode 100644 index 00000000000000..0e19d63303452a --- /dev/null +++ b/io_uring/splice.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "splice.h" + +struct io_splice { + struct file *file_out; + loff_t off_out; + loff_t off_in; + u64 len; + int splice_fd_in; + unsigned int flags; +}; + +static int __io_splice_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_splice *sp = io_kiocb_to_cmd(req); + unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL; + + sp->len = READ_ONCE(sqe->len); + sp->flags = READ_ONCE(sqe->splice_flags); + if (unlikely(sp->flags & ~valid_flags)) + return -EINVAL; + sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); + return 0; +} + +int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + if (READ_ONCE(sqe->splice_off_in) || READ_ONCE(sqe->off)) + return -EINVAL; + return __io_splice_prep(req, sqe); +} + +int io_tee(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_splice *sp = io_kiocb_to_cmd(req); + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + struct file *in; + long ret = 0; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + if (sp->flags & SPLICE_F_FD_IN_FIXED) + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); + else + in = io_file_get_normal(req, sp->splice_fd_in); + if (!in) { + ret = -EBADF; + goto done; + } + + if (sp->len) + ret = do_tee(in, out, sp->len, flags); + + if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) + io_put_file(in); +done: + if (ret != sp->len) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_splice *sp = io_kiocb_to_cmd(req); + + sp->off_in = READ_ONCE(sqe->splice_off_in); + sp->off_out = READ_ONCE(sqe->off); + return __io_splice_prep(req, sqe); +} + +int io_splice(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_splice *sp = io_kiocb_to_cmd(req); + struct file *out = sp->file_out; + unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED; + loff_t *poff_in, *poff_out; + struct file *in; + long ret = 0; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + if (sp->flags & SPLICE_F_FD_IN_FIXED) + in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); + else + in = io_file_get_normal(req, sp->splice_fd_in); + if (!in) { + ret = -EBADF; + goto done; + } + + poff_in = (sp->off_in == -1) ? NULL : &sp->off_in; + poff_out = (sp->off_out == -1) ? NULL : &sp->off_out; + + if (sp->len) + ret = do_splice(in, poff_in, out, poff_out, sp->len, flags); + + if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) + io_put_file(in); +done: + if (ret != sp->len) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/splice.h b/io_uring/splice.h new file mode 100644 index 00000000000000..542f94168ad3a1 --- /dev/null +++ b/io_uring/splice.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_tee(struct io_kiocb *req, unsigned int issue_flags); + +int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_splice(struct io_kiocb *req, unsigned int issue_flags); From d956679750fd47c2fb5288e993eac162dc0baf6b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:25:19 -0600 Subject: [PATCH 0957/1250] io_uring: split out fs related sync/fallocate functions This splits out sync_file_range, fsync, and fallocate. Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 97 +------------------------------------- io_uring/sync.c | 111 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/sync.h | 10 ++++ 4 files changed, 124 insertions(+), 97 deletions(-) create mode 100644 io_uring/sync.c create mode 100644 io_uring/sync.h diff --git a/io_uring/Makefile b/io_uring/Makefile index c6aca2af6f4ab9..7285c6aef8da0f 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,5 +2,6 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o +obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ + sync.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 43d4044d3bb958..7bcf8da7dd4063 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -96,6 +96,7 @@ #include "nop.h" #include "fs.h" #include "splice.h" +#include "sync.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -336,14 +337,6 @@ struct io_socket { unsigned long nofile; }; -struct io_sync { - struct file *file; - loff_t len; - loff_t off; - int flags; - int mode; -}; - struct io_cancel { struct file *file; u64 addr; @@ -3941,67 +3934,6 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - - if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - sync->flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC)) - return -EINVAL; - - sync->off = READ_ONCE(sqe->off); - sync->len = READ_ONCE(sqe->len); - return 0; -} - -static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - loff_t end = sync->off + sync->len; - int ret; - - /* fsync always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, - sync->flags & IORING_FSYNC_DATASYNC); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_fallocate_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - - if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - sync->off = READ_ONCE(sqe->off); - sync->len = READ_ONCE(sqe->addr); - sync->mode = READ_ONCE(sqe->len); - return 0; -} - -static int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - int ret; - - /* fallocate always requiring blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); - if (ret >= 0) - fsnotify_modify(req->file); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_open *open = io_kiocb_to_cmd(req); @@ -4681,33 +4613,6 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - - if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - sync->off = READ_ONCE(sqe->off); - sync->len = READ_ONCE(sqe->len); - sync->flags = READ_ONCE(sqe->sync_range_flags); - return 0; -} - -static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sync *sync = io_kiocb_to_cmd(req); - int ret; - - /* sync_file_range always requires a blocking context */ - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - #if defined(CONFIG_NET) static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/sync.c b/io_uring/sync.c new file mode 100644 index 00000000000000..9ee8ff865521f3 --- /dev/null +++ b/io_uring/sync.c @@ -0,0 +1,111 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "sync.h" + +struct io_sync { + struct file *file; + loff_t len; + loff_t off; + int flags; + int mode; +}; + +int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) + return -EINVAL; + + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->len); + sync->flags = READ_ONCE(sqe->sync_range_flags); + return 0; +} + +int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + int ret; + + /* sync_file_range always requires a blocking context */ + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + + if (unlikely(sqe->addr || sqe->buf_index || sqe->splice_fd_in)) + return -EINVAL; + + sync->flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(sync->flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->len); + return 0; +} + +int io_fsync(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + loff_t end = sync->off + sync->len; + int ret; + + /* fsync always requires a blocking context */ + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, + sync->flags & IORING_FSYNC_DATASYNC); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + + if (sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + sync->off = READ_ONCE(sqe->off); + sync->len = READ_ONCE(sqe->addr); + sync->mode = READ_ONCE(sqe->len); + return 0; +} + +int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sync *sync = io_kiocb_to_cmd(req); + int ret; + + /* fallocate always requiring blocking context */ + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + ret = vfs_fallocate(req->file, sync->mode, sync->off, sync->len); + if (ret >= 0) + fsnotify_modify(req->file); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/sync.h b/io_uring/sync.h new file mode 100644 index 00000000000000..e873c888da79b1 --- /dev/null +++ b/io_uring/sync.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags); + +int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_fsync(struct io_kiocb *req, unsigned int issue_flags); + +int io_fallocate(struct io_kiocb *req, unsigned int issue_flags); +int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); From f3b1ae30857f7cfb1f723c710fa01ffdaa1bf3af Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:28:33 -0600 Subject: [PATCH 0958/1250] io_uring: split out fadvise/madvise operations Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/advise.c | 100 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/advise.h | 7 ++++ io_uring/io_uring.c | 85 +------------------------------------ 4 files changed, 109 insertions(+), 85 deletions(-) create mode 100644 io_uring/advise.c create mode 100644 io_uring/advise.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 7285c6aef8da0f..4492aa24397efe 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -3,5 +3,5 @@ # Makefile for io_uring obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o + sync.o advise.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/advise.c b/io_uring/advise.c new file mode 100644 index 00000000000000..8870fdf66ffbbd --- /dev/null +++ b/io_uring/advise.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "advise.h" + +struct io_fadvise { + struct file *file; + u64 offset; + u32 len; + u32 advice; +}; + +struct io_madvise { + struct file *file; + u64 addr; + u32 len; + u32 advice; +}; + +int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = io_kiocb_to_cmd(req); + + if (sqe->buf_index || sqe->off || sqe->splice_fd_in) + return -EINVAL; + + ma->addr = READ_ONCE(sqe->addr); + ma->len = READ_ONCE(sqe->len); + ma->advice = READ_ONCE(sqe->fadvise_advice); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +int io_madvise(struct io_kiocb *req, unsigned int issue_flags) +{ +#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) + struct io_madvise *ma = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); + io_req_set_res(req, ret, 0); + return IOU_OK; +#else + return -EOPNOTSUPP; +#endif +} + +int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_fadvise *fa = io_kiocb_to_cmd(req); + + if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) + return -EINVAL; + + fa->offset = READ_ONCE(sqe->off); + fa->len = READ_ONCE(sqe->len); + fa->advice = READ_ONCE(sqe->fadvise_advice); + return 0; +} + +int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_fadvise *fa = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) { + switch (fa->advice) { + case POSIX_FADV_NORMAL: + case POSIX_FADV_RANDOM: + case POSIX_FADV_SEQUENTIAL: + break; + default: + return -EAGAIN; + } + } + + ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/advise.h b/io_uring/advise.h new file mode 100644 index 00000000000000..5ece2a045185ff --- /dev/null +++ b/io_uring/advise.h @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_madvise(struct io_kiocb *req, unsigned int issue_flags); + +int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_fadvise(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7bcf8da7dd4063..c2041fb10aa2c4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -97,6 +97,7 @@ #include "fs.h" #include "splice.h" #include "sync.h" +#include "advise.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -408,20 +409,6 @@ struct io_rsrc_update { u32 offset; }; -struct io_fadvise { - struct file *file; - u64 offset; - u32 len; - u32 advice; -}; - -struct io_madvise { - struct file *file; - u64 addr; - u32 len; - u32 advice; -}; - struct io_epoll { struct file *file; int epfd; @@ -4428,76 +4415,6 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) #endif } -static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ -#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - struct io_madvise *ma = io_kiocb_to_cmd(req); - - if (sqe->buf_index || sqe->off || sqe->splice_fd_in) - return -EINVAL; - - ma->addr = READ_ONCE(sqe->addr); - ma->len = READ_ONCE(sqe->len); - ma->advice = READ_ONCE(sqe->fadvise_advice); - return 0; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) -{ -#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - struct io_madvise *ma = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); - io_req_set_res(req, ret, 0); - return IOU_OK; -#else - return -EOPNOTSUPP; -#endif -} - -static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_fadvise *fa = io_kiocb_to_cmd(req); - - if (sqe->buf_index || sqe->addr || sqe->splice_fd_in) - return -EINVAL; - - fa->offset = READ_ONCE(sqe->off); - fa->len = READ_ONCE(sqe->len); - fa->advice = READ_ONCE(sqe->fadvise_advice); - return 0; -} - -static int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_fadvise *fa = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) { - switch (fa->advice) { - case POSIX_FADV_NORMAL: - case POSIX_FADV_RANDOM: - case POSIX_FADV_SEQUENTIAL: - break; - default: - return -EAGAIN; - } - } - - ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_statx *sx = io_kiocb_to_cmd(req); From fce59eb5e202af0bbd07015555336c7568db3022 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:43:10 -0600 Subject: [PATCH 0959/1250] io_uring: separate out file table handling code Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/filetable.c | 57 ++++++++++++++++++++++++++ io_uring/filetable.h | 58 ++++++++++++++++++++++++++ io_uring/io_uring.c | 86 --------------------------------------- io_uring/io_uring_types.h | 7 +--- 5 files changed, 117 insertions(+), 93 deletions(-) create mode 100644 io_uring/filetable.c create mode 100644 io_uring/filetable.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 4492aa24397efe..5efc4fe565a172 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -3,5 +3,5 @@ # Makefile for io_uring obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o advise.o + sync.o advise.o filetable.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/filetable.c b/io_uring/filetable.c new file mode 100644 index 00000000000000..560629a93c04b0 --- /dev/null +++ b/io_uring/filetable.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" + +int io_file_bitmap_get(struct io_ring_ctx *ctx) +{ + struct io_file_table *table = &ctx->file_table; + unsigned long nr = ctx->nr_user_files; + int ret; + + do { + ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); + if (ret != nr) + return ret; + + if (!table->alloc_hint) + break; + + nr = table->alloc_hint; + table->alloc_hint = 0; + } while (1); + + return -ENFILE; +} + +bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) +{ + table->files = kvcalloc(nr_files, sizeof(table->files[0]), + GFP_KERNEL_ACCOUNT); + if (unlikely(!table->files)) + return false; + + table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); + if (unlikely(!table->bitmap)) { + kvfree(table->files); + return false; + } + + return true; +} + +void io_free_file_tables(struct io_file_table *table) +{ + kvfree(table->files); + bitmap_free(table->bitmap); + table->files = NULL; + table->bitmap = NULL; +} diff --git a/io_uring/filetable.h b/io_uring/filetable.h new file mode 100644 index 00000000000000..fe1ec581958dda --- /dev/null +++ b/io_uring/filetable.h @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_FILE_TABLE_H +#define IOU_FILE_TABLE_H + +struct io_ring_ctx; + +/* + * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 + * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we + * can't safely always dereference the file when the task has exited and ring + * cleanup is done. If a file is tracked and part of SCM, then unix gc on + * process exit may reap it before __io_sqe_files_unregister() is run. + */ +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#if defined(CONFIG_64BIT) +#define FFS_SCM 0x4UL +#else +#define IO_URING_SCM_ALL +#define FFS_SCM 0x0UL +#endif +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) + +struct io_fixed_file { + /* file * with additional FFS_* flags */ + unsigned long file_ptr; +}; + +struct io_file_table { + struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; +}; + +bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); +void io_free_file_tables(struct io_file_table *table); +int io_file_bitmap_get(struct io_ring_ctx *ctx); + +static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) +{ + __clear_bit(bit, table->bitmap); + table->alloc_hint = bit; +} + +static inline void io_file_bitmap_set(struct io_file_table *table, int bit) +{ + WARN_ON_ONCE(test_bit(bit, table->bitmap)); + __set_bit(bit, table->bitmap); + table->alloc_hint = bit + 1; +} + +static inline struct io_fixed_file * +io_fixed_file_slot(struct io_file_table *table, unsigned i) +{ + return &table->files[i]; +} + +#endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c2041fb10aa2c4..4b4d6fd509d13f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -146,28 +146,6 @@ struct io_overflow_cqe { struct io_uring_cqe cqe; }; -/* - * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 - * and define IO_URING_SCM_ALL. For this case, we use SCM for all files as we - * can't safely always dereference the file when the task has exited and ring - * cleanup is done. If a file is tracked and part of SCM, then unix gc on - * process exit may reap it before __io_sqe_files_unregister() is run. - */ -#define FFS_NOWAIT 0x1UL -#define FFS_ISREG 0x2UL -#if defined(CONFIG_64BIT) -#define FFS_SCM 0x4UL -#else -#define IO_URING_SCM_ALL -#define FFS_SCM 0x0UL -#endif -#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) - -struct io_fixed_file { - /* file * with additional FFS_* flags */ - unsigned long file_ptr; -}; - struct io_rsrc_put { struct list_head list; u64 tag; @@ -3983,27 +3961,6 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_openat_prep(req, sqe); } -static int io_file_bitmap_get(struct io_ring_ctx *ctx) -{ - struct io_file_table *table = &ctx->file_table; - unsigned long nr = ctx->nr_user_files; - int ret; - - do { - ret = find_next_zero_bit(table->bitmap, nr, table->alloc_hint); - if (ret != nr) - return ret; - - if (!table->alloc_hint) - break; - - nr = table->alloc_hint; - table->alloc_hint = 0; - } while (1); - - return -ENFILE; -} - /* * Note when io_fixed_fd_install() returns error value, it will ensure * fput() is called correspondingly. @@ -6832,12 +6789,6 @@ static void io_wq_submit_work(struct io_wq_work *work) io_req_task_queue_fail(req, ret); } -static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, - unsigned i) -{ - return &table->files[i]; -} - static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, int index) { @@ -7934,43 +7885,6 @@ static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_pu return ret; } -static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) -{ - table->files = kvcalloc(nr_files, sizeof(table->files[0]), - GFP_KERNEL_ACCOUNT); - if (unlikely(!table->files)) - return false; - - table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); - if (unlikely(!table->bitmap)) { - kvfree(table->files); - return false; - } - - return true; -} - -static void io_free_file_tables(struct io_file_table *table) -{ - kvfree(table->files); - bitmap_free(table->bitmap); - table->files = NULL; - table->bitmap = NULL; -} - -static inline void io_file_bitmap_set(struct io_file_table *table, int bit) -{ - WARN_ON_ONCE(test_bit(bit, table->bitmap)); - __set_bit(bit, table->bitmap); - table->alloc_hint = bit + 1; -} - -static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) -{ - __clear_bit(bit, table->bitmap); - table->alloc_hint = bit; -} - static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) { #if !defined(IO_URING_SCM_ALL) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 1a0f592ff6fc35..dba72113c59d67 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -5,6 +5,7 @@ #include #include "io-wq.h" +#include "filetable.h" struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -122,12 +123,6 @@ struct io_ev_fd { struct rcu_head rcu; }; -struct io_file_table { - struct io_fixed_file *files; - unsigned long *bitmap; - unsigned int alloc_hint; -}; - struct io_ring_ctx { /* const or read-mostly hot data */ struct { From 6c92b7c6e6764b4315c9049da3a74d60c3114f11 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 May 2022 21:54:43 -0600 Subject: [PATCH 0960/1250] io_uring: split out open/close operations Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 311 ++----------------------------------------- io_uring/io_uring.h | 32 +++++ io_uring/openclose.c | 283 +++++++++++++++++++++++++++++++++++++++ io_uring/openclose.h | 14 ++ 5 files changed, 345 insertions(+), 298 deletions(-) create mode 100644 io_uring/openclose.c create mode 100644 io_uring/openclose.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 5efc4fe565a172..e60def39ca2c6a 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -3,5 +3,6 @@ # Makefile for io_uring obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o advise.o filetable.o + sync.o advise.o filetable.o \ + openclose.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4b4d6fd509d13f..a79186ba8c44ec 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -98,6 +98,7 @@ #include "splice.h" #include "sync.h" #include "advise.h" +#include "openclose.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -283,12 +284,6 @@ struct io_poll_update { bool update_user_data; }; -struct io_close { - struct file *file; - int fd; - u32 file_slot; -}; - struct io_timeout_data { struct io_kiocb *req; struct hrtimer timer; @@ -371,15 +366,6 @@ struct io_sr_msg { unsigned int flags; }; -struct io_open { - struct file *file; - int dfd; - u32 file_slot; - struct filename *filename; - struct open_how how; - unsigned long nofile; -}; - struct io_rsrc_update { struct file *file; u64 arg; @@ -555,9 +541,6 @@ static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); -static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, - unsigned int offset); -static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); @@ -670,10 +653,15 @@ const char *io_uring_get_opcode(u8 opcode) return "INVALID"; } +bool io_is_uring_fops(struct file *file) +{ + return file->f_op == &io_uring_fops; +} + struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) - if (file->f_op == &io_uring_fops) { + if (io_is_uring_fops(file)) { struct io_ring_ctx *ctx = file->private_data; return ctx->ring_sock->sk; @@ -699,26 +687,6 @@ static inline bool io_file_need_scm(struct file *filp) } #endif -static void io_ring_submit_unlock(struct io_ring_ctx *ctx, unsigned issue_flags) -{ - lockdep_assert_held(&ctx->uring_lock); - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); -} - -static void io_ring_submit_lock(struct io_ring_ctx *ctx, unsigned issue_flags) -{ - /* - * "Normal" inline submissions always hold the uring_lock, since we - * grab it from the system call. Same is true for the SQPOLL offload. - * The only exception is when we've detached the request and issue it - * from an async worker thread, grab the lock for that case. - */ - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_lock(&ctx->uring_lock); - lockdep_assert_held(&ctx->uring_lock); -} - static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -3899,74 +3867,12 @@ static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_open *open = io_kiocb_to_cmd(req); - const char __user *fname; - int ret; - - if (unlikely(sqe->buf_index)) - return -EINVAL; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - - /* open.how should be already initialised */ - if (!(open->how.flags & O_PATH) && force_o_largefile()) - open->how.flags |= O_LARGEFILE; - - open->dfd = READ_ONCE(sqe->fd); - fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); - open->filename = getname(fname); - if (IS_ERR(open->filename)) { - ret = PTR_ERR(open->filename); - open->filename = NULL; - return ret; - } - - open->file_slot = READ_ONCE(sqe->file_index); - if (open->file_slot && (open->how.flags & O_CLOEXEC)) - return -EINVAL; - - open->nofile = rlimit(RLIMIT_NOFILE); - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_open *open = io_kiocb_to_cmd(req); - u64 mode = READ_ONCE(sqe->len); - u64 flags = READ_ONCE(sqe->open_flags); - - open->how = build_open_how(flags, mode); - return __io_openat_prep(req, sqe); -} - -static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_open *open = io_kiocb_to_cmd(req); - struct open_how __user *how; - size_t len; - int ret; - - how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - len = READ_ONCE(sqe->len); - if (len < OPEN_HOW_SIZE_VER0) - return -EINVAL; - - ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len); - if (ret) - return ret; - - return __io_openat_prep(req, sqe); -} - /* * Note when io_fixed_fd_install() returns error value, it will ensure * fput() is called correspondingly. */ -static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot) +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) { bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; struct io_ring_ctx *ctx = req->ctx; @@ -3993,86 +3899,6 @@ static int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, return ret; } -static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_open *open = io_kiocb_to_cmd(req); - struct open_flags op; - struct file *file; - bool resolve_nonblock, nonblock_set; - bool fixed = !!open->file_slot; - int ret; - - ret = build_open_flags(&open->how, &op); - if (ret) - goto err; - nonblock_set = op.open_flag & O_NONBLOCK; - resolve_nonblock = open->how.resolve & RESOLVE_CACHED; - if (issue_flags & IO_URING_F_NONBLOCK) { - /* - * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, - * it'll always -EAGAIN - */ - if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) - return -EAGAIN; - op.lookup_flags |= LOOKUP_CACHED; - op.open_flag |= O_NONBLOCK; - } - - if (!fixed) { - ret = __get_unused_fd_flags(open->how.flags, open->nofile); - if (ret < 0) - goto err; - } - - file = do_filp_open(open->dfd, open->filename, &op); - if (IS_ERR(file)) { - /* - * We could hang on to this 'fd' on retrying, but seems like - * marginal gain for something that is now known to be a slower - * path. So just put it, and we'll get a new one when we retry. - */ - if (!fixed) - put_unused_fd(ret); - - ret = PTR_ERR(file); - /* only retry if RESOLVE_CACHED wasn't already set by application */ - if (ret == -EAGAIN && - (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) - return -EAGAIN; - goto err; - } - - if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) - file->f_flags &= ~O_NONBLOCK; - fsnotify_open(file); - - if (!fixed) - fd_install(ret, file); - else - ret = io_fixed_fd_install(req, issue_flags, file, - open->file_slot); -err: - putname(open->filename); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_openat(struct io_kiocb *req, unsigned int issue_flags) -{ - return io_openat2(req, issue_flags); -} - -static void io_open_cleanup(struct io_kiocb *req) -{ - struct io_open *open = io_kiocb_to_cmd(req); - - if (open->filename) - putname(open->filename); -} - static int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -4424,69 +4250,6 @@ static void io_statx_cleanup(struct io_kiocb *req) putname(sx->filename); } -static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_close *close = io_kiocb_to_cmd(req); - - if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) - return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) - return -EBADF; - - close->fd = READ_ONCE(sqe->fd); - close->file_slot = READ_ONCE(sqe->file_index); - if (close->file_slot && close->fd) - return -EINVAL; - - return 0; -} - -static int io_close(struct io_kiocb *req, unsigned int issue_flags) -{ - struct files_struct *files = current->files; - struct io_close *close = io_kiocb_to_cmd(req); - struct fdtable *fdt; - struct file *file; - int ret = -EBADF; - - if (close->file_slot) { - ret = io_close_fixed(req, issue_flags); - goto err; - } - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - if (close->fd >= fdt->max_fds) { - spin_unlock(&files->file_lock); - goto err; - } - file = rcu_dereference_protected(fdt->fd[close->fd], - lockdep_is_held(&files->file_lock)); - if (!file || file->f_op == &io_uring_fops) { - spin_unlock(&files->file_lock); - goto err; - } - - /* if the file has a flush method, be safe and punt to async */ - if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { - spin_unlock(&files->file_lock); - return -EAGAIN; - } - - file = __close_fd_get_file(close->fd); - spin_unlock(&files->file_lock); - if (!file) - goto err; - - /* No ->flush() or already async, safely close from here */ - ret = filp_close(file, current->files); -err: - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - #if defined(CONFIG_NET) static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -7744,8 +7507,8 @@ static struct io_rsrc_node *io_rsrc_node_alloc(void) return ref_node; } -static void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill) +void io_rsrc_node_switch(struct io_ring_ctx *ctx, + struct io_rsrc_data *data_to_kill) __must_hold(&ctx->uring_lock) { WARN_ON_ONCE(!ctx->rsrc_backup_node); @@ -7772,7 +7535,7 @@ static void io_rsrc_node_switch(struct io_ring_ctx *ctx, } } -static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) +int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { if (ctx->rsrc_backup_node) return 0; @@ -8319,8 +8082,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc) { u64 *tag_slot = io_get_tag_slot(data, idx); struct io_rsrc_put *prsrc; @@ -8386,52 +8149,6 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, return ret; } -static int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, - unsigned int offset) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_fixed_file *file_slot; - struct file *file; - int ret; - - io_ring_submit_lock(ctx, issue_flags); - ret = -ENXIO; - if (unlikely(!ctx->file_data)) - goto out; - ret = -EINVAL; - if (offset >= ctx->nr_user_files) - goto out; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto out; - - offset = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, offset); - ret = -EBADF; - if (!file_slot->file_ptr) - goto out; - - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); - if (ret) - goto out; - - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, offset); - io_rsrc_node_switch(ctx, ctx->file_data); - ret = 0; -out: - io_ring_submit_unlock(ctx, issue_flags); - return ret; -} - -static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_close *close = io_kiocb_to_cmd(req); - - return __io_close_fixed(req, issue_flags, close->file_slot - 1); -} - static int __io_sqe_files_update(struct io_ring_ctx *ctx, struct io_uring_rsrc_update2 *up, unsigned nr_args) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 02c00122b97a31..ebb225e8501251 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -2,6 +2,7 @@ #define IOU_CORE_H #include +#include #include "io_uring_types.h" enum { @@ -30,8 +31,39 @@ static inline void io_put_file(struct file *file) fput(file); } +static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx, + unsigned issue_flags) +{ + lockdep_assert_held(&ctx->uring_lock); + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); +} + +static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, + unsigned issue_flags) +{ + /* + * "Normal" inline submissions always hold the uring_lock, since we + * grab it from the system call. Same is true for the SQPOLL offload. + * The only exception is when we've detached the request and issue it + * from an async worker thread, grab the lock for that case. + */ + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_lock(&ctx->uring_lock); + lockdep_assert_held(&ctx->uring_lock); +} + struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot); + +int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc); +void io_rsrc_node_switch(struct io_ring_ctx *ctx, + struct io_rsrc_data *data_to_kill); +bool io_is_uring_fops(struct file *file); #endif diff --git a/io_uring/openclose.c b/io_uring/openclose.c new file mode 100644 index 00000000000000..fa35bd56a33086 --- /dev/null +++ b/io_uring/openclose.c @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../fs/internal.h" + +#include "io_uring_types.h" +#include "io_uring.h" +#include "openclose.h" + +struct io_open { + struct file *file; + int dfd; + u32 file_slot; + struct filename *filename; + struct open_how how; + unsigned long nofile; +}; + +struct io_close { + struct file *file; + int fd; + u32 file_slot; +}; + +static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_open *open = io_kiocb_to_cmd(req); + const char __user *fname; + int ret; + + if (unlikely(sqe->buf_index)) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + /* open.how should be already initialised */ + if (!(open->how.flags & O_PATH) && force_o_largefile()) + open->how.flags |= O_LARGEFILE; + + open->dfd = READ_ONCE(sqe->fd); + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + open->filename = getname(fname); + if (IS_ERR(open->filename)) { + ret = PTR_ERR(open->filename); + open->filename = NULL; + return ret; + } + + open->file_slot = READ_ONCE(sqe->file_index); + if (open->file_slot && (open->how.flags & O_CLOEXEC)) + return -EINVAL; + + open->nofile = rlimit(RLIMIT_NOFILE); + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_open *open = io_kiocb_to_cmd(req); + u64 mode = READ_ONCE(sqe->len); + u64 flags = READ_ONCE(sqe->open_flags); + + open->how = build_open_how(flags, mode); + return __io_openat_prep(req, sqe); +} + +int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_open *open = io_kiocb_to_cmd(req); + struct open_how __user *how; + size_t len; + int ret; + + how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + len = READ_ONCE(sqe->len); + if (len < OPEN_HOW_SIZE_VER0) + return -EINVAL; + + ret = copy_struct_from_user(&open->how, sizeof(open->how), how, len); + if (ret) + return ret; + + return __io_openat_prep(req, sqe); +} + +int io_openat2(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_open *open = io_kiocb_to_cmd(req); + struct open_flags op; + struct file *file; + bool resolve_nonblock, nonblock_set; + bool fixed = !!open->file_slot; + int ret; + + ret = build_open_flags(&open->how, &op); + if (ret) + goto err; + nonblock_set = op.open_flag & O_NONBLOCK; + resolve_nonblock = open->how.resolve & RESOLVE_CACHED; + if (issue_flags & IO_URING_F_NONBLOCK) { + /* + * Don't bother trying for O_TRUNC, O_CREAT, or O_TMPFILE open, + * it'll always -EAGAIN + */ + if (open->how.flags & (O_TRUNC | O_CREAT | O_TMPFILE)) + return -EAGAIN; + op.lookup_flags |= LOOKUP_CACHED; + op.open_flag |= O_NONBLOCK; + } + + if (!fixed) { + ret = __get_unused_fd_flags(open->how.flags, open->nofile); + if (ret < 0) + goto err; + } + + file = do_filp_open(open->dfd, open->filename, &op); + if (IS_ERR(file)) { + /* + * We could hang on to this 'fd' on retrying, but seems like + * marginal gain for something that is now known to be a slower + * path. So just put it, and we'll get a new one when we retry. + */ + if (!fixed) + put_unused_fd(ret); + + ret = PTR_ERR(file); + /* only retry if RESOLVE_CACHED wasn't already set by application */ + if (ret == -EAGAIN && + (!resolve_nonblock && (issue_flags & IO_URING_F_NONBLOCK))) + return -EAGAIN; + goto err; + } + + if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) + file->f_flags &= ~O_NONBLOCK; + fsnotify_open(file); + + if (!fixed) + fd_install(ret, file); + else + ret = io_fixed_fd_install(req, issue_flags, file, + open->file_slot); +err: + putname(open->filename); + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_openat(struct io_kiocb *req, unsigned int issue_flags) +{ + return io_openat2(req, issue_flags); +} + +void io_open_cleanup(struct io_kiocb *req) +{ + struct io_open *open = io_kiocb_to_cmd(req); + + if (open->filename) + putname(open->filename); +} + +int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, + unsigned int offset) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_fixed_file *file_slot; + struct file *file; + int ret; + + io_ring_submit_lock(ctx, issue_flags); + ret = -ENXIO; + if (unlikely(!ctx->file_data)) + goto out; + ret = -EINVAL; + if (offset >= ctx->nr_user_files) + goto out; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto out; + + offset = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, offset); + ret = -EBADF; + if (!file_slot->file_ptr) + goto out; + + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); + if (ret) + goto out; + + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, offset); + io_rsrc_node_switch(ctx, ctx->file_data); + ret = 0; +out: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_close *close = io_kiocb_to_cmd(req); + + return __io_close_fixed(req, issue_flags, close->file_slot - 1); +} + +int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_close *close = io_kiocb_to_cmd(req); + + if (sqe->off || sqe->addr || sqe->len || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + if (req->flags & REQ_F_FIXED_FILE) + return -EBADF; + + close->fd = READ_ONCE(sqe->fd); + close->file_slot = READ_ONCE(sqe->file_index); + if (close->file_slot && close->fd) + return -EINVAL; + + return 0; +} + +int io_close(struct io_kiocb *req, unsigned int issue_flags) +{ + struct files_struct *files = current->files; + struct io_close *close = io_kiocb_to_cmd(req); + struct fdtable *fdt; + struct file *file; + int ret = -EBADF; + + if (close->file_slot) { + ret = io_close_fixed(req, issue_flags); + goto err; + } + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + if (close->fd >= fdt->max_fds) { + spin_unlock(&files->file_lock); + goto err; + } + file = rcu_dereference_protected(fdt->fd[close->fd], + lockdep_is_held(&files->file_lock)); + if (!file || io_is_uring_fops(file)) { + spin_unlock(&files->file_lock); + goto err; + } + + /* if the file has a flush method, be safe and punt to async */ + if (file->f_op->flush && (issue_flags & IO_URING_F_NONBLOCK)) { + spin_unlock(&files->file_lock); + return -EAGAIN; + } + + file = __close_fd_get_file(close->fd); + spin_unlock(&files->file_lock); + if (!file) + goto err; + + /* No ->flush() or already async, safely close from here */ + ret = filp_close(file, current->files); +err: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/openclose.h b/io_uring/openclose.h new file mode 100644 index 00000000000000..9f578f3fad870d --- /dev/null +++ b/io_uring/openclose.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, + unsigned int offset); + +int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_openat(struct io_kiocb *req, unsigned int issue_flags); +void io_open_cleanup(struct io_kiocb *req); + +int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_openat2(struct io_kiocb *req, unsigned int issue_flags); + +int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_close(struct io_kiocb *req, unsigned int issue_flags); From 651de3ae6b88ec54bb35820cbc7f38adff54750a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 05:59:19 -0600 Subject: [PATCH 0961/1250] io_uring: move uring_cmd handling to its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 127 ++----------------------------------------- io_uring/io_uring.h | 9 +++ io_uring/uring_cmd.c | 115 +++++++++++++++++++++++++++++++++++++++ io_uring/uring_cmd.h | 13 +++++ 5 files changed, 142 insertions(+), 124 deletions(-) create mode 100644 io_uring/uring_cmd.c create mode 100644 io_uring/uring_cmd.h diff --git a/io_uring/Makefile b/io_uring/Makefile index e60def39ca2c6a..2e2cbeb272a890 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ - openclose.o + openclose.o uring_cmd.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a79186ba8c44ec..469d89f3cf0cd1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -99,6 +99,7 @@ #include "sync.h" #include "advise.h" #include "openclose.h" +#include "uring_cmd.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -473,14 +474,6 @@ struct io_cancel_data { int seq; }; -/* - * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into - * the following sqe if SQE128 is used. - */ -#define uring_cmd_pdu_size(is_sqe128) \ - ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ - offsetof(struct io_uring_sqe, cmd)) - struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -988,11 +981,6 @@ static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, return matched; } -static inline bool req_has_async_data(struct io_kiocb *req) -{ - return req->flags & REQ_F_ASYNC_DATA; -} - static inline void req_fail_link_node(struct io_kiocb *req, int res) { req_set_fail(req); @@ -1743,7 +1731,7 @@ static void io_req_complete_post(struct io_kiocb *req) io_cqring_ev_posted(ctx); } -static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) +inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) req->flags |= REQ_F_COMPLETE_INLINE; @@ -2151,7 +2139,7 @@ static void __io_req_task_work_add(struct io_kiocb *req, } } -static void io_req_task_work_add(struct io_kiocb *req) +void io_req_task_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; @@ -3268,7 +3256,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, } } -static inline bool io_alloc_async_data(struct io_kiocb *req) +bool io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); @@ -3714,111 +3702,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } -static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); - - ioucmd->task_work_cb(ioucmd); -} - -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *)) -{ - struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - - ioucmd->task_work_cb = task_work_cb; - req->io_task_work.func = io_uring_cmd_work; - io_req_task_work_add(req); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); - -static inline void io_req_set_cqe32_extra(struct io_kiocb *req, - u64 extra1, u64 extra2) -{ - req->extra1 = extra1; - req->extra2 = extra2; - req->flags |= REQ_F_CQE32_INIT; -} - -/* - * Called by consumers of io_uring_cmd, if they originally returned - * -EIOCBQUEUED upon receiving the command. - */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) -{ - struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - - if (ret < 0) - req_set_fail(req); - - io_req_set_res(req, 0, ret); - if (req->ctx->flags & IORING_SETUP_CQE32) - io_req_set_cqe32_extra(req, res2, 0); - __io_req_complete(req, 0); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_done); - -static int io_uring_cmd_prep_async(struct io_kiocb *req) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); - size_t cmd_size; - - cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); - - memcpy(req->async_data, ioucmd->cmd, cmd_size); - return 0; -} - -static int io_uring_cmd_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); - - if (sqe->rw_flags || sqe->__pad1) - return -EINVAL; - ioucmd->cmd = sqe->cmd; - ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return 0; -} - -static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct file *file = req->file; - int ret; - - if (!req->file->f_op->uring_cmd) - return -EOPNOTSUPP; - - if (ctx->flags & IORING_SETUP_SQE128) - issue_flags |= IO_URING_F_SQE128; - if (ctx->flags & IORING_SETUP_CQE32) - issue_flags |= IO_URING_F_CQE32; - if (ctx->flags & IORING_SETUP_IOPOLL) - issue_flags |= IO_URING_F_IOPOLL; - - if (req_has_async_data(req)) - ioucmd->cmd = req->async_data; - - ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - if (!req_has_async_data(req)) { - if (io_alloc_async_data(req)) - return -ENOMEM; - io_uring_cmd_prep_async(req); - } - return -EAGAIN; - } - - if (ret != -EIOCBQUEUED) { - io_uring_cmd_done(ioucmd, ret, 0); - return IOU_OK; - } - - return IOU_ISSUE_SKIP_COMPLETE; -} - static int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -11533,8 +11416,6 @@ static int __init io_uring_init(void) BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); - BUILD_BUG_ON(sizeof(struct io_uring_cmd) > 64); - for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { BUG_ON(!io_op_defs[i].prep); BUG_ON(!io_op_defs[i].issue); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ebb225e8501251..6a07e902120ab3 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -25,6 +25,11 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline bool req_has_async_data(struct io_kiocb *req) +{ + return req->flags & REQ_F_ASYNC_DATA; +} + static inline void io_put_file(struct file *file) { if (file) @@ -53,6 +58,8 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, lockdep_assert_held(&ctx->uring_lock); } +void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); + struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); @@ -65,5 +72,7 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill); bool io_is_uring_fops(struct file *file); +bool io_alloc_async_data(struct io_kiocb *req); +void io_req_task_work_add(struct io_kiocb *req); #endif diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c new file mode 100644 index 00000000000000..abf78918a0995f --- /dev/null +++ b/io_uring/uring_cmd.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "uring_cmd.h" + +static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + + ioucmd->task_work_cb(ioucmd); +} + +void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *)) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + ioucmd->task_work_cb = task_work_cb; + req->io_task_work.func = io_uring_cmd_work; + io_req_task_work_add(req); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); + +static inline void io_req_set_cqe32_extra(struct io_kiocb *req, + u64 extra1, u64 extra2) +{ + req->extra1 = extra1; + req->extra2 = extra2; + req->flags |= REQ_F_CQE32_INIT; +} + +/* + * Called by consumers of io_uring_cmd, if they originally returned + * -EIOCBQUEUED upon receiving the command. + */ +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2) +{ + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + + if (ret < 0) + req_set_fail(req); + + io_req_set_res(req, 0, ret); + if (req->ctx->flags & IORING_SETUP_CQE32) + io_req_set_cqe32_extra(req, res2, 0); + __io_req_complete(req, 0); +} +EXPORT_SYMBOL_GPL(io_uring_cmd_done); + +int io_uring_cmd_prep_async(struct io_kiocb *req) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + size_t cmd_size; + + cmd_size = uring_cmd_pdu_size(req->ctx->flags & IORING_SETUP_SQE128); + + memcpy(req->async_data, ioucmd->cmd, cmd_size); + return 0; +} + +int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + + if (sqe->rw_flags || sqe->__pad1) + return -EINVAL; + ioucmd->cmd = sqe->cmd; + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); + return 0; +} + +int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (!req->file->f_op->uring_cmd) + return -EOPNOTSUPP; + + if (ctx->flags & IORING_SETUP_SQE128) + issue_flags |= IO_URING_F_SQE128; + if (ctx->flags & IORING_SETUP_CQE32) + issue_flags |= IO_URING_F_CQE32; + if (ctx->flags & IORING_SETUP_IOPOLL) + issue_flags |= IO_URING_F_IOPOLL; + + if (req_has_async_data(req)) + ioucmd->cmd = req->async_data; + + ret = file->f_op->uring_cmd(ioucmd, issue_flags); + if (ret == -EAGAIN) { + if (!req_has_async_data(req)) { + if (io_alloc_async_data(req)) + return -ENOMEM; + io_uring_cmd_prep_async(req); + } + return -EAGAIN; + } + + if (ret != -EIOCBQUEUED) { + io_uring_cmd_done(ioucmd, ret, 0); + return IOU_OK; + } + + return IOU_ISSUE_SKIP_COMPLETE; +} diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h new file mode 100644 index 00000000000000..7c6697d13cb2e4 --- /dev/null +++ b/io_uring/uring_cmd.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); +int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_uring_cmd_prep_async(struct io_kiocb *req); + +/* + * The URING_CMD payload starts at 'cmd' in the first sqe, and continues into + * the following sqe if SQE128 is used. + */ +#define uring_cmd_pdu_size(is_sqe128) \ + ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ + offsetof(struct io_uring_sqe, cmd)) From 1a8cf9162ee272698b44307b0ddb1bd4154665c3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:04:14 -0600 Subject: [PATCH 0962/1250] io_uring: add a dummy -EOPNOTSUPP prep handler Add it and use it for the epoll handling, if epoll isn't configured. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 469d89f3cf0cd1..15d0377b7d9d86 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -4034,10 +4034,16 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) return IOU_ISSUE_SKIP_COMPLETE; } +static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, + const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +#if defined(CONFIG_EPOLL) static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { -#if defined(CONFIG_EPOLL) struct io_epoll *epoll = io_kiocb_to_cmd(req); if (sqe->buf_index || sqe->splice_fd_in) @@ -4056,14 +4062,10 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, } return 0; -#else - return -EOPNOTSUPP; -#endif } static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) { -#if defined(CONFIG_EPOLL) struct io_epoll *ie = io_kiocb_to_cmd(req); int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; @@ -4076,10 +4078,8 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_set_res(req, ret, 0); return IOU_OK; -#else - return -EOPNOTSUPP; -#endif } +#endif static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -11246,8 +11246,12 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, .audit_skip = 1, +#if defined(CONFIG_EPOLL) .prep = io_epoll_ctl_prep, .issue = io_epoll_ctl, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_SPLICE] = { .needs_file = 1, @@ -11418,7 +11422,8 @@ static int __init io_uring_init(void) for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { BUG_ON(!io_op_defs[i].prep); - BUG_ON(!io_op_defs[i].issue); + if (io_op_defs[i].prep != io_eopnotsupp_prep) + BUG_ON(!io_op_defs[i].issue); } req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | From 45fa2196462cc69f8de945a65048b0665ff5b1c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:09:18 -0600 Subject: [PATCH 0963/1250] io_uring: move epoll handler to its own file Would be nice to sort out Kconfig for this and don't even compile epoll.c if we don't have epoll configured. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/epoll.c | 62 +++++++++++++++++++++++++++++++++++++++++++++ io_uring/epoll.h | 6 +++++ io_uring/io_uring.c | 50 +----------------------------------- 4 files changed, 70 insertions(+), 50 deletions(-) create mode 100644 io_uring/epoll.c create mode 100644 io_uring/epoll.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 2e2cbeb272a890..59e70f2d8c56d4 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,5 +4,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ - openclose.o uring_cmd.o + openclose.o uring_cmd.o epoll.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/epoll.c b/io_uring/epoll.c new file mode 100644 index 00000000000000..acbb32498127ad --- /dev/null +++ b/io_uring/epoll.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "epoll.h" + +#if defined(CONFIG_EPOLL) +struct io_epoll { + struct file *file; + int epfd; + int op; + int fd; + struct epoll_event event; +}; + +int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_epoll *epoll = io_kiocb_to_cmd(req); + + if (sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + + epoll->epfd = READ_ONCE(sqe->fd); + epoll->op = READ_ONCE(sqe->len); + epoll->fd = READ_ONCE(sqe->off); + + if (ep_op_has_event(epoll->op)) { + struct epoll_event __user *ev; + + ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); + if (copy_from_user(&epoll->event, ev, sizeof(*ev))) + return -EFAULT; + } + + return 0; +} + +int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_epoll *ie = io_kiocb_to_cmd(req); + int ret; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + + ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); + if (force_nonblock && ret == -EAGAIN) + return -EAGAIN; + + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} +#endif diff --git a/io_uring/epoll.h b/io_uring/epoll.h new file mode 100644 index 00000000000000..870cce11ba982d --- /dev/null +++ b/io_uring/epoll.h @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 + +#if defined(CONFIG_EPOLL) +int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags); +#endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 15d0377b7d9d86..78828e294e53b7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -100,6 +100,7 @@ #include "advise.h" #include "openclose.h" #include "uring_cmd.h" +#include "epoll.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -374,14 +375,6 @@ struct io_rsrc_update { u32 offset; }; -struct io_epoll { - struct file *file; - int epfd; - int op; - int fd; - struct epoll_event event; -}; - struct io_provide_buf { struct file *file; __u64 addr; @@ -4040,47 +4033,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -#if defined(CONFIG_EPOLL) -static int io_epoll_ctl_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_epoll *epoll = io_kiocb_to_cmd(req); - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - - epoll->epfd = READ_ONCE(sqe->fd); - epoll->op = READ_ONCE(sqe->len); - epoll->fd = READ_ONCE(sqe->off); - - if (ep_op_has_event(epoll->op)) { - struct epoll_event __user *ev; - - ev = u64_to_user_ptr(READ_ONCE(sqe->addr)); - if (copy_from_user(&epoll->event, ev, sizeof(*ev))) - return -EFAULT; - } - - return 0; -} - -static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_epoll *ie = io_kiocb_to_cmd(req); - int ret; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock); - if (force_nonblock && ret == -EAGAIN) - return -EAGAIN; - - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} -#endif - static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_statx *sx = io_kiocb_to_cmd(req); From eaa3f16eca5b33689bcc1baedd0465cb5f85ed14 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:12:18 -0600 Subject: [PATCH 0964/1250] io_uring: move statx handling to its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 62 +------------------------------------ io_uring/statx.c | 74 +++++++++++++++++++++++++++++++++++++++++++++ io_uring/statx.h | 5 +++ 4 files changed, 82 insertions(+), 62 deletions(-) create mode 100644 io_uring/statx.c create mode 100644 io_uring/statx.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 59e70f2d8c56d4..de953c022c6ebd 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ - openclose.o uring_cmd.o epoll.o + openclose.o uring_cmd.o epoll.o \ + statx.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 78828e294e53b7..eb01d1aadeb486 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -101,6 +101,7 @@ #include "openclose.h" #include "uring_cmd.h" #include "epoll.h" +#include "statx.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -384,15 +385,6 @@ struct io_provide_buf { __u16 bid; }; -struct io_statx { - struct file *file; - int dfd; - unsigned int mask; - unsigned int flags; - struct filename *filename; - struct statx __user *buffer; -}; - struct io_shutdown { struct file *file; int how; @@ -4033,58 +4025,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_statx *sx = io_kiocb_to_cmd(req); - const char __user *path; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - if (req->flags & REQ_F_FIXED_FILE) - return -EBADF; - - sx->dfd = READ_ONCE(sqe->fd); - sx->mask = READ_ONCE(sqe->len); - path = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - sx->flags = READ_ONCE(sqe->statx_flags); - - sx->filename = getname_flags(path, - getname_statx_lookup_flags(sx->flags), - NULL); - - if (IS_ERR(sx->filename)) { - int ret = PTR_ERR(sx->filename); - - sx->filename = NULL; - return ret; - } - - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_statx(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_statx *sx = io_kiocb_to_cmd(req); - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static void io_statx_cleanup(struct io_kiocb *req) -{ - struct io_statx *sx = io_kiocb_to_cmd(req); - - if (sx->filename) - putname(sx->filename); -} - #if defined(CONFIG_NET) static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/statx.c b/io_uring/statx.c new file mode 100644 index 00000000000000..83b15687e9c5ee --- /dev/null +++ b/io_uring/statx.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include + +#include "../fs/internal.h" + +#include "io_uring_types.h" +#include "io_uring.h" +#include "statx.h" + +struct io_statx { + struct file *file; + int dfd; + unsigned int mask; + unsigned int flags; + struct filename *filename; + struct statx __user *buffer; +}; + +int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_statx *sx = io_kiocb_to_cmd(req); + const char __user *path; + + if (sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (req->flags & REQ_F_FIXED_FILE) + return -EBADF; + + sx->dfd = READ_ONCE(sqe->fd); + sx->mask = READ_ONCE(sqe->len); + path = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + sx->flags = READ_ONCE(sqe->statx_flags); + + sx->filename = getname_flags(path, + getname_statx_lookup_flags(sx->flags), + NULL); + + if (IS_ERR(sx->filename)) { + int ret = PTR_ERR(sx->filename); + + sx->filename = NULL; + return ret; + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_statx(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_statx *sx = io_kiocb_to_cmd(req); + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +void io_statx_cleanup(struct io_kiocb *req) +{ + struct io_statx *sx = io_kiocb_to_cmd(req); + + if (sx->filename) + putname(sx->filename); +} diff --git a/io_uring/statx.h b/io_uring/statx.h new file mode 100644 index 00000000000000..9a17f4d45a7d5f --- /dev/null +++ b/io_uring/statx.h @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_statx(struct io_kiocb *req, unsigned int issue_flags); +void io_statx_cleanup(struct io_kiocb *req); From cf97007d28b4e4907f3fbf5e1f5a501422173a71 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:25:13 -0600 Subject: [PATCH 0965/1250] io_uring: split network related opcodes into its own file While at it, convert the handlers to just use io_eopnotsupp_prep() if CONFIG_NET isn't set. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 873 ++------------------------------------------ io_uring/io_uring.h | 22 ++ io_uring/net.c | 779 +++++++++++++++++++++++++++++++++++++++ io_uring/net.h | 43 +++ 5 files changed, 884 insertions(+), 835 deletions(-) create mode 100644 io_uring/net.c create mode 100644 io_uring/net.h diff --git a/io_uring/Makefile b/io_uring/Makefile index de953c022c6ebd..c9ec1bbabfbd36 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ - statx.o + statx.o net.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index eb01d1aadeb486..cbc20985cd6f34 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -102,6 +102,7 @@ #include "uring_cmd.h" #include "epoll.h" #include "statx.h" +#include "net.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -131,8 +132,6 @@ #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ IO_REQ_CLEAN_FLAGS) -#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) - #define IO_TCTX_REFS_CACHE_NR (1U << 10) struct io_mapped_ubuf { @@ -295,25 +294,6 @@ struct io_timeout_data { u32 flags; }; -struct io_accept { - struct file *file; - struct sockaddr __user *addr; - int __user *addr_len; - int flags; - u32 file_slot; - unsigned long nofile; -}; - -struct io_socket { - struct file *file; - int domain; - int type; - int protocol; - int flags; - u32 file_slot; - unsigned long nofile; -}; - struct io_cancel { struct file *file; u64 addr; @@ -350,25 +330,6 @@ struct io_rw { rwf_t flags; }; -struct io_connect { - struct file *file; - struct sockaddr __user *addr; - int addr_len; -}; - -struct io_sr_msg { - struct file *file; - union { - struct compat_msghdr __user *umsg_compat; - struct user_msghdr __user *umsg; - void __user *buf; - }; - int msg_flags; - size_t len; - size_t done_io; - unsigned int flags; -}; - struct io_rsrc_update { struct file *file; u64 arg; @@ -385,30 +346,12 @@ struct io_provide_buf { __u16 bid; }; -struct io_shutdown { - struct file *file; - int how; -}; - struct io_msg { struct file *file; u64 user_data; u32 len; }; -struct io_async_connect { - struct sockaddr_storage address; -}; - -struct io_async_msghdr { - struct iovec fast_iov[UIO_FASTIOV]; - /* points to an allocated iov, if NULL we use fast_iov instead */ - struct iovec *free_iov; - struct sockaddr __user *uaddr; - struct msghdr msg; - struct sockaddr_storage addr; -}; - struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; @@ -517,9 +460,6 @@ static void io_req_task_queue(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index); - static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); @@ -808,8 +748,7 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) return __io_put_kbuf(req, &req->ctx->io_buffers_comp); } -static inline unsigned int io_put_kbuf(struct io_kiocb *req, - unsigned issue_flags) +inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) { unsigned int cflags; @@ -1291,12 +1230,6 @@ static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static inline void io_commit_cqring(struct io_ring_ctx *ctx) -{ - /* order cqe stores with ring update */ - smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); -} - static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used || ctx->drain_active) { @@ -1418,7 +1351,7 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx) * 1:1 relationship between how many times this function is called (and * hence the eventfd count) and number of CQEs posted to the CQ ring. */ -static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) +void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) @@ -1639,8 +1572,8 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, } } -static noinline bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags) +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags) { struct io_uring_cqe *cqe; @@ -2980,8 +2913,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, return u64_to_user_ptr(buf->addr); } -static void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) +void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -3073,13 +3006,6 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, return __io_iov_buffer_select(req, iov, issue_flags); } -static inline bool io_do_buffer_select(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return false; - return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); -} - static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, struct io_rw_state *s, unsigned int issue_flags) @@ -4025,755 +3951,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -#if defined(CONFIG_NET) -static int io_shutdown_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_shutdown *shutdown = io_kiocb_to_cmd(req); - - if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index || sqe->splice_fd_in)) - return -EINVAL; - - shutdown->how = READ_ONCE(sqe->len); - return 0; -} - -static int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_shutdown *shutdown = io_kiocb_to_cmd(req); - struct socket *sock; - int ret; - - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = __sys_shutdown_sock(sock, shutdown->how); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static bool io_net_retry(struct socket *sock, int flags) -{ - if (!(flags & MSG_WAITALL)) - return false; - return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; -} - -static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg) -{ - struct io_async_msghdr *async_msg = req->async_data; - - if (async_msg) - return -EAGAIN; - if (io_alloc_async_data(req)) { - kfree(kmsg->free_iov); - return -ENOMEM; - } - async_msg = req->async_data; - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(async_msg, kmsg, sizeof(*kmsg)); - async_msg->msg.msg_name = &async_msg->addr; - /* if were using fast_iov, set it to the new one */ - if (!async_msg->free_iov) - async_msg->msg.msg_iter.iov = async_msg->fast_iov; - - return -EAGAIN; -} - -static int io_sendmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - - iomsg->msg.msg_name = &iomsg->addr; - iomsg->free_iov = iomsg->fast_iov; - return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, - &iomsg->free_iov); -} - -static int io_sendmsg_prep_async(struct io_kiocb *req) -{ - int ret; - - ret = io_sendmsg_copy_hdr(req, req->async_data); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; -} - -static void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) -{ - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); -} - -static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - - if (unlikely(sqe->file_index || sqe->addr2)) - return -EINVAL; - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) - return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; - if (sr->msg_flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - sr->msg_flags |= MSG_CMSG_COMPAT; -#endif - sr->done_io = 0; - return 0; -} - -static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct io_async_msghdr iomsg, *kmsg; - struct socket *sock; - unsigned flags; - int min_ret = 0; - int ret; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); - - if (ret < min_ret) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); - } - req_set_fail(req); - } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_send(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct msghdr msg; - struct iovec iov; - struct socket *sock; - unsigned flags; - int min_ret = 0; - int ret; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - return ret; - - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); - - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); - if (ret < min_ret) { - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->len -= ret; - sr->buf += ret; - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return -EAGAIN; - } - req_set_fail(req); - } - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int __io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct iovec __user *uiov; - size_t iov_len; - int ret; - - ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, - &iomsg->uaddr, &uiov, &iov_len); - if (ret) - return ret; - - if (req->flags & REQ_F_BUFFER_SELECT) { - if (iov_len > 1) - return -EINVAL; - if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) - return -EFAULT; - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, - &iomsg->free_iov, &iomsg->msg.msg_iter, - false); - if (ret > 0) - ret = 0; - } - - return ret; -} - -#ifdef CONFIG_COMPAT -static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct compat_iovec __user *uiov; - compat_uptr_t ptr; - compat_size_t len; - int ret; - - ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, - &ptr, &len); - if (ret) - return ret; - - uiov = compat_ptr(ptr); - if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; - - if (len > 1) - return -EINVAL; - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; - iomsg->free_iov = NULL; - } else { - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, (struct iovec __user *)uiov, len, - UIO_FASTIOV, &iomsg->free_iov, - &iomsg->msg.msg_iter, true); - if (ret < 0) - return ret; - } - - return 0; -} -#endif - -static int io_recvmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) -{ - iomsg->msg.msg_name = &iomsg->addr; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return __io_compat_recvmsg_copy_hdr(req, iomsg); -#endif - - return __io_recvmsg_copy_hdr(req, iomsg); -} - -static int io_recvmsg_prep_async(struct io_kiocb *req) -{ - int ret; - - ret = io_recvmsg_copy_hdr(req, req->async_data); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; -} - -static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - - if (unlikely(sqe->file_index || sqe->addr2)) - return -EINVAL; - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); - sr->len = READ_ONCE(sqe->len); - sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) - return -EINVAL; - sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; - if (sr->msg_flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - if (sr->msg_flags & MSG_ERRQUEUE) - req->flags |= REQ_F_CLEAR_POLLIN; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - sr->msg_flags |= MSG_CMSG_COMPAT; -#endif - sr->done_io = 0; - return 0; -} - -static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct io_async_msghdr iomsg, *kmsg; - struct socket *sock; - unsigned int cflags; - unsigned flags; - int ret, min_ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); - if (ret) - return ret; - kmsg = &iomsg; - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); - - if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &sr->len, issue_flags); - if (!buf) - return -ENOBUFS; - kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = sr->len; - iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - sr->len); - } - - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&kmsg->msg.msg_iter); - - kmsg->msg.msg_get_inq = 1; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); - if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg); - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); - } - req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { - req_set_fail(req); - } - - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - cflags = io_put_kbuf(req, issue_flags); - if (kmsg->msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; -} - -static int io_recv(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct msghdr msg; - struct socket *sock; - struct iovec iov; - unsigned int cflags; - unsigned flags; - int ret, min_ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; - - sock = sock_from_file(req->file); - if (unlikely(!sock)) - return -ENOTSOCK; - - if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &sr->len, issue_flags); - if (!buf) - return -ENOBUFS; - sr->buf = buf; - } - - ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - goto out_free; - - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; - msg.msg_flags = 0; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - - flags = sr->msg_flags; - if (force_nonblock) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); - - ret = sock_recvmsg(sock, &msg, flags); - if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret > 0 && io_net_retry(sock, flags)) { - sr->len -= ret; - sr->buf += ret; - sr->done_io += ret; - req->flags |= REQ_F_PARTIAL_IO; - return -EAGAIN; - } - req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { -out_free: - req_set_fail(req); - } - - if (ret >= 0) - ret += sr->done_io; - else if (sr->done_io) - ret = sr->done_io; - cflags = io_put_kbuf(req, issue_flags); - if (msg.msg_inq) - cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; -} - -static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_accept *accept = io_kiocb_to_cmd(req); - unsigned flags; - - if (sqe->len || sqe->buf_index) - return -EINVAL; - - accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - accept->flags = READ_ONCE(sqe->accept_flags); - accept->nofile = rlimit(RLIMIT_NOFILE); - flags = READ_ONCE(sqe->ioprio); - if (flags & ~IORING_ACCEPT_MULTISHOT) - return -EINVAL; - - accept->file_slot = READ_ONCE(sqe->file_index); - if (accept->file_slot) { - if (accept->flags & SOCK_CLOEXEC) - return -EINVAL; - if (flags & IORING_ACCEPT_MULTISHOT && - accept->file_slot != IORING_FILE_INDEX_ALLOC) - return -EINVAL; - } - if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) - accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - if (flags & IORING_ACCEPT_MULTISHOT) - req->flags |= REQ_F_APOLL_MULTISHOT; - return 0; -} - -static int io_accept(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_accept *accept = io_kiocb_to_cmd(req); - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; - bool fixed = !!accept->file_slot; - struct file *file; - int ret, fd; - -retry: - if (!fixed) { - fd = __get_unused_fd_flags(accept->flags, accept->nofile); - if (unlikely(fd < 0)) - return fd; - } - file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, - accept->flags); - if (IS_ERR(file)) { - if (!fixed) - put_unused_fd(fd); - ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) { - /* - * if it's multishot and polled, we don't need to - * return EAGAIN to arm the poll infra since it - * has already been done - */ - if ((req->flags & IO_APOLL_MULTI_POLLED) == - IO_APOLL_MULTI_POLLED) - ret = IOU_ISSUE_SKIP_COMPLETE; - return ret; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } else if (!fixed) { - fd_install(fd, file); - ret = fd; - } else { - ret = io_fixed_fd_install(req, issue_flags, file, - accept->file_slot); - } - - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, ret, 0); - return IOU_OK; - } - if (ret >= 0) { - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, - IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - goto retry; - } - ret = -ECANCELED; - } - - return ret; -} - -static int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_socket *sock = io_kiocb_to_cmd(req); - - if (sqe->addr || sqe->rw_flags || sqe->buf_index) - return -EINVAL; - - sock->domain = READ_ONCE(sqe->fd); - sock->type = READ_ONCE(sqe->off); - sock->protocol = READ_ONCE(sqe->len); - sock->file_slot = READ_ONCE(sqe->file_index); - sock->nofile = rlimit(RLIMIT_NOFILE); - - sock->flags = sock->type & ~SOCK_TYPE_MASK; - if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) - return -EINVAL; - if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - return 0; -} - -static int io_socket(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_socket *sock = io_kiocb_to_cmd(req); - bool fixed = !!sock->file_slot; - struct file *file; - int ret, fd; - - if (!fixed) { - fd = __get_unused_fd_flags(sock->flags, sock->nofile); - if (unlikely(fd < 0)) - return fd; - } - file = __sys_socket_file(sock->domain, sock->type, sock->protocol); - if (IS_ERR(file)) { - if (!fixed) - put_unused_fd(fd); - ret = PTR_ERR(file); - if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - req_set_fail(req); - } else if (!fixed) { - fd_install(fd, file); - ret = fd; - } else { - ret = io_fixed_fd_install(req, issue_flags, file, - sock->file_slot); - } - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_connect_prep_async(struct io_kiocb *req) -{ - struct io_async_connect *io = req->async_data; - struct io_connect *conn = io_kiocb_to_cmd(req); - - return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); -} - -static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_connect *conn = io_kiocb_to_cmd(req); - - if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); - conn->addr_len = READ_ONCE(sqe->addr2); - return 0; -} - -static int io_connect(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_connect *connect = io_kiocb_to_cmd(req); - struct io_async_connect __io, *io; - unsigned file_flags; - int ret; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - - if (req_has_async_data(req)) { - io = req->async_data; - } else { - ret = move_addr_to_kernel(connect->addr, - connect->addr_len, - &__io.address); - if (ret) - goto out; - io = &__io; - } - - file_flags = force_nonblock ? O_NONBLOCK : 0; - - ret = __sys_connect_file(req->file, &io->address, - connect->addr_len, file_flags); - if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req_has_async_data(req)) - return -EAGAIN; - if (io_alloc_async_data(req)) { - ret = -ENOMEM; - goto out; - } - memcpy(req->async_data, &__io, sizeof(__io)); - return -EAGAIN; - } - if (ret == -ERESTARTSYS) - ret = -EINTR; -out: - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} -#else /* !CONFIG_NET */ -#define IO_NETOP_FN(op) \ -static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ -{ \ - return -EOPNOTSUPP; \ -} - -#define IO_NETOP_PREP(op) \ -IO_NETOP_FN(op) \ -static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ -{ \ - return -EOPNOTSUPP; \ -} \ - -#define IO_NETOP_PREP_ASYNC(op) \ -IO_NETOP_PREP(op) \ -static int io_##op##_prep_async(struct io_kiocb *req) \ -{ \ - return -EOPNOTSUPP; \ -} - -IO_NETOP_PREP_ASYNC(sendmsg); -IO_NETOP_PREP_ASYNC(recvmsg); -IO_NETOP_PREP_ASYNC(connect); -IO_NETOP_PREP(accept); -IO_NETOP_PREP(socket); -IO_NETOP_PREP(shutdown); -IO_NETOP_FN(send); -IO_NETOP_FN(recv); -#endif /* CONFIG_NET */ - struct io_poll_table { struct poll_table_struct pt; struct io_kiocb *req; @@ -7874,8 +7051,8 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, return 0; } -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) +int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) __must_hold(&req->ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; @@ -10986,12 +10163,14 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, +#if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, .prep_async = io_sendmsg_prep_async, -#if defined(CONFIG_NET) .cleanup = io_sendmsg_recvmsg_cleanup, +#else + .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_RECVMSG] = { @@ -11000,12 +10179,14 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .ioprio = 1, +#if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, .prep_async = io_recvmsg_prep_async, -#if defined(CONFIG_NET) .cleanup = io_sendmsg_recvmsg_cleanup, +#else + .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_TIMEOUT] = { @@ -11026,8 +10207,12 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .poll_exclusive = 1, .ioprio = 1, /* used for flags */ +#if defined(CONFIG_NET) .prep = io_accept_prep, .issue = io_accept, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, @@ -11044,10 +10229,14 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, +#if defined(CONFIG_NET) .async_size = sizeof(struct io_async_connect), .prep = io_connect_prep, .issue = io_connect, .prep_async = io_connect_prep_async, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_FALLOCATE] = { .needs_file = 1, @@ -11117,8 +10306,12 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, +#if defined(CONFIG_NET) .prep = io_sendmsg_prep, .issue = io_send, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_RECV] = { .needs_file = 1, @@ -11127,8 +10320,12 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .audit_skip = 1, .ioprio = 1, +#if defined(CONFIG_NET) .prep = io_recvmsg_prep, .issue = io_recv, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_OPENAT2] = { .prep = io_openat2_prep, @@ -11175,8 +10372,12 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, +#if defined(CONFIG_NET) .prep = io_shutdown_prep, .issue = io_shutdown, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_RENAMEAT] = { .prep = io_renameat_prep, @@ -11233,8 +10434,12 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_SOCKET] = { .audit_skip = 1, +#if defined(CONFIG_NET) .prep = io_socket_prep, .issue = io_socket, +#else + .prep = io_eopnotsupp_prep, +#endif }, [IORING_OP_URING_CMD] = { .needs_file = 1, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 6a07e902120ab3..4b46385720c585 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -58,13 +58,35 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx, lockdep_assert_held(&ctx->uring_lock); } +static inline void io_commit_cqring(struct io_ring_ctx *ctx) +{ + /* order cqe stores with ring update */ + smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); +} + void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags); +void io_cqring_ev_posted(struct io_ring_ctx *ctx); +void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags); +unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); + +static inline bool io_do_buffer_select(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return false; + return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); +} + struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); +int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, diff --git a/io_uring/net.c b/io_uring/net.c new file mode 100644 index 00000000000000..2434548d0c1fb3 --- /dev/null +++ b/io_uring/net.c @@ -0,0 +1,779 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "net.h" + +#if defined(CONFIG_NET) +struct io_shutdown { + struct file *file; + int how; +}; + +struct io_accept { + struct file *file; + struct sockaddr __user *addr; + int __user *addr_len; + int flags; + u32 file_slot; + unsigned long nofile; +}; + +struct io_socket { + struct file *file; + int domain; + int type; + int protocol; + int flags; + u32 file_slot; + unsigned long nofile; +}; + +struct io_connect { + struct file *file; + struct sockaddr __user *addr; + int addr_len; +}; + +struct io_sr_msg { + struct file *file; + union { + struct compat_msghdr __user *umsg_compat; + struct user_msghdr __user *umsg; + void __user *buf; + }; + int msg_flags; + size_t len; + size_t done_io; + unsigned int flags; +}; + +#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) + +int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_shutdown *shutdown = io_kiocb_to_cmd(req); + + if (unlikely(sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index || sqe->splice_fd_in)) + return -EINVAL; + + shutdown->how = READ_ONCE(sqe->len); + return 0; +} + +int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_shutdown *shutdown = io_kiocb_to_cmd(req); + struct socket *sock; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = __sys_shutdown_sock(sock, shutdown->how); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +static bool io_net_retry(struct socket *sock, int flags) +{ + if (!(flags & MSG_WAITALL)) + return false; + return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +} + +static int io_setup_async_msg(struct io_kiocb *req, + struct io_async_msghdr *kmsg) +{ + struct io_async_msghdr *async_msg = req->async_data; + + if (async_msg) + return -EAGAIN; + if (io_alloc_async_data(req)) { + kfree(kmsg->free_iov); + return -ENOMEM; + } + async_msg = req->async_data; + req->flags |= REQ_F_NEED_CLEANUP; + memcpy(async_msg, kmsg, sizeof(*kmsg)); + async_msg->msg.msg_name = &async_msg->addr; + /* if were using fast_iov, set it to the new one */ + if (!async_msg->free_iov) + async_msg->msg.msg_iter.iov = async_msg->fast_iov; + + return -EAGAIN; +} + +static int io_sendmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + + iomsg->msg.msg_name = &iomsg->addr; + iomsg->free_iov = iomsg->fast_iov; + return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags, + &iomsg->free_iov); +} + +int io_sendmsg_prep_async(struct io_kiocb *req) +{ + int ret; + + ret = io_sendmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + +void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) +{ + struct io_async_msghdr *io = req->async_data; + + kfree(io->free_iov); +} + +int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + + if (unlikely(sqe->file_index || sqe->addr2)) + return -EINVAL; + + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); + sr->flags = READ_ONCE(sqe->ioprio); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + if (sr->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; +#endif + sr->done_io = 0; + return 0; +} + +int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct io_async_msghdr iomsg, *kmsg; + struct socket *sock; + unsigned flags; + int min_ret = 0; + int ret; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { + ret = io_sendmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } + + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + flags = sr->msg_flags; + if (issue_flags & IO_URING_F_NONBLOCK) + flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } + req_set_fail(req); + } + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_send(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct msghdr msg; + struct iovec iov; + struct socket *sock; + unsigned flags; + int min_ret = 0; + int ret; + + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + flags = sr->msg_flags; + if (issue_flags & IO_URING_F_NONBLOCK) + flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); + if (ret < min_ret) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } + req_set_fail(req); + } + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +static int __io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct iovec __user *uiov; + size_t iov_len; + int ret; + + ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, + &iomsg->uaddr, &uiov, &iov_len); + if (ret) + return ret; + + if (req->flags & REQ_F_BUFFER_SELECT) { + if (iov_len > 1) + return -EINVAL; + if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) + return -EFAULT; + sr->len = iomsg->fast_iov[0].iov_len; + iomsg->free_iov = NULL; + } else { + iomsg->free_iov = iomsg->fast_iov; + ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + &iomsg->free_iov, &iomsg->msg.msg_iter, + false); + if (ret > 0) + ret = 0; + } + + return ret; +} + +#ifdef CONFIG_COMPAT +static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct compat_iovec __user *uiov; + compat_uptr_t ptr; + compat_size_t len; + int ret; + + ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, + &ptr, &len); + if (ret) + return ret; + + uiov = compat_ptr(ptr); + if (req->flags & REQ_F_BUFFER_SELECT) { + compat_ssize_t clen; + + if (len > 1) + return -EINVAL; + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + sr->len = clen; + iomsg->free_iov = NULL; + } else { + iomsg->free_iov = iomsg->fast_iov; + ret = __import_iovec(READ, (struct iovec __user *)uiov, len, + UIO_FASTIOV, &iomsg->free_iov, + &iomsg->msg.msg_iter, true); + if (ret < 0) + return ret; + } + + return 0; +} +#endif + +static int io_recvmsg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg) +{ + iomsg->msg.msg_name = &iomsg->addr; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return __io_compat_recvmsg_copy_hdr(req, iomsg); +#endif + + return __io_recvmsg_copy_hdr(req, iomsg); +} + +int io_recvmsg_prep_async(struct io_kiocb *req) +{ + int ret; + + ret = io_recvmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + +int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + + if (unlikely(sqe->file_index || sqe->addr2)) + return -EINVAL; + + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->len = READ_ONCE(sqe->len); + sr->flags = READ_ONCE(sqe->ioprio); + if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; + sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + if (sr->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + if (sr->msg_flags & MSG_ERRQUEUE) + req->flags |= REQ_F_CLEAR_POLLIN; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; +#endif + sr->done_io = 0; + return 0; +} + +int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct io_async_msghdr iomsg, *kmsg; + struct socket *sock; + unsigned int cflags; + unsigned flags; + int ret, min_ret = 0; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { + ret = io_recvmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } + + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg); + + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + kmsg->fast_iov[0].iov_base = buf; + kmsg->fast_iov[0].iov_len = sr->len; + iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, + sr->len); + } + + flags = sr->msg_flags; + if (force_nonblock) + flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + + kmsg->msg.msg_get_inq = 1; + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + req_set_fail(req); + } + + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + cflags = io_put_kbuf(req, issue_flags); + if (kmsg->msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + io_req_set_res(req, ret, cflags); + return IOU_OK; +} + +int io_recv(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct msghdr msg; + struct socket *sock; + struct iovec iov; + unsigned int cflags; + unsigned flags; + int ret, min_ret = 0; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + if (io_do_buffer_select(req)) { + void __user *buf; + + buf = io_buffer_select(req, &sr->len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; + } + + ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + goto out_free; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_get_inq = 1; + msg.msg_flags = 0; + msg.msg_controllen = 0; + msg.msg_iocb = NULL; + + flags = sr->msg_flags; + if (force_nonblock) + flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + + ret = sock_recvmsg(sock, &msg, flags); + if (ret < min_ret) { + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { +out_free: + req_set_fail(req); + } + + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + cflags = io_put_kbuf(req, issue_flags); + if (msg.msg_inq) + cflags |= IORING_CQE_F_SOCK_NONEMPTY; + io_req_set_res(req, ret, cflags); + return IOU_OK; +} + +int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_accept *accept = io_kiocb_to_cmd(req); + unsigned flags; + + if (sqe->len || sqe->buf_index) + return -EINVAL; + + accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + accept->flags = READ_ONCE(sqe->accept_flags); + accept->nofile = rlimit(RLIMIT_NOFILE); + flags = READ_ONCE(sqe->ioprio); + if (flags & ~IORING_ACCEPT_MULTISHOT) + return -EINVAL; + + accept->file_slot = READ_ONCE(sqe->file_index); + if (accept->file_slot) { + if (accept->flags & SOCK_CLOEXEC) + return -EINVAL; + if (flags & IORING_ACCEPT_MULTISHOT && + accept->file_slot != IORING_FILE_INDEX_ALLOC) + return -EINVAL; + } + if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) + accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + if (flags & IORING_ACCEPT_MULTISHOT) + req->flags |= REQ_F_APOLL_MULTISHOT; + return 0; +} + +int io_accept(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_accept *accept = io_kiocb_to_cmd(req); + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; + bool fixed = !!accept->file_slot; + struct file *file; + int ret, fd; + +retry: + if (!fixed) { + fd = __get_unused_fd_flags(accept->flags, accept->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, + accept->flags); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && force_nonblock) { + /* + * if it's multishot and polled, we don't need to + * return EAGAIN to arm the poll infra since it + * has already been done + */ + if ((req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) + ret = IOU_ISSUE_SKIP_COMPLETE; + return ret; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_fixed_fd_install(req, issue_flags, file, + accept->file_slot); + } + + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + io_req_set_res(req, ret, 0); + return IOU_OK; + } + if (ret >= 0) { + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, + IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) { + io_cqring_ev_posted(ctx); + goto retry; + } + ret = -ECANCELED; + } + + return ret; +} + +int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_socket *sock = io_kiocb_to_cmd(req); + + if (sqe->addr || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + + sock->domain = READ_ONCE(sqe->fd); + sock->type = READ_ONCE(sqe->off); + sock->protocol = READ_ONCE(sqe->len); + sock->file_slot = READ_ONCE(sqe->file_index); + sock->nofile = rlimit(RLIMIT_NOFILE); + + sock->flags = sock->type & ~SOCK_TYPE_MASK; + if (sock->file_slot && (sock->flags & SOCK_CLOEXEC)) + return -EINVAL; + if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + return 0; +} + +int io_socket(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_socket *sock = io_kiocb_to_cmd(req); + bool fixed = !!sock->file_slot; + struct file *file; + int ret, fd; + + if (!fixed) { + fd = __get_unused_fd_flags(sock->flags, sock->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = __sys_socket_file(sock->domain, sock->type, sock->protocol); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_fixed_fd_install(req, issue_flags, file, + sock->file_slot); + } + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_connect_prep_async(struct io_kiocb *req) +{ + struct io_async_connect *io = req->async_data; + struct io_connect *conn = io_kiocb_to_cmd(req); + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); +} + +int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_connect *conn = io_kiocb_to_cmd(req); + + if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); + conn->addr_len = READ_ONCE(sqe->addr2); + return 0; +} + +int io_connect(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_connect *connect = io_kiocb_to_cmd(req); + struct io_async_connect __io, *io; + unsigned file_flags; + int ret; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + + if (req_has_async_data(req)) { + io = req->async_data; + } else { + ret = move_addr_to_kernel(connect->addr, + connect->addr_len, + &__io.address); + if (ret) + goto out; + io = &__io; + } + + file_flags = force_nonblock ? O_NONBLOCK : 0; + + ret = __sys_connect_file(req->file, &io->address, + connect->addr_len, file_flags); + if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { + if (req_has_async_data(req)) + return -EAGAIN; + if (io_alloc_async_data(req)) { + ret = -ENOMEM; + goto out; + } + memcpy(req->async_data, &__io, sizeof(__io)); + return -EAGAIN; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; +out: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} +#endif diff --git a/io_uring/net.h b/io_uring/net.h new file mode 100644 index 00000000000000..81d71d1647704d --- /dev/null +++ b/io_uring/net.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#if defined(CONFIG_NET) +struct io_async_msghdr { + struct iovec fast_iov[UIO_FASTIOV]; + /* points to an allocated iov, if NULL we use fast_iov instead */ + struct iovec *free_iov; + struct sockaddr __user *uaddr; + struct msghdr msg; + struct sockaddr_storage addr; +}; + +struct io_async_connect { + struct sockaddr_storage address; +}; + +int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_shutdown(struct io_kiocb *req, unsigned int issue_flags); + +int io_sendmsg_prep_async(struct io_kiocb *req); +void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req); +int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); +int io_send(struct io_kiocb *req, unsigned int issue_flags); + +int io_recvmsg_prep_async(struct io_kiocb *req); +int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags); +int io_recv(struct io_kiocb *req, unsigned int issue_flags); + +int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_accept(struct io_kiocb *req, unsigned int issue_flags); + +int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_socket(struct io_kiocb *req, unsigned int issue_flags); + +int io_connect_prep_async(struct io_kiocb *req); +int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_connect(struct io_kiocb *req, unsigned int issue_flags); +#endif From c620cd2f844c47353dd458ad1328247c31b43a04 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 06:42:08 -0600 Subject: [PATCH 0966/1250] io_uring: move msg_ring into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 55 +------------------------------------- io_uring/msg_ring.c | 65 +++++++++++++++++++++++++++++++++++++++++++++ io_uring/msg_ring.h | 4 +++ 4 files changed, 71 insertions(+), 55 deletions(-) create mode 100644 io_uring/msg_ring.c create mode 100644 io_uring/msg_ring.h diff --git a/io_uring/Makefile b/io_uring/Makefile index c9ec1bbabfbd36..d7cf992c841a38 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ - statx.o net.o + statx.o net.o msg_ring.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cbc20985cd6f34..a0173ab5178c95 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -103,6 +103,7 @@ #include "epoll.h" #include "statx.h" #include "net.h" +#include "msg_ring.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -346,12 +347,6 @@ struct io_provide_buf { __u16 bid; }; -struct io_msg { - struct file *file; - u64 user_data; - u32 len; -}; - struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; @@ -3613,54 +3608,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } -static int io_msg_ring_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_msg *msg = io_kiocb_to_cmd(req); - - if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || - sqe->buf_index || sqe->personality)) - return -EINVAL; - - msg->user_data = READ_ONCE(sqe->off); - msg->len = READ_ONCE(sqe->len); - return 0; -} - -static int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_msg *msg = io_kiocb_to_cmd(req); - struct io_ring_ctx *target_ctx; - bool filled; - int ret; - - ret = -EBADFD; - if (req->file->f_op != &io_uring_fops) - goto done; - - ret = -EOVERFLOW; - target_ctx = req->file->private_data; - - spin_lock(&target_ctx->completion_lock); - filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); - io_commit_cqring(target_ctx); - spin_unlock(&target_ctx->completion_lock); - - if (filled) { - io_cqring_ev_posted(target_ctx); - ret = 0; - } - -done: - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - /* put file to avoid an attempt to IOPOLL the req */ - io_put_file(req->file); - req->file = NULL; - return IOU_OK; -} - /* * Note when io_fixed_fd_install() returns error value, it will ensure * fput() is called correspondingly. diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c new file mode 100644 index 00000000000000..3b89f9a0a0b459 --- /dev/null +++ b/io_uring/msg_ring.c @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "msg_ring.h" + +struct io_msg { + struct file *file; + u64 user_data; + u32 len; +}; + +int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_msg *msg = io_kiocb_to_cmd(req); + + if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || + sqe->buf_index || sqe->personality)) + return -EINVAL; + + msg->user_data = READ_ONCE(sqe->off); + msg->len = READ_ONCE(sqe->len); + return 0; +} + +int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_msg *msg = io_kiocb_to_cmd(req); + struct io_ring_ctx *target_ctx; + bool filled; + int ret; + + ret = -EBADFD; + if (!io_is_uring_fops(req->file)) + goto done; + + ret = -EOVERFLOW; + target_ctx = req->file->private_data; + + spin_lock(&target_ctx->completion_lock); + filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); + io_commit_cqring(target_ctx); + spin_unlock(&target_ctx->completion_lock); + + if (filled) { + io_cqring_ev_posted(target_ctx); + ret = 0; + } + +done: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + /* put file to avoid an attempt to IOPOLL the req */ + io_put_file(req->file); + req->file = NULL; + return IOU_OK; +} diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h new file mode 100644 index 00000000000000..fb9601f202d07d --- /dev/null +++ b/io_uring/msg_ring.h @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); From 4b8271494264eade6d9697fc6b2a39fd48a4db9a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 08:56:52 -0600 Subject: [PATCH 0967/1250] io_uring: move our reference counting into a header Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 43 +--------------------------------------- io_uring/refs.h | 48 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 42 deletions(-) create mode 100644 io_uring/refs.h diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a0173ab5178c95..eea5282b1ca21b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -91,6 +91,7 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "refs.h" #include "xattr.h" #include "nop.h" @@ -611,54 +612,12 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) -/* - * Shamelessly stolen from the mm implementation of page reference checking, - * see commit f958d7b528b1 for details. - */ -#define req_ref_zero_or_close_to_overflow(req) \ - ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) - -static inline bool req_ref_inc_not_zero(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - return atomic_inc_not_zero(&req->refs); -} - -static inline bool req_ref_put_and_test(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_REFCOUNT))) - return true; - - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_dec_and_test(&req->refs); -} - -static inline void req_ref_get(struct io_kiocb *req) -{ - WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - atomic_inc(&req->refs); -} - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs)) __io_submit_flush_completions(ctx); } -static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) -{ - if (!(req->flags & REQ_F_REFCOUNT)) { - req->flags |= REQ_F_REFCOUNT; - atomic_set(&req->refs, nr); - } -} - -static inline void io_req_set_refcount(struct io_kiocb *req) -{ - __io_req_set_refcount(req, 1); -} - #define IO_RSRC_REF_BATCH 100 static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) diff --git a/io_uring/refs.h b/io_uring/refs.h new file mode 100644 index 00000000000000..334c5ead4c43d9 --- /dev/null +++ b/io_uring/refs.h @@ -0,0 +1,48 @@ +#ifndef IOU_REQ_REF_H +#define IOU_REQ_REF_H + +#include +#include "io_uring_types.h" + +/* + * Shamelessly stolen from the mm implementation of page reference checking, + * see commit f958d7b528b1 for details. + */ +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + return atomic_inc_not_zero(&req->refs); +} + +static inline bool req_ref_put_and_test(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_REFCOUNT))) + return true; + + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + +static inline void req_ref_get(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_inc(&req->refs); +} + +static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) +{ + if (!(req->flags & REQ_F_REFCOUNT)) { + req->flags |= REQ_F_REFCOUNT; + atomic_set(&req->refs, nr); + } +} + +static inline void io_req_set_refcount(struct io_kiocb *req) +{ + __io_req_set_refcount(req, 1); +} +#endif From 237ab11eb0e79423dbec3ebe6ca85a012a3e492f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 08:57:27 -0600 Subject: [PATCH 0968/1250] io_uring: move timeout opcodes and handling into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 667 +------------------------------------- io_uring/io_uring.h | 13 +- io_uring/io_uring_types.h | 10 + io_uring/timeout.c | 634 ++++++++++++++++++++++++++++++++++++ io_uring/timeout.h | 35 ++ 6 files changed, 701 insertions(+), 660 deletions(-) create mode 100644 io_uring/timeout.c create mode 100644 io_uring/timeout.h diff --git a/io_uring/Makefile b/io_uring/Makefile index d7cf992c841a38..6ae4e45a15dbfc 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ - statx.o net.o msg_ring.o + statx.o net.o msg_ring.o timeout.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index eea5282b1ca21b..3fc59a22d54e89 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -105,6 +105,7 @@ #include "statx.h" #include "net.h" #include "msg_ring.h" +#include "timeout.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -288,14 +289,6 @@ struct io_poll_update { bool update_user_data; }; -struct io_timeout_data { - struct io_kiocb *req; - struct hrtimer timer; - struct timespec64 ts; - enum hrtimer_mode mode; - u32 flags; -}; - struct io_cancel { struct file *file; u64 addr; @@ -303,27 +296,6 @@ struct io_cancel { s32 fd; }; -struct io_timeout { - struct file *file; - u32 off; - u32 target_seq; - struct list_head list; - /* head of the link, used by linked timeouts only */ - struct io_kiocb *head; - /* for linked completions */ - struct io_kiocb *prev; -}; - -struct io_timeout_rem { - struct file *file; - u64 addr; - - /* timeout update */ - struct timespec64 ts; - u32 flags; - bool ltimeout; -}; - struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -388,16 +360,6 @@ struct io_defer_entry { u32 seq; }; -struct io_cancel_data { - struct io_ring_ctx *ctx; - union { - u64 data; - struct file *file; - }; - u32 flags; - int seq; -}; - struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; @@ -436,7 +398,6 @@ static const struct io_op_def io_op_defs[]; #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) -static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, @@ -444,7 +405,6 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static void io_dismantle_req(struct io_kiocb *req); -static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args); @@ -456,9 +416,7 @@ static void io_req_task_queue(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); -static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static void io_eventfd_signal(struct io_ring_ctx *ctx); -static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); static struct kmem_cache *req_cachep; @@ -609,9 +567,6 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) } } -#define io_for_each_link(pos, head) \ - for (pos = (head); pos; pos = pos->link) - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs)) @@ -803,24 +758,6 @@ static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) io_ring_submit_unlock(ctx, issue_flags); } -static bool io_match_task(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_kiocb *req; - - if (task && head->task != task) - return false; - if (cancel_all) - return true; - - io_for_each_link(req, head) { - if (req->flags & REQ_F_INFLIGHT) - return true; - } - return false; -} - static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; @@ -877,13 +814,6 @@ static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) complete(&ctx->ref_comp); } -static inline bool io_is_timeout_noseq(struct io_kiocb *req) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - - return !timeout->off; -} - static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, @@ -1120,24 +1050,6 @@ static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) io_queue_linked_timeout(link); } -static void io_kill_timeout(struct io_kiocb *req, int status) - __must_hold(&req->ctx->completion_lock) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_timeout_data *io = req->async_data; - - if (hrtimer_try_to_cancel(&io->timer) != -1) { - struct io_timeout *timeout = io_kiocb_to_cmd(req); - - if (status) - req_set_fail(req); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&timeout->list); - io_req_tw_post_queue(req, status, 0); - } -} - static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { @@ -1152,38 +1064,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) - __must_hold(&ctx->completion_lock) -{ - u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - struct io_timeout *timeout, *tmp; - - spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { - struct io_kiocb *req = cmd_to_io_kiocb(timeout); - u32 events_needed, events_got; - - if (io_is_timeout_noseq(req)) - break; - - /* - * Since seq can easily wrap around over time, subtract - * the last seq at which timeouts were flushed before comparing. - * Assuming not more than 2^31-1 events have happened since, - * these subtractions won't have wrapped, so we can check if - * target is in [last_seq, current_seq] by comparing the two. - */ - events_needed = timeout->target_seq - ctx->cq_last_tm_flush; - events_got = seq - ctx->cq_last_tm_flush; - if (events_got < events_needed) - break; - - io_kill_timeout(req, 0); - } - ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); -} - static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used || ctx->drain_active) { @@ -1585,14 +1465,14 @@ static void __io_req_complete_put(struct io_kiocb *req) } } -static void __io_req_complete_post(struct io_kiocb *req) +void __io_req_complete_post(struct io_kiocb *req) { if (!(req->flags & REQ_F_CQE_SKIP)) __io_fill_cqe_req(req->ctx, req); __io_req_complete_put(req); } -static void io_req_complete_post(struct io_kiocb *req) +void io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1717,7 +1597,7 @@ static inline void io_dismantle_req(struct io_kiocb *req) io_put_file(req->file); } -static __cold void io_free_req(struct io_kiocb *req) +__cold void io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1731,96 +1611,6 @@ static __cold void io_free_req(struct io_kiocb *req) spin_unlock(&ctx->completion_lock); } -static inline void io_remove_next_linked(struct io_kiocb *req) -{ - struct io_kiocb *nxt = req->link; - - req->link = nxt->link; - nxt->link = NULL; -} - -static struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) - __must_hold(&req->ctx->timeout_lock) -{ - struct io_kiocb *link = req->link; - - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { - struct io_timeout_data *io = link->async_data; - struct io_timeout *timeout = io_kiocb_to_cmd(link); - - io_remove_next_linked(req); - timeout->head = NULL; - if (hrtimer_try_to_cancel(&io->timer) != -1) { - list_del(&timeout->list); - return link; - } - } - return NULL; -} - -static void io_fail_links(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - struct io_kiocb *nxt, *link = req->link; - bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; - - req->link = NULL; - while (link) { - long res = -ECANCELED; - - if (link->flags & REQ_F_FAIL) - res = link->cqe.res; - - nxt = link->link; - link->link = NULL; - - trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, - req->opcode, link); - - if (ignore_cqes) - link->flags |= REQ_F_CQE_SKIP; - else - link->flags &= ~REQ_F_CQE_SKIP; - io_req_set_res(link, res, 0); - __io_req_complete_post(link); - link = nxt; - } -} - -static bool io_disarm_next(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - struct io_kiocb *link = NULL; - bool posted = false; - - if (req->flags & REQ_F_ARM_LTIMEOUT) { - link = req->link; - req->flags &= ~REQ_F_ARM_LTIMEOUT; - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { - io_remove_next_linked(req); - io_req_tw_post_queue(link, -ECANCELED, 0); - posted = true; - } - } else if (req->flags & REQ_F_LINK_TIMEOUT) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); - if (link) { - posted = true; - io_req_tw_post_queue(link, -ECANCELED, 0); - } - } - if (unlikely((req->flags & REQ_F_FAIL) && - !(req->flags & REQ_F_HARDLINK))) { - posted |= (req->link != NULL); - io_fail_links(req); - } - return posted; -} - static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -2033,7 +1823,7 @@ static void io_req_tw_post(struct io_kiocb *req, bool *locked) io_req_complete_post(req); } -static void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) +void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags) { io_req_set_res(req, res, cflags); req->io_task_work.func = io_req_tw_post; @@ -2057,7 +1847,7 @@ static void io_req_task_submit(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, -EFAULT); } -static void io_req_task_queue_fail(struct io_kiocb *req, int ret) +void io_req_task_queue_fail(struct io_kiocb *req, int ret) { io_req_set_res(req, ret, 0); req->io_task_work.func = io_req_task_cancel; @@ -2076,7 +1866,7 @@ static void io_req_task_queue_reissue(struct io_kiocb *req) io_req_task_work_add(req); } -static void io_queue_next(struct io_kiocb *req) +void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2177,14 +1967,6 @@ static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req) return nxt; } -static inline void io_put_req(struct io_kiocb *req) -{ - if (req_ref_put_and_test(req)) { - io_queue_next(req); - io_free_req(req); - } -} - static unsigned io_cqring_events(struct io_ring_ctx *ctx) { /* See comment at the top of this file */ @@ -2451,7 +2233,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) return false; } -static inline void io_req_task_complete(struct io_kiocb *req, bool *locked) +inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { if (*locked) { req->cqe.flags |= io_put_kbuf(req, 0); @@ -4600,334 +4382,6 @@ static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) -{ - struct io_timeout_data *data = container_of(timer, - struct io_timeout_data, timer); - struct io_kiocb *req = data->req; - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->timeout_lock, flags); - list_del_init(&timeout->list); - atomic_set(&req->ctx->cq_timeouts, - atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); - - if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) - req_set_fail(req); - - io_req_set_res(req, -ETIME, 0); - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); - return HRTIMER_NORESTART; -} - -static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) - __must_hold(&ctx->timeout_lock) -{ - struct io_timeout *timeout; - struct io_timeout_data *io; - struct io_kiocb *req = NULL; - - list_for_each_entry(timeout, &ctx->timeout_list, list) { - struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - cd->data != tmp->cqe.user_data) - continue; - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == tmp->work.cancel_seq) - continue; - tmp->work.cancel_seq = cd->seq; - } - req = tmp; - break; - } - if (!req) - return ERR_PTR(-ENOENT); - - io = req->async_data; - if (hrtimer_try_to_cancel(&io->timer) == -1) - return ERR_PTR(-EALREADY); - timeout = io_kiocb_to_cmd(req); - list_del_init(&timeout->list); - return req; -} - -static int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - - spin_lock_irq(&ctx->timeout_lock); - req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); - - if (IS_ERR(req)) - return PTR_ERR(req); - io_req_task_queue_fail(req, -ECANCELED); - return 0; -} - -static clockid_t io_timeout_get_clock(struct io_timeout_data *data) -{ - switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { - case IORING_TIMEOUT_BOOTTIME: - return CLOCK_BOOTTIME; - case IORING_TIMEOUT_REALTIME: - return CLOCK_REALTIME; - default: - /* can't happen, vetted at prep time */ - WARN_ON_ONCE(1); - fallthrough; - case 0: - return CLOCK_MONOTONIC; - } -} - -static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->timeout_lock) -{ - struct io_timeout_data *io; - struct io_timeout *timeout; - struct io_kiocb *req = NULL; - - list_for_each_entry(timeout, &ctx->ltimeout_list, list) { - struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); - - if (user_data == tmp->cqe.user_data) { - req = tmp; - break; - } - } - if (!req) - return -ENOENT; - - io = req->async_data; - if (hrtimer_try_to_cancel(&io->timer) == -1) - return -EALREADY; - hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); - io->timer.function = io_link_timeout_fn; - hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); - return 0; -} - -static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, - struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->timeout_lock) -{ - struct io_cancel_data cd = { .data = user_data, }; - struct io_kiocb *req = io_timeout_extract(ctx, &cd); - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_timeout_data *data; - - if (IS_ERR(req)) - return PTR_ERR(req); - - timeout->off = 0; /* noseq */ - data = req->async_data; - list_add_tail(&timeout->list, &ctx->timeout_list); - hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); - return 0; -} - -static int io_timeout_remove_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_timeout_rem *tr = io_kiocb_to_cmd(req); - - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->buf_index || sqe->len || sqe->splice_fd_in) - return -EINVAL; - - tr->ltimeout = false; - tr->addr = READ_ONCE(sqe->addr); - tr->flags = READ_ONCE(sqe->timeout_flags); - if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { - if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) - return -EINVAL; - if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) - tr->ltimeout = true; - if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) - return -EINVAL; - if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) - return -EFAULT; - if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) - return -EINVAL; - } else if (tr->flags) { - /* timeout removal doesn't support flags */ - return -EINVAL; - } - - return 0; -} - -static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) -{ - return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS - : HRTIMER_MODE_REL; -} - -/* - * Remove or update an existing timeout command - */ -static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_timeout_rem *tr = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - int ret; - - if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { - struct io_cancel_data cd = { .data = tr->addr, }; - - spin_lock(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, &cd); - spin_unlock(&ctx->completion_lock); - } else { - enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - - spin_lock_irq(&ctx->timeout_lock); - if (tr->ltimeout) - ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); - else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); - } - - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int __io_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - bool is_timeout_link) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_timeout_data *data; - unsigned flags; - u32 off = READ_ONCE(sqe->off); - - if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) - return -EINVAL; - if (off && is_timeout_link) - return -EINVAL; - flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | - IORING_TIMEOUT_ETIME_SUCCESS)) - return -EINVAL; - /* more than one clock specified is invalid, obviously */ - if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) - return -EINVAL; - - INIT_LIST_HEAD(&timeout->list); - timeout->off = off; - if (unlikely(off && !req->ctx->off_timeout_used)) - req->ctx->off_timeout_used = true; - - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (io_alloc_async_data(req)) - return -ENOMEM; - - data = req->async_data; - data->req = req; - data->flags = flags; - - if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) - return -EFAULT; - - if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) - return -EINVAL; - - INIT_LIST_HEAD(&timeout->list); - data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); - - if (is_timeout_link) { - struct io_submit_link *link = &req->ctx->submit_state.link; - - if (!link->head) - return -EINVAL; - if (link->last->opcode == IORING_OP_LINK_TIMEOUT) - return -EINVAL; - timeout->head = link->last; - link->last->flags |= REQ_F_ARM_LTIMEOUT; - } - return 0; -} - -static int io_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_timeout_prep(req, sqe, false); -} - -static int io_link_timeout_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return __io_timeout_prep(req, sqe, true); -} - -static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_timeout_data *data = req->async_data; - struct list_head *entry; - u32 tail, off = timeout->off; - - spin_lock_irq(&ctx->timeout_lock); - - /* - * sqe->off holds how many events that need to occur for this - * timeout event to be satisfied. If it isn't set, then this is - * a pure timeout request, sequence isn't used. - */ - if (io_is_timeout_noseq(req)) { - entry = ctx->timeout_list.prev; - goto add; - } - - tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); - timeout->target_seq = tail + off; - - /* Update the last seq here in case io_flush_timeouts() hasn't. - * This is safe because ->completion_lock is held, and submissions - * and completions are never mixed in the same ->completion_lock section. - */ - ctx->cq_last_tm_flush = tail; - - /* - * Insertion sort, ensuring the first entry in the list is always - * the one we need first. - */ - list_for_each_prev(entry, &ctx->timeout_list) { - struct io_timeout *nextt = list_entry(entry, struct io_timeout, list); - struct io_kiocb *nxt = cmd_to_io_kiocb(nextt); - - if (io_is_timeout_noseq(nxt)) - continue; - /* nxt.seq is behind @tail, otherwise would've been completed */ - if (off >= nextt->target_seq - tail) - break; - } -add: - list_add(&timeout->list, entry); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); - return IOU_ISSUE_SKIP_COMPLETE; -} - static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -4979,7 +4433,7 @@ static int io_async_cancel_one(struct io_uring_task *tctx, return ret; } -static int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -5462,84 +4916,6 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) return file; } -static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_kiocb *prev = timeout->prev; - int ret = -ENOENT; - - if (prev) { - if (!(req->task->flags & PF_EXITING)) { - struct io_cancel_data cd = { - .ctx = req->ctx, - .data = prev->cqe.user_data, - }; - - ret = io_try_cancel(req, &cd); - } - io_req_set_res(req, ret ?: -ETIME, 0); - io_req_complete_post(req); - io_put_req(prev); - } else { - io_req_set_res(req, -ETIME, 0); - io_req_complete_post(req); - } -} - -static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) -{ - struct io_timeout_data *data = container_of(timer, - struct io_timeout_data, timer); - struct io_kiocb *prev, *req = data->req; - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->timeout_lock, flags); - prev = timeout->head; - timeout->head = NULL; - - /* - * We don't expect the list to be empty, that will only happen if we - * race with the completion of the linked work. - */ - if (prev) { - io_remove_next_linked(prev); - if (!req_ref_inc_not_zero(prev)) - prev = NULL; - } - list_del(&timeout->list); - timeout->prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); - - req->io_task_work.func = io_req_task_link_timeout; - io_req_task_work_add(req); - return HRTIMER_NORESTART; -} - -static void io_queue_linked_timeout(struct io_kiocb *req) -{ - struct io_timeout *timeout = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - - spin_lock_irq(&ctx->timeout_lock); - /* - * If the back reference is NULL, then our linked request finished - * before we got a chance to setup the timer - */ - if (timeout->head) { - struct io_timeout_data *data = req->async_data; - - data->timer.function = io_link_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), - data->mode); - list_add_tail(&timeout->list, &ctx->ltimeout_list); - } - spin_unlock_irq(&ctx->timeout_lock); - /* drop submission reference */ - io_put_req(req); -} - static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { @@ -8116,31 +7492,6 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_ring_ctx_free(ctx); } -/* Returns true if we found and killed one or more timeouts */ -static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, - struct task_struct *tsk, bool cancel_all) -{ - struct io_timeout *timeout, *tmp; - int canceled = 0; - - spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); - list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { - struct io_kiocb *req = cmd_to_io_kiocb(timeout); - - if (io_match_task(req, tsk, cancel_all)) { - io_kill_timeout(req, -ECANCELED); - canceled++; - } - } - spin_unlock_irq(&ctx->timeout_lock); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (canceled != 0) - io_cqring_ev_posted(ctx); - return canceled != 0; -} - static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4b46385720c585..e285e12ccbdbb1 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -65,7 +65,8 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) } void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); - +void io_req_complete_post(struct io_kiocb *req); +void __io_req_complete_post(struct io_kiocb *req); bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_cqring_ev_posted(struct io_ring_ctx *ctx); @@ -96,5 +97,15 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_work_add(struct io_kiocb *req); +void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); +void io_req_task_complete(struct io_kiocb *req, bool *locked); +void io_req_task_queue_fail(struct io_kiocb *req, int ret); +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); + +void io_free_req(struct io_kiocb *req); +void io_queue_next(struct io_kiocb *req); + +#define io_for_each_link(pos, head) \ + for (pos = (head); pos; pos = pos->link) #endif diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index dba72113c59d67..349524907b6bfc 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -488,4 +488,14 @@ struct io_kiocb { struct io_wq_work work; }; +struct io_cancel_data { + struct io_ring_ctx *ctx; + union { + u64 data; + struct file *file; + }; + u32 flags; + int seq; +}; + #endif diff --git a/io_uring/timeout.c b/io_uring/timeout.c new file mode 100644 index 00000000000000..5e42bfcd683e2a --- /dev/null +++ b/io_uring/timeout.c @@ -0,0 +1,634 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "refs.h" +#include "timeout.h" + +struct io_timeout { + struct file *file; + u32 off; + u32 target_seq; + struct list_head list; + /* head of the link, used by linked timeouts only */ + struct io_kiocb *head; + /* for linked completions */ + struct io_kiocb *prev; +}; + +struct io_timeout_rem { + struct file *file; + u64 addr; + + /* timeout update */ + struct timespec64 ts; + u32 flags; + bool ltimeout; +}; + +static inline bool io_is_timeout_noseq(struct io_kiocb *req) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + + return !timeout->off; +} + +static inline void io_put_req(struct io_kiocb *req) +{ + if (req_ref_put_and_test(req)) { + io_queue_next(req); + io_free_req(req); + } +} + +static void io_kill_timeout(struct io_kiocb *req, int status) + __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) +{ + struct io_timeout_data *io = req->async_data; + + if (hrtimer_try_to_cancel(&io->timer) != -1) { + struct io_timeout *timeout = io_kiocb_to_cmd(req); + + if (status) + req_set_fail(req); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + list_del_init(&timeout->list); + io_req_tw_post_queue(req, status, 0); + } +} + +__cold void io_flush_timeouts(struct io_ring_ctx *ctx) + __must_hold(&ctx->completion_lock) +{ + u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + struct io_timeout *timeout, *tmp; + + spin_lock_irq(&ctx->timeout_lock); + list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { + struct io_kiocb *req = cmd_to_io_kiocb(timeout); + u32 events_needed, events_got; + + if (io_is_timeout_noseq(req)) + break; + + /* + * Since seq can easily wrap around over time, subtract + * the last seq at which timeouts were flushed before comparing. + * Assuming not more than 2^31-1 events have happened since, + * these subtractions won't have wrapped, so we can check if + * target is in [last_seq, current_seq] by comparing the two. + */ + events_needed = timeout->target_seq - ctx->cq_last_tm_flush; + events_got = seq - ctx->cq_last_tm_flush; + if (events_got < events_needed) + break; + + io_kill_timeout(req, 0); + } + ctx->cq_last_tm_flush = seq; + spin_unlock_irq(&ctx->timeout_lock); +} + +static void io_fail_links(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + struct io_kiocb *nxt, *link = req->link; + bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; + + req->link = NULL; + while (link) { + long res = -ECANCELED; + + if (link->flags & REQ_F_FAIL) + res = link->cqe.res; + + nxt = link->link; + link->link = NULL; + + trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, + req->opcode, link); + + if (ignore_cqes) + link->flags |= REQ_F_CQE_SKIP; + else + link->flags &= ~REQ_F_CQE_SKIP; + io_req_set_res(link, res, 0); + __io_req_complete_post(link); + link = nxt; + } +} + +static inline void io_remove_next_linked(struct io_kiocb *req) +{ + struct io_kiocb *nxt = req->link; + + req->link = nxt->link; + nxt->link = NULL; +} + +bool io_disarm_next(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + struct io_kiocb *link = NULL; + bool posted = false; + + if (req->flags & REQ_F_ARM_LTIMEOUT) { + link = req->link; + req->flags &= ~REQ_F_ARM_LTIMEOUT; + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { + io_remove_next_linked(req); + io_req_tw_post_queue(link, -ECANCELED, 0); + posted = true; + } + } else if (req->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->timeout_lock); + link = io_disarm_linked_timeout(req); + spin_unlock_irq(&ctx->timeout_lock); + if (link) { + posted = true; + io_req_tw_post_queue(link, -ECANCELED, 0); + } + } + if (unlikely((req->flags & REQ_F_FAIL) && + !(req->flags & REQ_F_HARDLINK))) { + posted |= (req->link != NULL); + io_fail_links(req); + } + return posted; +} + +struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link) + __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) +{ + struct io_timeout_data *io = link->async_data; + struct io_timeout *timeout = io_kiocb_to_cmd(link); + + io_remove_next_linked(req); + timeout->head = NULL; + if (hrtimer_try_to_cancel(&io->timer) != -1) { + list_del(&timeout->list); + return link; + } + + return NULL; +} + +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) +{ + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *req = data->req; + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->timeout_lock, flags); + list_del_init(&timeout->list); + atomic_set(&req->ctx->cq_timeouts, + atomic_read(&req->ctx->cq_timeouts) + 1); + spin_unlock_irqrestore(&ctx->timeout_lock, flags); + + if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) + req_set_fail(req); + + io_req_set_res(req, -ETIME, 0); + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return HRTIMER_NORESTART; +} + +static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, + struct io_cancel_data *cd) + __must_hold(&ctx->timeout_lock) +{ + struct io_timeout *timeout; + struct io_timeout_data *io; + struct io_kiocb *req = NULL; + + list_for_each_entry(timeout, &ctx->timeout_list, list) { + struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); + + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + cd->data != tmp->cqe.user_data) + continue; + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == tmp->work.cancel_seq) + continue; + tmp->work.cancel_seq = cd->seq; + } + req = tmp; + break; + } + if (!req) + return ERR_PTR(-ENOENT); + + io = req->async_data; + if (hrtimer_try_to_cancel(&io->timer) == -1) + return ERR_PTR(-EALREADY); + timeout = io_kiocb_to_cmd(req); + list_del_init(&timeout->list); + return req; +} + +int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + + spin_lock_irq(&ctx->timeout_lock); + req = io_timeout_extract(ctx, cd); + spin_unlock_irq(&ctx->timeout_lock); + + if (IS_ERR(req)) + return PTR_ERR(req); + io_req_task_queue_fail(req, -ECANCELED); + return 0; +} + +static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_kiocb *prev = timeout->prev; + int ret = -ENOENT; + + if (prev) { + if (!(req->task->flags & PF_EXITING)) { + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = prev->cqe.user_data, + }; + + ret = io_try_cancel(req, &cd); + } + io_req_set_res(req, ret ?: -ETIME, 0); + io_req_complete_post(req); + io_put_req(prev); + } else { + io_req_set_res(req, -ETIME, 0); + io_req_complete_post(req); + } +} + +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) +{ + struct io_timeout_data *data = container_of(timer, + struct io_timeout_data, timer); + struct io_kiocb *prev, *req = data->req; + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + unsigned long flags; + + spin_lock_irqsave(&ctx->timeout_lock, flags); + prev = timeout->head; + timeout->head = NULL; + + /* + * We don't expect the list to be empty, that will only happen if we + * race with the completion of the linked work. + */ + if (prev) { + io_remove_next_linked(prev); + if (!req_ref_inc_not_zero(prev)) + prev = NULL; + } + list_del(&timeout->list); + timeout->prev = prev; + spin_unlock_irqrestore(&ctx->timeout_lock, flags); + + req->io_task_work.func = io_req_task_link_timeout; + io_req_task_work_add(req); + return HRTIMER_NORESTART; +} + +static clockid_t io_timeout_get_clock(struct io_timeout_data *data) +{ + switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->timeout_lock) +{ + struct io_timeout_data *io; + struct io_timeout *timeout; + struct io_kiocb *req = NULL; + + list_for_each_entry(timeout, &ctx->ltimeout_list, list) { + struct io_kiocb *tmp = cmd_to_io_kiocb(timeout); + + if (user_data == tmp->cqe.user_data) { + req = tmp; + break; + } + } + if (!req) + return -ENOENT; + + io = req->async_data; + if (hrtimer_try_to_cancel(&io->timer) == -1) + return -EALREADY; + hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); + io->timer.function = io_link_timeout_fn; + hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); + return 0; +} + +static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->timeout_lock) +{ + struct io_cancel_data cd = { .data = user_data, }; + struct io_kiocb *req = io_timeout_extract(ctx, &cd); + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_timeout_data *data; + + if (IS_ERR(req)) + return PTR_ERR(req); + + timeout->off = 0; /* noseq */ + data = req->async_data; + list_add_tail(&timeout->list, &ctx->timeout_list); + hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); + return 0; +} + +int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_timeout_rem *tr = io_kiocb_to_cmd(req); + + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->buf_index || sqe->len || sqe->splice_fd_in) + return -EINVAL; + + tr->ltimeout = false; + tr->addr = READ_ONCE(sqe->addr); + tr->flags = READ_ONCE(sqe->timeout_flags); + if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { + if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) + return -EINVAL; + if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) + tr->ltimeout = true; + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) + return -EINVAL; + if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) + return -EFAULT; + if (tr->ts.tv_sec < 0 || tr->ts.tv_nsec < 0) + return -EINVAL; + } else if (tr->flags) { + /* timeout removal doesn't support flags */ + return -EINVAL; + } + + return 0; +} + +static inline enum hrtimer_mode io_translate_timeout_mode(unsigned int flags) +{ + return (flags & IORING_TIMEOUT_ABS) ? HRTIMER_MODE_ABS + : HRTIMER_MODE_REL; +} + +/* + * Remove or update an existing timeout command + */ +int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_timeout_rem *tr = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + int ret; + + if (!(tr->flags & IORING_TIMEOUT_UPDATE)) { + struct io_cancel_data cd = { .data = tr->addr, }; + + spin_lock(&ctx->completion_lock); + ret = io_timeout_cancel(ctx, &cd); + spin_unlock(&ctx->completion_lock); + } else { + enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); + + spin_lock_irq(&ctx->timeout_lock); + if (tr->ltimeout) + ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + else + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + spin_unlock_irq(&ctx->timeout_lock); + } + + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +static int __io_timeout_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe, + bool is_timeout_link) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_timeout_data *data; + unsigned flags; + u32 off = READ_ONCE(sqe->off); + + if (sqe->buf_index || sqe->len != 1 || sqe->splice_fd_in) + return -EINVAL; + if (off && is_timeout_link) + return -EINVAL; + flags = READ_ONCE(sqe->timeout_flags); + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | + IORING_TIMEOUT_ETIME_SUCCESS)) + return -EINVAL; + /* more than one clock specified is invalid, obviously */ + if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) + return -EINVAL; + + INIT_LIST_HEAD(&timeout->list); + timeout->off = off; + if (unlikely(off && !req->ctx->off_timeout_used)) + req->ctx->off_timeout_used = true; + + if (WARN_ON_ONCE(req_has_async_data(req))) + return -EFAULT; + if (io_alloc_async_data(req)) + return -ENOMEM; + + data = req->async_data; + data->req = req; + data->flags = flags; + + if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) + return -EFAULT; + + if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) + return -EINVAL; + + INIT_LIST_HEAD(&timeout->list); + data->mode = io_translate_timeout_mode(flags); + hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); + + if (is_timeout_link) { + struct io_submit_link *link = &req->ctx->submit_state.link; + + if (!link->head) + return -EINVAL; + if (link->last->opcode == IORING_OP_LINK_TIMEOUT) + return -EINVAL; + timeout->head = link->last; + link->last->flags |= REQ_F_ARM_LTIMEOUT; + } + return 0; +} + +int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_timeout_prep(req, sqe, false); +} + +int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_timeout_prep(req, sqe, true); +} + +int io_timeout(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct io_timeout_data *data = req->async_data; + struct list_head *entry; + u32 tail, off = timeout->off; + + spin_lock_irq(&ctx->timeout_lock); + + /* + * sqe->off holds how many events that need to occur for this + * timeout event to be satisfied. If it isn't set, then this is + * a pure timeout request, sequence isn't used. + */ + if (io_is_timeout_noseq(req)) { + entry = ctx->timeout_list.prev; + goto add; + } + + tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + timeout->target_seq = tail + off; + + /* Update the last seq here in case io_flush_timeouts() hasn't. + * This is safe because ->completion_lock is held, and submissions + * and completions are never mixed in the same ->completion_lock section. + */ + ctx->cq_last_tm_flush = tail; + + /* + * Insertion sort, ensuring the first entry in the list is always + * the one we need first. + */ + list_for_each_prev(entry, &ctx->timeout_list) { + struct io_timeout *nextt = list_entry(entry, struct io_timeout, list); + struct io_kiocb *nxt = cmd_to_io_kiocb(nextt); + + if (io_is_timeout_noseq(nxt)) + continue; + /* nxt.seq is behind @tail, otherwise would've been completed */ + if (off >= nextt->target_seq - tail) + break; + } +add: + list_add(&timeout->list, entry); + data->timer.function = io_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + spin_unlock_irq(&ctx->timeout_lock); + return IOU_ISSUE_SKIP_COMPLETE; +} + +void io_queue_linked_timeout(struct io_kiocb *req) +{ + struct io_timeout *timeout = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->timeout_lock); + /* + * If the back reference is NULL, then our linked request finished + * before we got a chance to setup the timer + */ + if (timeout->head) { + struct io_timeout_data *data = req->async_data; + + data->timer.function = io_link_timeout_fn; + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), + data->mode); + list_add_tail(&timeout->list, &ctx->ltimeout_list); + } + spin_unlock_irq(&ctx->timeout_lock); + /* drop submission reference */ + io_put_req(req); +} + +static bool io_match_task(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) + __must_hold(&req->ctx->timeout_lock) +{ + struct io_kiocb *req; + + if (task && head->task != task) + return false; + if (cancel_all) + return true; + + io_for_each_link(req, head) { + if (req->flags & REQ_F_INFLIGHT) + return true; + } + return false; +} + +/* Returns true if we found and killed one or more timeouts */ +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all) +{ + struct io_timeout *timeout, *tmp; + int canceled = 0; + + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); + list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { + struct io_kiocb *req = cmd_to_io_kiocb(timeout); + + if (io_match_task(req, tsk, cancel_all)) { + io_kill_timeout(req, -ECANCELED); + canceled++; + } + } + spin_unlock_irq(&ctx->timeout_lock); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (canceled != 0) + io_cqring_ev_posted(ctx); + return canceled != 0; +} diff --git a/io_uring/timeout.h b/io_uring/timeout.h new file mode 100644 index 00000000000000..dd7cfb0d936671 --- /dev/null +++ b/io_uring/timeout.h @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: GPL-2.0 + +struct io_timeout_data { + struct io_kiocb *req; + struct hrtimer timer; + struct timespec64 ts; + enum hrtimer_mode mode; + u32 flags; +}; + +struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link); + +static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) +{ + struct io_kiocb *link = req->link; + + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) + return __io_disarm_linked_timeout(req, link); + + return NULL; +} + +__cold void io_flush_timeouts(struct io_ring_ctx *ctx); +int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all); +void io_queue_linked_timeout(struct io_kiocb *req); +bool io_disarm_next(struct io_kiocb *req); + +int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_link_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_timeout(struct io_kiocb *req, unsigned int issue_flags); +int io_timeout_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags); From 078a30a655ff8a26ede7dbafaccefe280b55a290 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 09:13:39 -0600 Subject: [PATCH 0969/1250] io_uring: move SQPOLL related handling into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 467 +------------------------------------------- io_uring/io_uring.h | 34 ++++ io_uring/sqpoll.c | 426 ++++++++++++++++++++++++++++++++++++++++ io_uring/sqpoll.h | 29 +++ 5 files changed, 497 insertions(+), 462 deletions(-) create mode 100644 io_uring/sqpoll.c create mode 100644 io_uring/sqpoll.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 6ae4e45a15dbfc..c59a9ca74262a4 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ - statx.o net.o msg_ring.o timeout.o + statx.o net.o msg_ring.o timeout.o \ + sqpoll.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3fc59a22d54e89..17c555aa03bc9c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -92,6 +92,7 @@ #include "io_uring_types.h" #include "io_uring.h" #include "refs.h" +#include "sqpoll.h" #include "xattr.h" #include "nop.h" @@ -109,7 +110,6 @@ #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) @@ -214,31 +214,6 @@ struct io_buffer { __u16 bgid; }; -enum { - IO_SQ_THREAD_SHOULD_STOP = 0, - IO_SQ_THREAD_SHOULD_PARK, -}; - -struct io_sq_data { - refcount_t refs; - atomic_t park_pending; - struct mutex lock; - - /* ctx's that are using this sqd */ - struct list_head ctx_list; - - struct task_struct *thread; - struct wait_queue_head wait; - - unsigned sq_thread_idle; - int sq_cpu; - pid_t task_pid; - pid_t task_tgid; - - unsigned long state; - struct completion exited; -}; - #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 @@ -402,7 +377,6 @@ static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static void io_dismantle_req(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, @@ -1079,13 +1053,6 @@ static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_signal(ctx); } -static inline bool io_sqring_full(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; -} - static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -1974,28 +1941,7 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx) return __io_cqring_events(ctx); } -static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - - /* make sure SQ entry isn't read before tail */ - return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; -} - -static inline bool io_run_task_work(void) -{ - if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { - __set_current_state(TASK_RUNNING); - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - return true; - } - - return false; -} - -static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) +int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; unsigned int poll_flags = BLK_POLL_NOSLEEP; @@ -5297,7 +5243,7 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; } -static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) +int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { unsigned int entries = io_sqring_entries(ctx); @@ -5349,173 +5295,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) return ret; } -static inline bool io_sqd_events_pending(struct io_sq_data *sqd) -{ - return READ_ONCE(sqd->state); -} - -static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) -{ - unsigned int to_submit; - int ret = 0; - - to_submit = io_sqring_entries(ctx); - /* if we're handling multiple rings, cap submit size for fairness */ - if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) - to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - - if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { - const struct cred *creds = NULL; - - if (ctx->sq_creds != current_cred()) - creds = override_creds(ctx->sq_creds); - - mutex_lock(&ctx->uring_lock); - if (!wq_list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, true); - - /* - * Don't submit if refs are dying, good for io_uring_register(), - * but also it is relied upon by io_ring_exit_work() - */ - if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && - !(ctx->flags & IORING_SETUP_R_DISABLED)) - ret = io_submit_sqes(ctx, to_submit); - mutex_unlock(&ctx->uring_lock); - - if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) - wake_up(&ctx->sqo_sq_wait); - if (creds) - revert_creds(creds); - } - - return ret; -} - -static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) -{ - struct io_ring_ctx *ctx; - unsigned sq_thread_idle = 0; - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); - sqd->sq_thread_idle = sq_thread_idle; -} - -static bool io_sqd_handle_event(struct io_sq_data *sqd) -{ - bool did_sig = false; - struct ksignal ksig; - - if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || - signal_pending(current)) { - mutex_unlock(&sqd->lock); - if (signal_pending(current)) - did_sig = get_signal(&ksig); - cond_resched(); - mutex_lock(&sqd->lock); - } - return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); -} - -static int io_sq_thread(void *data) -{ - struct io_sq_data *sqd = data; - struct io_ring_ctx *ctx; - unsigned long timeout = 0; - char buf[TASK_COMM_LEN]; - DEFINE_WAIT(wait); - - snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); - set_task_comm(current, buf); - - if (sqd->sq_cpu != -1) - set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); - else - set_cpus_allowed_ptr(current, cpu_online_mask); - current->flags |= PF_NO_SETAFFINITY; - - audit_alloc_kernel(current); - - mutex_lock(&sqd->lock); - while (1) { - bool cap_entries, sqt_spin = false; - - if (io_sqd_events_pending(sqd) || signal_pending(current)) { - if (io_sqd_handle_event(sqd)) - break; - timeout = jiffies + sqd->sq_thread_idle; - } - - cap_entries = !list_is_singular(&sqd->ctx_list); - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - int ret = __io_sq_thread(ctx, cap_entries); - - if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) - sqt_spin = true; - } - if (io_run_task_work()) - sqt_spin = true; - - if (sqt_spin || !time_after(jiffies, timeout)) { - cond_resched(); - if (sqt_spin) - timeout = jiffies + sqd->sq_thread_idle; - continue; - } - - prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); - if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { - bool needs_sched = true; - - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - atomic_or(IORING_SQ_NEED_WAKEUP, - &ctx->rings->sq_flags); - if ((ctx->flags & IORING_SETUP_IOPOLL) && - !wq_list_empty(&ctx->iopoll_list)) { - needs_sched = false; - break; - } - - /* - * Ensure the store of the wakeup flag is not - * reordered with the load of the SQ tail - */ - smp_mb__after_atomic(); - - if (io_sqring_entries(ctx)) { - needs_sched = false; - break; - } - } - - if (needs_sched) { - mutex_unlock(&sqd->lock); - schedule(); - mutex_lock(&sqd->lock); - } - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - atomic_andnot(IORING_SQ_NEED_WAKEUP, - &ctx->rings->sq_flags); - } - - finish_wait(&sqd->wait, &wait); - timeout = jiffies + sqd->sq_thread_idle; - } - - io_uring_cancel_generic(true, sqd); - sqd->thread = NULL; - list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); - io_run_task_work(); - mutex_unlock(&sqd->lock); - - audit_free(current); - - complete(&sqd->exited); - do_exit(0); -} - struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; @@ -5934,131 +5713,6 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return ret; } -static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) -{ - WARN_ON_ONCE(sqd->thread == current); - - /* - * Do the dance but not conditional clear_bit() because it'd race with - * other threads incrementing park_pending and setting the bit. - */ - clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - if (atomic_dec_return(&sqd->park_pending)) - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_unlock(&sqd->lock); -} - -static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) -{ - WARN_ON_ONCE(sqd->thread == current); - - atomic_inc(&sqd->park_pending); - set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); -} - -static void io_sq_thread_stop(struct io_sq_data *sqd) -{ - WARN_ON_ONCE(sqd->thread == current); - WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); - - set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); - mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); - mutex_unlock(&sqd->lock); - wait_for_completion(&sqd->exited); -} - -static void io_put_sq_data(struct io_sq_data *sqd) -{ - if (refcount_dec_and_test(&sqd->refs)) { - WARN_ON_ONCE(atomic_read(&sqd->park_pending)); - - io_sq_thread_stop(sqd); - kfree(sqd); - } -} - -static void io_sq_thread_finish(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd) { - io_sq_thread_park(sqd); - list_del_init(&ctx->sqd_list); - io_sqd_update_thread_idle(sqd); - io_sq_thread_unpark(sqd); - - io_put_sq_data(sqd); - ctx->sq_data = NULL; - } -} - -static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) -{ - struct io_ring_ctx *ctx_attach; - struct io_sq_data *sqd; - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) - return ERR_PTR(-ENXIO); - if (f.file->f_op != &io_uring_fops) { - fdput(f); - return ERR_PTR(-EINVAL); - } - - ctx_attach = f.file->private_data; - sqd = ctx_attach->sq_data; - if (!sqd) { - fdput(f); - return ERR_PTR(-EINVAL); - } - if (sqd->task_tgid != current->tgid) { - fdput(f); - return ERR_PTR(-EPERM); - } - - refcount_inc(&sqd->refs); - fdput(f); - return sqd; -} - -static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, - bool *attached) -{ - struct io_sq_data *sqd; - - *attached = false; - if (p->flags & IORING_SETUP_ATTACH_WQ) { - sqd = io_attach_sq_data(p); - if (!IS_ERR(sqd)) { - *attached = true; - return sqd; - } - /* fall through for EPERM case, setup new sqd/task */ - if (PTR_ERR(sqd) != -EPERM) - return sqd; - } - - sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); - if (!sqd) - return ERR_PTR(-ENOMEM); - - atomic_set(&sqd->park_pending, 0); - refcount_set(&sqd->refs, 1); - INIT_LIST_HEAD(&sqd->ctx_list); - mutex_init(&sqd->lock); - init_waitqueue_head(&sqd->wait); - init_completion(&sqd->exited); - return sqd; -} - /* * Ensure the UNIX gc is aware of our file set, so we are certain that * the io_uring can be safely unregistered on process exit, even if we have @@ -6495,8 +6149,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, return io_wq_create(concurrency, &data); } -static __cold int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) +__cold int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; @@ -6554,96 +6208,6 @@ void __io_uring_free(struct task_struct *tsk) tsk->io_uring = NULL; } -static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, - struct io_uring_params *p) -{ - int ret; - - /* Retain compatibility with failing for an invalid attach attempt */ - if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == - IORING_SETUP_ATTACH_WQ) { - struct fd f; - - f = fdget(p->wq_fd); - if (!f.file) - return -ENXIO; - if (f.file->f_op != &io_uring_fops) { - fdput(f); - return -EINVAL; - } - fdput(f); - } - if (ctx->flags & IORING_SETUP_SQPOLL) { - struct task_struct *tsk; - struct io_sq_data *sqd; - bool attached; - - ret = security_uring_sqpoll(); - if (ret) - return ret; - - sqd = io_get_sq_data(p, &attached); - if (IS_ERR(sqd)) { - ret = PTR_ERR(sqd); - goto err; - } - - ctx->sq_creds = get_current_cred(); - ctx->sq_data = sqd; - ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); - if (!ctx->sq_thread_idle) - ctx->sq_thread_idle = HZ; - - io_sq_thread_park(sqd); - list_add(&ctx->sqd_list, &sqd->ctx_list); - io_sqd_update_thread_idle(sqd); - /* don't attach to a dying SQPOLL thread, would be racy */ - ret = (attached && !sqd->thread) ? -ENXIO : 0; - io_sq_thread_unpark(sqd); - - if (ret < 0) - goto err; - if (attached) - return 0; - - if (p->flags & IORING_SETUP_SQ_AFF) { - int cpu = p->sq_thread_cpu; - - ret = -EINVAL; - if (cpu >= nr_cpu_ids || !cpu_online(cpu)) - goto err_sqpoll; - sqd->sq_cpu = cpu; - } else { - sqd->sq_cpu = -1; - } - - sqd->task_pid = current->pid; - sqd->task_tgid = current->tgid; - tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); - if (IS_ERR(tsk)) { - ret = PTR_ERR(tsk); - goto err_sqpoll; - } - - sqd->thread = tsk; - ret = io_uring_alloc_task_context(tsk, ctx); - wake_up_new_task(tsk); - if (ret) - goto err; - } else if (p->flags & IORING_SETUP_SQ_AFF) { - /* Can't have SQ_AFF without SQPOLL */ - ret = -EINVAL; - goto err; - } - - return 0; -err_sqpoll: - complete(&ctx->sq_data->exited); -err: - io_sq_thread_finish(ctx); - return ret; -} - static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -7755,8 +7319,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) * Find any io_uring ctx that this task has registered or done IO on, and cancel * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation. */ -static __cold void io_uring_cancel_generic(bool cancel_all, - struct io_sq_data *sqd) +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; @@ -8034,24 +7597,6 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ -static int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) -{ - DEFINE_WAIT(wait); - - do { - if (!io_sqring_full(ctx)) - break; - prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); - - if (!io_sqring_full(ctx)) - break; - schedule(); - } while (!signal_pending(current)); - - finish_wait(&ctx->sqo_sq_wait, &wait); - return 0; -} - static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) { if (flags & IORING_ENTER_EXT_ARG) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e285e12ccbdbb1..1da8e66507a350 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -64,6 +64,34 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static inline bool io_sqring_full(struct io_ring_ctx *ctx) +{ + struct io_rings *r = ctx->rings; + + return READ_ONCE(r->sq.tail) - ctx->cached_sq_head == ctx->sq_entries; +} + +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + + /* make sure SQ entry isn't read before tail */ + return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; +} + +static inline bool io_run_task_work(void) +{ + if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { + __set_current_state(TASK_RUNNING); + clear_notify_signal(); + if (task_work_pending(current)) + task_work_run(); + return true; + } + + return false; +} + void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); @@ -101,6 +129,12 @@ void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); +int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx); + +int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); +int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c new file mode 100644 index 00000000000000..149d5c976f1467 --- /dev/null +++ b/io_uring/sqpoll.c @@ -0,0 +1,426 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Contains the core associated with submission side polling of the SQ + * ring, offloading submissions from the application to a kernel thread. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "sqpoll.h" + +#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 + +enum { + IO_SQ_THREAD_SHOULD_STOP = 0, + IO_SQ_THREAD_SHOULD_PARK, +}; + +void io_sq_thread_unpark(struct io_sq_data *sqd) + __releases(&sqd->lock) +{ + WARN_ON_ONCE(sqd->thread == current); + + /* + * Do the dance but not conditional clear_bit() because it'd race with + * other threads incrementing park_pending and setting the bit. + */ + clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + if (atomic_dec_return(&sqd->park_pending)) + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + mutex_unlock(&sqd->lock); +} + +void io_sq_thread_park(struct io_sq_data *sqd) + __acquires(&sqd->lock) +{ + WARN_ON_ONCE(sqd->thread == current); + + atomic_inc(&sqd->park_pending); + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + mutex_lock(&sqd->lock); + if (sqd->thread) + wake_up_process(sqd->thread); +} + +void io_sq_thread_stop(struct io_sq_data *sqd) +{ + WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); + + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + mutex_lock(&sqd->lock); + if (sqd->thread) + wake_up_process(sqd->thread); + mutex_unlock(&sqd->lock); + wait_for_completion(&sqd->exited); +} + +void io_put_sq_data(struct io_sq_data *sqd) +{ + if (refcount_dec_and_test(&sqd->refs)) { + WARN_ON_ONCE(atomic_read(&sqd->park_pending)); + + io_sq_thread_stop(sqd); + kfree(sqd); + } +} + +static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) +{ + struct io_ring_ctx *ctx; + unsigned sq_thread_idle = 0; + + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + sq_thread_idle = max(sq_thread_idle, ctx->sq_thread_idle); + sqd->sq_thread_idle = sq_thread_idle; +} + +void io_sq_thread_finish(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd) { + io_sq_thread_park(sqd); + list_del_init(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); + io_sq_thread_unpark(sqd); + + io_put_sq_data(sqd); + ctx->sq_data = NULL; + } +} + +static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx_attach; + struct io_sq_data *sqd; + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return ERR_PTR(-ENXIO); + if (!io_is_uring_fops(f.file)) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + ctx_attach = f.file->private_data; + sqd = ctx_attach->sq_data; + if (!sqd) { + fdput(f); + return ERR_PTR(-EINVAL); + } + if (sqd->task_tgid != current->tgid) { + fdput(f); + return ERR_PTR(-EPERM); + } + + refcount_inc(&sqd->refs); + fdput(f); + return sqd; +} + +static struct io_sq_data *io_get_sq_data(struct io_uring_params *p, + bool *attached) +{ + struct io_sq_data *sqd; + + *attached = false; + if (p->flags & IORING_SETUP_ATTACH_WQ) { + sqd = io_attach_sq_data(p); + if (!IS_ERR(sqd)) { + *attached = true; + return sqd; + } + /* fall through for EPERM case, setup new sqd/task */ + if (PTR_ERR(sqd) != -EPERM) + return sqd; + } + + sqd = kzalloc(sizeof(*sqd), GFP_KERNEL); + if (!sqd) + return ERR_PTR(-ENOMEM); + + atomic_set(&sqd->park_pending, 0); + refcount_set(&sqd->refs, 1); + INIT_LIST_HEAD(&sqd->ctx_list); + mutex_init(&sqd->lock); + init_waitqueue_head(&sqd->wait); + init_completion(&sqd->exited); + return sqd; +} + +static inline bool io_sqd_events_pending(struct io_sq_data *sqd) +{ + return READ_ONCE(sqd->state); +} + +static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) +{ + unsigned int to_submit; + int ret = 0; + + to_submit = io_sqring_entries(ctx); + /* if we're handling multiple rings, cap submit size for fairness */ + if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) + to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; + + if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { + const struct cred *creds = NULL; + + if (ctx->sq_creds != current_cred()) + creds = override_creds(ctx->sq_creds); + + mutex_lock(&ctx->uring_lock); + if (!wq_list_empty(&ctx->iopoll_list)) + io_do_iopoll(ctx, true); + + /* + * Don't submit if refs are dying, good for io_uring_register(), + * but also it is relied upon by io_ring_exit_work() + */ + if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) + ret = io_submit_sqes(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait)) + wake_up(&ctx->sqo_sq_wait); + if (creds) + revert_creds(creds); + } + + return ret; +} + +static bool io_sqd_handle_event(struct io_sq_data *sqd) +{ + bool did_sig = false; + struct ksignal ksig; + + if (test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state) || + signal_pending(current)) { + mutex_unlock(&sqd->lock); + if (signal_pending(current)) + did_sig = get_signal(&ksig); + cond_resched(); + mutex_lock(&sqd->lock); + } + return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); +} + +static int io_sq_thread(void *data) +{ + struct io_sq_data *sqd = data; + struct io_ring_ctx *ctx; + unsigned long timeout = 0; + char buf[TASK_COMM_LEN]; + DEFINE_WAIT(wait); + + snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); + set_task_comm(current, buf); + + if (sqd->sq_cpu != -1) + set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); + else + set_cpus_allowed_ptr(current, cpu_online_mask); + current->flags |= PF_NO_SETAFFINITY; + + audit_alloc_kernel(current); + + mutex_lock(&sqd->lock); + while (1) { + bool cap_entries, sqt_spin = false; + + if (io_sqd_events_pending(sqd) || signal_pending(current)) { + if (io_sqd_handle_event(sqd)) + break; + timeout = jiffies + sqd->sq_thread_idle; + } + + cap_entries = !list_is_singular(&sqd->ctx_list); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + int ret = __io_sq_thread(ctx, cap_entries); + + if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) + sqt_spin = true; + } + if (io_run_task_work()) + sqt_spin = true; + + if (sqt_spin || !time_after(jiffies, timeout)) { + cond_resched(); + if (sqt_spin) + timeout = jiffies + sqd->sq_thread_idle; + continue; + } + + prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE); + if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) { + bool needs_sched = true; + + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + atomic_or(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); + if ((ctx->flags & IORING_SETUP_IOPOLL) && + !wq_list_empty(&ctx->iopoll_list)) { + needs_sched = false; + break; + } + + /* + * Ensure the store of the wakeup flag is not + * reordered with the load of the SQ tail + */ + smp_mb__after_atomic(); + + if (io_sqring_entries(ctx)) { + needs_sched = false; + break; + } + } + + if (needs_sched) { + mutex_unlock(&sqd->lock); + schedule(); + mutex_lock(&sqd->lock); + } + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + atomic_andnot(IORING_SQ_NEED_WAKEUP, + &ctx->rings->sq_flags); + } + + finish_wait(&sqd->wait, &wait); + timeout = jiffies + sqd->sq_thread_idle; + } + + io_uring_cancel_generic(true, sqd); + sqd->thread = NULL; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); + io_run_task_work(); + mutex_unlock(&sqd->lock); + + audit_free(current); + + complete(&sqd->exited); + do_exit(0); +} + +int io_sqpoll_wait_sq(struct io_ring_ctx *ctx) +{ + DEFINE_WAIT(wait); + + do { + if (!io_sqring_full(ctx)) + break; + prepare_to_wait(&ctx->sqo_sq_wait, &wait, TASK_INTERRUPTIBLE); + + if (!io_sqring_full(ctx)) + break; + schedule(); + } while (!signal_pending(current)); + + finish_wait(&ctx->sqo_sq_wait, &wait); + return 0; +} + +__cold int io_sq_offload_create(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + int ret; + + /* Retain compatibility with failing for an invalid attach attempt */ + if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == + IORING_SETUP_ATTACH_WQ) { + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return -ENXIO; + if (!io_is_uring_fops(f.file)) { + fdput(f); + return -EINVAL; + } + fdput(f); + } + if (ctx->flags & IORING_SETUP_SQPOLL) { + struct task_struct *tsk; + struct io_sq_data *sqd; + bool attached; + + ret = security_uring_sqpoll(); + if (ret) + return ret; + + sqd = io_get_sq_data(p, &attached); + if (IS_ERR(sqd)) { + ret = PTR_ERR(sqd); + goto err; + } + + ctx->sq_creds = get_current_cred(); + ctx->sq_data = sqd; + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; + + io_sq_thread_park(sqd); + list_add(&ctx->sqd_list, &sqd->ctx_list); + io_sqd_update_thread_idle(sqd); + /* don't attach to a dying SQPOLL thread, would be racy */ + ret = (attached && !sqd->thread) ? -ENXIO : 0; + io_sq_thread_unpark(sqd); + + if (ret < 0) + goto err; + if (attached) + return 0; + + if (p->flags & IORING_SETUP_SQ_AFF) { + int cpu = p->sq_thread_cpu; + + ret = -EINVAL; + if (cpu >= nr_cpu_ids || !cpu_online(cpu)) + goto err_sqpoll; + sqd->sq_cpu = cpu; + } else { + sqd->sq_cpu = -1; + } + + sqd->task_pid = current->pid; + sqd->task_tgid = current->tgid; + tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); + goto err_sqpoll; + } + + sqd->thread = tsk; + ret = io_uring_alloc_task_context(tsk, ctx); + wake_up_new_task(tsk); + if (ret) + goto err; + } else if (p->flags & IORING_SETUP_SQ_AFF) { + /* Can't have SQ_AFF without SQPOLL */ + ret = -EINVAL; + goto err; + } + + return 0; +err_sqpoll: + complete(&ctx->sq_data->exited); +err: + io_sq_thread_finish(ctx); + return ret; +} diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h new file mode 100644 index 00000000000000..0c3fbcd1f583f6 --- /dev/null +++ b/io_uring/sqpoll.h @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: GPL-2.0 + +struct io_sq_data { + refcount_t refs; + atomic_t park_pending; + struct mutex lock; + + /* ctx's that are using this sqd */ + struct list_head ctx_list; + + struct task_struct *thread; + struct wait_queue_head wait; + + unsigned sq_thread_idle; + int sq_cpu; + pid_t task_pid; + pid_t task_tgid; + + unsigned long state; + struct completion exited; +}; + +int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p); +void io_sq_thread_finish(struct io_ring_ctx *ctx); +void io_sq_thread_stop(struct io_sq_data *sqd); +void io_sq_thread_park(struct io_sq_data *sqd); +void io_sq_thread_unpark(struct io_sq_data *sqd); +void io_put_sq_data(struct io_sq_data *sqd); +int io_sqpoll_wait_sq(struct io_ring_ctx *ctx); From 32bba81902290c59c21a3fa2f5206c44d9156a05 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 10:28:04 -0600 Subject: [PATCH 0970/1250] io_uring: use io_is_uring_fops() consistently Convert the last spots that check for io_uring_fops to use the provided helper instead. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 17c555aa03bc9c..687900b84ce08c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2309,7 +2309,7 @@ static bool __io_file_supports_nowait(struct file *file, umode_t mode) if (S_ISREG(mode)) { if (IS_ENABLED(CONFIG_BLOCK) && io_bdev_nowait(file->f_inode->i_sb->s_bdev) && - file->f_op != &io_uring_fops) + !io_is_uring_fops(file)) return true; return false; } @@ -4857,7 +4857,7 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); /* we don't allow fixed io_uring files */ - if (file && file->f_op == &io_uring_fops) + if (file && io_is_uring_fops(file)) io_req_track_inflight(req); return file; } @@ -5949,7 +5949,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ - if (file->f_op == &io_uring_fops) { + if (io_is_uring_fops(file)) { fput(file); goto fail; } @@ -5996,7 +5996,7 @@ int io_install_fixed_file(struct io_kiocb *req, struct file *file, struct io_fixed_file *file_slot; int ret; - if (file->f_op == &io_uring_fops) + if (io_is_uring_fops(file)) return -EBADF; if (!ctx->file_data) return -ENXIO; @@ -6096,7 +6096,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, * still no point in allowing a ring fd as it doesn't * support regular read/write anyway. */ - if (file->f_op == &io_uring_fops) { + if (io_is_uring_fops(file)) { fput(file); err = -EBADF; break; @@ -7416,7 +7416,7 @@ static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, file = fget(fd); if (!file) { return -EBADF; - } else if (file->f_op != &io_uring_fops) { + } else if (!io_is_uring_fops(file)) { fput(file); return -EOPNOTSUPP; } @@ -7677,7 +7677,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, return -EBADF; ret = -EOPNOTSUPP; - if (unlikely(f.file->f_op != &io_uring_fops)) + if (unlikely(!io_is_uring_fops(f.file))) goto out_fput; ret = -ENXIO; @@ -8852,7 +8852,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, return -EBADF; ret = -EOPNOTSUPP; - if (f.file->f_op != &io_uring_fops) + if (!io_is_uring_fops(f.file)) goto out_fput; ctx = f.file->private_data; From 787ef08a852f04c4f95a75422b3225df9f188628 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 10:40:19 -0600 Subject: [PATCH 0971/1250] io_uring: move fdinfo helpers to its own file This also means moving a bit more of the fixed file handling to the filetable side, which makes sense separately too. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/fdinfo.c | 191 ++++++++++++++++++++++++++++++++++ io_uring/fdinfo.h | 3 + io_uring/filetable.h | 19 ++++ io_uring/io_uring.c | 210 +------------------------------------- io_uring/io_uring_types.h | 13 +++ 6 files changed, 230 insertions(+), 208 deletions(-) create mode 100644 io_uring/fdinfo.c create mode 100644 io_uring/fdinfo.h diff --git a/io_uring/Makefile b/io_uring/Makefile index c59a9ca74262a4..ed0bf42db4ae4c 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -6,5 +6,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ - sqpoll.o + sqpoll.o fdinfo.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c new file mode 100644 index 00000000000000..fcedde4b4b1e1f --- /dev/null +++ b/io_uring/fdinfo.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "sqpoll.h" +#include "fdinfo.h" + +#ifdef CONFIG_PROC_FS +static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, + const struct cred *cred) +{ + struct user_namespace *uns = seq_user_ns(m); + struct group_info *gi; + kernel_cap_t cap; + unsigned __capi; + int g; + + seq_printf(m, "%5d\n", id); + seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); + seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); + seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); + seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); + seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); + seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); + seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); + seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); + seq_puts(m, "\n\tGroups:\t"); + gi = cred->group_info; + for (g = 0; g < gi->ngroups; g++) { + seq_put_decimal_ull(m, g ? " " : "", + from_kgid_munged(uns, gi->gid[g])); + } + seq_puts(m, "\n\tCapEff:\t"); + cap = cred->cap_effective; + CAP_FOR_EACH_U32(__capi) + seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); + seq_putc(m, '\n'); + return 0; +} + +static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) +{ + struct io_sq_data *sq = NULL; + struct io_overflow_cqe *ocqe; + struct io_rings *r = ctx->rings; + unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; + unsigned int sq_head = READ_ONCE(r->sq.head); + unsigned int sq_tail = READ_ONCE(r->sq.tail); + unsigned int cq_head = READ_ONCE(r->cq.head); + unsigned int cq_tail = READ_ONCE(r->cq.tail); + unsigned int cq_shift = 0; + unsigned int sq_entries, cq_entries; + bool has_lock; + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + unsigned int i; + + if (is_cqe32) + cq_shift = 1; + + /* + * we may get imprecise sqe and cqe info if uring is actively running + * since we get cached_sq_head and cached_cq_tail without uring_lock + * and sq_tail and cq_head are changed by userspace. But it's ok since + * we usually use these info when it is stuck. + */ + seq_printf(m, "SqMask:\t0x%x\n", sq_mask); + seq_printf(m, "SqHead:\t%u\n", sq_head); + seq_printf(m, "SqTail:\t%u\n", sq_tail); + seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); + seq_printf(m, "CqMask:\t0x%x\n", cq_mask); + seq_printf(m, "CqHead:\t%u\n", cq_head); + seq_printf(m, "CqTail:\t%u\n", cq_tail); + seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); + seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); + sq_entries = min(sq_tail - sq_head, ctx->sq_entries); + for (i = 0; i < sq_entries; i++) { + unsigned int entry = i + sq_head; + unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); + struct io_uring_sqe *sqe; + + if (sq_idx > sq_mask) + continue; + sqe = &ctx->sq_sqes[sq_idx]; + seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", + sq_idx, sqe->opcode, sqe->fd, sqe->flags, + sqe->user_data); + } + seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); + cq_entries = min(cq_tail - cq_head, ctx->cq_entries); + for (i = 0; i < cq_entries; i++) { + unsigned int entry = i + cq_head; + struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; + + if (!is_cqe32) { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags); + } else { + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " + "extra1:%llu, extra2:%llu\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); + } + } + + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. If we fail to get the lock, we just don't iterate any + * structures that could be going away outside the io_uring mutex. + */ + has_lock = mutex_trylock(&ctx->uring_lock); + + if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { + sq = ctx->sq_data; + if (!sq->thread) + sq = NULL; + } + + seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); + seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); + seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); + for (i = 0; has_lock && i < ctx->nr_user_files; i++) { + struct file *f = io_file_from_index(&ctx->file_table, i); + + if (f) + seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); + else + seq_printf(m, "%5u: \n", i); + } + seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); + for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *buf = ctx->user_bufs[i]; + unsigned int len = buf->ubuf_end - buf->ubuf; + + seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); + } + if (has_lock && !xa_empty(&ctx->personalities)) { + unsigned long index; + const struct cred *cred; + + seq_printf(m, "Personalities:\n"); + xa_for_each(&ctx->personalities, index, cred) + io_uring_show_cred(m, index, cred); + } + if (has_lock) + mutex_unlock(&ctx->uring_lock); + + seq_puts(m, "PollList:\n"); + spin_lock(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list = &ctx->cancel_hash[i]; + struct io_kiocb *req; + + hlist_for_each_entry(req, list, hash_node) + seq_printf(m, " op=%d, task_works=%d\n", req->opcode, + task_work_pending(req->task)); + } + + seq_puts(m, "CqOverflowList:\n"); + list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { + struct io_uring_cqe *cqe = &ocqe->cqe; + + seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", + cqe->user_data, cqe->res, cqe->flags); + + } + + spin_unlock(&ctx->completion_lock); +} + +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +{ + struct io_ring_ctx *ctx = f->private_data; + + if (percpu_ref_tryget(&ctx->refs)) { + __io_uring_show_fdinfo(ctx, m); + percpu_ref_put(&ctx->refs); + } +} +#endif diff --git a/io_uring/fdinfo.h b/io_uring/fdinfo.h new file mode 100644 index 00000000000000..6fde48c450e300 --- /dev/null +++ b/io_uring/fdinfo.h @@ -0,0 +1,3 @@ +// SPDX-License-Identifier: GPL-2.0 + +void io_uring_show_fdinfo(struct seq_file *m, struct file *f); diff --git a/io_uring/filetable.h b/io_uring/filetable.h index fe1ec581958dda..6e1675f406b729 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -36,6 +36,8 @@ bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); int io_file_bitmap_get(struct io_ring_ctx *ctx); +unsigned int io_file_get_flags(struct file *file); + static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) { __clear_bit(bit, table->bitmap); @@ -55,4 +57,21 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i) return &table->files[i]; } +static inline struct file *io_file_from_index(struct io_file_table *table, + int index) +{ + struct io_fixed_file *slot = io_fixed_file_slot(table, index); + + return (struct file *) (slot->file_ptr & FFS_MASK); +} + +static inline void io_fixed_file_set(struct io_fixed_file *file_slot, + struct file *file) +{ + unsigned long file_ptr = (unsigned long) file; + + file_ptr |= io_file_get_flags(file); + file_slot->file_ptr = file_ptr; +} + #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 687900b84ce08c..95c2083a0da8b3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -93,6 +93,7 @@ #include "io_uring.h" #include "refs.h" #include "sqpoll.h" +#include "fdinfo.h" #include "xattr.h" #include "nop.h" @@ -137,21 +138,8 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) -struct io_mapped_ubuf { - u64 ubuf; - u64 ubuf_end; - unsigned int nr_bvecs; - unsigned long acct_pages; - struct bio_vec bvec[]; -}; - struct io_ring_ctx; -struct io_overflow_cqe { - struct list_head list; - struct io_uring_cqe cqe; -}; - struct io_rsrc_put { struct list_head list; u64 tag; @@ -2325,7 +2313,7 @@ static bool __io_file_supports_nowait(struct file *file, umode_t mode) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static unsigned int io_file_get_flags(struct file *file) +unsigned int io_file_get_flags(struct file *file) { umode_t mode = file_inode(file)->i_mode; unsigned int res = 0; @@ -4810,22 +4798,6 @@ static void io_wq_submit_work(struct io_wq_work *work) io_req_task_queue_fail(req, ret); } -static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, - int index) -{ - struct io_fixed_file *slot = io_fixed_file_slot(&ctx->file_table, index); - - return (struct file *) (slot->file_ptr & FFS_MASK); -} - -static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file) -{ - unsigned long file_ptr = (unsigned long) file; - - file_ptr |= io_file_get_flags(file); - file_slot->file_ptr = file_ptr; -} - inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { @@ -5667,7 +5639,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) int i; for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(ctx, i); + struct file *file = io_file_from_index(&ctx->file_table, i); if (!file) continue; @@ -7777,182 +7749,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, return ret; } -#ifdef CONFIG_PROC_FS -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - unsigned __capi; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - CAP_FOR_EACH_U32(__capi) - seq_put_hex_ll(m, NULL, cap.cap[CAP_LAST_U32 - __capi], 8); - seq_putc(m, '\n'); - return 0; -} - -static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, - struct seq_file *m) -{ - struct io_sq_data *sq = NULL; - struct io_overflow_cqe *ocqe; - struct io_rings *r = ctx->rings; - unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; - unsigned int sq_head = READ_ONCE(r->sq.head); - unsigned int sq_tail = READ_ONCE(r->sq.tail); - unsigned int cq_head = READ_ONCE(r->cq.head); - unsigned int cq_tail = READ_ONCE(r->cq.tail); - unsigned int cq_shift = 0; - unsigned int sq_entries, cq_entries; - bool has_lock; - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - unsigned int i; - - if (is_cqe32) - cq_shift = 1; - - /* - * we may get imprecise sqe and cqe info if uring is actively running - * since we get cached_sq_head and cached_cq_tail without uring_lock - * and sq_tail and cq_head are changed by userspace. But it's ok since - * we usually use these info when it is stuck. - */ - seq_printf(m, "SqMask:\t0x%x\n", sq_mask); - seq_printf(m, "SqHead:\t%u\n", sq_head); - seq_printf(m, "SqTail:\t%u\n", sq_tail); - seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); - seq_printf(m, "CqMask:\t0x%x\n", cq_mask); - seq_printf(m, "CqHead:\t%u\n", cq_head); - seq_printf(m, "CqTail:\t%u\n", cq_tail); - seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); - seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); - sq_entries = min(sq_tail - sq_head, ctx->sq_entries); - for (i = 0; i < sq_entries; i++) { - unsigned int entry = i + sq_head; - unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); - struct io_uring_sqe *sqe; - - if (sq_idx > sq_mask) - continue; - sqe = &ctx->sq_sqes[sq_idx]; - seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", - sq_idx, sqe->opcode, sqe->fd, sqe->flags, - sqe->user_data); - } - seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); - cq_entries = min(cq_tail - cq_head, ctx->cq_entries); - for (i = 0; i < cq_entries; i++) { - unsigned int entry = i + cq_head; - struct io_uring_cqe *cqe = &r->cqes[(entry & cq_mask) << cq_shift]; - - if (!is_cqe32) { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", - entry & cq_mask, cqe->user_data, cqe->res, - cqe->flags); - } else { - seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x, " - "extra1:%llu, extra2:%llu\n", - entry & cq_mask, cqe->user_data, cqe->res, - cqe->flags, cqe->big_cqe[0], cqe->big_cqe[1]); - } - } - - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { - sq = ctx->sq_data; - if (!sq->thread) - sq = NULL; - } - - seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); - seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); - seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); - for (i = 0; has_lock && i < ctx->nr_user_files; i++) { - struct file *f = io_file_from_index(ctx, i); - - if (f) - seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); - else - seq_printf(m, "%5u: \n", i); - } - seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); - for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *buf = ctx->user_bufs[i]; - unsigned int len = buf->ubuf_end - buf->ubuf; - - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len); - } - if (has_lock && !xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - - seq_puts(m, "PollList:\n"); - spin_lock(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list = &ctx->cancel_hash[i]; - struct io_kiocb *req; - - hlist_for_each_entry(req, list, hash_node) - seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - task_work_pending(req->task)); - } - - seq_puts(m, "CqOverflowList:\n"); - list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { - struct io_uring_cqe *cqe = &ocqe->cqe; - - seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", - cqe->user_data, cqe->res, cqe->flags); - - } - - spin_unlock(&ctx->completion_lock); -} - -static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) -{ - struct io_ring_ctx *ctx = f->private_data; - - if (percpu_ref_tryget(&ctx->refs)) { - __io_uring_show_fdinfo(ctx, m); - percpu_ref_put(&ctx->refs); - } -} -#endif - static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 349524907b6bfc..147e1e597530a9 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -498,4 +498,17 @@ struct io_cancel_data { int seq; }; +struct io_overflow_cqe { + struct list_head list; + struct io_uring_cqe cqe; +}; + +struct io_mapped_ubuf { + u64 ubuf; + u64 ubuf_end; + unsigned int nr_bvecs; + unsigned long acct_pages; + struct bio_vec bvec[]; +}; + #endif From 91c2e67442ed6625e6a6c7c6851c962981375f51 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 11:01:04 -0600 Subject: [PATCH 0972/1250] io_uring: move io_uring_task (tctx) helpers into its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 368 +------------------------------------------- io_uring/io_uring.h | 4 + io_uring/tctx.c | 332 +++++++++++++++++++++++++++++++++++++++ io_uring/tctx.h | 55 +++++++ 5 files changed, 396 insertions(+), 365 deletions(-) create mode 100644 io_uring/tctx.c create mode 100644 io_uring/tctx.h diff --git a/io_uring/Makefile b/io_uring/Makefile index ed0bf42db4ae4c..2db085cdedad84 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -6,5 +6,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o + sqpoll.o fdinfo.o tctx.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 95c2083a0da8b3..d7336e6c9f2349 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -92,6 +92,7 @@ #include "io_uring_types.h" #include "io_uring.h" #include "refs.h" +#include "tctx.h" #include "sqpoll.h" #include "fdinfo.h" @@ -208,30 +209,6 @@ struct io_buffer { #define BGID_ARRAY 64 -/* - * Arbitrary limit, can be raised if need be - */ -#define IO_RINGFD_REG_MAX 16 - -struct io_uring_task { - /* submission side */ - int cached_refs; - struct xarray xa; - struct wait_queue_head wait; - const struct io_ring_ctx *last; - struct io_wq *io_wq; - struct percpu_counter inflight; - atomic_t inflight_tracked; - atomic_t in_idle; - - spinlock_t task_lock; - struct io_wq_work_list task_list; - struct io_wq_work_list prio_task_list; - struct callback_head task_work; - struct file **registered_rings; - bool task_running; -}; - /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in @@ -311,12 +288,6 @@ enum { IO_CHECK_CQ_DROPPED_BIT, }; -struct io_tctx_node { - struct list_head ctx_node; - struct task_struct *task; - struct io_ring_ctx *ctx; -}; - struct io_defer_entry { struct list_head list; struct io_kiocb *req; @@ -361,7 +332,6 @@ static const struct io_op_def io_op_defs[]; #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) -static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); @@ -1677,7 +1647,7 @@ static void handle_tw_list(struct io_wq_work_node *node, } while (node); } -static void tctx_task_work(struct callback_head *cb) +void tctx_task_work(struct callback_head *cb) { bool uring_locked = false; struct io_ring_ctx *ctx = NULL; @@ -4725,7 +4695,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) +struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -4733,7 +4703,7 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) return req ? &req->work : NULL; } -static void io_wq_submit_work(struct io_wq_work *work) +void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -6089,97 +6059,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, return done ? done : err; } -static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, - struct task_struct *task) -{ - struct io_wq_hash *hash; - struct io_wq_data data; - unsigned int concurrency; - - mutex_lock(&ctx->uring_lock); - hash = ctx->hash_map; - if (!hash) { - hash = kzalloc(sizeof(*hash), GFP_KERNEL); - if (!hash) { - mutex_unlock(&ctx->uring_lock); - return ERR_PTR(-ENOMEM); - } - refcount_set(&hash->refs, 1); - init_waitqueue_head(&hash->wait); - ctx->hash_map = hash; - } - mutex_unlock(&ctx->uring_lock); - - data.hash = hash; - data.task = task; - data.free_work = io_wq_free_work; - data.do_work = io_wq_submit_work; - - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - - return io_wq_create(concurrency, &data); -} - -__cold int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx; - int ret; - - tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); - if (unlikely(!tctx)) - return -ENOMEM; - - tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, - sizeof(struct file *), GFP_KERNEL); - if (unlikely(!tctx->registered_rings)) { - kfree(tctx); - return -ENOMEM; - } - - ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); - if (unlikely(ret)) { - kfree(tctx->registered_rings); - kfree(tctx); - return ret; - } - - tctx->io_wq = io_init_wq_offload(ctx, task); - if (IS_ERR(tctx->io_wq)) { - ret = PTR_ERR(tctx->io_wq); - percpu_counter_destroy(&tctx->inflight); - kfree(tctx->registered_rings); - kfree(tctx); - return ret; - } - - xa_init(&tctx->xa); - init_waitqueue_head(&tctx->wait); - atomic_set(&tctx->in_idle, 0); - atomic_set(&tctx->inflight_tracked, 0); - task->io_uring = tctx; - spin_lock_init(&tctx->task_lock); - INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); - init_task_work(&tctx->task_work, tctx_task_work); - return 0; -} - -void __io_uring_free(struct task_struct *tsk) -{ - struct io_uring_task *tctx = tsk->io_uring; - - WARN_ON_ONCE(!xa_empty(&tctx->xa)); - WARN_ON_ONCE(tctx->io_wq); - WARN_ON_ONCE(tctx->cached_refs); - - kfree(tctx->registered_rings); - percpu_counter_destroy(&tctx->inflight); - kfree(tctx); - tsk->io_uring = NULL; -} - static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) { @@ -7179,107 +7058,6 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } } -static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - int ret; - - if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current, ctx); - if (unlikely(ret)) - return ret; - - tctx = current->io_uring; - if (ctx->iowq_limits_set) { - unsigned int limits[2] = { ctx->iowq_limits[0], - ctx->iowq_limits[1], }; - - ret = io_wq_max_workers(tctx->io_wq, limits); - if (ret) - return ret; - } - } - if (!xa_load(&tctx->xa, (unsigned long)ctx)) { - node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) - return -ENOMEM; - node->ctx = ctx; - node->task = current; - - ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, - node, GFP_KERNEL)); - if (ret) { - kfree(node); - return ret; - } - - mutex_lock(&ctx->uring_lock); - list_add(&node->ctx_node, &ctx->tctx_list); - mutex_unlock(&ctx->uring_lock); - } - tctx->last = ctx; - return 0; -} - -/* - * Note that this task has used io_uring. We use it for cancelation purposes. - */ -static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) -{ - struct io_uring_task *tctx = current->io_uring; - - if (likely(tctx && tctx->last == ctx)) - return 0; - return __io_uring_add_tctx_node(ctx); -} - -/* - * Remove this io_uring_file -> task mapping. - */ -static __cold void io_uring_del_tctx_node(unsigned long index) -{ - struct io_uring_task *tctx = current->io_uring; - struct io_tctx_node *node; - - if (!tctx) - return; - node = xa_erase(&tctx->xa, index); - if (!node) - return; - - WARN_ON_ONCE(current != node->task); - WARN_ON_ONCE(list_empty(&node->ctx_node)); - - mutex_lock(&node->ctx->uring_lock); - list_del(&node->ctx_node); - mutex_unlock(&node->ctx->uring_lock); - - if (tctx->last == node->ctx) - tctx->last = NULL; - kfree(node); -} - -static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) -{ - struct io_wq *wq = tctx->io_wq; - struct io_tctx_node *node; - unsigned long index; - - xa_for_each(&tctx->xa, index, node) { - io_uring_del_tctx_node(index); - cond_resched(); - } - if (wq) { - /* - * Must be after io_uring_del_tctx_node() (removes nodes under - * uring_lock) to avoid race with io_uring_try_cancel_iowq(). - */ - io_wq_put_and_exit(wq); - tctx->io_wq = NULL; - } -} - static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) { if (tracked) @@ -7361,144 +7139,6 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } -void io_uring_unreg_ringfd(void) -{ - struct io_uring_task *tctx = current->io_uring; - int i; - - for (i = 0; i < IO_RINGFD_REG_MAX; i++) { - if (tctx->registered_rings[i]) { - fput(tctx->registered_rings[i]); - tctx->registered_rings[i] = NULL; - } - } -} - -static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, - int start, int end) -{ - struct file *file; - int offset; - - for (offset = start; offset < end; offset++) { - offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[offset]) - continue; - - file = fget(fd); - if (!file) { - return -EBADF; - } else if (!io_is_uring_fops(file)) { - fput(file); - return -EOPNOTSUPP; - } - tctx->registered_rings[offset] = file; - return offset; - } - - return -EBUSY; -} - -/* - * Register a ring fd to avoid fdget/fdput for each io_uring_enter() - * invocation. User passes in an array of struct io_uring_rsrc_update - * with ->data set to the ring_fd, and ->offset given for the desired - * index. If no index is desired, application may set ->offset == -1U - * and we'll find an available index. Returns number of entries - * successfully processed, or < 0 on error if none were processed. - */ -static int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update __user *arg = __arg; - struct io_uring_rsrc_update reg; - struct io_uring_task *tctx; - int ret, i; - - if (!nr_args || nr_args > IO_RINGFD_REG_MAX) - return -EINVAL; - - mutex_unlock(&ctx->uring_lock); - ret = io_uring_add_tctx_node(ctx); - mutex_lock(&ctx->uring_lock); - if (ret) - return ret; - - tctx = current->io_uring; - for (i = 0; i < nr_args; i++) { - int start, end; - - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } - - if (reg.resv) { - ret = -EINVAL; - break; - } - - if (reg.offset == -1U) { - start = 0; - end = IO_RINGFD_REG_MAX; - } else { - if (reg.offset >= IO_RINGFD_REG_MAX) { - ret = -EINVAL; - break; - } - start = reg.offset; - end = start + 1; - } - - ret = io_ring_add_registered_fd(tctx, reg.data, start, end); - if (ret < 0) - break; - - reg.offset = ret; - if (copy_to_user(&arg[i], ®, sizeof(reg))) { - fput(tctx->registered_rings[reg.offset]); - tctx->registered_rings[reg.offset] = NULL; - ret = -EFAULT; - break; - } - } - - return i ? i : ret; -} - -static int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update __user *arg = __arg; - struct io_uring_task *tctx = current->io_uring; - struct io_uring_rsrc_update reg; - int ret = 0, i; - - if (!nr_args || nr_args > IO_RINGFD_REG_MAX) - return -EINVAL; - if (!tctx) - return 0; - - for (i = 0; i < nr_args; i++) { - if (copy_from_user(®, &arg[i], sizeof(reg))) { - ret = -EFAULT; - break; - } - if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { - ret = -EINVAL; - break; - } - - reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[reg.offset]) { - fput(tctx->registered_rings[reg.offset]); - tctx->registered_rings[reg.offset] = NULL; - } - } - - return i ? i : ret; -} - static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 1da8e66507a350..60678e88a9b92c 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -128,6 +128,7 @@ void io_req_task_work_add(struct io_kiocb *req); void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); +void tctx_task_work(struct callback_head *cb); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, @@ -136,6 +137,9 @@ int io_uring_alloc_task_context(struct task_struct *task, int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); +struct io_wq_work *io_wq_free_work(struct io_wq_work *work); +void io_wq_submit_work(struct io_wq_work *work); + void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); diff --git a/io_uring/tctx.c b/io_uring/tctx.c new file mode 100644 index 00000000000000..3f7e9feb6ca2bc --- /dev/null +++ b/io_uring/tctx.c @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "tctx.h" + +static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, + struct task_struct *task) +{ + struct io_wq_hash *hash; + struct io_wq_data data; + unsigned int concurrency; + + mutex_lock(&ctx->uring_lock); + hash = ctx->hash_map; + if (!hash) { + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) { + mutex_unlock(&ctx->uring_lock); + return ERR_PTR(-ENOMEM); + } + refcount_set(&hash->refs, 1); + init_waitqueue_head(&hash->wait); + ctx->hash_map = hash; + } + mutex_unlock(&ctx->uring_lock); + + data.hash = hash; + data.task = task; + data.free_work = io_wq_free_work; + data.do_work = io_wq_submit_work; + + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + + return io_wq_create(concurrency, &data); +} + +void __io_uring_free(struct task_struct *tsk) +{ + struct io_uring_task *tctx = tsk->io_uring; + + WARN_ON_ONCE(!xa_empty(&tctx->xa)); + WARN_ON_ONCE(tctx->io_wq); + WARN_ON_ONCE(tctx->cached_refs); + + kfree(tctx->registered_rings); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + tsk->io_uring = NULL; +} + +__cold int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx; + int ret; + + tctx = kzalloc(sizeof(*tctx), GFP_KERNEL); + if (unlikely(!tctx)) + return -ENOMEM; + + tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, + sizeof(struct file *), GFP_KERNEL); + if (unlikely(!tctx->registered_rings)) { + kfree(tctx); + return -ENOMEM; + } + + ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); + if (unlikely(ret)) { + kfree(tctx->registered_rings); + kfree(tctx); + return ret; + } + + tctx->io_wq = io_init_wq_offload(ctx, task); + if (IS_ERR(tctx->io_wq)) { + ret = PTR_ERR(tctx->io_wq); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx->registered_rings); + kfree(tctx); + return ret; + } + + xa_init(&tctx->xa); + init_waitqueue_head(&tctx->wait); + atomic_set(&tctx->in_idle, 0); + atomic_set(&tctx->inflight_tracked, 0); + task->io_uring = tctx; + spin_lock_init(&tctx->task_lock); + INIT_WQ_LIST(&tctx->task_list); + INIT_WQ_LIST(&tctx->prio_task_list); + init_task_work(&tctx->task_work, tctx_task_work); + return 0; +} + +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_tctx_node *node; + int ret; + + if (unlikely(!tctx)) { + ret = io_uring_alloc_task_context(current, ctx); + if (unlikely(ret)) + return ret; + + tctx = current->io_uring; + if (ctx->iowq_limits_set) { + unsigned int limits[2] = { ctx->iowq_limits[0], + ctx->iowq_limits[1], }; + + ret = io_wq_max_workers(tctx->io_wq, limits); + if (ret) + return ret; + } + } + if (!xa_load(&tctx->xa, (unsigned long)ctx)) { + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + node->ctx = ctx; + node->task = current; + + ret = xa_err(xa_store(&tctx->xa, (unsigned long)ctx, + node, GFP_KERNEL)); + if (ret) { + kfree(node); + return ret; + } + + mutex_lock(&ctx->uring_lock); + list_add(&node->ctx_node, &ctx->tctx_list); + mutex_unlock(&ctx->uring_lock); + } + tctx->last = ctx; + return 0; +} + +/* + * Remove this io_uring_file -> task mapping. + */ +__cold void io_uring_del_tctx_node(unsigned long index) +{ + struct io_uring_task *tctx = current->io_uring; + struct io_tctx_node *node; + + if (!tctx) + return; + node = xa_erase(&tctx->xa, index); + if (!node) + return; + + WARN_ON_ONCE(current != node->task); + WARN_ON_ONCE(list_empty(&node->ctx_node)); + + mutex_lock(&node->ctx->uring_lock); + list_del(&node->ctx_node); + mutex_unlock(&node->ctx->uring_lock); + + if (tctx->last == node->ctx) + tctx->last = NULL; + kfree(node); +} + +__cold void io_uring_clean_tctx(struct io_uring_task *tctx) +{ + struct io_wq *wq = tctx->io_wq; + struct io_tctx_node *node; + unsigned long index; + + xa_for_each(&tctx->xa, index, node) { + io_uring_del_tctx_node(index); + cond_resched(); + } + if (wq) { + /* + * Must be after io_uring_del_tctx_node() (removes nodes under + * uring_lock) to avoid race with io_uring_try_cancel_iowq(). + */ + io_wq_put_and_exit(wq); + tctx->io_wq = NULL; + } +} + +void io_uring_unreg_ringfd(void) +{ + struct io_uring_task *tctx = current->io_uring; + int i; + + for (i = 0; i < IO_RINGFD_REG_MAX; i++) { + if (tctx->registered_rings[i]) { + fput(tctx->registered_rings[i]); + tctx->registered_rings[i] = NULL; + } + } +} + +static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, + int start, int end) +{ + struct file *file; + int offset; + + for (offset = start; offset < end; offset++) { + offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[offset]) + continue; + + file = fget(fd); + if (!file) { + return -EBADF; + } else if (!io_is_uring_fops(file)) { + fput(file); + return -EOPNOTSUPP; + } + tctx->registered_rings[offset] = file; + return offset; + } + + return -EBUSY; +} + +/* + * Register a ring fd to avoid fdget/fdput for each io_uring_enter() + * invocation. User passes in an array of struct io_uring_rsrc_update + * with ->data set to the ring_fd, and ->offset given for the desired + * index. If no index is desired, application may set ->offset == -1U + * and we'll find an available index. Returns number of entries + * successfully processed, or < 0 on error if none were processed. + */ +int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update __user *arg = __arg; + struct io_uring_rsrc_update reg; + struct io_uring_task *tctx; + int ret, i; + + if (!nr_args || nr_args > IO_RINGFD_REG_MAX) + return -EINVAL; + + mutex_unlock(&ctx->uring_lock); + ret = io_uring_add_tctx_node(ctx); + mutex_lock(&ctx->uring_lock); + if (ret) + return ret; + + tctx = current->io_uring; + for (i = 0; i < nr_args; i++) { + int start, end; + + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } + + if (reg.resv) { + ret = -EINVAL; + break; + } + + if (reg.offset == -1U) { + start = 0; + end = IO_RINGFD_REG_MAX; + } else { + if (reg.offset >= IO_RINGFD_REG_MAX) { + ret = -EINVAL; + break; + } + start = reg.offset; + end = start + 1; + } + + ret = io_ring_add_registered_fd(tctx, reg.data, start, end); + if (ret < 0) + break; + + reg.offset = ret; + if (copy_to_user(&arg[i], ®, sizeof(reg))) { + fput(tctx->registered_rings[reg.offset]); + tctx->registered_rings[reg.offset] = NULL; + ret = -EFAULT; + break; + } + } + + return i ? i : ret; +} + +int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update __user *arg = __arg; + struct io_uring_task *tctx = current->io_uring; + struct io_uring_rsrc_update reg; + int ret = 0, i; + + if (!nr_args || nr_args > IO_RINGFD_REG_MAX) + return -EINVAL; + if (!tctx) + return 0; + + for (i = 0; i < nr_args; i++) { + if (copy_from_user(®, &arg[i], sizeof(reg))) { + ret = -EFAULT; + break; + } + if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { + ret = -EINVAL; + break; + } + + reg.offset = array_index_nospec(reg.offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[reg.offset]) { + fput(tctx->registered_rings[reg.offset]); + tctx->registered_rings[reg.offset] = NULL; + } + } + + return i ? i : ret; +} diff --git a/io_uring/tctx.h b/io_uring/tctx.h new file mode 100644 index 00000000000000..f4964e40d07e0d --- /dev/null +++ b/io_uring/tctx.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + +struct io_uring_task { + /* submission side */ + int cached_refs; + struct xarray xa; + struct wait_queue_head wait; + const struct io_ring_ctx *last; + struct io_wq *io_wq; + struct percpu_counter inflight; + atomic_t inflight_tracked; + atomic_t in_idle; + + spinlock_t task_lock; + struct io_wq_work_list task_list; + struct io_wq_work_list prio_task_list; + struct callback_head task_work; + struct file **registered_rings; + bool task_running; +}; + +struct io_tctx_node { + struct list_head ctx_node; + struct task_struct *task; + struct io_ring_ctx *ctx; +}; + +int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx); +void io_uring_del_tctx_node(unsigned long index); +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); +void io_uring_clean_tctx(struct io_uring_task *tctx); + +void io_uring_unreg_ringfd(void); +int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args); +int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, + unsigned nr_args); + +/* + * Note that this task has used io_uring. We use it for cancelation purposes. + */ +static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) +{ + struct io_uring_task *tctx = current->io_uring; + + if (likely(tctx && tctx->last == ctx)) + return 0; + return __io_uring_add_tctx_node(ctx); +} From 2d44939e7582567f9e77c775bc45c4e21915e6e4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 11:48:35 -0600 Subject: [PATCH 0973/1250] io_uring: include and forward-declaration sanitation Remove some dead headers we no longer need, and get rid of the io_ring_ctx and io_uring_fops forward declarations. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d7336e6c9f2349..c1229704adfdf5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -70,11 +70,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -86,7 +84,6 @@ #include -#include "../fs/internal.h" #include "io-wq.h" #include "io_uring_types.h" @@ -139,8 +136,6 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) -struct io_ring_ctx; - struct io_rsrc_put { struct list_head list; u64 tag; @@ -352,8 +347,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; -static const struct file_operations io_uring_fops; - const char *io_uring_get_opcode(u8 opcode) { switch ((enum io_uring_op)opcode) { @@ -457,11 +450,6 @@ const char *io_uring_get_opcode(u8 opcode) return "INVALID"; } -bool io_is_uring_fops(struct file *file) -{ - return file->f_op == &io_uring_fops; -} - struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -7402,6 +7390,11 @@ static const struct file_operations io_uring_fops = { #endif }; +bool io_is_uring_fops(struct file *file) +{ + return file->f_op == &io_uring_fops; +} + static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { From 624749ca3f176bd678c9059a944909d3849a5e34 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 11:57:03 -0600 Subject: [PATCH 0974/1250] io_uring: add opcode name to io_op_defs This kills the last per-op switch. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 150 +++++++++++++++----------------------------- 1 file changed, 52 insertions(+), 98 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c1229704adfdf5..3f006907c8c5f4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -315,6 +315,8 @@ struct io_op_def { /* size of async data needed, if any */ unsigned short async_size; + const char *name; + int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); int (*issue)(struct io_kiocb *, unsigned int); int (*prep_async)(struct io_kiocb *); @@ -349,104 +351,8 @@ static struct kmem_cache *req_cachep; const char *io_uring_get_opcode(u8 opcode) { - switch ((enum io_uring_op)opcode) { - case IORING_OP_NOP: - return "NOP"; - case IORING_OP_READV: - return "READV"; - case IORING_OP_WRITEV: - return "WRITEV"; - case IORING_OP_FSYNC: - return "FSYNC"; - case IORING_OP_READ_FIXED: - return "READ_FIXED"; - case IORING_OP_WRITE_FIXED: - return "WRITE_FIXED"; - case IORING_OP_POLL_ADD: - return "POLL_ADD"; - case IORING_OP_POLL_REMOVE: - return "POLL_REMOVE"; - case IORING_OP_SYNC_FILE_RANGE: - return "SYNC_FILE_RANGE"; - case IORING_OP_SENDMSG: - return "SENDMSG"; - case IORING_OP_RECVMSG: - return "RECVMSG"; - case IORING_OP_TIMEOUT: - return "TIMEOUT"; - case IORING_OP_TIMEOUT_REMOVE: - return "TIMEOUT_REMOVE"; - case IORING_OP_ACCEPT: - return "ACCEPT"; - case IORING_OP_ASYNC_CANCEL: - return "ASYNC_CANCEL"; - case IORING_OP_LINK_TIMEOUT: - return "LINK_TIMEOUT"; - case IORING_OP_CONNECT: - return "CONNECT"; - case IORING_OP_FALLOCATE: - return "FALLOCATE"; - case IORING_OP_OPENAT: - return "OPENAT"; - case IORING_OP_CLOSE: - return "CLOSE"; - case IORING_OP_FILES_UPDATE: - return "FILES_UPDATE"; - case IORING_OP_STATX: - return "STATX"; - case IORING_OP_READ: - return "READ"; - case IORING_OP_WRITE: - return "WRITE"; - case IORING_OP_FADVISE: - return "FADVISE"; - case IORING_OP_MADVISE: - return "MADVISE"; - case IORING_OP_SEND: - return "SEND"; - case IORING_OP_RECV: - return "RECV"; - case IORING_OP_OPENAT2: - return "OPENAT2"; - case IORING_OP_EPOLL_CTL: - return "EPOLL_CTL"; - case IORING_OP_SPLICE: - return "SPLICE"; - case IORING_OP_PROVIDE_BUFFERS: - return "PROVIDE_BUFFERS"; - case IORING_OP_REMOVE_BUFFERS: - return "REMOVE_BUFFERS"; - case IORING_OP_TEE: - return "TEE"; - case IORING_OP_SHUTDOWN: - return "SHUTDOWN"; - case IORING_OP_RENAMEAT: - return "RENAMEAT"; - case IORING_OP_UNLINKAT: - return "UNLINKAT"; - case IORING_OP_MKDIRAT: - return "MKDIRAT"; - case IORING_OP_SYMLINKAT: - return "SYMLINKAT"; - case IORING_OP_LINKAT: - return "LINKAT"; - case IORING_OP_MSG_RING: - return "MSG_RING"; - case IORING_OP_FSETXATTR: - return "FSETXATTR"; - case IORING_OP_SETXATTR: - return "SETXATTR"; - case IORING_OP_FGETXATTR: - return "FGETXATTR"; - case IORING_OP_GETXATTR: - return "GETXATTR"; - case IORING_OP_SOCKET: - return "SOCKET"; - case IORING_OP_URING_CMD: - return "URING_CMD"; - case IORING_OP_LAST: - return "INVALID"; - } + if (opcode < IORING_OP_LAST) + return io_op_defs[opcode].name; return "INVALID"; } @@ -8307,6 +8213,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, + .name = "NOP", .prep = io_nop_prep, .issue = io_nop, }, @@ -8320,6 +8227,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "READV", .prep = io_prep_rw, .issue = io_read, .prep_async = io_readv_prep_async, @@ -8335,6 +8243,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "WRITEV", .prep = io_prep_rw, .issue = io_write, .prep_async = io_writev_prep_async, @@ -8343,6 +8252,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_FSYNC] = { .needs_file = 1, .audit_skip = 1, + .name = "FSYNC", .prep = io_fsync_prep, .issue = io_fsync, }, @@ -8355,6 +8265,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "READ_FIXED", .prep = io_prep_rw, .issue = io_read, }, @@ -8368,6 +8279,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "WRITE_FIXED", .prep = io_prep_rw, .issue = io_write, }, @@ -8375,17 +8287,20 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, + .name = "POLL_ADD", .prep = io_poll_add_prep, .issue = io_poll_add, }, [IORING_OP_POLL_REMOVE] = { .audit_skip = 1, + .name = "POLL_REMOVE", .prep = io_poll_remove_prep, .issue = io_poll_remove, }, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, .audit_skip = 1, + .name = "SYNC_FILE_RANGE", .prep = io_sfr_prep, .issue = io_sync_file_range, }, @@ -8394,6 +8309,7 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, + .name = "SENDMSG", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, @@ -8410,6 +8326,7 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .ioprio = 1, + .name = "RECVMSG", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, @@ -8423,12 +8340,14 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_TIMEOUT] = { .audit_skip = 1, .async_size = sizeof(struct io_timeout_data), + .name = "TIMEOUT", .prep = io_timeout_prep, .issue = io_timeout, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ .audit_skip = 1, + .name = "TIMEOUT_REMOVE", .prep = io_timeout_remove_prep, .issue = io_timeout_remove, }, @@ -8438,6 +8357,7 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .poll_exclusive = 1, .ioprio = 1, /* used for flags */ + .name = "ACCEPT", #if defined(CONFIG_NET) .prep = io_accept_prep, .issue = io_accept, @@ -8447,12 +8367,14 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, + .name = "ASYNC_CANCEL", .prep = io_async_cancel_prep, .issue = io_async_cancel, }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, .async_size = sizeof(struct io_timeout_data), + .name = "LINK_TIMEOUT", .prep = io_link_timeout_prep, .issue = io_no_issue, }, @@ -8460,6 +8382,7 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .name = "CONNECT", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_connect), .prep = io_connect_prep, @@ -8471,26 +8394,31 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .name = "FALLOCATE", .prep = io_fallocate_prep, .issue = io_fallocate, }, [IORING_OP_OPENAT] = { + .name = "OPENAT", .prep = io_openat_prep, .issue = io_openat, .cleanup = io_open_cleanup, }, [IORING_OP_CLOSE] = { + .name = "CLOSE", .prep = io_close_prep, .issue = io_close, }, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, .iopoll = 1, + .name = "FILES_UPDATE", .prep = io_files_update_prep, .issue = io_files_update, }, [IORING_OP_STATX] = { .audit_skip = 1, + .name = "STATX", .prep = io_statx_prep, .issue = io_statx, .cleanup = io_statx_cleanup, @@ -8505,6 +8433,7 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "READ", .prep = io_prep_rw, .issue = io_read, }, @@ -8518,16 +8447,19 @@ static const struct io_op_def io_op_defs[] = { .ioprio = 1, .iopoll = 1, .async_size = sizeof(struct io_async_rw), + .name = "WRITE", .prep = io_prep_rw, .issue = io_write, }, [IORING_OP_FADVISE] = { .needs_file = 1, .audit_skip = 1, + .name = "FADVISE", .prep = io_fadvise_prep, .issue = io_fadvise, }, [IORING_OP_MADVISE] = { + .name = "MADVISE", .prep = io_madvise_prep, .issue = io_madvise, }, @@ -8537,6 +8469,7 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, + .name = "SEND", #if defined(CONFIG_NET) .prep = io_sendmsg_prep, .issue = io_send, @@ -8551,6 +8484,7 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .audit_skip = 1, .ioprio = 1, + .name = "RECV", #if defined(CONFIG_NET) .prep = io_recvmsg_prep, .issue = io_recv, @@ -8559,6 +8493,7 @@ static const struct io_op_def io_op_defs[] = { #endif }, [IORING_OP_OPENAT2] = { + .name = "OPENAT2", .prep = io_openat2_prep, .issue = io_openat2, .cleanup = io_open_cleanup, @@ -8566,6 +8501,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, .audit_skip = 1, + .name = "EPOLL", #if defined(CONFIG_EPOLL) .prep = io_epoll_ctl_prep, .issue = io_epoll_ctl, @@ -8578,18 +8514,21 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, + .name = "SPLICE", .prep = io_splice_prep, .issue = io_splice, }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, + .name = "PROVIDE_BUFFERS", .prep = io_provide_buffers_prep, .issue = io_provide_buffers, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, + .name = "REMOVE_BUFFERS", .prep = io_remove_buffers_prep, .issue = io_remove_buffers, }, @@ -8598,11 +8537,13 @@ static const struct io_op_def io_op_defs[] = { .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, + .name = "TEE", .prep = io_tee_prep, .issue = io_tee, }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, + .name = "SHUTDOWN", #if defined(CONFIG_NET) .prep = io_shutdown_prep, .issue = io_shutdown, @@ -8611,26 +8552,31 @@ static const struct io_op_def io_op_defs[] = { #endif }, [IORING_OP_RENAMEAT] = { + .name = "RENAMEAT", .prep = io_renameat_prep, .issue = io_renameat, .cleanup = io_renameat_cleanup, }, [IORING_OP_UNLINKAT] = { + .name = "UNLINKAT", .prep = io_unlinkat_prep, .issue = io_unlinkat, .cleanup = io_unlinkat_cleanup, }, [IORING_OP_MKDIRAT] = { + .name = "MKDIRAT", .prep = io_mkdirat_prep, .issue = io_mkdirat, .cleanup = io_mkdirat_cleanup, }, [IORING_OP_SYMLINKAT] = { + .name = "SYMLINKAT", .prep = io_symlinkat_prep, .issue = io_symlinkat, .cleanup = io_link_cleanup, }, [IORING_OP_LINKAT] = { + .name = "LINKAT", .prep = io_linkat_prep, .issue = io_linkat, .cleanup = io_link_cleanup, @@ -8638,33 +8584,39 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_MSG_RING] = { .needs_file = 1, .iopoll = 1, + .name = "MSG_RING", .prep = io_msg_ring_prep, .issue = io_msg_ring, }, [IORING_OP_FSETXATTR] = { .needs_file = 1, + .name = "FSETXATTR", .prep = io_fsetxattr_prep, .issue = io_fsetxattr, .cleanup = io_xattr_cleanup, }, [IORING_OP_SETXATTR] = { + .name = "SETXATTR", .prep = io_setxattr_prep, .issue = io_setxattr, .cleanup = io_xattr_cleanup, }, [IORING_OP_FGETXATTR] = { .needs_file = 1, + .name = "FGETXATTR", .prep = io_fgetxattr_prep, .issue = io_fgetxattr, .cleanup = io_xattr_cleanup, }, [IORING_OP_GETXATTR] = { + .name = "GETXATTR", .prep = io_getxattr_prep, .issue = io_getxattr, .cleanup = io_xattr_cleanup, }, [IORING_OP_SOCKET] = { .audit_skip = 1, + .name = "SOCKET", #if defined(CONFIG_NET) .prep = io_socket_prep, .issue = io_socket, @@ -8675,6 +8627,7 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_URING_CMD] = { .needs_file = 1, .plug = 1, + .name = "URING_CMD", .async_size = uring_cmd_pdu_size(1), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, @@ -8752,6 +8705,7 @@ static int __init io_uring_init(void) BUG_ON(!io_op_defs[i].prep); if (io_op_defs[i].prep != io_eopnotsupp_prep) BUG_ON(!io_op_defs[i].issue); + WARN_ON_ONCE(!io_op_defs[i].name); } req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | From e18862b552beab95d6546ece61ef25ddfab7cab6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 20:31:09 -0600 Subject: [PATCH 0975/1250] io_uring: move poll handling into its own file Add a io_poll_issue() rather than export the general task_work locking and io_issue_sqe(), and put the io_op_defs definition and structure into a separate header file so that poll can use it. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 842 +------------------------------------------- io_uring/io_uring.h | 32 ++ io_uring/opdef.h | 40 +++ io_uring/poll.c | 760 +++++++++++++++++++++++++++++++++++++++ io_uring/poll.h | 30 ++ 6 files changed, 879 insertions(+), 827 deletions(-) create mode 100644 io_uring/opdef.h create mode 100644 io_uring/poll.c create mode 100644 io_uring/poll.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 2db085cdedad84..eb1b07a9951639 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -6,5 +6,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o tctx.o + sqpoll.o fdinfo.o tctx.o poll.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3f006907c8c5f4..b4ecfa7238555f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -88,6 +88,7 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "opdef.h" #include "refs.h" #include "tctx.h" #include "sqpoll.h" @@ -106,6 +107,7 @@ #include "net.h" #include "msg_ring.h" #include "timeout.h" +#include "poll.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -208,22 +210,6 @@ struct io_buffer { * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in */ -struct io_poll { - struct file *file; - struct wait_queue_head *head; - __poll_t events; - struct wait_queue_entry wait; -}; - -struct io_poll_update { - struct file *file; - u64 old_user_data; - u64 new_user_data; - __poll_t events; - bool update_events; - bool update_user_data; -}; - struct io_cancel { struct file *file; u64 addr; @@ -268,11 +254,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -struct async_poll { - struct io_poll poll; - struct io_poll *double_poll; -}; - enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, @@ -289,42 +270,6 @@ struct io_defer_entry { u32 seq; }; -struct io_op_def { - /* needs req->file assigned */ - unsigned needs_file : 1; - /* should block plug */ - unsigned plug : 1; - /* hash wq insertion if file is a regular file */ - unsigned hash_reg_file : 1; - /* unbound wq insertion if file is a non-regular file */ - unsigned unbound_nonreg_file : 1; - /* set if opcode supports polled "wait" */ - unsigned pollin : 1; - unsigned pollout : 1; - unsigned poll_exclusive : 1; - /* op supports buffer selection */ - unsigned buffer_select : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; - /* skip auditing */ - unsigned audit_skip : 1; - /* supports ioprio */ - unsigned ioprio : 1; - /* supports iopoll */ - unsigned iopoll : 1; - /* size of async data needed, if any */ - unsigned short async_size; - - const char *name; - - int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); - int (*issue)(struct io_kiocb *, unsigned int); - int (*prep_async)(struct io_kiocb *); - void (*cleanup)(struct io_kiocb *); -}; - -static const struct io_op_def io_op_defs[]; - /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) @@ -529,32 +474,12 @@ static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, return xa_load(&ctx->io_bl_xa, bgid); } -static void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; struct io_buffer *buf; - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if ((req->flags & REQ_F_BUFFER_SELECTED) && - (req->flags & REQ_F_PARTIAL_IO)) - return; - - /* - * READV uses fields in `struct io_rw` (len/addr) to stash the selected - * buffer data. However if that buffer is recycled the original request - * data stored in addr is lost. Therefore forbid recycling for now. - */ - if (req->opcode == IORING_OP_READV) - return; - /* * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear * the flag and hence ensure that bl->head doesn't get incremented. @@ -599,8 +524,8 @@ static bool io_match_linked(struct io_kiocb *head) * As io_match_task() but protected against racing with linked timeouts. * User must not hold timeout_lock. */ -static bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, - bool cancel_all) +bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all) { bool matched; @@ -1310,7 +1235,7 @@ inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) io_req_complete_post(req); } -static void io_req_complete_failed(struct io_kiocb *req, s32 res) +void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); io_req_set_res(req, res, io_put_kbuf(req, IO_URING_F_UNLOCKED)); @@ -1656,7 +1581,7 @@ static void io_req_task_cancel(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, req->cqe.res); } -static void io_req_task_submit(struct io_kiocb *req, bool *locked) +void io_req_task_submit(struct io_kiocb *req, bool *locked) { io_tw_lock(req->ctx, locked); /* req->task == current here, checking PF_EXITING is safe */ @@ -3437,749 +3362,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -struct io_poll_table { - struct poll_table_struct pt; - struct io_kiocb *req; - int nr_entries; - int error; -}; - -#define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK GENMASK(30, 0) - -/* - * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can - * bump it and acquire ownership. It's disallowed to modify requests while not - * owning it, that prevents from races for enqueueing task_work's and b/w - * arming poll and wakeups. - */ -static inline bool io_poll_get_ownership(struct io_kiocb *req) -{ - return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); -} - -static void io_poll_mark_cancelled(struct io_kiocb *req) -{ - atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); -} - -static struct io_poll *io_poll_get_double(struct io_kiocb *req) -{ - /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ - if (req->opcode == IORING_OP_POLL_ADD) - return req->async_data; - return req->apoll->double_poll; -} - -static struct io_poll *io_poll_get_single(struct io_kiocb *req) -{ - if (req->opcode == IORING_OP_POLL_ADD) - return io_kiocb_to_cmd(req); - return &req->apoll->poll; -} - -static void io_poll_req_insert(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; - - list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); -} - -static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, - wait_queue_func_t wake_func) -{ - poll->head = NULL; -#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) - /* mask in events that we always want/need */ - poll->events = events | IO_POLL_UNMASK; - INIT_LIST_HEAD(&poll->wait.entry); - init_waitqueue_func_entry(&poll->wait, wake_func); -} - -static inline void io_poll_remove_entry(struct io_poll *poll) -{ - struct wait_queue_head *head = smp_load_acquire(&poll->head); - - if (head) { - spin_lock_irq(&head->lock); - list_del_init(&poll->wait.entry); - poll->head = NULL; - spin_unlock_irq(&head->lock); - } -} - -static void io_poll_remove_entries(struct io_kiocb *req) -{ - /* - * Nothing to do if neither of those flags are set. Avoid dipping - * into the poll/apoll/double cachelines if we can. - */ - if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) - return; - - /* - * While we hold the waitqueue lock and the waitqueue is nonempty, - * wake_up_pollfree() will wait for us. However, taking the waitqueue - * lock in the first place can race with the waitqueue being freed. - * - * We solve this as eventpoll does: by taking advantage of the fact that - * all users of wake_up_pollfree() will RCU-delay the actual free. If - * we enter rcu_read_lock() and see that the pointer to the queue is - * non-NULL, we can then lock it without the memory being freed out from - * under us. - * - * Keep holding rcu_read_lock() as long as we hold the queue lock, in - * case the caller deletes the entry from the queue, leaving it empty. - * In that case, only RCU prevents the queue memory from being freed. - */ - rcu_read_lock(); - if (req->flags & REQ_F_SINGLE_POLL) - io_poll_remove_entry(io_poll_get_single(req)); - if (req->flags & REQ_F_DOUBLE_POLL) - io_poll_remove_entry(io_poll_get_double(req)); - rcu_read_unlock(); -} - -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags); -/* - * All poll tw should go through this. Checks for poll events, manages - * references, does rewait, etc. - * - * Returns a negative error on failure. >0 when no action require, which is - * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->cqe.res. - */ -static int io_poll_check_events(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int v, ret; - - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) - return -ECANCELED; - - do { - v = atomic_read(&req->poll_refs); - - /* tw handler should be the owner, and so have some references */ - if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) - return 0; - if (v & IO_POLL_CANCEL_FLAG) - return -ECANCELED; - - if (!req->cqe.res) { - struct poll_table_struct pt = { ._key = req->apoll_events }; - req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; - } - - if ((unlikely(!req->cqe.res))) - continue; - if (req->apoll_events & EPOLLONESHOT) - return 0; - - /* multishot, just fill a CQE and proceed */ - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - __poll_t mask = mangle_poll(req->cqe.res & - req->apoll_events); - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - continue; - } - return -ECANCELED; - } - - io_tw_lock(req->ctx, locked); - if (unlikely(req->task->flags & PF_EXITING)) - return -EFAULT; - ret = io_issue_sqe(req, - IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); - if (ret) - return ret; - - /* - * Release all references, retry if someone tried to restart - * task_work while we were executing it. - */ - } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); - - return 1; -} - -static void io_poll_task_func(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_poll_check_events(req, locked); - if (ret > 0) - return; - - if (!ret) { - struct io_poll *poll = io_kiocb_to_cmd(req); - - req->cqe.res = mangle_poll(req->cqe.res & poll->events); - } else { - req->cqe.res = ret; - req_set_fail(req); - } - - io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - req->cqe.flags = 0; - __io_req_complete_post(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - -static void io_apoll_task_func(struct io_kiocb *req, bool *locked) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = io_poll_check_events(req, locked); - if (ret > 0) - return; - - io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - spin_unlock(&ctx->completion_lock); - - if (!ret) - io_req_task_submit(req, locked); - else - io_req_complete_failed(req, ret); -} - -static void __io_poll_execute(struct io_kiocb *req, int mask, - __poll_t __maybe_unused events) -{ - io_req_set_res(req, mask, 0); - /* - * This is useful for poll that is armed on behalf of another - * request, and where the wakeup path could be on a different - * CPU. We want to avoid pulling in req->apoll->events for that - * case. - */ - if (req->opcode == IORING_OP_POLL_ADD) - req->io_task_work.func = io_poll_task_func; - else - req->io_task_work.func = io_apoll_task_func; - - trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); - io_req_task_work_add(req); -} - -static inline void io_poll_execute(struct io_kiocb *req, int res, - __poll_t events) -{ - if (io_poll_get_ownership(req)) - __io_poll_execute(req, res, events); -} - -static void io_poll_cancel_req(struct io_kiocb *req) -{ - io_poll_mark_cancelled(req); - /* kick tw, which should complete the request */ - io_poll_execute(req, 0, 0); -} - -#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) -#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) -#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) - -static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, - void *key) -{ - struct io_kiocb *req = wqe_to_req(wait); - struct io_poll *poll = container_of(wait, struct io_poll, wait); - __poll_t mask = key_to_poll(key); - - if (unlikely(mask & POLLFREE)) { - io_poll_mark_cancelled(req); - /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0, poll->events); - - /* - * If the waitqueue is being freed early but someone is already - * holds ownership over it, we have to tear down the request as - * best we can. That means immediately removing the request from - * its waitqueue and preventing all further accesses to the - * waitqueue via the request. - */ - list_del_init(&poll->wait.entry); - - /* - * Careful: this *must* be the last step, since as soon - * as req->head is NULL'ed out, the request can be - * completed and freed, since aio_poll_complete_work() - * will no longer need to take the waitqueue lock. - */ - smp_store_release(&poll->head, NULL); - return 1; - } - - /* for instances that support it check for an event match first */ - if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) - return 0; - - if (io_poll_get_ownership(req)) { - /* optional, saves extra locking for removal in tw handler */ - if (mask && poll->events & EPOLLONESHOT) { - list_del_init(&poll->wait.entry); - poll->head = NULL; - if (wqe_is_double(wait)) - req->flags &= ~REQ_F_DOUBLE_POLL; - else - req->flags &= ~REQ_F_SINGLE_POLL; - } - __io_poll_execute(req, mask, poll->events); - } - return 1; -} - -static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, - struct wait_queue_head *head, - struct io_poll **poll_ptr) -{ - struct io_kiocb *req = pt->req; - unsigned long wqe_private = (unsigned long) req; - - /* - * The file being polled uses multiple waitqueues for poll handling - * (e.g. one for read, one for write). Setup a separate io_poll - * if this happens. - */ - if (unlikely(pt->nr_entries)) { - struct io_poll *first = poll; - - /* double add on the same waitqueue head, ignore */ - if (first->head == head) - return; - /* already have a 2nd entry, fail a third attempt */ - if (*poll_ptr) { - if ((*poll_ptr)->head == head) - return; - pt->error = -EINVAL; - return; - } - - poll = kmalloc(sizeof(*poll), GFP_ATOMIC); - if (!poll) { - pt->error = -ENOMEM; - return; - } - /* mark as double wq entry */ - wqe_private |= 1; - req->flags |= REQ_F_DOUBLE_POLL; - io_init_poll_iocb(poll, first->events, first->wait.func); - *poll_ptr = poll; - if (req->opcode == IORING_OP_POLL_ADD) - req->flags |= REQ_F_ASYNC_DATA; - } - - req->flags |= REQ_F_SINGLE_POLL; - pt->nr_entries++; - poll->head = head; - poll->wait.private = (void *) wqe_private; - - if (poll->events & EPOLLEXCLUSIVE) - add_wait_queue_exclusive(head, &poll->wait); - else - add_wait_queue(head, &poll->wait); -} - -static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - struct io_poll *poll = io_kiocb_to_cmd(pt->req); - - __io_queue_proc(poll, pt, head, - (struct io_poll **) &pt->req->async_data); -} - -static int __io_arm_poll_handler(struct io_kiocb *req, - struct io_poll *poll, - struct io_poll_table *ipt, __poll_t mask) -{ - struct io_ring_ctx *ctx = req->ctx; - int v; - - INIT_HLIST_NODE(&req->hash_node); - req->work.cancel_seq = atomic_read(&ctx->cancel_seq); - io_init_poll_iocb(poll, mask, io_poll_wake); - poll->file = req->file; - - req->apoll_events = poll->events; - - ipt->pt._key = mask; - ipt->req = req; - ipt->error = 0; - ipt->nr_entries = 0; - - /* - * Take the ownership to delay any tw execution up until we're done - * with poll arming. see io_poll_get_ownership(). - */ - atomic_set(&req->poll_refs, 1); - mask = vfs_poll(req->file, &ipt->pt) & poll->events; - - if (mask && (poll->events & EPOLLONESHOT)) { - io_poll_remove_entries(req); - /* no one else has access to the req, forget about the ref */ - return mask; - } - if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { - io_poll_remove_entries(req); - if (!ipt->error) - ipt->error = -EINVAL; - return 0; - } - - spin_lock(&ctx->completion_lock); - io_poll_req_insert(req); - spin_unlock(&ctx->completion_lock); - - if (mask) { - /* can't multishot if failed, just queue the event we've got */ - if (unlikely(ipt->error || !ipt->nr_entries)) { - poll->events |= EPOLLONESHOT; - req->apoll_events |= EPOLLONESHOT; - ipt->error = 0; - } - __io_poll_execute(req, mask, poll->events); - return 0; - } - - /* - * Release ownership. If someone tried to queue a tw while it was - * locked, kick it off for them. - */ - v = atomic_dec_return(&req->poll_refs); - if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0, poll->events); - return 0; -} - -static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, - struct poll_table_struct *p) -{ - struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); - struct async_poll *apoll = pt->req->apoll; - - __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); -} - -enum { - IO_APOLL_OK, - IO_APOLL_ABORTED, - IO_APOLL_READY -}; - -static int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; - struct async_poll *apoll; - struct io_poll_table ipt; - __poll_t mask = POLLPRI | POLLERR; - int ret; - - if (!def->pollin && !def->pollout) - return IO_APOLL_ABORTED; - if (!file_can_poll(req->file)) - return IO_APOLL_ABORTED; - if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) - return IO_APOLL_ABORTED; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) - mask |= EPOLLONESHOT; - - if (def->pollin) { - mask |= EPOLLIN | EPOLLRDNORM; - - /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if (req->flags & REQ_F_CLEAR_POLLIN) - mask &= ~EPOLLIN; - } else { - mask |= EPOLLOUT | EPOLLWRNORM; - } - if (def->poll_exclusive) - mask |= EPOLLEXCLUSIVE; - if (req->flags & REQ_F_POLLED) { - apoll = req->apoll; - kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del_init(&apoll->poll.wait.entry); - } else { - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; - } - apoll->double_poll = NULL; - req->apoll = apoll; - req->flags |= REQ_F_POLLED; - ipt.pt._qproc = io_async_queue_proc; - - io_kbuf_recycle(req, issue_flags); - - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); - if (ret || ipt.error) - return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - - trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, - mask, apoll->poll.events); - return IO_APOLL_OK; -} - -/* - * Returns true if we found and killed one or more poll requests - */ -static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, - struct task_struct *tsk, bool cancel_all) -{ - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - int i; - - spin_lock(&ctx->completion_lock); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) { - if (io_match_task_safe(req, tsk, cancel_all)) { - hlist_del_init(&req->hash_node); - io_poll_cancel_req(req); - found = true; - } - } - } - spin_unlock(&ctx->completion_lock); - return found; -} - -static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, - struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct hlist_head *list; - struct io_kiocb *req; - - list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; - hlist_for_each_entry(req, list, hash_node) { - if (cd->data != req->cqe.user_data) - continue; - if (poll_only && req->opcode != IORING_OP_POLL_ADD) - continue; - if (cd->flags & IORING_ASYNC_CANCEL_ALL) { - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - } - return req; - } - return NULL; -} - -static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - int i; - - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; - - list = &ctx->cancel_hash[i]; - hlist_for_each_entry(req, list, hash_node) { - if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && - req->file != cd->file) - continue; - if (cd->seq == req->work.cancel_seq) - continue; - req->work.cancel_seq = cd->seq; - return req; - } - } - return NULL; -} - -static bool io_poll_disarm(struct io_kiocb *req) - __must_hold(&ctx->completion_lock) -{ - if (!io_poll_get_ownership(req)) - return false; - io_poll_remove_entries(req); - hash_del(&req->hash_node); - return true; -} - -static int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) -{ - struct io_kiocb *req; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd); - else - req = io_poll_find(ctx, false, cd); - if (!req) - return -ENOENT; - io_poll_cancel_req(req); - return 0; -} - -static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, - unsigned int flags) -{ - u32 events; - - events = READ_ONCE(sqe->poll32_events); -#ifdef __BIG_ENDIAN - events = swahw32(events); -#endif - if (!(flags & IORING_POLL_ADD_MULTI)) - events |= EPOLLONESHOT; - return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); -} - -static int io_poll_remove_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_poll_update *upd = io_kiocb_to_cmd(req); - u32 flags; - - if (sqe->buf_index || sqe->splice_fd_in) - return -EINVAL; - flags = READ_ONCE(sqe->len); - if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | - IORING_POLL_ADD_MULTI)) - return -EINVAL; - /* meaningless without update */ - if (flags == IORING_POLL_ADD_MULTI) - return -EINVAL; - - upd->old_user_data = READ_ONCE(sqe->addr); - upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; - upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; - - upd->new_user_data = READ_ONCE(sqe->off); - if (!upd->update_user_data && upd->new_user_data) - return -EINVAL; - if (upd->update_events) - upd->events = io_poll_parse_events(sqe, flags); - else if (sqe->poll32_events) - return -EINVAL; - - return 0; -} - -static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_poll *poll = io_kiocb_to_cmd(req); - u32 flags; - - if (sqe->buf_index || sqe->off || sqe->addr) - return -EINVAL; - flags = READ_ONCE(sqe->len); - if (flags & ~IORING_POLL_ADD_MULTI) - return -EINVAL; - if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) - return -EINVAL; - - io_req_set_refcount(req); - poll->events = io_poll_parse_events(sqe, flags); - return 0; -} - -static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_poll *poll = io_kiocb_to_cmd(req); - struct io_poll_table ipt; - int ret; - - ipt.pt._qproc = io_poll_queue_proc; - - ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); - if (ret) { - io_req_set_res(req, ret, 0); - return IOU_OK; - } - if (ipt.error) { - req_set_fail(req); - return ipt.error; - } - - return IOU_ISSUE_SKIP_COMPLETE; -} - -static int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_poll_update *poll_update = io_kiocb_to_cmd(req); - struct io_cancel_data cd = { .data = poll_update->old_user_data, }; - struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *preq; - int ret2, ret = 0; - bool locked; - - spin_lock(&ctx->completion_lock); - preq = io_poll_find(ctx, true, &cd); - if (!preq || !io_poll_disarm(preq)) { - spin_unlock(&ctx->completion_lock); - ret = preq ? -EALREADY : -ENOENT; - goto out; - } - spin_unlock(&ctx->completion_lock); - - if (poll_update->update_events || poll_update->update_user_data) { - /* only mask one event flags, keep behavior flags */ - if (poll_update->update_events) { - struct io_poll *poll = io_kiocb_to_cmd(preq); - - poll->events &= ~0xffff; - poll->events |= poll_update->events & 0xffff; - poll->events |= IO_POLL_UNMASK; - } - if (poll_update->update_user_data) - preq->cqe.user_data = poll_update->new_user_data; - - ret2 = io_poll_add(preq, issue_flags); - /* successfully updated, don't complete poll request */ - if (!ret2 || ret2 == -EIOCBQUEUED) - goto out; - } - - req_set_fail(preq); - io_req_set_res(preq, -ECANCELED, 0); - locked = !(issue_flags & IO_URING_F_UNLOCKED); - io_req_task_complete(preq, &locked); -out: - if (ret < 0) { - req_set_fail(req); - return ret; - } - /* complete update request, we're done with it */ - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -4589,6 +3771,14 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } +int io_poll_issue(struct io_kiocb *req, bool *locked) +{ + io_tw_lock(req->ctx, locked); + if (unlikely(req->task->flags & PF_EXITING)) + return -EFAULT; + return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); +} + struct io_wq_work *io_wq_free_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -8209,7 +7399,7 @@ static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) return -ECANCELED; } -static const struct io_op_def io_op_defs[] = { +const struct io_op_def io_op_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 60678e88a9b92c..1ceac4ea62bf3b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -92,6 +92,7 @@ static inline bool io_run_task_work(void) return false; } +void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); @@ -109,6 +110,32 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); } +void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags); +static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +{ + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + return; + /* + * For legacy provided buffer mode, don't recycle if we already did + * IO to this buffer. For ring-mapped provided buffer mode, we should + * increment ring->head to explicitly monopolize the buffer to avoid + * multiple use. + */ + if ((req->flags & REQ_F_BUFFER_SELECTED) && + (req->flags & REQ_F_PARTIAL_IO)) + return; + + /* + * READV uses fields in `struct io_rw` (len/addr) to stash the selected + * buffer data. However if that buffer is recycled the original request + * data stored in addr is lost. Therefore forbid recycling for now. + */ + if (req->opcode == IORING_OP_READV) + return; + + __io_kbuf_recycle(req, issue_flags); +} + struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); @@ -128,12 +155,14 @@ void io_req_task_work_add(struct io_kiocb *req); void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); +void io_req_task_submit(struct io_kiocb *req, bool *locked); void tctx_task_work(struct callback_head *cb); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); +int io_poll_issue(struct io_kiocb *req, bool *locked); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); @@ -143,6 +172,9 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); +bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all); + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) diff --git a/io_uring/opdef.h b/io_uring/opdef.h new file mode 100644 index 00000000000000..4578adcdba8a41 --- /dev/null +++ b/io_uring/opdef.h @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_OP_DEF_H +#define IOU_OP_DEF_H + +struct io_op_def { + /* needs req->file assigned */ + unsigned needs_file : 1; + /* should block plug */ + unsigned plug : 1; + /* hash wq insertion if file is a regular file */ + unsigned hash_reg_file : 1; + /* unbound wq insertion if file is a non-regular file */ + unsigned unbound_nonreg_file : 1; + /* set if opcode supports polled "wait" */ + unsigned pollin : 1; + unsigned pollout : 1; + unsigned poll_exclusive : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; + /* skip auditing */ + unsigned audit_skip : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; + /* size of async data needed, if any */ + unsigned short async_size; + + const char *name; + + int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); + int (*issue)(struct io_kiocb *, unsigned int); + int (*prep_async)(struct io_kiocb *); + void (*cleanup)(struct io_kiocb *); +}; + +extern const struct io_op_def io_op_defs[]; +#endif diff --git a/io_uring/poll.c b/io_uring/poll.c new file mode 100644 index 00000000000000..c3e4fcb0a7ba7b --- /dev/null +++ b/io_uring/poll.c @@ -0,0 +1,760 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "refs.h" +#include "opdef.h" +#include "poll.h" + +struct io_poll_update { + struct file *file; + u64 old_user_data; + u64 new_user_data; + __poll_t events; + bool update_events; + bool update_user_data; +}; + +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int nr_entries; + int error; +}; + +#define IO_POLL_CANCEL_FLAG BIT(31) +#define IO_POLL_REF_MASK GENMASK(30, 0) + +/* + * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can + * bump it and acquire ownership. It's disallowed to modify requests while not + * owning it, that prevents from races for enqueueing task_work's and b/w + * arming poll and wakeups. + */ +static inline bool io_poll_get_ownership(struct io_kiocb *req) +{ + return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); +} + +static void io_poll_mark_cancelled(struct io_kiocb *req) +{ + atomic_or(IO_POLL_CANCEL_FLAG, &req->poll_refs); +} + +static struct io_poll *io_poll_get_double(struct io_kiocb *req) +{ + /* pure poll stashes this in ->async_data, poll driven retry elsewhere */ + if (req->opcode == IORING_OP_POLL_ADD) + return req->async_data; + return req->apoll->double_poll; +} + +static struct io_poll *io_poll_get_single(struct io_kiocb *req) +{ + if (req->opcode == IORING_OP_POLL_ADD) + return io_kiocb_to_cmd(req); + return &req->apoll->poll; +} + +static void io_poll_req_insert(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); +} + +static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, + wait_queue_func_t wake_func) +{ + poll->head = NULL; +#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) + /* mask in events that we always want/need */ + poll->events = events | IO_POLL_UNMASK; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, wake_func); +} + +static inline void io_poll_remove_entry(struct io_poll *poll) +{ + struct wait_queue_head *head = smp_load_acquire(&poll->head); + + if (head) { + spin_lock_irq(&head->lock); + list_del_init(&poll->wait.entry); + poll->head = NULL; + spin_unlock_irq(&head->lock); + } +} + +static void io_poll_remove_entries(struct io_kiocb *req) +{ + /* + * Nothing to do if neither of those flags are set. Avoid dipping + * into the poll/apoll/double cachelines if we can. + */ + if (!(req->flags & (REQ_F_SINGLE_POLL | REQ_F_DOUBLE_POLL))) + return; + + /* + * While we hold the waitqueue lock and the waitqueue is nonempty, + * wake_up_pollfree() will wait for us. However, taking the waitqueue + * lock in the first place can race with the waitqueue being freed. + * + * We solve this as eventpoll does: by taking advantage of the fact that + * all users of wake_up_pollfree() will RCU-delay the actual free. If + * we enter rcu_read_lock() and see that the pointer to the queue is + * non-NULL, we can then lock it without the memory being freed out from + * under us. + * + * Keep holding rcu_read_lock() as long as we hold the queue lock, in + * case the caller deletes the entry from the queue, leaving it empty. + * In that case, only RCU prevents the queue memory from being freed. + */ + rcu_read_lock(); + if (req->flags & REQ_F_SINGLE_POLL) + io_poll_remove_entry(io_poll_get_single(req)); + if (req->flags & REQ_F_DOUBLE_POLL) + io_poll_remove_entry(io_poll_get_double(req)); + rcu_read_unlock(); +} + +/* + * All poll tw should go through this. Checks for poll events, manages + * references, does rewait, etc. + * + * Returns a negative error on failure. >0 when no action require, which is + * either spurious wakeup or multishot CQE is served. 0 when it's done with + * the request, then the mask is stored in req->cqe.res. + */ +static int io_poll_check_events(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + int v, ret; + + /* req->task == current here, checking PF_EXITING is safe */ + if (unlikely(req->task->flags & PF_EXITING)) + return -ECANCELED; + + do { + v = atomic_read(&req->poll_refs); + + /* tw handler should be the owner, and so have some references */ + if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) + return 0; + if (v & IO_POLL_CANCEL_FLAG) + return -ECANCELED; + + if (!req->cqe.res) { + struct poll_table_struct pt = { ._key = req->apoll_events }; + req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; + } + + if ((unlikely(!req->cqe.res))) + continue; + if (req->apoll_events & EPOLLONESHOT) + return 0; + + /* multishot, just fill a CQE and proceed */ + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + __poll_t mask = mangle_poll(req->cqe.res & + req->apoll_events); + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, req->cqe.user_data, + mask, IORING_CQE_F_MORE); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) { + io_cqring_ev_posted(ctx); + continue; + } + return -ECANCELED; + } + + ret = io_poll_issue(req, locked); + if (ret) + return ret; + + /* + * Release all references, retry if someone tried to restart + * task_work while we were executing it. + */ + } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); + + return 1; +} + +static void io_poll_task_func(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = io_poll_check_events(req, locked); + if (ret > 0) + return; + + if (!ret) { + struct io_poll *poll = io_kiocb_to_cmd(req); + + req->cqe.res = mangle_poll(req->cqe.res & poll->events); + } else { + req->cqe.res = ret; + req_set_fail(req); + } + + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + req->cqe.flags = 0; + __io_req_complete_post(req); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + +static void io_apoll_task_func(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = io_poll_check_events(req, locked); + if (ret > 0) + return; + + io_poll_remove_entries(req); + spin_lock(&ctx->completion_lock); + hash_del(&req->hash_node); + spin_unlock(&ctx->completion_lock); + + if (!ret) + io_req_task_submit(req, locked); + else + io_req_complete_failed(req, ret); +} + +static void __io_poll_execute(struct io_kiocb *req, int mask, + __poll_t __maybe_unused events) +{ + io_req_set_res(req, mask, 0); + /* + * This is useful for poll that is armed on behalf of another + * request, and where the wakeup path could be on a different + * CPU. We want to avoid pulling in req->apoll->events for that + * case. + */ + if (req->opcode == IORING_OP_POLL_ADD) + req->io_task_work.func = io_poll_task_func; + else + req->io_task_work.func = io_apoll_task_func; + + trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); + io_req_task_work_add(req); +} + +static inline void io_poll_execute(struct io_kiocb *req, int res, + __poll_t events) +{ + if (io_poll_get_ownership(req)) + __io_poll_execute(req, res, events); +} + +static void io_poll_cancel_req(struct io_kiocb *req) +{ + io_poll_mark_cancelled(req); + /* kick tw, which should complete the request */ + io_poll_execute(req, 0, 0); +} + +#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) +#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) +#define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) + +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_kiocb *req = wqe_to_req(wait); + struct io_poll *poll = container_of(wait, struct io_poll, wait); + __poll_t mask = key_to_poll(key); + + if (unlikely(mask & POLLFREE)) { + io_poll_mark_cancelled(req); + /* we have to kick tw in case it's not already */ + io_poll_execute(req, 0, poll->events); + + /* + * If the waitqueue is being freed early but someone is already + * holds ownership over it, we have to tear down the request as + * best we can. That means immediately removing the request from + * its waitqueue and preventing all further accesses to the + * waitqueue via the request. + */ + list_del_init(&poll->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&poll->head, NULL); + return 1; + } + + /* for instances that support it check for an event match first */ + if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) + return 0; + + if (io_poll_get_ownership(req)) { + /* optional, saves extra locking for removal in tw handler */ + if (mask && poll->events & EPOLLONESHOT) { + list_del_init(&poll->wait.entry); + poll->head = NULL; + if (wqe_is_double(wait)) + req->flags &= ~REQ_F_DOUBLE_POLL; + else + req->flags &= ~REQ_F_SINGLE_POLL; + } + __io_poll_execute(req, mask, poll->events); + } + return 1; +} + +static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, + struct wait_queue_head *head, + struct io_poll **poll_ptr) +{ + struct io_kiocb *req = pt->req; + unsigned long wqe_private = (unsigned long) req; + + /* + * The file being polled uses multiple waitqueues for poll handling + * (e.g. one for read, one for write). Setup a separate io_poll + * if this happens. + */ + if (unlikely(pt->nr_entries)) { + struct io_poll *first = poll; + + /* double add on the same waitqueue head, ignore */ + if (first->head == head) + return; + /* already have a 2nd entry, fail a third attempt */ + if (*poll_ptr) { + if ((*poll_ptr)->head == head) + return; + pt->error = -EINVAL; + return; + } + + poll = kmalloc(sizeof(*poll), GFP_ATOMIC); + if (!poll) { + pt->error = -ENOMEM; + return; + } + /* mark as double wq entry */ + wqe_private |= 1; + req->flags |= REQ_F_DOUBLE_POLL; + io_init_poll_iocb(poll, first->events, first->wait.func); + *poll_ptr = poll; + if (req->opcode == IORING_OP_POLL_ADD) + req->flags |= REQ_F_ASYNC_DATA; + } + + req->flags |= REQ_F_SINGLE_POLL; + pt->nr_entries++; + poll->head = head; + poll->wait.private = (void *) wqe_private; + + if (poll->events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(head, &poll->wait); + else + add_wait_queue(head, &poll->wait); +} + +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct io_poll *poll = io_kiocb_to_cmd(pt->req); + + __io_queue_proc(poll, pt, head, + (struct io_poll **) &pt->req->async_data); +} + +static int __io_arm_poll_handler(struct io_kiocb *req, + struct io_poll *poll, + struct io_poll_table *ipt, __poll_t mask) +{ + struct io_ring_ctx *ctx = req->ctx; + int v; + + INIT_HLIST_NODE(&req->hash_node); + req->work.cancel_seq = atomic_read(&ctx->cancel_seq); + io_init_poll_iocb(poll, mask, io_poll_wake); + poll->file = req->file; + + req->apoll_events = poll->events; + + ipt->pt._key = mask; + ipt->req = req; + ipt->error = 0; + ipt->nr_entries = 0; + + /* + * Take the ownership to delay any tw execution up until we're done + * with poll arming. see io_poll_get_ownership(). + */ + atomic_set(&req->poll_refs, 1); + mask = vfs_poll(req->file, &ipt->pt) & poll->events; + + if (mask && (poll->events & EPOLLONESHOT)) { + io_poll_remove_entries(req); + /* no one else has access to the req, forget about the ref */ + return mask; + } + if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { + io_poll_remove_entries(req); + if (!ipt->error) + ipt->error = -EINVAL; + return 0; + } + + spin_lock(&ctx->completion_lock); + io_poll_req_insert(req); + spin_unlock(&ctx->completion_lock); + + if (mask) { + /* can't multishot if failed, just queue the event we've got */ + if (unlikely(ipt->error || !ipt->nr_entries)) { + poll->events |= EPOLLONESHOT; + req->apoll_events |= EPOLLONESHOT; + ipt->error = 0; + } + __io_poll_execute(req, mask, poll->events); + return 0; + } + + /* + * Release ownership. If someone tried to queue a tw while it was + * locked, kick it off for them. + */ + v = atomic_dec_return(&req->poll_refs); + if (unlikely(v & IO_POLL_REF_MASK)) + __io_poll_execute(req, 0, poll->events); + return 0; +} + +static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + struct async_poll *apoll = pt->req->apoll; + + __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); +} + +int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) +{ + const struct io_op_def *def = &io_op_defs[req->opcode]; + struct io_ring_ctx *ctx = req->ctx; + struct async_poll *apoll; + struct io_poll_table ipt; + __poll_t mask = POLLPRI | POLLERR; + int ret; + + if (!def->pollin && !def->pollout) + return IO_APOLL_ABORTED; + if (!file_can_poll(req->file)) + return IO_APOLL_ABORTED; + if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) + return IO_APOLL_ABORTED; + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) + mask |= EPOLLONESHOT; + + if (def->pollin) { + mask |= EPOLLIN | EPOLLRDNORM; + + /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ + if (req->flags & REQ_F_CLEAR_POLLIN) + mask &= ~EPOLLIN; + } else { + mask |= EPOLLOUT | EPOLLWRNORM; + } + if (def->poll_exclusive) + mask |= EPOLLEXCLUSIVE; + if (req->flags & REQ_F_POLLED) { + apoll = req->apoll; + kfree(apoll->double_poll); + } else if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del_init(&apoll->poll.wait.entry); + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return IO_APOLL_ABORTED; + } + apoll->double_poll = NULL; + req->apoll = apoll; + req->flags |= REQ_F_POLLED; + ipt.pt._qproc = io_async_queue_proc; + + io_kbuf_recycle(req, issue_flags); + + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); + if (ret || ipt.error) + return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; + + trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, + mask, apoll->poll.events); + return IO_APOLL_OK; +} + +/* + * Returns true if we found and killed one or more poll requests + */ +__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool found = false; + int i; + + spin_lock(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (io_match_task_safe(req, tsk, cancel_all)) { + hlist_del_init(&req->hash_node); + io_poll_cancel_req(req); + found = true; + } + } + } + spin_unlock(&ctx->completion_lock); + return found; +} + +static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, + struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct hlist_head *list; + struct io_kiocb *req; + + list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; + hlist_for_each_entry(req, list, hash_node) { + if (cd->data != req->cqe.user_data) + continue; + if (poll_only && req->opcode != IORING_OP_POLL_ADD) + continue; + if (cd->flags & IORING_ASYNC_CANCEL_ALL) { + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + } + return req; + } + return NULL; +} + +static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, + struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + int i; + + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry(req, list, hash_node) { + if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && + req->file != cd->file) + continue; + if (cd->seq == req->work.cancel_seq) + continue; + req->work.cancel_seq = cd->seq; + return req; + } + } + return NULL; +} + +static bool io_poll_disarm(struct io_kiocb *req) + __must_hold(&ctx->completion_lock) +{ + if (!io_poll_get_ownership(req)) + return false; + io_poll_remove_entries(req); + hash_del(&req->hash_node); + return true; +} + +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) + __must_hold(&ctx->completion_lock) +{ + struct io_kiocb *req; + + if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) + req = io_poll_file_find(ctx, cd); + else + req = io_poll_find(ctx, false, cd); + if (!req) + return -ENOENT; + io_poll_cancel_req(req); + return 0; +} + +static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, + unsigned int flags) +{ + u32 events; + + events = READ_ONCE(sqe->poll32_events); +#ifdef __BIG_ENDIAN + events = swahw32(events); +#endif + if (!(flags & IORING_POLL_ADD_MULTI)) + events |= EPOLLONESHOT; + return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); +} + +int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_poll_update *upd = io_kiocb_to_cmd(req); + u32 flags; + + if (sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + flags = READ_ONCE(sqe->len); + if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | + IORING_POLL_ADD_MULTI)) + return -EINVAL; + /* meaningless without update */ + if (flags == IORING_POLL_ADD_MULTI) + return -EINVAL; + + upd->old_user_data = READ_ONCE(sqe->addr); + upd->update_events = flags & IORING_POLL_UPDATE_EVENTS; + upd->update_user_data = flags & IORING_POLL_UPDATE_USER_DATA; + + upd->new_user_data = READ_ONCE(sqe->off); + if (!upd->update_user_data && upd->new_user_data) + return -EINVAL; + if (upd->update_events) + upd->events = io_poll_parse_events(sqe, flags); + else if (sqe->poll32_events) + return -EINVAL; + + return 0; +} + +int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_poll *poll = io_kiocb_to_cmd(req); + u32 flags; + + if (sqe->buf_index || sqe->off || sqe->addr) + return -EINVAL; + flags = READ_ONCE(sqe->len); + if (flags & ~IORING_POLL_ADD_MULTI) + return -EINVAL; + if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) + return -EINVAL; + + io_req_set_refcount(req); + poll->events = io_poll_parse_events(sqe, flags); + return 0; +} + +int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_poll *poll = io_kiocb_to_cmd(req); + struct io_poll_table ipt; + int ret; + + ipt.pt._qproc = io_poll_queue_proc; + + ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); + if (ret) { + io_req_set_res(req, ret, 0); + return IOU_OK; + } + if (ipt.error) { + req_set_fail(req); + return ipt.error; + } + + return IOU_ISSUE_SKIP_COMPLETE; +} + +int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_poll_update *poll_update = io_kiocb_to_cmd(req); + struct io_cancel_data cd = { .data = poll_update->old_user_data, }; + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *preq; + int ret2, ret = 0; + bool locked; + + spin_lock(&ctx->completion_lock); + preq = io_poll_find(ctx, true, &cd); + if (!preq || !io_poll_disarm(preq)) { + spin_unlock(&ctx->completion_lock); + ret = preq ? -EALREADY : -ENOENT; + goto out; + } + spin_unlock(&ctx->completion_lock); + + if (poll_update->update_events || poll_update->update_user_data) { + /* only mask one event flags, keep behavior flags */ + if (poll_update->update_events) { + struct io_poll *poll = io_kiocb_to_cmd(preq); + + poll->events &= ~0xffff; + poll->events |= poll_update->events & 0xffff; + poll->events |= IO_POLL_UNMASK; + } + if (poll_update->update_user_data) + preq->cqe.user_data = poll_update->new_user_data; + + ret2 = io_poll_add(preq, issue_flags); + /* successfully updated, don't complete poll request */ + if (!ret2 || ret2 == -EIOCBQUEUED) + goto out; + } + + req_set_fail(preq); + io_req_set_res(preq, -ECANCELED, 0); + locked = !(issue_flags & IO_URING_F_UNLOCKED); + io_req_task_complete(preq, &locked); +out: + if (ret < 0) { + req_set_fail(req); + return ret; + } + /* complete update request, we're done with it */ + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/poll.h b/io_uring/poll.h new file mode 100644 index 00000000000000..cc75c1567a84ae --- /dev/null +++ b/io_uring/poll.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 + +enum { + IO_APOLL_OK, + IO_APOLL_ABORTED, + IO_APOLL_READY +}; + +struct io_poll { + struct file *file; + struct wait_queue_head *head; + __poll_t events; + struct wait_queue_entry wait; +}; + +struct async_poll { + struct io_poll poll; + struct io_poll *double_poll; +}; + +int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_poll_add(struct io_kiocb *req, unsigned int issue_flags); + +int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags); + +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); +int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); +bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all); From 04d70db6e5af4254b630ffaf7529e8e8cfbfa2d3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 25 May 2022 20:36:47 -0600 Subject: [PATCH 0976/1250] io_uring: move cancelation into its own file This also helps cleanup the io_uring.h cancel parts, as we can make things static in the cancel.c file, mostly. Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/cancel.c | 194 ++++++++++++++++++++++++++++++++++++++++++++ io_uring/cancel.h | 6 ++ io_uring/io_uring.c | 177 +--------------------------------------- io_uring/io_uring.h | 1 - io_uring/timeout.c | 1 + 6 files changed, 204 insertions(+), 178 deletions(-) create mode 100644 io_uring/cancel.c create mode 100644 io_uring/cancel.h diff --git a/io_uring/Makefile b/io_uring/Makefile index eb1b07a9951639..cfd61e6b7759f7 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -6,5 +6,6 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ sync.o advise.o filetable.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o tctx.o poll.o + sqpoll.o fdinfo.o tctx.o poll.o \ + cancel.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/cancel.c b/io_uring/cancel.c new file mode 100644 index 00000000000000..83cceb52d82d64 --- /dev/null +++ b/io_uring/cancel.c @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "tctx.h" +#include "poll.h" +#include "timeout.h" +#include "cancel.h" + +struct io_cancel { + struct file *file; + u64 addr; + u32 flags; + s32 fd; +}; + +#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ + IORING_ASYNC_CANCEL_ANY) + +static bool io_cancel_cb(struct io_wq_work *work, void *data) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_cancel_data *cd = data; + + if (req->ctx != cd->ctx) + return false; + if (cd->flags & IORING_ASYNC_CANCEL_ANY) { + ; + } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { + if (req->file != cd->file) + return false; + } else { + if (req->cqe.user_data != cd->data) + return false; + } + if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { + if (cd->seq == req->work.cancel_seq) + return false; + req->work.cancel_seq = cd->seq; + } + return true; +} + +static int io_async_cancel_one(struct io_uring_task *tctx, + struct io_cancel_data *cd) +{ + enum io_wq_cancel cancel_ret; + int ret = 0; + bool all; + + if (!tctx || !tctx->io_wq) + return -ENOENT; + + all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); + switch (cancel_ret) { + case IO_WQ_CANCEL_OK: + ret = 0; + break; + case IO_WQ_CANCEL_RUNNING: + ret = -EALREADY; + break; + case IO_WQ_CANCEL_NOTFOUND: + ret = -ENOENT; + break; + } + + return ret; +} + +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); + + ret = io_async_cancel_one(req->task->io_uring, cd); + /* + * Fall-through even for -EALREADY, as we may have poll armed + * that need unarming. + */ + if (!ret) + return 0; + + spin_lock(&ctx->completion_lock); + ret = io_poll_cancel(ctx, cd); + if (ret != -ENOENT) + goto out; + if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) + ret = io_timeout_cancel(ctx, cd); +out: + spin_unlock(&ctx->completion_lock); + return ret; +} + + +int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_cancel *cancel = io_kiocb_to_cmd(req); + + if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + if (sqe->off || sqe->len || sqe->splice_fd_in) + return -EINVAL; + + cancel->addr = READ_ONCE(sqe->addr); + cancel->flags = READ_ONCE(sqe->cancel_flags); + if (cancel->flags & ~CANCEL_FLAGS) + return -EINVAL; + if (cancel->flags & IORING_ASYNC_CANCEL_FD) { + if (cancel->flags & IORING_ASYNC_CANCEL_ANY) + return -EINVAL; + cancel->fd = READ_ONCE(sqe->fd); + } + + return 0; +} + +static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, + unsigned int issue_flags) +{ + bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); + struct io_ring_ctx *ctx = cd->ctx; + struct io_tctx_node *node; + int ret, nr = 0; + + do { + ret = io_try_cancel(req, cd); + if (ret == -ENOENT) + break; + if (!all) + return ret; + nr++; + } while (1); + + /* slow path, try all io-wq's */ + io_ring_submit_lock(ctx, issue_flags); + ret = -ENOENT; + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + ret = io_async_cancel_one(tctx, cd); + if (ret != -ENOENT) { + if (!all) + break; + nr++; + } + } + io_ring_submit_unlock(ctx, issue_flags); + return all ? nr : ret; +} + +int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_cancel *cancel = io_kiocb_to_cmd(req); + struct io_cancel_data cd = { + .ctx = req->ctx, + .data = cancel->addr, + .flags = cancel->flags, + .seq = atomic_inc_return(&req->ctx->cancel_seq), + }; + int ret; + + if (cd.flags & IORING_ASYNC_CANCEL_FD) { + if (req->flags & REQ_F_FIXED_FILE) + req->file = io_file_get_fixed(req, cancel->fd, + issue_flags); + else + req->file = io_file_get_normal(req, cancel->fd); + if (!req->file) { + ret = -EBADF; + goto done; + } + cd.file = req->file; + } + + ret = __io_async_cancel(&cd, req, issue_flags); +done: + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h new file mode 100644 index 00000000000000..4f35d86963253f --- /dev/null +++ b/io_uring/cancel.h @@ -0,0 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 + +int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); + +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b4ecfa7238555f..fb4f3ffa58c886 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -108,6 +108,7 @@ #include "msg_ring.h" #include "timeout.h" #include "poll.h" +#include "cancel.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -210,13 +211,6 @@ struct io_buffer { * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in */ -struct io_cancel { - struct file *file; - u64 addr; - u32 flags; - s32 fd; -}; - struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -3362,175 +3356,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -static bool io_cancel_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - struct io_cancel_data *cd = data; - - if (req->ctx != cd->ctx) - return false; - if (cd->flags & IORING_ASYNC_CANCEL_ANY) { - ; - } else if (cd->flags & IORING_ASYNC_CANCEL_FD) { - if (req->file != cd->file) - return false; - } else { - if (req->cqe.user_data != cd->data) - return false; - } - if (cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY)) { - if (cd->seq == req->work.cancel_seq) - return false; - req->work.cancel_seq = cd->seq; - } - return true; -} - -static int io_async_cancel_one(struct io_uring_task *tctx, - struct io_cancel_data *cd) -{ - enum io_wq_cancel cancel_ret; - int ret = 0; - bool all; - - if (!tctx || !tctx->io_wq) - return -ENOENT; - - all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); - cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, cd, all); - switch (cancel_ret) { - case IO_WQ_CANCEL_OK: - ret = 0; - break; - case IO_WQ_CANCEL_RUNNING: - ret = -EALREADY; - break; - case IO_WQ_CANCEL_NOTFOUND: - ret = -ENOENT; - break; - } - - return ret; -} - -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) -{ - struct io_ring_ctx *ctx = req->ctx; - int ret; - - WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); - - ret = io_async_cancel_one(req->task->io_uring, cd); - /* - * Fall-through even for -EALREADY, as we may have poll armed - * that need unarming. - */ - if (!ret) - return 0; - - spin_lock(&ctx->completion_lock); - ret = io_poll_cancel(ctx, cd); - if (ret != -ENOENT) - goto out; - if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) - ret = io_timeout_cancel(ctx, cd); -out: - spin_unlock(&ctx->completion_lock); - return ret; -} - -#define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ - IORING_ASYNC_CANCEL_ANY) - -static int io_async_cancel_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_cancel *cancel = io_kiocb_to_cmd(req); - - if (unlikely(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; - if (sqe->off || sqe->len || sqe->splice_fd_in) - return -EINVAL; - - cancel->addr = READ_ONCE(sqe->addr); - cancel->flags = READ_ONCE(sqe->cancel_flags); - if (cancel->flags & ~CANCEL_FLAGS) - return -EINVAL; - if (cancel->flags & IORING_ASYNC_CANCEL_FD) { - if (cancel->flags & IORING_ASYNC_CANCEL_ANY) - return -EINVAL; - cancel->fd = READ_ONCE(sqe->fd); - } - - return 0; -} - -static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, - unsigned int issue_flags) -{ - bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); - struct io_ring_ctx *ctx = cd->ctx; - struct io_tctx_node *node; - int ret, nr = 0; - - do { - ret = io_try_cancel(req, cd); - if (ret == -ENOENT) - break; - if (!all) - return ret; - nr++; - } while (1); - - /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, issue_flags); - ret = -ENOENT; - list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - ret = io_async_cancel_one(tctx, cd); - if (ret != -ENOENT) { - if (!all) - break; - nr++; - } - } - io_ring_submit_unlock(ctx, issue_flags); - return all ? nr : ret; -} - -static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_cancel *cancel = io_kiocb_to_cmd(req); - struct io_cancel_data cd = { - .ctx = req->ctx, - .data = cancel->addr, - .flags = cancel->flags, - .seq = atomic_inc_return(&req->ctx->cancel_seq), - }; - int ret; - - if (cd.flags & IORING_ASYNC_CANCEL_FD) { - if (req->flags & REQ_F_FIXED_FILE) - req->file = io_file_get_fixed(req, cancel->fd, - issue_flags); - else - req->file = io_file_get_normal(req, cancel->fd); - if (!req->file) { - ret = -EBADF; - goto done; - } - cd.file = req->file; - } - - ret = __io_async_cancel(&cd, req, issue_flags); -done: - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 1ceac4ea62bf3b..a78e3c5ab109b8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -157,7 +157,6 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, bool *locked); void tctx_task_work(struct callback_head *cb); -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 5e42bfcd683e2a..69cca42d6835b4 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -11,6 +11,7 @@ #include "io_uring_types.h" #include "io_uring.h" #include "refs.h" +#include "cancel.h" #include "timeout.h" struct io_timeout { From 4c29b398a7bb6baf3c2bba7d0e90b93139098bcf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 07:07:23 -0600 Subject: [PATCH 0977/1250] io_uring: split provided buffers handling into its own file Move both the opcodes related to it, and the internals code dealing with it. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 602 +------------------------------------------- io_uring/io_uring.h | 36 +-- io_uring/kbuf.c | 524 ++++++++++++++++++++++++++++++++++++++ io_uring/kbuf.h | 142 +++++++++++ io_uring/net.c | 1 + io_uring/poll.c | 1 + 7 files changed, 672 insertions(+), 636 deletions(-) create mode 100644 io_uring/kbuf.c create mode 100644 io_uring/kbuf.h diff --git a/io_uring/Makefile b/io_uring/Makefile index cfd61e6b7759f7..b85418b64e8241 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o + cancel.o kbuf.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fb4f3ffa58c886..e395167999edfb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -93,6 +93,7 @@ #include "tctx.h" #include "sqpoll.h" #include "fdinfo.h" +#include "kbuf.h" #include "xattr.h" #include "nop.h" @@ -171,42 +172,10 @@ struct io_rsrc_data { bool quiesce; }; -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) -struct io_buffer_list { - /* - * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, - * then these are classic provided buffers and ->buf_list is used. - */ - union { - struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; - }; - __u16 bgid; - - /* below is for ring provided buffers */ - __u16 buf_nr_pages; - __u16 nr_entries; - __u16 head; - __u16 mask; -}; - -struct io_buffer { - struct list_head list; - __u64 addr; - __u32 len; - __u16 bid; - __u16 bgid; -}; - #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -#define BGID_ARRAY 64 - /* * First field must be the file pointer in all the * iocb unions! See also 'struct kiocb' in @@ -226,15 +195,6 @@ struct io_rsrc_update { u32 offset; }; -struct io_provide_buf { - struct file *file; - __u64 addr; - __u32 len; - __u32 bgid; - __u16 nbufs; - __u16 bid; -}; - struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; @@ -399,110 +359,6 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, } } -static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) -{ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) - req->buf_list->head++; - req->flags &= ~REQ_F_BUFFER_RING; - } else { - list_add(&req->kbuf->list, list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - } - - return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); -} - -static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) -{ - lockdep_assert_held(&req->ctx->completion_lock); - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - return __io_put_kbuf(req, &req->ctx->io_buffers_comp); -} - -inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) -{ - unsigned int cflags; - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; - - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); - } - - return cflags; -} - -static struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - if (ctx->io_bl && bgid < BGID_ARRAY) - return &ctx->io_bl[bgid]; - - return xa_load(&ctx->io_bl_xa, bgid); -} - -void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - struct io_buffer *buf; - - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - */ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - } - } - return; - } - - io_ring_submit_lock(ctx, issue_flags); - - buf = req->kbuf; - bl = io_buffer_get_list(ctx, buf->bgid); - list_add(&buf->list, &bl->buf_list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; - - io_ring_submit_unlock(ctx, issue_flags); -} - static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; @@ -2296,96 +2152,6 @@ static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, return __io_import_fixed(req, rw, iter, req->imu); } -static int io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) -{ - bl->bgid = bgid; - if (bgid < BGID_ARRAY) - return 0; - - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); -} - -static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl) -{ - if (!list_empty(&bl->buf_list)) { - struct io_buffer *kbuf; - - kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&kbuf->list); - if (*len > kbuf->len) - *len = kbuf->len; - req->flags |= REQ_F_BUFFER_SELECTED; - req->kbuf = kbuf; - req->buf_index = kbuf->bid; - return u64_to_user_ptr(kbuf->addr); - } - return NULL; -} - -static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, - struct io_buffer_list *bl, - unsigned int issue_flags) -{ - struct io_uring_buf_ring *br = bl->buf_ring; - struct io_uring_buf *buf; - __u16 head = bl->head; - - if (unlikely(smp_load_acquire(&br->tail) == head)) - return NULL; - - head &= bl->mask; - if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } - if (*len > buf->len) - *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; - req->buf_list = bl; - req->buf_index = buf->bid; - - if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { - /* - * If we came in unlocked, we have no choice but to consume the - * buffer here. This does mean it'll be pinned until the IO - * completes. But coming in unlocked means we're in io-wq - * context, hence there should be no further retry. For the - * locked case, the caller must ensure to call the commit when - * the transfer completes (or if we get -EAGAIN and must poll - * or retry). - */ - req->buf_list = NULL; - bl->head++; - } - return u64_to_user_ptr(buf->addr); -} - -void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - void __user *ret = NULL; - - io_ring_submit_lock(req->ctx, issue_flags); - - bl = io_buffer_get_list(ctx, req->buf_index); - if (likely(bl)) { - if (bl->buf_nr_pages) - ret = io_ring_buffer_select(req, len, bl, issue_flags); - else - ret = io_provided_buffer_select(req, len, bl); - } - io_ring_submit_unlock(req->ctx, issue_flags); - return ret; -} - #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) @@ -3098,258 +2864,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, return ret; } -static int io_remove_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req); - u64 tmp; - - if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || - sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -EINVAL; - - memset(p, 0, sizeof(*p)); - p->nbufs = tmp; - p->bgid = READ_ONCE(sqe->buf_group); - return 0; -} - -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) -{ - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->buf_nr_pages) { - int j; - - i = bl->buf_ring->tail - bl->head; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - return i; - } - - /* the head kbuf is the list itself */ - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - - nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_del(&nxt->list); - if (++i == nbufs) - return i; - cond_resched(); - } - i++; - - return i; -} - -static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!bl->buf_nr_pages) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - if (ret < 0) - req_set_fail(req); - - /* complete before unlock, IOPOLL may need the lock */ - io_req_set_res(req, ret, 0); - __io_req_complete(req, issue_flags); - io_ring_submit_unlock(ctx, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; -} - -static int io_provide_buffers_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - unsigned long size, tmp_check; - struct io_provide_buf *p = io_kiocb_to_cmd(req); - u64 tmp; - - if (sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - tmp = READ_ONCE(sqe->fd); - if (!tmp || tmp > USHRT_MAX) - return -E2BIG; - p->nbufs = tmp; - p->addr = READ_ONCE(sqe->addr); - p->len = READ_ONCE(sqe->len); - - if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, - &size)) - return -EOVERFLOW; - if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) - return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; - if (!access_ok(u64_to_user_ptr(p->addr), size)) - return -EFAULT; - - p->bgid = READ_ONCE(sqe->buf_group); - tmp = READ_ONCE(sqe->off); - if (tmp > USHRT_MAX) - return -E2BIG; - p->bid = tmp; - return 0; -} - -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *buf; - struct page *page; - int bufs_in_page; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - /* - * No free buffers and no completion entries either. Allocate a new - * page worth of buffer entries and add those to our freelist. - */ - page = alloc_page(GFP_KERNEL_ACCOUNT); - if (!page) - return -ENOMEM; - - list_add(&page->lru, &ctx->io_buffers_pages); - - buf = page_address(page); - bufs_in_page = PAGE_SIZE / sizeof(*buf); - while (bufs_in_page) { - list_add_tail(&buf->list, &ctx->io_buffers_cache); - buf++; - bufs_in_page--; - } - - return 0; -} - -static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, - struct io_buffer_list *bl) -{ - struct io_buffer *buf; - u64 addr = pbuf->addr; - int i, bid = pbuf->bid; - - for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) - break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); - buf->addr = addr; - buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); - buf->bid = bid; - buf->bgid = pbuf->bgid; - addr += pbuf->len; - bid++; - cond_resched(); - } - - return i ? 0 : -ENOMEM; -} - -static __cold int io_init_bl_list(struct io_ring_ctx *ctx) -{ - int i; - - ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), - GFP_KERNEL); - if (!ctx->io_bl) - return -ENOMEM; - - for (i = 0; i < BGID_ARRAY; i++) { - INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); - ctx->io_bl[i].bgid = i; - } - - return 0; -} - -static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { - ret = io_init_bl_list(ctx); - if (ret) - goto err; - } - - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { - bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) { - ret = -ENOMEM; - goto err; - } - INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); - if (ret) { - kfree(bl); - goto err; - } - } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->buf_nr_pages) { - ret = -EINVAL; - goto err; - } - - ret = io_add_buffers(ctx, p, bl); -err: - if (ret < 0) - req_set_fail(req); - /* complete before unlock, IOPOLL may need the lock */ - io_req_set_res(req, ret, 0); - __io_req_complete(req, issue_flags); - io_ring_submit_unlock(ctx, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; -} - static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, const struct io_uring_sqe *sqe) { @@ -5218,8 +4732,7 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static struct page **io_pin_pages(unsigned long ubuf, unsigned long len, - int *npages) +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) { unsigned long start, end, nr_pages; struct vm_area_struct **vmas = NULL; @@ -5543,33 +5056,6 @@ static int io_eventfd_unregister(struct io_ring_ctx *ctx) return -ENXIO; } -static void io_destroy_buffers(struct io_ring_ctx *ctx) -{ - struct io_buffer_list *bl; - unsigned long index; - int i; - - for (i = 0; i < BGID_ARRAY; i++) { - if (!ctx->io_bl) - break; - __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); - } - - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - __io_remove_buffers(ctx, bl, -1U); - kfree(bl); - } - - while (!list_empty(&ctx->io_buffers_pages)) { - struct page *page; - - page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); - list_del_init(&page->lru); - __free_page(page); - } -} - static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; @@ -6953,89 +6439,6 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return ret; } -static int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_ring *br; - struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; - struct page **pages; - int nr_pages; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - if (!is_power_of_2(reg.ring_entries)) - return -EINVAL; - - /* cannot disambiguate full vs empty due to head/tail size */ - if (reg.ring_entries >= 65536) - return -EINVAL; - - if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { - int ret = io_init_bl_list(ctx); - if (ret) - return ret; - } - - bl = io_buffer_get_list(ctx, reg.bgid); - if (bl) { - /* if mapped buffer ring OR classic exists, don't allow */ - if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) - return -EEXIST; - } else { - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; - } - - pages = io_pin_pages(reg.ring_addr, - struct_size(br, bufs, reg.ring_entries), - &nr_pages); - if (IS_ERR(pages)) { - kfree(free_bl); - return PTR_ERR(pages); - } - - br = page_address(pages[0]); - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->nr_entries = reg.ring_entries; - bl->buf_ring = br; - bl->mask = reg.ring_entries - 1; - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; -} - -static int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) -{ - struct io_uring_buf_reg reg; - struct io_buffer_list *bl; - - if (copy_from_user(®, arg, sizeof(reg))) - return -EFAULT; - if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - - bl = io_buffer_get_list(ctx, reg.bgid); - if (!bl) - return -ENOENT; - if (!bl->buf_nr_pages) - return -EINVAL; - - __io_remove_buffers(ctx, bl, -1U); - if (bl->bgid >= BGID_ARRAY) { - xa_erase(&ctx->io_bl_xa, bl->bgid); - kfree(bl); - } - return 0; -} - static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -7701,7 +7104,6 @@ static int __init io_uring_init(void) /* ->buf_index is u16 */ BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); - BUILD_BUG_ON(BGID_ARRAY * sizeof(struct io_buffer_list) > PAGE_SIZE); BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != offsetof(struct io_uring_buf_ring, tail)); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index a78e3c5ab109b8..172defdcfdbe40 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -99,42 +99,8 @@ void __io_req_complete_post(struct io_kiocb *req); bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_cqring_ev_posted(struct io_ring_ctx *ctx); -void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags); -unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); -static inline bool io_do_buffer_select(struct io_kiocb *req) -{ - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return false; - return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); -} - -void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags); -static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) -{ - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if ((req->flags & REQ_F_BUFFER_SELECTED) && - (req->flags & REQ_F_PARTIAL_IO)) - return; - - /* - * READV uses fields in `struct io_rw` (len/addr) to stash the selected - * buffer data. However if that buffer is recycled the original request - * data stored in addr is lost. Therefore forbid recycling for now. - */ - if (req->opcode == IORING_OP_READV) - return; - - __io_kbuf_recycle(req, issue_flags); -} +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c new file mode 100644 index 00000000000000..bc58890d932b2f --- /dev/null +++ b/io_uring/kbuf.c @@ -0,0 +1,524 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "opdef.h" +#include "kbuf.h" + +#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) + +#define BGID_ARRAY 64 + +struct io_provide_buf { + struct file *file; + __u64 addr; + __u32 len; + __u32 bgid; + __u16 nbufs; + __u16 bid; +}; + +static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, + unsigned int bgid) +{ + if (ctx->io_bl && bgid < BGID_ARRAY) + return &ctx->io_bl[bgid]; + + return xa_load(&ctx->io_bl_xa, bgid); +} + +void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + struct io_buffer *buf; + + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + */ + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) { + if (req->flags & REQ_F_PARTIAL_IO) { + req->buf_list->head++; + req->buf_list = NULL; + } else { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + } + return; + } + + io_ring_submit_lock(ctx, issue_flags); + + buf = req->kbuf; + bl = io_buffer_get_list(ctx, buf->bgid); + list_add(&buf->list, &bl->buf_list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + req->buf_index = buf->bgid; + + io_ring_submit_unlock(ctx, issue_flags); +} + +static int io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) +{ + bl->bgid = bgid; + if (bgid < BGID_ARRAY) + return 0; + + return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); +} + +static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl) +{ + if (!list_empty(&bl->buf_list)) { + struct io_buffer *kbuf; + + kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); + list_del(&kbuf->list); + if (*len > kbuf->len) + *len = kbuf->len; + req->flags |= REQ_F_BUFFER_SELECTED; + req->kbuf = kbuf; + req->buf_index = kbuf->bid; + return u64_to_user_ptr(kbuf->addr); + } + return NULL; +} + +static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + unsigned int issue_flags) +{ + struct io_uring_buf_ring *br = bl->buf_ring; + struct io_uring_buf *buf; + __u16 head = bl->head; + + if (unlikely(smp_load_acquire(&br->tail) == head)) + return NULL; + + head &= bl->mask; + if (head < IO_BUFFER_LIST_BUF_PER_PAGE) { + buf = &br->bufs[head]; + } else { + int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); + int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; + buf = page_address(bl->buf_pages[index]); + buf += off; + } + if (*len > buf->len) + *len = buf->len; + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + req->buf_index = buf->bid; + + if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { + /* + * If we came in unlocked, we have no choice but to consume the + * buffer here. This does mean it'll be pinned until the IO + * completes. But coming in unlocked means we're in io-wq + * context, hence there should be no further retry. For the + * locked case, the caller must ensure to call the commit when + * the transfer completes (or if we get -EAGAIN and must poll + * or retry). + */ + req->buf_list = NULL; + bl->head++; + } + return u64_to_user_ptr(buf->addr); +} + +void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + void __user *ret = NULL; + + io_ring_submit_lock(req->ctx, issue_flags); + + bl = io_buffer_get_list(ctx, req->buf_index); + if (likely(bl)) { + if (bl->buf_nr_pages) + ret = io_ring_buffer_select(req, len, bl, issue_flags); + else + ret = io_provided_buffer_select(req, len, bl); + } + io_ring_submit_unlock(req->ctx, issue_flags); + return ret; +} + +static __cold int io_init_bl_list(struct io_ring_ctx *ctx) +{ + int i; + + ctx->io_bl = kcalloc(BGID_ARRAY, sizeof(struct io_buffer_list), + GFP_KERNEL); + if (!ctx->io_bl) + return -ENOMEM; + + for (i = 0; i < BGID_ARRAY; i++) { + INIT_LIST_HEAD(&ctx->io_bl[i].buf_list); + ctx->io_bl[i].bgid = i; + } + + return 0; +} + +static int __io_remove_buffers(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned nbufs) +{ + unsigned i = 0; + + /* shouldn't happen */ + if (!nbufs) + return 0; + + if (bl->buf_nr_pages) { + int j; + + i = bl->buf_ring->tail - bl->head; + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + kvfree(bl->buf_pages); + bl->buf_pages = NULL; + bl->buf_nr_pages = 0; + /* make sure it's seen as empty */ + INIT_LIST_HEAD(&bl->buf_list); + return i; + } + + /* the head kbuf is the list itself */ + while (!list_empty(&bl->buf_list)) { + struct io_buffer *nxt; + + nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); + list_del(&nxt->list); + if (++i == nbufs) + return i; + cond_resched(); + } + i++; + + return i; +} + +void io_destroy_buffers(struct io_ring_ctx *ctx) +{ + struct io_buffer_list *bl; + unsigned long index; + int i; + + for (i = 0; i < BGID_ARRAY; i++) { + if (!ctx->io_bl) + break; + __io_remove_buffers(ctx, &ctx->io_bl[i], -1U); + } + + xa_for_each(&ctx->io_bl_xa, index, bl) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + __io_remove_buffers(ctx, bl, -1U); + kfree(bl); + } + + while (!list_empty(&ctx->io_buffers_pages)) { + struct page *page; + + page = list_first_entry(&ctx->io_buffers_pages, struct page, lru); + list_del_init(&page->lru); + __free_page(page); + } +} + +int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req); + u64 tmp; + + if (sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + sqe->splice_fd_in) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -EINVAL; + + memset(p, 0, sizeof(*p)); + p->nbufs = tmp; + p->bgid = READ_ONCE(sqe->buf_group); + return 0; +} + +int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + + ret = -ENOENT; + bl = io_buffer_get_list(ctx, p->bgid); + if (bl) { + ret = -EINVAL; + /* can't use provide/remove buffers command on mapped buffers */ + if (!bl->buf_nr_pages) + ret = __io_remove_buffers(ctx, bl, p->nbufs); + } + if (ret < 0) + req_set_fail(req); + + /* complete before unlock, IOPOLL may need the lock */ + io_req_set_res(req, ret, 0); + __io_req_complete(req, issue_flags); + io_ring_submit_unlock(ctx, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; +} + +int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned long size, tmp_check; + struct io_provide_buf *p = io_kiocb_to_cmd(req); + u64 tmp; + + if (sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + tmp = READ_ONCE(sqe->fd); + if (!tmp || tmp > USHRT_MAX) + return -E2BIG; + p->nbufs = tmp; + p->addr = READ_ONCE(sqe->addr); + p->len = READ_ONCE(sqe->len); + + if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, + &size)) + return -EOVERFLOW; + if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) + return -EOVERFLOW; + + size = (unsigned long)p->len * p->nbufs; + if (!access_ok(u64_to_user_ptr(p->addr), size)) + return -EFAULT; + + p->bgid = READ_ONCE(sqe->buf_group); + tmp = READ_ONCE(sqe->off); + if (tmp > USHRT_MAX) + return -E2BIG; + p->bid = tmp; + return 0; +} + +static int io_refill_buffer_cache(struct io_ring_ctx *ctx) +{ + struct io_buffer *buf; + struct page *page; + int bufs_in_page; + + /* + * Completions that don't happen inline (eg not under uring_lock) will + * add to ->io_buffers_comp. If we don't have any free buffers, check + * the completion list and splice those entries first. + */ + if (!list_empty_careful(&ctx->io_buffers_comp)) { + spin_lock(&ctx->completion_lock); + if (!list_empty(&ctx->io_buffers_comp)) { + list_splice_init(&ctx->io_buffers_comp, + &ctx->io_buffers_cache); + spin_unlock(&ctx->completion_lock); + return 0; + } + spin_unlock(&ctx->completion_lock); + } + + /* + * No free buffers and no completion entries either. Allocate a new + * page worth of buffer entries and add those to our freelist. + */ + page = alloc_page(GFP_KERNEL_ACCOUNT); + if (!page) + return -ENOMEM; + + list_add(&page->lru, &ctx->io_buffers_pages); + + buf = page_address(page); + bufs_in_page = PAGE_SIZE / sizeof(*buf); + while (bufs_in_page) { + list_add_tail(&buf->list, &ctx->io_buffers_cache); + buf++; + bufs_in_page--; + } + + return 0; +} + +static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, + struct io_buffer_list *bl) +{ + struct io_buffer *buf; + u64 addr = pbuf->addr; + int i, bid = pbuf->bid; + + for (i = 0; i < pbuf->nbufs; i++) { + if (list_empty(&ctx->io_buffers_cache) && + io_refill_buffer_cache(ctx)) + break; + buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, + list); + list_move_tail(&buf->list, &bl->buf_list); + buf->addr = addr; + buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); + buf->bid = bid; + buf->bgid = pbuf->bgid; + addr += pbuf->len; + bid++; + cond_resched(); + } + + return i ? 0 : -ENOMEM; +} + +int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + + if (unlikely(p->bgid < BGID_ARRAY && !ctx->io_bl)) { + ret = io_init_bl_list(ctx); + if (ret) + goto err; + } + + bl = io_buffer_get_list(ctx, p->bgid); + if (unlikely(!bl)) { + bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) { + ret = -ENOMEM; + goto err; + } + INIT_LIST_HEAD(&bl->buf_list); + ret = io_buffer_add_list(ctx, bl, p->bgid); + if (ret) { + kfree(bl); + goto err; + } + } + /* can't add buffers via this command for a mapped buffer ring */ + if (bl->buf_nr_pages) { + ret = -EINVAL; + goto err; + } + + ret = io_add_buffers(ctx, p, bl); +err: + if (ret < 0) + req_set_fail(req); + /* complete before unlock, IOPOLL may need the lock */ + io_req_set_res(req, ret, 0); + __io_req_complete(req, issue_flags); + io_ring_submit_unlock(ctx, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; +} + +int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_ring *br; + struct io_uring_buf_reg reg; + struct io_buffer_list *bl, *free_bl = NULL; + struct page **pages; + int nr_pages; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + if (!reg.ring_addr) + return -EFAULT; + if (reg.ring_addr & ~PAGE_MASK) + return -EINVAL; + if (!is_power_of_2(reg.ring_entries)) + return -EINVAL; + + /* cannot disambiguate full vs empty due to head/tail size */ + if (reg.ring_entries >= 65536) + return -EINVAL; + + if (unlikely(reg.bgid < BGID_ARRAY && !ctx->io_bl)) { + int ret = io_init_bl_list(ctx); + if (ret) + return ret; + } + + bl = io_buffer_get_list(ctx, reg.bgid); + if (bl) { + /* if mapped buffer ring OR classic exists, don't allow */ + if (bl->buf_nr_pages || !list_empty(&bl->buf_list)) + return -EEXIST; + } else { + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + if (!bl) + return -ENOMEM; + } + + pages = io_pin_pages(reg.ring_addr, + struct_size(br, bufs, reg.ring_entries), + &nr_pages); + if (IS_ERR(pages)) { + kfree(free_bl); + return PTR_ERR(pages); + } + + br = page_address(pages[0]); + bl->buf_pages = pages; + bl->buf_nr_pages = nr_pages; + bl->nr_entries = reg.ring_entries; + bl->buf_ring = br; + bl->mask = reg.ring_entries - 1; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +} + +int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_reg reg; + struct io_buffer_list *bl; + + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) + return -EINVAL; + + bl = io_buffer_get_list(ctx, reg.bgid); + if (!bl) + return -ENOENT; + if (!bl->buf_nr_pages) + return -EINVAL; + + __io_remove_buffers(ctx, bl, -1U); + if (bl->bgid >= BGID_ARRAY) { + xa_erase(&ctx->io_bl_xa, bl->bgid); + kfree(bl); + } + return 0; +} diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h new file mode 100644 index 00000000000000..9da3a933ef40e1 --- /dev/null +++ b/io_uring/kbuf.h @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_KBUF_H +#define IOU_KBUF_H + +#include + +struct io_buffer_list { + /* + * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, + * then these are classic provided buffers and ->buf_list is used. + */ + union { + struct list_head buf_list; + struct { + struct page **buf_pages; + struct io_uring_buf_ring *buf_ring; + }; + }; + __u16 bgid; + + /* below is for ring provided buffers */ + __u16 buf_nr_pages; + __u16 nr_entries; + __u16 head; + __u16 mask; +}; + +struct io_buffer { + struct list_head list; + __u64 addr; + __u32 len; + __u16 bid; + __u16 bgid; +}; + +void __user *io_buffer_select(struct io_kiocb *req, size_t *len, + unsigned int issue_flags); +void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags); +void io_destroy_buffers(struct io_ring_ctx *ctx); + +int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags); + +int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); + +int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); +int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); + +static inline bool io_do_buffer_select(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return false; + return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); +} + +static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +{ + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + return; + /* + * For legacy provided buffer mode, don't recycle if we already did + * IO to this buffer. For ring-mapped provided buffer mode, we should + * increment ring->head to explicitly monopolize the buffer to avoid + * multiple use. + */ + if ((req->flags & REQ_F_BUFFER_SELECTED) && + (req->flags & REQ_F_PARTIAL_IO)) + return; + + /* + * READV uses fields in `struct io_rw` (len/addr) to stash the selected + * buffer data. However if that buffer is recycled the original request + * data stored in addr is lost. Therefore forbid recycling for now. + */ + if (req->opcode == IORING_OP_READV) + return; + + __io_kbuf_recycle(req, issue_flags); +} + +static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) +{ + if (req->flags & REQ_F_BUFFER_RING) { + if (req->buf_list) + req->buf_list->head++; + req->flags &= ~REQ_F_BUFFER_RING; + } else { + list_add(&req->kbuf->list, list); + req->flags &= ~REQ_F_BUFFER_SELECTED; + } + + return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); +} + +static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) +{ + lockdep_assert_held(&req->ctx->completion_lock); + + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + return 0; + return __io_put_kbuf(req, &req->ctx->io_buffers_comp); +} + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) +{ + unsigned int cflags; + + if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) + return 0; + + /* + * We can add this buffer back to two lists: + * + * 1) The io_buffers_cache list. This one is protected by the + * ctx->uring_lock. If we already hold this lock, add back to this + * list as we can grab it from issue as well. + * 2) The io_buffers_comp list. This one is protected by the + * ctx->completion_lock. + * + * We migrate buffers from the comp_list to the issue cache list + * when we need one. + */ + if (req->flags & REQ_F_BUFFER_RING) { + /* no buffers to recycle for this case */ + cflags = __io_put_kbuf(req, NULL); + } else if (issue_flags & IO_URING_F_UNLOCKED) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); + spin_unlock(&ctx->completion_lock); + } else { + lockdep_assert_held(&req->ctx->uring_lock); + + cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); + } + + return cflags; +} +#endif diff --git a/io_uring/net.c b/io_uring/net.c index 2434548d0c1fb3..fe1fe920b9291f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -12,6 +12,7 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "kbuf.h" #include "net.h" #if defined(CONFIG_NET) diff --git a/io_uring/poll.c b/io_uring/poll.c index c3e4fcb0a7ba7b..b80f7fa261232d 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -17,6 +17,7 @@ #include "io_uring.h" #include "refs.h" #include "opdef.h" +#include "kbuf.h" #include "poll.h" struct io_poll_update { From 5df88c7728811fe56707e53399acfb5d38d4a572 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 07:12:45 -0600 Subject: [PATCH 0978/1250] io_uring: move rsrc related data, core, and commands Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 1644 +++--------------------------------------- io_uring/io_uring.h | 6 +- io_uring/openclose.c | 1 + io_uring/rsrc.c | 1320 +++++++++++++++++++++++++++++++++ io_uring/rsrc.h | 155 ++++ 6 files changed, 1595 insertions(+), 1533 deletions(-) create mode 100644 io_uring/rsrc.c create mode 100644 io_uring/rsrc.h diff --git a/io_uring/Makefile b/io_uring/Makefile index b85418b64e8241..360a83039c2a4f 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o + cancel.o kbuf.o rsrc.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e395167999edfb..0c47c919887f52 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -68,7 +68,6 @@ #include #include #include -#include #include #include #include @@ -94,6 +93,7 @@ #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" +#include "rsrc.h" #include "xattr.h" #include "nop.h" @@ -114,17 +114,9 @@ #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) -/* only define max */ -#define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) -#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) -#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) - -#define IORING_MAX_REG_BUFFERS (1U << 14) - #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -140,38 +132,6 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) -struct io_rsrc_put { - struct list_head list; - u64 tag; - union { - void *rsrc; - struct file *file; - struct io_mapped_ubuf *buf; - }; -}; - -struct io_rsrc_node { - struct percpu_ref refs; - struct list_head node; - struct list_head rsrc_list; - struct io_rsrc_data *rsrc_data; - struct llist_node llist; - bool done; -}; - -typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); - -struct io_rsrc_data { - struct io_ring_ctx *ctx; - - u64 **tags; - unsigned int nr; - rsrc_put_fn *do_put; - atomic_t refs; - struct completion done; - bool quiesce; -}; - #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 @@ -188,13 +148,6 @@ struct io_rw { rwf_t flags; }; -struct io_rsrc_update { - struct file *file; - u64 arg; - u32 nr_args; - u32 offset; -}; - struct io_rw_state { struct iov_iter iter; struct iov_iter_state iter_state; @@ -208,11 +161,6 @@ struct io_async_rw { struct wait_page_queue wpq; }; -enum { - IORING_RSRC_FILE = 0, - IORING_RSRC_BUFFER = 1, -}; - enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, @@ -233,12 +181,8 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all); static void io_dismantle_req(struct io_kiocb *req); -static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, - struct io_uring_rsrc_update2 *up, - unsigned nr_args); static void io_clean_op(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req); -static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); @@ -268,22 +212,6 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); -#if defined(CONFIG_UNIX) -static inline bool io_file_need_scm(struct file *filp) -{ -#if defined(IO_URING_SCM_ALL) - return true; -#else - return !!unix_get_socket(filp); -#endif -} -#else -static inline bool io_file_need_scm(struct file *filp) -{ - return false; -} -#endif - static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { @@ -298,67 +226,6 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) __io_submit_flush_completions(ctx); } -#define IO_RSRC_REF_BATCH 100 - -static void io_rsrc_put_node(struct io_rsrc_node *node, int nr) -{ - percpu_ref_put_many(&node->refs, nr); -} - -static inline void io_req_put_rsrc_locked(struct io_kiocb *req, - struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_rsrc_node *node = req->rsrc_node; - - if (node) { - if (node == ctx->rsrc_node) - ctx->rsrc_cached_refs++; - else - io_rsrc_put_node(node, 1); - } -} - -static inline void io_req_put_rsrc(struct io_kiocb *req) -{ - if (req->rsrc_node) - io_rsrc_put_node(req->rsrc_node, 1); -} - -static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - if (ctx->rsrc_cached_refs) { - io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); - ctx->rsrc_cached_refs = 0; - } -} - -static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); -} - -static inline void io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned int issue_flags) -{ - if (!req->rsrc_node) { - req->rsrc_node = ctx->rsrc_node; - - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - lockdep_assert_held(&ctx->uring_lock); - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); - } else { - percpu_ref_get(&req->rsrc_node->refs); - } - } -} - static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; @@ -2870,92 +2737,6 @@ static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, return -EOPNOTSUPP; } -static int io_files_update_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_rsrc_update *up = io_kiocb_to_cmd(req); - - if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) - return -EINVAL; - if (sqe->rw_flags || sqe->splice_fd_in) - return -EINVAL; - - up->offset = READ_ONCE(sqe->off); - up->nr_args = READ_ONCE(sqe->len); - if (!up->nr_args) - return -EINVAL; - up->arg = READ_ONCE(sqe->addr); - return 0; -} - -static int io_files_update_with_index_alloc(struct io_kiocb *req, - unsigned int issue_flags) -{ - struct io_rsrc_update *up = io_kiocb_to_cmd(req); - __s32 __user *fds = u64_to_user_ptr(up->arg); - unsigned int done; - struct file *file; - int ret, fd; - - if (!req->ctx->file_data) - return -ENXIO; - - for (done = 0; done < up->nr_args; done++) { - if (copy_from_user(&fd, &fds[done], sizeof(fd))) { - ret = -EFAULT; - break; - } - - file = fget(fd); - if (!file) { - ret = -EBADF; - break; - } - ret = io_fixed_fd_install(req, issue_flags, file, - IORING_FILE_INDEX_ALLOC); - if (ret < 0) - break; - if (copy_to_user(&fds[done], &ret, sizeof(ret))) { - __io_close_fixed(req, issue_flags, ret); - ret = -EFAULT; - break; - } - } - - if (done) - return done; - return ret; -} - -static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rsrc_update *up = io_kiocb_to_cmd(req); - struct io_ring_ctx *ctx = req->ctx; - struct io_uring_rsrc_update2 up2; - int ret; - - up2.offset = up->offset; - up2.data = up->arg; - up2.nr = 0; - up2.tags = 0; - up2.resv = 0; - up2.resv2 = 0; - - if (up->offset == IORING_FILE_INDEX_ALLOC) { - ret = io_files_update_with_index_alloc(req, issue_flags); - } else { - io_ring_submit_lock(ctx, issue_flags); - ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, - &up2, up->nr_args); - io_ring_submit_unlock(ctx, issue_flags); - } - - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - static int io_req_prep_async(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -3696,7 +3477,7 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, return -1; } -static int io_run_task_work_sig(void) +int io_run_task_work_sig(void) { if (io_run_task_work()) return 1; @@ -3798,1265 +3579,164 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void io_free_page_table(void **table, size_t size) +int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) + __must_hold(&req->ctx->uring_lock) { - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); + struct io_ring_ctx *ctx = req->ctx; + bool needs_switch = false; + struct io_fixed_file *file_slot; + int ret; - for (i = 0; i < nr_tables; i++) - kfree(table[i]); - kfree(table); -} + if (io_is_uring_fops(file)) + return -EBADF; + if (!ctx->file_data) + return -ENXIO; + if (slot_index >= ctx->nr_user_files) + return -EINVAL; -static __cold void **io_alloc_page_table(size_t size) -{ - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - size_t init_size = size; - void **table; + slot_index = array_index_nospec(slot_index, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); - if (!table) - return NULL; + if (file_slot->file_ptr) { + struct file *old_file; - for (i = 0; i < nr_tables; i++) { - unsigned int this_size = min_t(size_t, size, PAGE_SIZE); + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto err; - table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); - if (!table[i]) { - io_free_page_table(table, init_size); - return NULL; - } - size -= this_size; + old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, slot_index, + ctx->rsrc_node, old_file); + if (ret) + goto err; + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, slot_index); + needs_switch = true; } - return table; -} -static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) -{ - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); + ret = io_scm_file_account(ctx, file); + if (!ret) { + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); + } +err: + if (needs_switch) + io_rsrc_node_switch(ctx, ctx->file_data); + if (ret) + fput(file); + return ret; } -static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) +static void io_mem_free(void *ptr) { - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); - struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; - bool first_add = false; - unsigned long delay = HZ; - - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); - node->done = true; - - /* if we are mid-quiesce then do not delay */ - if (node->rsrc_data->quiesce) - delay = 0; + struct page *page; - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (!node->done) - break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); + if (!ptr) + return; - if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); + page = virt_to_head_page(ptr); + if (put_page_testzero(page)) + free_compound_page(page); } -static struct io_rsrc_node *io_rsrc_node_alloc(void) +static void *io_mem_alloc(size_t size) { - struct io_rsrc_node *ref_node; - - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; - } - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); - ref_node->done = false; - return ref_node; + return (void *) __get_free_pages(gfp, get_order(size)); } -void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill) - __must_hold(&ctx->uring_lock) +static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { - WARN_ON_ONCE(!ctx->rsrc_backup_node); - WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); + struct io_rings *rings; + size_t off, sq_array_size; - io_rsrc_refs_drop(ctx); + off = struct_size(rings, cqes, cq_entries); + if (off == SIZE_MAX) + return SIZE_MAX; + if (ctx->flags & IORING_SETUP_CQE32) { + if (check_shl_overflow(off, 1, &off)) + return SIZE_MAX; + } - if (data_to_kill) { - struct io_rsrc_node *rsrc_node = ctx->rsrc_node; +#ifdef CONFIG_SMP + off = ALIGN(off, SMP_CACHE_BYTES); + if (off == 0) + return SIZE_MAX; +#endif - rsrc_node->rsrc_data = data_to_kill; - spin_lock_irq(&ctx->rsrc_ref_lock); - list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - spin_unlock_irq(&ctx->rsrc_ref_lock); + if (sq_offset) + *sq_offset = off; - atomic_inc(&data_to_kill->refs); - percpu_ref_kill(&rsrc_node->refs); - ctx->rsrc_node = NULL; - } + sq_array_size = array_size(sizeof(u32), sq_entries); + if (sq_array_size == SIZE_MAX) + return SIZE_MAX; - if (!ctx->rsrc_node) { - ctx->rsrc_node = ctx->rsrc_backup_node; - ctx->rsrc_backup_node = NULL; - } -} + if (check_add_overflow(off, sq_array_size, &off)) + return SIZE_MAX; -int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) -{ - if (ctx->rsrc_backup_node) - return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); - return ctx->rsrc_backup_node ? 0 : -ENOMEM; + return off; } -static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, - struct io_ring_ctx *ctx) +static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int eventfd_async) { - int ret; - - /* As we may drop ->uring_lock, other task may have started quiesce */ - if (data->quiesce) - return -ENXIO; - - data->quiesce = true; - do { - ret = io_rsrc_node_switch_start(ctx); - if (ret) - break; - io_rsrc_node_switch(ctx, data); + struct io_ev_fd *ev_fd; + __s32 __user *fds = arg; + int fd; - /* kill initial ref, already quiesced if zero */ - if (atomic_dec_and_test(&data->refs)) - break; - mutex_unlock(&ctx->uring_lock); - flush_delayed_work(&ctx->rsrc_put_work); - ret = wait_for_completion_interruptible(&data->done); - if (!ret) { - mutex_lock(&ctx->uring_lock); - if (atomic_read(&data->refs) > 0) { - /* - * it has been revived by another thread while - * we were unlocked - */ - mutex_unlock(&ctx->uring_lock); - } else { - break; - } - } + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) + return -EBUSY; - atomic_inc(&data->refs); - /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); - reinit_completion(&data->done); + if (copy_from_user(&fd, fds, sizeof(*fds))) + return -EFAULT; - ret = io_run_task_work_sig(); - mutex_lock(&ctx->uring_lock); - } while (ret >= 0); - data->quiesce = false; + ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); + if (!ev_fd) + return -ENOMEM; - return ret; + ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); + if (IS_ERR(ev_fd->cq_ev_fd)) { + int ret = PTR_ERR(ev_fd->cq_ev_fd); + kfree(ev_fd); + return ret; + } + ev_fd->eventfd_async = eventfd_async; + ctx->has_evfd = true; + rcu_assign_pointer(ctx->io_ev_fd, ev_fd); + return 0; } -static u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) +static void io_eventfd_put(struct rcu_head *rcu) { - unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; - unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; + struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - return &data->tags[table_idx][off]; + eventfd_ctx_put(ev_fd->cq_ev_fd); + kfree(ev_fd); } -static void io_rsrc_data_free(struct io_rsrc_data *data) +static int io_eventfd_unregister(struct io_ring_ctx *ctx) { - size_t size = data->nr * sizeof(data->tags[0][0]); + struct io_ev_fd *ev_fd; + + ev_fd = rcu_dereference_protected(ctx->io_ev_fd, + lockdep_is_held(&ctx->uring_lock)); + if (ev_fd) { + ctx->has_evfd = false; + rcu_assign_pointer(ctx->io_ev_fd, NULL); + call_rcu(&ev_fd->rcu, io_eventfd_put); + return 0; + } - if (data->tags) - io_free_page_table((void **)data->tags, size); - kfree(data); + return -ENXIO; } -static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, - u64 __user *utags, unsigned nr, - struct io_rsrc_data **pdata) -{ - struct io_rsrc_data *data; - int ret = -ENOMEM; - unsigned i; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); - if (!data->tags) { - kfree(data); - return -ENOMEM; - } - - data->nr = nr; - data->ctx = ctx; - data->do_put = do_put; - if (utags) { - ret = -EFAULT; - for (i = 0; i < nr; i++) { - u64 *tag_slot = io_get_tag_slot(data, i); - - if (copy_from_user(tag_slot, &utags[i], - sizeof(*tag_slot))) - goto fail; - } - } - - atomic_set(&data->refs, 1); - init_completion(&data->done); - *pdata = data; - return 0; -fail: - io_rsrc_data_free(data); - return ret; -} - -static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ -#if !defined(IO_URING_SCM_ALL) - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(&ctx->file_table, i); - - if (!file) - continue; - if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) - continue; - io_file_bitmap_clear(&ctx->file_table, i); - fput(file); - } -#endif - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) - kfree_skb(skb); - } -#endif - io_free_file_tables(&ctx->file_table); - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; -} - -static int io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ - unsigned nr = ctx->nr_user_files; - int ret; - - if (!ctx->file_data) - return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_files = 0; - ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); - ctx->nr_user_files = nr; - if (!ret) - __io_sqe_files_unregister(ctx); - return ret; -} - -/* - * Ensure the UNIX gc is aware of our file set, so we are certain that - * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. We account only files that can hold other - * files because otherwise they can't form a loop and so are not interesting - * for GC. - */ -static int io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) -{ -#if defined(CONFIG_UNIX) - struct sock *sk = ctx->ring_sock->sk; - struct sk_buff_head *head = &sk->sk_receive_queue; - struct scm_fp_list *fpl; - struct sk_buff *skb; - - if (likely(!io_file_need_scm(file))) - return 0; - - /* - * See if we can merge this file into an existing skb SCM_RIGHTS - * file set. If there's no room, fall back to allocating a new skb - * and filling it in. - */ - spin_lock_irq(&head->lock); - skb = skb_peek(head); - if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) - __skb_unlink(skb, head); - else - skb = NULL; - spin_unlock_irq(&head->lock); - - if (!skb) { - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) { - kfree(fpl); - return -ENOMEM; - } - - fpl->user = get_uid(current_user()); - fpl->max = SCM_MAX_FD; - fpl->count = 0; - - UNIXCB(skb).fp = fpl; - skb->sk = sk; - skb->destructor = unix_destruct_scm; - refcount_add(skb->truesize, &sk->sk_wmem_alloc); - } - - fpl = UNIXCB(skb).fp; - fpl->fp[fpl->count++] = get_file(file); - unix_inflight(fpl->user, file); - skb_queue_head(head, skb); - fput(file); -#endif - return 0; -} - -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - struct file *file = prsrc->file; -#if defined(CONFIG_UNIX) - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - if (!io_file_need_scm(file)) { - fput(file); - return; - } - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#else - fput(file); -#endif -} - -static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) -{ - struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; - struct io_rsrc_put *prsrc, *tmp; - - list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { - list_del(&prsrc->list); - - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); - - spin_lock(&ctx->completion_lock); - io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); - } - - rsrc_data->do_put(ctx, prsrc); - kfree(prsrc); - } - - io_rsrc_node_destroy(ref_node); - if (atomic_dec_and_test(&rsrc_data->refs)) - complete(&rsrc_data->done); -} - -static void io_rsrc_put_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx; - struct llist_node *node; - - ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); - node = llist_del_all(&ctx->rsrc_put_llist); - - while (node) { - struct io_rsrc_node *ref_node; - struct llist_node *next = node->next; - - ref_node = llist_entry(node, struct io_rsrc_node, llist); - __io_rsrc_put_work(ref_node); - node = next; - } -} - -static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args, u64 __user *tags) -{ - __s32 __user *fds = (__s32 __user *) arg; - struct file *file; - int fd, ret; - unsigned i; - - if (ctx->file_data) - return -EBUSY; - if (!nr_args) - return -EINVAL; - if (nr_args > IORING_MAX_FIXED_FILES) - return -EMFILE; - if (nr_args > rlimit(RLIMIT_NOFILE)) - return -EMFILE; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, - &ctx->file_data); - if (ret) - return ret; - - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - return -ENOMEM; - } - - for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct io_fixed_file *file_slot; - - if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { - ret = -EFAULT; - goto fail; - } - /* allow sparse sets */ - if (!fds || fd == -1) { - ret = -EINVAL; - if (unlikely(*io_get_tag_slot(ctx->file_data, i))) - goto fail; - continue; - } - - file = fget(fd); - ret = -EBADF; - if (unlikely(!file)) - goto fail; - - /* - * Don't allow io_uring instances to be registered. If UNIX - * isn't enabled, then this causes a reference cycle and this - * instance can never get freed. If UNIX is enabled we'll - * handle it just fine, but there's still no point in allowing - * a ring fd as it doesn't support regular read/write anyway. - */ - if (io_is_uring_fops(file)) { - fput(file); - goto fail; - } - ret = io_scm_file_account(ctx, file); - if (ret) { - fput(file); - goto fail; - } - file_slot = io_fixed_file_slot(&ctx->file_table, i); - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, i); - } - - io_rsrc_node_switch(ctx, NULL); - return 0; -fail: - __io_sqe_files_unregister(ctx); - return ret; -} - -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) -{ - u64 *tag_slot = io_get_tag_slot(data, idx); - struct io_rsrc_put *prsrc; - - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) - return -ENOMEM; - - prsrc->tag = *tag_slot; - *tag_slot = 0; - prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->rsrc_list); - return 0; -} - -int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) - __must_hold(&req->ctx->uring_lock) -{ - struct io_ring_ctx *ctx = req->ctx; - bool needs_switch = false; - struct io_fixed_file *file_slot; - int ret; - - if (io_is_uring_fops(file)) - return -EBADF; - if (!ctx->file_data) - return -ENXIO; - if (slot_index >= ctx->nr_user_files) - return -EINVAL; - - slot_index = array_index_nospec(slot_index, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - - if (file_slot->file_ptr) { - struct file *old_file; - - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - ctx->rsrc_node, old_file); - if (ret) - goto err; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); - needs_switch = true; - } - - ret = io_scm_file_account(ctx, file); - if (!ret) { - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); - } -err: - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->file_data); - if (ret) - fput(file); - return ret; -} - -static int __io_sqe_files_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update2 *up, - unsigned nr_args) -{ - u64 __user *tags = u64_to_user_ptr(up->tags); - __s32 __user *fds = u64_to_user_ptr(up->data); - struct io_rsrc_data *data = ctx->file_data; - struct io_fixed_file *file_slot; - struct file *file; - int fd, i, err = 0; - unsigned int done; - bool needs_switch = false; - - if (!ctx->file_data) - return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_files) - return -EINVAL; - - for (done = 0; done < nr_args; done++) { - u64 tag = 0; - - if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || - copy_from_user(&fd, &fds[done], sizeof(fd))) { - err = -EFAULT; - break; - } - if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { - err = -EINVAL; - break; - } - if (fd == IORING_REGISTER_FILES_SKIP) - continue; - - i = array_index_nospec(up->offset + done, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); - - if (file_slot->file_ptr) { - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); - if (err) - break; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, i); - needs_switch = true; - } - if (fd != -1) { - file = fget(fd); - if (!file) { - err = -EBADF; - break; - } - /* - * Don't allow io_uring instances to be registered. If - * UNIX isn't enabled, then this causes a reference - * cycle and this instance can never get freed. If UNIX - * is enabled we'll handle it just fine, but there's - * still no point in allowing a ring fd as it doesn't - * support regular read/write anyway. - */ - if (io_is_uring_fops(file)) { - fput(file); - err = -EBADF; - break; - } - err = io_scm_file_account(ctx, file); - if (err) { - fput(file); - break; - } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, i); - } - } - - if (needs_switch) - io_rsrc_node_switch(ctx, data); - return done ? done : err; -} - -static inline void __io_unaccount_mem(struct user_struct *user, - unsigned long nr_pages) -{ - atomic_long_sub(nr_pages, &user->locked_vm); -} - -static inline int __io_account_mem(struct user_struct *user, - unsigned long nr_pages) -{ - unsigned long page_limit, cur_pages, new_pages; - - /* Don't allow more pages than we can safely lock */ - page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - do { - cur_pages = atomic_long_read(&user->locked_vm); - new_pages = cur_pages + nr_pages; - if (new_pages > page_limit) - return -ENOMEM; - } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, - new_pages) != cur_pages); - - return 0; -} - -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) -{ - if (ctx->user) - __io_unaccount_mem(ctx->user, nr_pages); - - if (ctx->mm_account) - atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); -} - -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) -{ - int ret; - - if (ctx->user) { - ret = __io_account_mem(ctx->user, nr_pages); - if (ret) - return ret; - } - - if (ctx->mm_account) - atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); - - return 0; -} - -static void io_mem_free(void *ptr) -{ - struct page *page; - - if (!ptr) - return; - - page = virt_to_head_page(ptr); - if (put_page_testzero(page)) - free_compound_page(page); -} - -static void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - - return (void *) __get_free_pages(gfp, get_order(size)); -} - -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) -{ - struct io_rings *rings; - size_t off, sq_array_size; - - off = struct_size(rings, cqes, cq_entries); - if (off == SIZE_MAX) - return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { - if (check_shl_overflow(off, 1, &off)) - return SIZE_MAX; - } - -#ifdef CONFIG_SMP - off = ALIGN(off, SMP_CACHE_BYTES); - if (off == 0) - return SIZE_MAX; -#endif - - if (sq_offset) - *sq_offset = off; - - sq_array_size = array_size(sizeof(u32), sq_entries); - if (sq_array_size == SIZE_MAX) - return SIZE_MAX; - - if (check_add_overflow(off, sq_array_size, &off)) - return SIZE_MAX; - - return off; -} - -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) -{ - struct io_mapped_ubuf *imu = *slot; - unsigned int i; - - if (imu != ctx->dummy_ubuf) { - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); - } - *slot = NULL; -} - -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - io_buffer_unmap(ctx, &prsrc->buf); - prsrc->buf = NULL; -} - -static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned int i; - - for (i = 0; i < ctx->nr_user_bufs; i++) - io_buffer_unmap(ctx, &ctx->user_bufs[i]); - kfree(ctx->user_bufs); - io_rsrc_data_free(ctx->buf_data); - ctx->user_bufs = NULL; - ctx->buf_data = NULL; - ctx->nr_user_bufs = 0; -} - -static int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned nr = ctx->nr_user_bufs; - int ret; - - if (!ctx->buf_data) - return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_bufs = 0; - ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); - ctx->nr_user_bufs = nr; - if (!ret) - __io_sqe_buffers_unregister(ctx); - return ret; -} - -static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, - void __user *arg, unsigned index) -{ - struct iovec __user *src; - -#ifdef CONFIG_COMPAT - if (ctx->compat) { - struct compat_iovec __user *ciovs; - struct compat_iovec ciov; - - ciovs = (struct compat_iovec __user *) arg; - if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) - return -EFAULT; - - dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); - dst->iov_len = ciov.iov_len; - return 0; - } -#endif - src = (struct iovec __user *) arg; - if (copy_from_user(dst, &src[index], sizeof(*dst))) - return -EFAULT; - return 0; -} - -/* - * Not super efficient, but this is just a registration time. And we do cache - * the last compound head, so generally we'll only do a full search if we don't - * match that one. - * - * We check if the given compound head page has already been accounted, to - * avoid double accounting it. This allows us to account the full size of the - * page, not just the constituent pages of a huge page. - */ -static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct page *hpage) -{ - int i, j; - - /* check current page array */ - for (i = 0; i < nr_pages; i++) { - if (!PageCompound(pages[i])) - continue; - if (compound_head(pages[i]) == hpage) - return true; - } - - /* check previously registered pages */ - for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = ctx->user_bufs[i]; - - for (j = 0; j < imu->nr_bvecs; j++) { - if (!PageCompound(imu->bvec[j].bv_page)) - continue; - if (compound_head(imu->bvec[j].bv_page) == hpage) - return true; - } - } - - return false; -} - -static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, - int nr_pages, struct io_mapped_ubuf *imu, - struct page **last_hpage) -{ - int i, ret; - - imu->acct_pages = 0; - for (i = 0; i < nr_pages; i++) { - if (!PageCompound(pages[i])) { - imu->acct_pages++; - } else { - struct page *hpage; - - hpage = compound_head(pages[i]); - if (hpage == *last_hpage) - continue; - *last_hpage = hpage; - if (headpage_already_acct(ctx, pages, i, hpage)) - continue; - imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; - } - } - - if (!imu->acct_pages) - return 0; - - ret = io_account_mem(ctx, imu->acct_pages); - if (ret) - imu->acct_pages = 0; - return ret; -} - -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) -{ - unsigned long start, end, nr_pages; - struct vm_area_struct **vmas = NULL; - struct page **pages = NULL; - int i, pret, ret = -ENOMEM; - - end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ubuf >> PAGE_SHIFT; - nr_pages = end - start; - - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - goto done; - - vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - goto done; - - ret = 0; - mmap_read_lock(current->mm); - pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - pages, vmas); - if (pret == nr_pages) { - /* don't support file backed memory */ - for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma_is_shmem(vma)) - continue; - if (vma->vm_file && - !is_file_hugepages(vma->vm_file)) { - ret = -EOPNOTSUPP; - break; - } - } - *npages = nr_pages; - } else { - ret = pret < 0 ? pret : -EFAULT; - } - mmap_read_unlock(current->mm); - if (ret) { - /* - * if we did partial map, or found file backed vmas, - * release any pages we did get - */ - if (pret > 0) - unpin_user_pages(pages, pret); - goto done; - } - ret = 0; -done: - kvfree(vmas); - if (ret < 0) { - kvfree(pages); - pages = ERR_PTR(ret); - } - return pages; -} - -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) -{ - struct io_mapped_ubuf *imu = NULL; - struct page **pages = NULL; - unsigned long off; - size_t size; - int ret, nr_pages, i; - - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; - return 0; - } - - *pimu = NULL; - ret = -ENOMEM; - - pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, - &nr_pages); - if (IS_ERR(pages)) { - ret = PTR_ERR(pages); - pages = NULL; - goto done; - } - - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); - if (!imu) - goto done; - - ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); - goto done; - } - - off = (unsigned long) iov->iov_base & ~PAGE_MASK; - size = iov->iov_len; - for (i = 0; i < nr_pages; i++) { - size_t vec_len; - - vec_len = min_t(size_t, size, PAGE_SIZE - off); - imu->bvec[i].bv_page = pages[i]; - imu->bvec[i].bv_len = vec_len; - imu->bvec[i].bv_offset = off; - off = 0; - size -= vec_len; - } - /* store original address for later verification */ - imu->ubuf = (unsigned long) iov->iov_base; - imu->ubuf_end = imu->ubuf + iov->iov_len; - imu->nr_bvecs = nr_pages; - *pimu = imu; - ret = 0; -done: - if (ret) - kvfree(imu); - kvfree(pages); - return ret; -} - -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) -{ - ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); - return ctx->user_bufs ? 0 : -ENOMEM; -} - -static int io_buffer_validate(struct iovec *iov) -{ - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); - - /* - * Don't impose further limits on the size and buffer - * constraints here, we'll -EINVAL later when IO is - * submitted if they are wrong. - */ - if (!iov->iov_base) - return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - - return 0; -} - -static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int nr_args, u64 __user *tags) -{ - struct page *last_hpage = NULL; - struct io_rsrc_data *data; - int i, ret; - struct iovec iov; - - if (ctx->user_bufs) - return -EBUSY; - if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) - return -EINVAL; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - return ret; - ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); - if (ret) - return ret; - ret = io_buffers_map_alloc(ctx, nr_args); - if (ret) { - io_rsrc_data_free(data); - return ret; - } - - for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { - if (arg) { - ret = io_copy_iov(ctx, &iov, arg, i); - if (ret) - break; - ret = io_buffer_validate(&iov); - if (ret) - break; - } else { - memset(&iov, 0, sizeof(iov)); - } - - if (!iov.iov_base && *io_get_tag_slot(data, i)) { - ret = -EINVAL; - break; - } - - ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], - &last_hpage); - if (ret) - break; - } - - WARN_ON_ONCE(ctx->buf_data); - - ctx->buf_data = data; - if (ret) - __io_sqe_buffers_unregister(ctx); - else - io_rsrc_node_switch(ctx, NULL); - return ret; -} - -static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, - struct io_uring_rsrc_update2 *up, - unsigned int nr_args) -{ - u64 __user *tags = u64_to_user_ptr(up->tags); - struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); - struct page *last_hpage = NULL; - bool needs_switch = false; - __u32 done; - int i, err; - - if (!ctx->buf_data) - return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_bufs) - return -EINVAL; - - for (done = 0; done < nr_args; done++) { - struct io_mapped_ubuf *imu; - int offset = up->offset + done; - u64 tag = 0; - - err = io_copy_iov(ctx, &iov, iovs, done); - if (err) - break; - if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { - err = -EFAULT; - break; - } - err = io_buffer_validate(&iov); - if (err) - break; - if (!iov.iov_base && tag) { - err = -EINVAL; - break; - } - err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); - if (err) - break; - - i = array_index_nospec(offset, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != ctx->dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->rsrc_node, ctx->user_bufs[i]); - if (unlikely(err)) { - io_buffer_unmap(ctx, &imu); - break; - } - ctx->user_bufs[i] = NULL; - needs_switch = true; - } - - ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, offset) = tag; - } - - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->buf_data); - return done ? done : err; -} - -static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, - unsigned int eventfd_async) -{ - struct io_ev_fd *ev_fd; - __s32 __user *fds = arg; - int fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) - return -EBUSY; - - if (copy_from_user(&fd, fds, sizeof(*fds))) - return -EFAULT; - - ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); - if (!ev_fd) - return -ENOMEM; - - ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); - if (IS_ERR(ev_fd->cq_ev_fd)) { - int ret = PTR_ERR(ev_fd->cq_ev_fd); - kfree(ev_fd); - return ret; - } - ev_fd->eventfd_async = eventfd_async; - ctx->has_evfd = true; - rcu_assign_pointer(ctx->io_ev_fd, ev_fd); - return 0; -} - -static void io_eventfd_put(struct rcu_head *rcu) -{ - struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); - - eventfd_ctx_put(ev_fd->cq_ev_fd); - kfree(ev_fd); -} - -static int io_eventfd_unregister(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = rcu_dereference_protected(ctx->io_ev_fd, - lockdep_is_held(&ctx->uring_lock)); - if (ev_fd) { - ctx->has_evfd = false; - rcu_assign_pointer(ctx->io_ev_fd, NULL); - call_rcu(&ev_fd->rcu, io_eventfd_put); - return 0; - } - - return -ENXIO; -} - -static void io_req_caches_free(struct io_ring_ctx *ctx) +static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; int nr = 0; @@ -5078,12 +3758,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static void io_wait_rsrc_data(struct io_rsrc_data *data) -{ - if (data && !atomic_dec_and_test(&data->refs)) - wait_for_completion(&data->done); -} - static void io_flush_apoll_cache(struct io_ring_ctx *ctx) { struct async_poll *apoll; @@ -6228,89 +4902,6 @@ static int io_register_enable_rings(struct io_ring_ctx *ctx) return 0; } -static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, - struct io_uring_rsrc_update2 *up, - unsigned nr_args) -{ - __u32 tmp; - int err; - - if (check_add_overflow(up->offset, nr_args, &tmp)) - return -EOVERFLOW; - err = io_rsrc_node_switch_start(ctx); - if (err) - return err; - - switch (type) { - case IORING_RSRC_FILE: - return __io_sqe_files_update(ctx, up, nr_args); - case IORING_RSRC_BUFFER: - return __io_sqe_buffers_update(ctx, up, nr_args); - } - return -EINVAL; -} - -static int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned nr_args) -{ - struct io_uring_rsrc_update2 up; - - if (!nr_args) - return -EINVAL; - memset(&up, 0, sizeof(up)); - if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) - return -EFAULT; - if (up.resv || up.resv2) - return -EINVAL; - return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); -} - -static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, - unsigned size, unsigned type) -{ - struct io_uring_rsrc_update2 up; - - if (size != sizeof(up)) - return -EINVAL; - if (copy_from_user(&up, arg, sizeof(up))) - return -EFAULT; - if (!up.nr || up.resv || up.resv2) - return -EINVAL; - return __io_register_rsrc_update(ctx, type, &up, up.nr); -} - -static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, - unsigned int size, unsigned int type) -{ - struct io_uring_rsrc_register rr; - - /* keep it extendible */ - if (size != sizeof(rr)) - return -EINVAL; - - memset(&rr, 0, sizeof(rr)); - if (copy_from_user(&rr, arg, size)) - return -EFAULT; - if (!rr.nr || rr.resv2) - return -EINVAL; - if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) - return -EINVAL; - - switch (type) { - case IORING_RSRC_FILE: - if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) - break; - return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); - case IORING_RSRC_BUFFER: - if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) - break; - return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), - rr.nr, u64_to_user_ptr(rr.tags)); - } - return -EINVAL; -} - static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, unsigned len) { @@ -7103,7 +5694,6 @@ static int __init io_uring_init(void) sizeof(struct io_uring_rsrc_update2)); /* ->buf_index is u16 */ - BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0); BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != offsetof(struct io_uring_buf_ring, tail)); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 172defdcfdbe40..090c17deba9db2 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -92,6 +92,7 @@ static inline bool io_run_task_work(void) return false; } +int io_run_task_work_sig(void); void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); @@ -110,11 +111,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); -int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc); -void io_rsrc_node_switch(struct io_ring_ctx *ctx, - struct io_rsrc_data *data_to_kill); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_work_add(struct io_kiocb *req); diff --git a/io_uring/openclose.c b/io_uring/openclose.c index fa35bd56a33086..1cbf3903097053 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -14,6 +14,7 @@ #include "io_uring_types.h" #include "io_uring.h" +#include "rsrc.h" #include "openclose.h" struct io_open { diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c new file mode 100644 index 00000000000000..8c40b20659d40a --- /dev/null +++ b/io_uring/rsrc.c @@ -0,0 +1,1320 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "openclose.h" +#include "rsrc.h" + +struct io_rsrc_update { + struct file *file; + u64 arg; + u32 nr_args; + u32 offset; +}; + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage); + +#define IO_RSRC_REF_BATCH 100 + +/* only define max */ +#define IORING_MAX_FIXED_FILES (1U << 20) +#define IORING_MAX_REG_BUFFERS (1U << 14) + +void io_rsrc_refs_drop(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + if (ctx->rsrc_cached_refs) { + io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); + ctx->rsrc_cached_refs = 0; + } +} + +static inline void __io_unaccount_mem(struct user_struct *user, + unsigned long nr_pages) +{ + atomic_long_sub(nr_pages, &user->locked_vm); +} + +static inline int __io_account_mem(struct user_struct *user, + unsigned long nr_pages) +{ + unsigned long page_limit, cur_pages, new_pages; + + /* Don't allow more pages than we can safely lock */ + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + return 0; +} + +static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + if (ctx->user) + __io_unaccount_mem(ctx->user, nr_pages); + + if (ctx->mm_account) + atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); +} + +static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +{ + int ret; + + if (ctx->user) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + + if (ctx->mm_account) + atomic64_add(nr_pages, &ctx->mm_account->pinned_vm); + + return 0; +} + +static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, + void __user *arg, unsigned index) +{ + struct iovec __user *src; + +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct compat_iovec __user *ciovs; + struct compat_iovec ciov; + + ciovs = (struct compat_iovec __user *) arg; + if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) + return -EFAULT; + + dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base); + dst->iov_len = ciov.iov_len; + return 0; + } +#endif + src = (struct iovec __user *) arg; + if (copy_from_user(dst, &src[index], sizeof(*dst))) + return -EFAULT; + return 0; +} + +static int io_buffer_validate(struct iovec *iov) +{ + unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + + /* + * Don't impose further limits on the size and buffer + * constraints here, we'll -EINVAL later when IO is + * submitted if they are wrong. + */ + if (!iov->iov_base) + return iov->iov_len ? -EFAULT : 0; + if (!iov->iov_len) + return -EFAULT; + + /* arbitrary limit, but we need something */ + if (iov->iov_len > SZ_1G) + return -EFAULT; + + if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) + return -EOVERFLOW; + + return 0; +} + +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) +{ + struct io_mapped_ubuf *imu = *slot; + unsigned int i; + + if (imu != ctx->dummy_ubuf) { + for (i = 0; i < imu->nr_bvecs; i++) + unpin_user_page(imu->bvec[i].bv_page); + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + kvfree(imu); + } + *slot = NULL; +} + +void io_rsrc_refs_refill(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; + percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); +} + +static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) +{ + struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; + struct io_ring_ctx *ctx = rsrc_data->ctx; + struct io_rsrc_put *prsrc, *tmp; + + list_for_each_entry_safe(prsrc, tmp, &ref_node->rsrc_list, list) { + list_del(&prsrc->list); + + if (prsrc->tag) { + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_lock(&ctx->uring_lock); + + spin_lock(&ctx->completion_lock); + io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + + if (ctx->flags & IORING_SETUP_IOPOLL) + mutex_unlock(&ctx->uring_lock); + } + + rsrc_data->do_put(ctx, prsrc); + kfree(prsrc); + } + + io_rsrc_node_destroy(ref_node); + if (atomic_dec_and_test(&rsrc_data->refs)) + complete(&rsrc_data->done); +} + +void io_rsrc_put_work(struct work_struct *work) +{ + struct io_ring_ctx *ctx; + struct llist_node *node; + + ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); + node = llist_del_all(&ctx->rsrc_put_llist); + + while (node) { + struct io_rsrc_node *ref_node; + struct llist_node *next = node->next; + + ref_node = llist_entry(node, struct io_rsrc_node, llist); + __io_rsrc_put_work(ref_node); + node = next; + } +} + +void io_wait_rsrc_data(struct io_rsrc_data *data) +{ + if (data && !atomic_dec_and_test(&data->refs)) + wait_for_completion(&data->done); +} + +void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) +{ + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); +} + +static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) +{ + struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); + struct io_ring_ctx *ctx = node->rsrc_data->ctx; + unsigned long flags; + bool first_add = false; + unsigned long delay = HZ; + + spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); + node->done = true; + + /* if we are mid-quiesce then do not delay */ + if (node->rsrc_data->quiesce) + delay = 0; + + while (!list_empty(&ctx->rsrc_ref_list)) { + node = list_first_entry(&ctx->rsrc_ref_list, + struct io_rsrc_node, node); + /* recycle ref nodes in order */ + if (!node->done) + break; + list_del(&node->node); + first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); + } + spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); + + if (first_add) + mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); +} + +static struct io_rsrc_node *io_rsrc_node_alloc(void) +{ + struct io_rsrc_node *ref_node; + + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; + + if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, + 0, GFP_KERNEL)) { + kfree(ref_node); + return NULL; + } + INIT_LIST_HEAD(&ref_node->node); + INIT_LIST_HEAD(&ref_node->rsrc_list); + ref_node->done = false; + return ref_node; +} + +void io_rsrc_node_switch(struct io_ring_ctx *ctx, + struct io_rsrc_data *data_to_kill) + __must_hold(&ctx->uring_lock) +{ + WARN_ON_ONCE(!ctx->rsrc_backup_node); + WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); + + io_rsrc_refs_drop(ctx); + + if (data_to_kill) { + struct io_rsrc_node *rsrc_node = ctx->rsrc_node; + + rsrc_node->rsrc_data = data_to_kill; + spin_lock_irq(&ctx->rsrc_ref_lock); + list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); + spin_unlock_irq(&ctx->rsrc_ref_lock); + + atomic_inc(&data_to_kill->refs); + percpu_ref_kill(&rsrc_node->refs); + ctx->rsrc_node = NULL; + } + + if (!ctx->rsrc_node) { + ctx->rsrc_node = ctx->rsrc_backup_node; + ctx->rsrc_backup_node = NULL; + } +} + +int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) +{ + if (ctx->rsrc_backup_node) + return 0; + ctx->rsrc_backup_node = io_rsrc_node_alloc(); + return ctx->rsrc_backup_node ? 0 : -ENOMEM; +} + +__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, + struct io_ring_ctx *ctx) +{ + int ret; + + /* As we may drop ->uring_lock, other task may have started quiesce */ + if (data->quiesce) + return -ENXIO; + + data->quiesce = true; + do { + ret = io_rsrc_node_switch_start(ctx); + if (ret) + break; + io_rsrc_node_switch(ctx, data); + + /* kill initial ref, already quiesced if zero */ + if (atomic_dec_and_test(&data->refs)) + break; + mutex_unlock(&ctx->uring_lock); + flush_delayed_work(&ctx->rsrc_put_work); + ret = wait_for_completion_interruptible(&data->done); + if (!ret) { + mutex_lock(&ctx->uring_lock); + if (atomic_read(&data->refs) > 0) { + /* + * it has been revived by another thread while + * we were unlocked + */ + mutex_unlock(&ctx->uring_lock); + } else { + break; + } + } + + atomic_inc(&data->refs); + /* wait for all works potentially completing data->done */ + flush_delayed_work(&ctx->rsrc_put_work); + reinit_completion(&data->done); + + ret = io_run_task_work_sig(); + mutex_lock(&ctx->uring_lock); + } while (ret >= 0); + data->quiesce = false; + + return ret; +} + +static void io_free_page_table(void **table, size_t size) +{ + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); + + for (i = 0; i < nr_tables; i++) + kfree(table[i]); + kfree(table); +} + +static void io_rsrc_data_free(struct io_rsrc_data *data) +{ + size_t size = data->nr * sizeof(data->tags[0][0]); + + if (data->tags) + io_free_page_table((void **)data->tags, size); + kfree(data); +} + +static __cold void **io_alloc_page_table(size_t size) +{ + unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); + size_t init_size = size; + void **table; + + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); + if (!table) + return NULL; + + for (i = 0; i < nr_tables; i++) { + unsigned int this_size = min_t(size_t, size, PAGE_SIZE); + + table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); + if (!table[i]) { + io_free_page_table(table, init_size); + return NULL; + } + size -= this_size; + } + return table; +} + +__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, + rsrc_put_fn *do_put, u64 __user *utags, + unsigned nr, struct io_rsrc_data **pdata) +{ + struct io_rsrc_data *data; + int ret = -ENOMEM; + unsigned i; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); + if (!data->tags) { + kfree(data); + return -ENOMEM; + } + + data->nr = nr; + data->ctx = ctx; + data->do_put = do_put; + if (utags) { + ret = -EFAULT; + for (i = 0; i < nr; i++) { + u64 *tag_slot = io_get_tag_slot(data, i); + + if (copy_from_user(tag_slot, &utags[i], + sizeof(*tag_slot))) + goto fail; + } + } + + atomic_set(&data->refs, 1); + init_completion(&data->done); + *pdata = data; + return 0; +fail: + io_rsrc_data_free(data); + return ret; +} + +static int __io_sqe_files_update(struct io_ring_ctx *ctx, + struct io_uring_rsrc_update2 *up, + unsigned nr_args) +{ + u64 __user *tags = u64_to_user_ptr(up->tags); + __s32 __user *fds = u64_to_user_ptr(up->data); + struct io_rsrc_data *data = ctx->file_data; + struct io_fixed_file *file_slot; + struct file *file; + int fd, i, err = 0; + unsigned int done; + bool needs_switch = false; + + if (!ctx->file_data) + return -ENXIO; + if (up->offset + nr_args > ctx->nr_user_files) + return -EINVAL; + + for (done = 0; done < nr_args; done++) { + u64 tag = 0; + + if ((tags && copy_from_user(&tag, &tags[done], sizeof(tag))) || + copy_from_user(&fd, &fds[done], sizeof(fd))) { + err = -EFAULT; + break; + } + if ((fd == IORING_REGISTER_FILES_SKIP || fd == -1) && tag) { + err = -EINVAL; + break; + } + if (fd == IORING_REGISTER_FILES_SKIP) + continue; + + i = array_index_nospec(up->offset + done, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, i); + + if (file_slot->file_ptr) { + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + err = io_queue_rsrc_removal(data, i, ctx->rsrc_node, file); + if (err) + break; + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, i); + needs_switch = true; + } + if (fd != -1) { + file = fget(fd); + if (!file) { + err = -EBADF; + break; + } + /* + * Don't allow io_uring instances to be registered. If + * UNIX isn't enabled, then this causes a reference + * cycle and this instance can never get freed. If UNIX + * is enabled we'll handle it just fine, but there's + * still no point in allowing a ring fd as it doesn't + * support regular read/write anyway. + */ + if (io_is_uring_fops(file)) { + fput(file); + err = -EBADF; + break; + } + err = io_scm_file_account(ctx, file); + if (err) { + fput(file); + break; + } + *io_get_tag_slot(data, i) = tag; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); + } + } + + if (needs_switch) + io_rsrc_node_switch(ctx, data); + return done ? done : err; +} + +static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, + struct io_uring_rsrc_update2 *up, + unsigned int nr_args) +{ + u64 __user *tags = u64_to_user_ptr(up->tags); + struct iovec iov, __user *iovs = u64_to_user_ptr(up->data); + struct page *last_hpage = NULL; + bool needs_switch = false; + __u32 done; + int i, err; + + if (!ctx->buf_data) + return -ENXIO; + if (up->offset + nr_args > ctx->nr_user_bufs) + return -EINVAL; + + for (done = 0; done < nr_args; done++) { + struct io_mapped_ubuf *imu; + int offset = up->offset + done; + u64 tag = 0; + + err = io_copy_iov(ctx, &iov, iovs, done); + if (err) + break; + if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) { + err = -EFAULT; + break; + } + err = io_buffer_validate(&iov); + if (err) + break; + if (!iov.iov_base && tag) { + err = -EINVAL; + break; + } + err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); + if (err) + break; + + i = array_index_nospec(offset, ctx->nr_user_bufs); + if (ctx->user_bufs[i] != ctx->dummy_ubuf) { + err = io_queue_rsrc_removal(ctx->buf_data, i, + ctx->rsrc_node, ctx->user_bufs[i]); + if (unlikely(err)) { + io_buffer_unmap(ctx, &imu); + break; + } + ctx->user_bufs[i] = NULL; + needs_switch = true; + } + + ctx->user_bufs[i] = imu; + *io_get_tag_slot(ctx->buf_data, offset) = tag; + } + + if (needs_switch) + io_rsrc_node_switch(ctx, ctx->buf_data); + return done ? done : err; +} + +static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, + struct io_uring_rsrc_update2 *up, + unsigned nr_args) +{ + __u32 tmp; + int err; + + if (check_add_overflow(up->offset, nr_args, &tmp)) + return -EOVERFLOW; + err = io_rsrc_node_switch_start(ctx); + if (err) + return err; + + switch (type) { + case IORING_RSRC_FILE: + return __io_sqe_files_update(ctx, up, nr_args); + case IORING_RSRC_BUFFER: + return __io_sqe_buffers_update(ctx, up, nr_args); + } + return -EINVAL; +} + +int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct io_uring_rsrc_update2 up; + + if (!nr_args) + return -EINVAL; + memset(&up, 0, sizeof(up)); + if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update))) + return -EFAULT; + if (up.resv || up.resv2) + return -EINVAL; + return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args); +} + +int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned size, unsigned type) +{ + struct io_uring_rsrc_update2 up; + + if (size != sizeof(up)) + return -EINVAL; + if (copy_from_user(&up, arg, sizeof(up))) + return -EFAULT; + if (!up.nr || up.resv || up.resv2) + return -EINVAL; + return __io_register_rsrc_update(ctx, type, &up, up.nr); +} + +__cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, + unsigned int size, unsigned int type) +{ + struct io_uring_rsrc_register rr; + + /* keep it extendible */ + if (size != sizeof(rr)) + return -EINVAL; + + memset(&rr, 0, sizeof(rr)); + if (copy_from_user(&rr, arg, size)) + return -EFAULT; + if (!rr.nr || rr.resv2) + return -EINVAL; + if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE) + return -EINVAL; + + switch (type) { + case IORING_RSRC_FILE: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; + return io_sqe_files_register(ctx, u64_to_user_ptr(rr.data), + rr.nr, u64_to_user_ptr(rr.tags)); + case IORING_RSRC_BUFFER: + if (rr.flags & IORING_RSRC_REGISTER_SPARSE && rr.data) + break; + return io_sqe_buffers_register(ctx, u64_to_user_ptr(rr.data), + rr.nr, u64_to_user_ptr(rr.tags)); + } + return -EINVAL; +} + +int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + + if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) + return -EINVAL; + if (sqe->rw_flags || sqe->splice_fd_in) + return -EINVAL; + + up->offset = READ_ONCE(sqe->off); + up->nr_args = READ_ONCE(sqe->len); + if (!up->nr_args) + return -EINVAL; + up->arg = READ_ONCE(sqe->addr); + return 0; +} + +static int io_files_update_with_index_alloc(struct io_kiocb *req, + unsigned int issue_flags) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + __s32 __user *fds = u64_to_user_ptr(up->arg); + unsigned int done; + struct file *file; + int ret, fd; + + if (!req->ctx->file_data) + return -ENXIO; + + for (done = 0; done < up->nr_args; done++) { + if (copy_from_user(&fd, &fds[done], sizeof(fd))) { + ret = -EFAULT; + break; + } + + file = fget(fd); + if (!file) { + ret = -EBADF; + break; + } + ret = io_fixed_fd_install(req, issue_flags, file, + IORING_FILE_INDEX_ALLOC); + if (ret < 0) + break; + if (copy_to_user(&fds[done], &ret, sizeof(ret))) { + __io_close_fixed(req, issue_flags, ret); + ret = -EFAULT; + break; + } + } + + if (done) + return done; + return ret; +} + +int io_files_update(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + struct io_uring_rsrc_update2 up2; + int ret; + + up2.offset = up->offset; + up2.data = up->arg; + up2.nr = 0; + up2.tags = 0; + up2.resv = 0; + up2.resv2 = 0; + + if (up->offset == IORING_FILE_INDEX_ALLOC) { + ret = io_files_update_with_index_alloc(req, issue_flags); + } else { + io_ring_submit_lock(ctx, issue_flags); + ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, + &up2, up->nr_args); + io_ring_submit_unlock(ctx, issue_flags); + } + + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc) +{ + u64 *tag_slot = io_get_tag_slot(data, idx); + struct io_rsrc_put *prsrc; + + prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); + if (!prsrc) + return -ENOMEM; + + prsrc->tag = *tag_slot; + *tag_slot = 0; + prsrc->rsrc = rsrc; + list_add(&prsrc->list, &node->rsrc_list); + return 0; +} + +void __io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ +#if !defined(IO_URING_SCM_ALL) + int i; + + for (i = 0; i < ctx->nr_user_files; i++) { + struct file *file = io_file_from_index(&ctx->file_table, i); + + if (!file) + continue; + if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM) + continue; + io_file_bitmap_clear(&ctx->file_table, i); + fput(file); + } +#endif + +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) { + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) + kfree_skb(skb); + } +#endif + io_free_file_tables(&ctx->file_table); + io_rsrc_data_free(ctx->file_data); + ctx->file_data = NULL; + ctx->nr_user_files = 0; +} + +int io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ + unsigned nr = ctx->nr_user_files; + int ret; + + if (!ctx->file_data) + return -ENXIO; + + /* + * Quiesce may unlock ->uring_lock, and while it's not held + * prevent new requests using the table. + */ + ctx->nr_user_files = 0; + ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); + ctx->nr_user_files = nr; + if (!ret) + __io_sqe_files_unregister(ctx); + return ret; +} + +/* + * Ensure the UNIX gc is aware of our file set, so we are certain that + * the io_uring can be safely unregistered on process exit, even if we have + * loops in the file referencing. We account only files that can hold other + * files because otherwise they can't form a loop and so are not interesting + * for GC. + */ +int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) +{ +#if defined(CONFIG_UNIX) + struct sock *sk = ctx->ring_sock->sk; + struct sk_buff_head *head = &sk->sk_receive_queue; + struct scm_fp_list *fpl; + struct sk_buff *skb; + + if (likely(!io_file_need_scm(file))) + return 0; + + /* + * See if we can merge this file into an existing skb SCM_RIGHTS + * file set. If there's no room, fall back to allocating a new skb + * and filling it in. + */ + spin_lock_irq(&head->lock); + skb = skb_peek(head); + if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) + __skb_unlink(skb, head); + else + skb = NULL; + spin_unlock_irq(&head->lock); + + if (!skb) { + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } + + fpl->user = get_uid(current_user()); + fpl->max = SCM_MAX_FD; + fpl->count = 0; + + UNIXCB(skb).fp = fpl; + skb->sk = sk; + skb->destructor = unix_destruct_scm; + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + } + + fpl = UNIXCB(skb).fp; + fpl->fp[fpl->count++] = get_file(file); + unix_inflight(fpl->user, file); + skb_queue_head(head, skb); + fput(file); +#endif + return 0; +} + +static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +{ + struct file *file = prsrc->file; +#if defined(CONFIG_UNIX) + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff_head list, *head = &sock->sk_receive_queue; + struct sk_buff *skb; + int i; + + if (!io_file_need_scm(file)) { + fput(file); + return; + } + + __skb_queue_head_init(&list); + + /* + * Find the skb that holds this file in its SCM_RIGHTS. When found, + * remove this entry and rearrange the file array. + */ + skb = skb_dequeue(head); + while (skb) { + struct scm_fp_list *fp; + + fp = UNIXCB(skb).fp; + for (i = 0; i < fp->count; i++) { + int left; + + if (fp->fp[i] != file) + continue; + + unix_notinflight(fp->user, fp->fp[i]); + left = fp->count - 1 - i; + if (left) { + memmove(&fp->fp[i], &fp->fp[i + 1], + left * sizeof(struct file *)); + } + fp->count--; + if (!fp->count) { + kfree_skb(skb); + skb = NULL; + } else { + __skb_queue_tail(&list, skb); + } + fput(file); + file = NULL; + break; + } + + if (!file) + break; + + __skb_queue_tail(&list, skb); + + skb = skb_dequeue(head); + } + + if (skb_peek(&list)) { + spin_lock_irq(&head->lock); + while ((skb = __skb_dequeue(&list)) != NULL) + __skb_queue_tail(head, skb); + spin_unlock_irq(&head->lock); + } +#else + fput(file); +#endif +} + +int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args, u64 __user *tags) +{ + __s32 __user *fds = (__s32 __user *) arg; + struct file *file; + int fd, ret; + unsigned i; + + if (ctx->file_data) + return -EBUSY; + if (!nr_args) + return -EINVAL; + if (nr_args > IORING_MAX_FIXED_FILES) + return -EMFILE; + if (nr_args > rlimit(RLIMIT_NOFILE)) + return -EMFILE; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + return ret; + ret = io_rsrc_data_alloc(ctx, io_rsrc_file_put, tags, nr_args, + &ctx->file_data); + if (ret) + return ret; + + if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { + io_rsrc_data_free(ctx->file_data); + ctx->file_data = NULL; + return -ENOMEM; + } + + for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { + struct io_fixed_file *file_slot; + + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { + ret = -EFAULT; + goto fail; + } + /* allow sparse sets */ + if (!fds || fd == -1) { + ret = -EINVAL; + if (unlikely(*io_get_tag_slot(ctx->file_data, i))) + goto fail; + continue; + } + + file = fget(fd); + ret = -EBADF; + if (unlikely(!file)) + goto fail; + + /* + * Don't allow io_uring instances to be registered. If UNIX + * isn't enabled, then this causes a reference cycle and this + * instance can never get freed. If UNIX is enabled we'll + * handle it just fine, but there's still no point in allowing + * a ring fd as it doesn't support regular read/write anyway. + */ + if (io_is_uring_fops(file)) { + fput(file); + goto fail; + } + ret = io_scm_file_account(ctx, file); + if (ret) { + fput(file); + goto fail; + } + file_slot = io_fixed_file_slot(&ctx->file_table, i); + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, i); + } + + io_rsrc_node_switch(ctx, NULL); + return 0; +fail: + __io_sqe_files_unregister(ctx); + return ret; +} + +static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) +{ + io_buffer_unmap(ctx, &prsrc->buf); + prsrc->buf = NULL; +} + +void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) +{ + unsigned int i; + + for (i = 0; i < ctx->nr_user_bufs; i++) + io_buffer_unmap(ctx, &ctx->user_bufs[i]); + kfree(ctx->user_bufs); + io_rsrc_data_free(ctx->buf_data); + ctx->user_bufs = NULL; + ctx->buf_data = NULL; + ctx->nr_user_bufs = 0; +} + +int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) +{ + unsigned nr = ctx->nr_user_bufs; + int ret; + + if (!ctx->buf_data) + return -ENXIO; + + /* + * Quiesce may unlock ->uring_lock, and while it's not held + * prevent new requests using the table. + */ + ctx->nr_user_bufs = 0; + ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); + ctx->nr_user_bufs = nr; + if (!ret) + __io_sqe_buffers_unregister(ctx); + return ret; +} + +/* + * Not super efficient, but this is just a registration time. And we do cache + * the last compound head, so generally we'll only do a full search if we don't + * match that one. + * + * We check if the given compound head page has already been accounted, to + * avoid double accounting it. This allows us to account the full size of the + * page, not just the constituent pages of a huge page. + */ +static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct page *hpage) +{ + int i, j; + + /* check current page array */ + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) + continue; + if (compound_head(pages[i]) == hpage) + return true; + } + + /* check previously registered pages */ + for (i = 0; i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + + for (j = 0; j < imu->nr_bvecs; j++) { + if (!PageCompound(imu->bvec[j].bv_page)) + continue; + if (compound_head(imu->bvec[j].bv_page) == hpage) + return true; + } + } + + return false; +} + +static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, + int nr_pages, struct io_mapped_ubuf *imu, + struct page **last_hpage) +{ + int i, ret; + + imu->acct_pages = 0; + for (i = 0; i < nr_pages; i++) { + if (!PageCompound(pages[i])) { + imu->acct_pages++; + } else { + struct page *hpage; + + hpage = compound_head(pages[i]); + if (hpage == *last_hpage) + continue; + *last_hpage = hpage; + if (headpage_already_acct(ctx, pages, i, hpage)) + continue; + imu->acct_pages += page_size(hpage) >> PAGE_SHIFT; + } + } + + if (!imu->acct_pages) + return 0; + + ret = io_account_mem(ctx, imu->acct_pages); + if (ret) + imu->acct_pages = 0; + return ret; +} + +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) +{ + unsigned long start, end, nr_pages; + struct vm_area_struct **vmas = NULL; + struct page **pages = NULL; + int i, pret, ret = -ENOMEM; + + end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ubuf >> PAGE_SHIFT; + nr_pages = end - start; + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto done; + + vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas) + goto done; + + ret = 0; + mmap_read_lock(current->mm); + pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); + if (pret == nr_pages) { + /* don't support file backed memory */ + for (i = 0; i < nr_pages; i++) { + struct vm_area_struct *vma = vmas[i]; + + if (vma_is_shmem(vma)) + continue; + if (vma->vm_file && + !is_file_hugepages(vma->vm_file)) { + ret = -EOPNOTSUPP; + break; + } + } + *npages = nr_pages; + } else { + ret = pret < 0 ? pret : -EFAULT; + } + mmap_read_unlock(current->mm); + if (ret) { + /* + * if we did partial map, or found file backed vmas, + * release any pages we did get + */ + if (pret > 0) + unpin_user_pages(pages, pret); + goto done; + } + ret = 0; +done: + kvfree(vmas); + if (ret < 0) { + kvfree(pages); + pages = ERR_PTR(ret); + } + return pages; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, + struct io_mapped_ubuf **pimu, + struct page **last_hpage) +{ + struct io_mapped_ubuf *imu = NULL; + struct page **pages = NULL; + unsigned long off; + size_t size; + int ret, nr_pages, i; + + if (!iov->iov_base) { + *pimu = ctx->dummy_ubuf; + return 0; + } + + *pimu = NULL; + ret = -ENOMEM; + + pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, + &nr_pages); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + pages = NULL; + goto done; + } + + imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + if (!imu) + goto done; + + ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); + if (ret) { + unpin_user_pages(pages, nr_pages); + goto done; + } + + off = (unsigned long) iov->iov_base & ~PAGE_MASK; + size = iov->iov_len; + for (i = 0; i < nr_pages; i++) { + size_t vec_len; + + vec_len = min_t(size_t, size, PAGE_SIZE - off); + imu->bvec[i].bv_page = pages[i]; + imu->bvec[i].bv_len = vec_len; + imu->bvec[i].bv_offset = off; + off = 0; + size -= vec_len; + } + /* store original address for later verification */ + imu->ubuf = (unsigned long) iov->iov_base; + imu->ubuf_end = imu->ubuf + iov->iov_len; + imu->nr_bvecs = nr_pages; + *pimu = imu; + ret = 0; +done: + if (ret) + kvfree(imu); + kvfree(pages); + return ret; +} + +static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) +{ + ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); + return ctx->user_bufs ? 0 : -ENOMEM; +} + +int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args, u64 __user *tags) +{ + struct page *last_hpage = NULL; + struct io_rsrc_data *data; + int i, ret; + struct iovec iov; + + BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + + if (ctx->user_bufs) + return -EBUSY; + if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) + return -EINVAL; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + return ret; + ret = io_rsrc_data_alloc(ctx, io_rsrc_buf_put, tags, nr_args, &data); + if (ret) + return ret; + ret = io_buffers_map_alloc(ctx, nr_args); + if (ret) { + io_rsrc_data_free(data); + return ret; + } + + for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { + if (arg) { + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; + ret = io_buffer_validate(&iov); + if (ret) + break; + } else { + memset(&iov, 0, sizeof(iov)); + } + + if (!iov.iov_base && *io_get_tag_slot(data, i)) { + ret = -EINVAL; + break; + } + + ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], + &last_hpage); + if (ret) + break; + } + + WARN_ON_ONCE(ctx->buf_data); + + ctx->buf_data = data; + if (ret) + __io_sqe_buffers_unregister(ctx); + else + io_rsrc_node_switch(ctx, NULL); + return ret; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h new file mode 100644 index 00000000000000..872c86312cbc2d --- /dev/null +++ b/io_uring/rsrc.h @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_RSRC_H +#define IOU_RSRC_H + +#include + +#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) +#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) +#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) + +enum { + IORING_RSRC_FILE = 0, + IORING_RSRC_BUFFER = 1, +}; + +struct io_rsrc_put { + struct list_head list; + u64 tag; + union { + void *rsrc; + struct file *file; + struct io_mapped_ubuf *buf; + }; +}; + +typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); + +struct io_rsrc_data { + struct io_ring_ctx *ctx; + + u64 **tags; + unsigned int nr; + rsrc_put_fn *do_put; + atomic_t refs; + struct completion done; + bool quiesce; +}; + +struct io_rsrc_node { + struct percpu_ref refs; + struct list_head node; + struct list_head rsrc_list; + struct io_rsrc_data *rsrc_data; + struct llist_node llist; + bool done; +}; + +void io_rsrc_put_work(struct work_struct *work); +void io_rsrc_refs_refill(struct io_ring_ctx *ctx); +void io_wait_rsrc_data(struct io_rsrc_data *data); +void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); +void io_rsrc_refs_drop(struct io_ring_ctx *ctx); +int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); +int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc); +void io_rsrc_node_switch(struct io_ring_ctx *ctx, + struct io_rsrc_data *data_to_kill); + + +void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); +int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); +int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned int nr_args, u64 __user *tags); +void __io_sqe_files_unregister(struct io_ring_ctx *ctx); +int io_sqe_files_unregister(struct io_ring_ctx *ctx); +int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args, u64 __user *tags); + +int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file); + +#if defined(CONFIG_UNIX) +static inline bool io_file_need_scm(struct file *filp) +{ +#if defined(IO_URING_SCM_ALL) + return true; +#else + return !!unix_get_socket(filp); +#endif +} +#else +static inline bool io_file_need_scm(struct file *filp) +{ + return false; +} +#endif + +static inline int io_scm_file_account(struct io_ring_ctx *ctx, + struct file *file) +{ + if (likely(!io_file_need_scm(file))) + return 0; + return __io_scm_file_account(ctx, file); +} + +int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args); +int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, + unsigned size, unsigned type); +int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, + unsigned int size, unsigned int type); + +static inline void io_rsrc_put_node(struct io_rsrc_node *node, int nr) +{ + percpu_ref_put_many(&node->refs, nr); +} + +static inline void io_req_put_rsrc(struct io_kiocb *req) +{ + if (req->rsrc_node) + io_rsrc_put_node(req->rsrc_node, 1); +} + +static inline void io_req_put_rsrc_locked(struct io_kiocb *req, + struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + struct io_rsrc_node *node = req->rsrc_node; + + if (node) { + if (node == ctx->rsrc_node) + ctx->rsrc_cached_refs++; + else + io_rsrc_put_node(node, 1); + } +} + +static inline void io_req_set_rsrc_node(struct io_kiocb *req, + struct io_ring_ctx *ctx, + unsigned int issue_flags) +{ + if (!req->rsrc_node) { + req->rsrc_node = ctx->rsrc_node; + + if (!(issue_flags & IO_URING_F_UNLOCKED)) { + lockdep_assert_held(&ctx->uring_lock); + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); + } else { + percpu_ref_get(&req->rsrc_node->refs); + } + } +} + +static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) +{ + unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; + unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; + + return &data->tags[table_idx][off]; +} + +int io_files_update(struct io_kiocb *req, unsigned int issue_flags); +int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +#endif From f2f1bae84af5170bd9a392d4fee78d2d317b195a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 26 May 2022 09:44:31 -0600 Subject: [PATCH 0979/1250] io_uring: move remaining file table manipulation to filetable.c Signed-off-by: Jens Axboe --- io_uring/filetable.c | 87 +++++++++++++++++++++++++++++++++++++++++++- io_uring/filetable.h | 5 ++- io_uring/io_uring.c | 82 ----------------------------------------- io_uring/io_uring.h | 4 -- 4 files changed, 90 insertions(+), 88 deletions(-) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 560629a93c04b0..e449ceb9a848e6 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -4,14 +4,17 @@ #include #include #include +#include #include #include #include "io_uring_types.h" #include "io_uring.h" +#include "rsrc.h" +#include "filetable.h" -int io_file_bitmap_get(struct io_ring_ctx *ctx) +static int io_file_bitmap_get(struct io_ring_ctx *ctx) { struct io_file_table *table = &ctx->file_table; unsigned long nr = ctx->nr_user_files; @@ -55,3 +58,85 @@ void io_free_file_tables(struct io_file_table *table) table->files = NULL; table->bitmap = NULL; } + +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) + __must_hold(&req->ctx->uring_lock) +{ + struct io_ring_ctx *ctx = req->ctx; + bool needs_switch = false; + struct io_fixed_file *file_slot; + int ret; + + if (io_is_uring_fops(file)) + return -EBADF; + if (!ctx->file_data) + return -ENXIO; + if (slot_index >= ctx->nr_user_files) + return -EINVAL; + + slot_index = array_index_nospec(slot_index, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); + + if (file_slot->file_ptr) { + struct file *old_file; + + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto err; + + old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, slot_index, + ctx->rsrc_node, old_file); + if (ret) + goto err; + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, slot_index); + needs_switch = true; + } + + ret = io_scm_file_account(ctx, file); + if (!ret) { + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); + } +err: + if (needs_switch) + io_rsrc_node_switch(ctx, ctx->file_data); + if (ret) + fput(file); + return ret; +} + +/* + * Note when io_fixed_fd_install() returns error value, it will ensure + * fput() is called correspondingly. + */ +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) +{ + bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; + struct io_ring_ctx *ctx = req->ctx; + int ret; + + io_ring_submit_lock(ctx, issue_flags); + + if (alloc_slot) { + ret = io_file_bitmap_get(ctx); + if (unlikely(ret < 0)) + goto err; + file_slot = ret; + } else { + file_slot--; + } + + ret = io_install_fixed_file(req, file, issue_flags, file_slot); + if (!ret && alloc_slot) + ret = file_slot; +err: + io_ring_submit_unlock(ctx, issue_flags); + if (unlikely(ret < 0)) + fput(file); + return ret; +} diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 6e1675f406b729..c404360f709053 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -3,6 +3,7 @@ #define IOU_FILE_TABLE_H struct io_ring_ctx; +struct io_kiocb; /* * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 @@ -34,7 +35,9 @@ struct io_file_table { bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); -int io_file_bitmap_get(struct io_ring_ctx *ctx); + +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot); unsigned int io_file_get_flags(struct file *file); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0c47c919887f52..c0f1f79933ac2a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2699,38 +2699,6 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) return ret; } -/* - * Note when io_fixed_fd_install() returns error value, it will ensure - * fput() is called correspondingly. - */ -int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot) -{ - bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; - struct io_ring_ctx *ctx = req->ctx; - int ret; - - io_ring_submit_lock(ctx, issue_flags); - - if (alloc_slot) { - ret = io_file_bitmap_get(ctx); - if (unlikely(ret < 0)) - goto err; - file_slot = ret; - } else { - file_slot--; - } - - ret = io_install_fixed_file(req, file, issue_flags, file_slot); - if (!ret && alloc_slot) - ret = file_slot; -err: - io_ring_submit_unlock(ctx, issue_flags); - if (unlikely(ret < 0)) - fput(file); - return ret; -} - static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, const struct io_uring_sqe *sqe) { @@ -3579,56 +3547,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) - __must_hold(&req->ctx->uring_lock) -{ - struct io_ring_ctx *ctx = req->ctx; - bool needs_switch = false; - struct io_fixed_file *file_slot; - int ret; - - if (io_is_uring_fops(file)) - return -EBADF; - if (!ctx->file_data) - return -ENXIO; - if (slot_index >= ctx->nr_user_files) - return -EINVAL; - - slot_index = array_index_nospec(slot_index, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - - if (file_slot->file_ptr) { - struct file *old_file; - - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto err; - - old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - ctx->rsrc_node, old_file); - if (ret) - goto err; - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); - needs_switch = true; - } - - ret = io_scm_file_account(ctx, file); - if (!ret) { - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); - } -err: - if (needs_switch) - io_rsrc_node_switch(ctx, ctx->file_data); - if (ret) - fput(file); - return ret; -} - static void io_mem_free(void *ptr) { struct page *page; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 090c17deba9db2..71afb46070e364 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -106,10 +106,6 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); -int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot); -int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); From b687df8cda59fa5843b5f9c366c0c7cb63672cf6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 07:27:03 -0600 Subject: [PATCH 0980/1250] io_uring: move read/write related opcodes to its own file Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 1239 +------------------------------------------ io_uring/io_uring.h | 131 +++++ io_uring/rw.c | 1099 ++++++++++++++++++++++++++++++++++++++ io_uring/rw.h | 23 + 5 files changed, 1263 insertions(+), 1231 deletions(-) create mode 100644 io_uring/rw.c create mode 100644 io_uring/rw.h diff --git a/io_uring/Makefile b/io_uring/Makefile index 360a83039c2a4f..d70deed65a0bb2 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o + cancel.o kbuf.o rsrc.o rw.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c0f1f79933ac2a..0af61a6c29cfee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -57,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -67,13 +65,10 @@ #include #include #include -#include #include #include #include -#include #include -#include #include #include #include @@ -110,6 +105,7 @@ #include "timeout.h" #include "poll.h" #include "cancel.h" +#include "rw.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -136,31 +132,6 @@ #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -/* - * First field must be the file pointer in all the - * iocb unions! See also 'struct kiocb' in - */ -struct io_rw { - /* NOTE: kiocb has the file as the first member, so don't do it here */ - struct kiocb kiocb; - u64 addr; - u32 len; - rwf_t flags; -}; - -struct io_rw_state { - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov[UIO_FASTIOV]; -}; - -struct io_async_rw { - struct io_rw_state s; - const struct iovec *free_iovec; - size_t bytes_done; - struct wait_page_queue wpq; -}; - enum { IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_DROPPED_BIT, @@ -184,9 +155,7 @@ static void io_dismantle_req(struct io_kiocb *req); static void io_clean_op(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req); -static void io_req_task_queue(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); -static int io_req_prep_async(struct io_kiocb *req); static void io_eventfd_signal(struct io_ring_ctx *ctx); @@ -393,11 +362,6 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -static inline bool io_req_ffs_set(struct io_kiocb *req) -{ - return req->flags & REQ_F_FIXED_FILE; -} - static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { @@ -489,7 +453,7 @@ static inline void io_req_add_compl_list(struct io_kiocb *req) wq_list_add_tail(&req->comp_list, &state->compl_reqs); } -static void io_queue_iowq(struct io_kiocb *req, bool *dont_use) +void io_queue_iowq(struct io_kiocb *req, bool *dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -532,7 +496,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used || ctx->drain_active) { spin_lock(&ctx->completion_lock); @@ -547,60 +511,6 @@ static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_signal(ctx); } -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) -{ - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); -} - -/* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ -static noinline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int shift = 0; - unsigned int free, queued, len; - - if (ctx->flags & IORING_SETUP_CQE32) - shift = 1; - - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; - /* we need a contiguous range, limit based on the current array offset */ - len = min(free, ctx->cq_entries - off); - if (!len) - return NULL; - - ctx->cached_cq_tail++; - ctx->cqe_cached = &rings->cqes[off]; - ctx->cqe_sentinel = ctx->cqe_cached + len; - ctx->cqe_cached++; - return &rings->cqes[off << shift]; -} - -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) -{ - if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { - struct io_uring_cqe *cqe = ctx->cqe_cached; - - if (ctx->flags & IORING_SETUP_CQE32) { - unsigned int off = ctx->cqe_cached - ctx->rings->cqes; - - cqe += off; - } - - ctx->cached_cq_tail++; - ctx->cqe_cached++; - return cqe; - } - - return __io_get_cqe(ctx); -} - static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; @@ -628,17 +538,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) rcu_read_unlock(); } -static inline void io_cqring_wake(struct io_ring_ctx *ctx) -{ - /* - * wake_up_all() may seem excessive, but io_wake_function() and - * io_should_wake() handle the termination of the loop and only - * wake as many waiters as we need to. - */ - if (wq_has_sleeper(&ctx->cq_wait)) - wake_up_all(&ctx->cq_wait); -} - /* * This should only get called when at least one event has been posted. * Some applications rely on the eventfd notification count only changing @@ -655,16 +554,6 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } -static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) -{ - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - - if (ctx->flags & IORING_SETUP_SQPOLL) - io_cqring_wake(ctx); -} - /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -775,9 +664,8 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, - u64 extra2) +bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags, u64 extra1, u64 extra2) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); @@ -814,59 +702,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - struct io_uring_cqe *cqe; - - if (!(ctx->flags & IORING_SETUP_CQE32)) { - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, 0, 0); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(*cqe)); - return true; - } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - 0, 0); - } else { - u64 extra1 = 0, extra2 = 0; - - if (req->flags & REQ_F_CQE32_INIT) { - extra1 = req->extra1; - extra2 = req->extra2; - } - - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, extra1, extra2); - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); - WRITE_ONCE(cqe->big_cqe[0], extra1); - WRITE_ONCE(cqe->big_cqe[1], extra2); - return true; - } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - extra1, extra2); - } -} - bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { @@ -1269,7 +1104,7 @@ void io_req_task_work_add(struct io_kiocb *req) __io_req_task_work_add(req, tctx, &tctx->task_list); } -static void io_req_task_prio_work_add(struct io_kiocb *req) +void io_req_task_prio_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; @@ -1315,18 +1150,12 @@ void io_req_task_queue_fail(struct io_kiocb *req, int ret) io_req_task_work_add(req); } -static void io_req_task_queue(struct io_kiocb *req) +void io_req_task_queue(struct io_kiocb *req) { req->io_task_work.func = io_req_task_submit; io_req_task_work_add(req); } -static void io_req_task_queue_reissue(struct io_kiocb *req) -{ - req->io_task_work.func = io_queue_iowq; - io_req_task_work_add(req); -} - void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -1335,8 +1164,7 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } -static void io_free_batch_list(struct io_ring_ctx *ctx, - struct io_wq_work_node *node) +void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) { struct task_struct *task = NULL; @@ -1435,76 +1263,6 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx) return __io_cqring_events(ctx); } -int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) -{ - struct io_wq_work_node *pos, *start, *prev; - unsigned int poll_flags = BLK_POLL_NOSLEEP; - DEFINE_IO_COMP_BATCH(iob); - int nr_events = 0; - - /* - * Only spin for completions if we don't have multiple devices hanging - * off our complete list. - */ - if (ctx->poll_multi_queue || force_nonspin) - poll_flags |= BLK_POLL_ONESHOT; - - wq_list_for_each(pos, start, &ctx->iopoll_list) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - struct io_rw *rw = io_kiocb_to_cmd(req); - int ret; - - /* - * Move completed and retryable entries to our local lists. - * If we find a request that requires polling, break out - * and complete those lists first, if we have entries there. - */ - if (READ_ONCE(req->iopoll_completed)) - break; - - ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags); - if (unlikely(ret < 0)) - return ret; - else if (ret) - poll_flags |= BLK_POLL_ONESHOT; - - /* iopoll may have completed current req */ - if (!rq_list_empty(iob.req_list) || - READ_ONCE(req->iopoll_completed)) - break; - } - - if (!rq_list_empty(iob.req_list)) - iob.complete(&iob); - else if (!pos) - return 0; - - prev = start; - wq_list_for_each_resume(pos, prev) { - struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - - /* order with io_complete_rw_iopoll(), e.g. ->result updates */ - if (!smp_load_acquire(&req->iopoll_completed)) - break; - nr_events++; - if (unlikely(req->flags & REQ_F_CQE_SKIP)) - continue; - - req->cqe.flags = io_put_kbuf(req, 0); - __io_fill_cqe_req(req->ctx, req); - } - - if (unlikely(!nr_events)) - return 0; - - io_commit_cqring(ctx); - io_cqring_ev_posted_iopoll(ctx); - pos = start ? start->next : ctx->iopoll_list.first; - wq_list_cut(&ctx->iopoll_list, prev, start); - io_free_batch_list(ctx, pos); - return nr_events; -} - /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. @@ -1589,90 +1347,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return ret; } - -static void kiocb_end_write(struct io_kiocb *req) -{ - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (req->flags & REQ_F_ISREG) { - struct super_block *sb = file_inode(req->file)->i_sb; - - __sb_writers_acquired(sb, SB_FREEZE_WRITE); - sb_end_write(sb); - } -} - -#ifdef CONFIG_BLOCK -static bool io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - - if (!req_has_async_data(req)) - return !io_req_prep_async(req); - iov_iter_restore(&io->s.iter, &io->s.iter_state); - return true; -} - -static bool io_rw_should_reissue(struct io_kiocb *req) -{ - umode_t mode = file_inode(req->file)->i_mode; - struct io_ring_ctx *ctx = req->ctx; - - if (!S_ISBLK(mode) && !S_ISREG(mode)) - return false; - if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) - return false; - /* - * If ref is dying, we might be running poll reap from the exit work. - * Don't attempt to reissue from that path, just let it fail with - * -EAGAIN. - */ - if (percpu_ref_is_dying(&ctx->refs)) - return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->task, current) || !in_task()) - return false; - return true; -} -#else -static bool io_resubmit_prep(struct io_kiocb *req) -{ - return false; -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ - return false; -} -#endif - -static bool __io_complete_rw_common(struct io_kiocb *req, long res) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - - if (rw->kiocb.ki_flags & IOCB_WRITE) { - kiocb_end_write(req); - fsnotify_modify(req->file); - } else { - fsnotify_access(req->file); - } - if (unlikely(res != req->cqe.res)) { - if ((res == -EAGAIN || res == -EOPNOTSUPP) && - io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; - return true; - } - req_set_fail(req); - req->cqe.res = res; - } - return false; -} - inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { if (*locked) { @@ -1685,46 +1359,6 @@ inline void io_req_task_complete(struct io_kiocb *req, bool *locked) } } -static void __io_complete_rw(struct io_kiocb *req, long res, - unsigned int issue_flags) -{ - if (__io_complete_rw_common(req, res)) - return; - io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags)); - __io_req_complete(req, issue_flags); -} - -static void io_complete_rw(struct kiocb *kiocb, long res) -{ - struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); - struct io_kiocb *req = cmd_to_io_kiocb(rw); - - if (__io_complete_rw_common(req, res)) - return; - io_req_set_res(req, res, 0); - req->io_task_work.func = io_req_task_complete; - io_req_task_prio_work_add(req); -} - -static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) -{ - struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); - struct io_kiocb *req = cmd_to_io_kiocb(rw); - - if (kiocb->ki_flags & IOCB_WRITE) - kiocb_end_write(req); - if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; - return; - } - req->cqe.res = res; - } - - /* order with io_iopoll_complete() checking ->iopoll_completed */ - smp_store_release(&req->iopoll_completed, 1); -} - /* * After the iocb has been issued, it's safe to be found on the poll list. * Adding the kiocb to the list AFTER submission ensures that we don't @@ -1833,426 +1467,6 @@ unsigned int io_file_get_flags(struct file *file) return res; } -static inline bool io_file_supports_nowait(struct io_kiocb *req) -{ - return req->flags & REQ_F_SUPPORT_NOWAIT; -} - -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - unsigned ioprio; - int ret; - - rw->kiocb.ki_pos = READ_ONCE(sqe->off); - /* used for fixed read/write too - just read unconditionally */ - req->buf_index = READ_ONCE(sqe->buf_index); - - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) { - struct io_ring_ctx *ctx = req->ctx; - u16 index; - - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) - return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); - } - - ioprio = READ_ONCE(sqe->ioprio); - if (ioprio) { - ret = ioprio_check_cap(ioprio); - if (ret) - return ret; - - rw->kiocb.ki_ioprio = ioprio; - } else { - rw->kiocb.ki_ioprio = get_current_ioprio(); - } - - rw->addr = READ_ONCE(sqe->addr); - rw->len = READ_ONCE(sqe->len); - rw->flags = READ_ONCE(sqe->rw_flags); - return 0; -} - -static void io_readv_writev_cleanup(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); -} - -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) -{ - switch (ret) { - case -EIOCBQUEUED: - break; - case -ERESTARTSYS: - case -ERESTARTNOINTR: - case -ERESTARTNOHAND: - case -ERESTART_RESTARTBLOCK: - /* - * We can't just restart the syscall, since previously - * submitted sqes may already be in progress. Just fail this - * IO with EINTR. - */ - ret = -EINTR; - fallthrough; - default: - kiocb->ki_complete(kiocb, ret); - } -} - -static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - - if (rw->kiocb.ki_pos != -1) - return &rw->kiocb.ki_pos; - - if (!(req->file->f_mode & FMODE_STREAM)) { - req->flags |= REQ_F_CUR_POS; - rw->kiocb.ki_pos = req->file->f_pos; - return &rw->kiocb.ki_pos; - } - - rw->kiocb.ki_pos = 0; - return NULL; -} - -static void kiocb_done(struct io_kiocb *req, ssize_t ret, - unsigned int issue_flags) -{ - struct io_async_rw *io = req->async_data; - struct io_rw *rw = io_kiocb_to_cmd(req); - - /* add previously done IO, if any */ - if (req_has_async_data(req) && io->bytes_done > 0) { - if (ret < 0) - ret = io->bytes_done; - else - ret += io->bytes_done; - } - - if (req->flags & REQ_F_CUR_POS) - req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) - __io_complete_rw(req, ret, issue_flags); - else - io_rw_done(&rw->kiocb, ret); - - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) - io_req_task_queue_reissue(req); - else - io_req_task_queue_fail(req, ret); - } -} - -static int __io_import_fixed(struct io_kiocb *req, int ddir, - struct iov_iter *iter, struct io_mapped_ubuf *imu) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - size_t len = rw->len; - u64 buf_end, buf_addr = rw->addr; - size_t offset; - - if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) - return -EFAULT; - /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) - return -EFAULT; - - /* - * May not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ - offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); - - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are PAGE_SIZE in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be PAGE_SIZE aligned. - */ - const struct bio_vec *bvec = imu->bvec; - - if (offset <= bvec->bv_len) { - iov_iter_advance(iter, offset); - } else { - unsigned long seg_skip; - - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> PAGE_SHIFT); - - iter->bvec = bvec + seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~PAGE_MASK; - } - } - - return 0; -} - -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - unsigned int issue_flags) -{ - if (WARN_ON_ONCE(!req->imu)) - return -EFAULT; - return __io_import_fixed(req, rw, iter, req->imu); -} - -#ifdef CONFIG_COMPAT -static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct compat_iovec __user *uiov; - compat_ssize_t clen; - void __user *buf; - size_t len; - - uiov = u64_to_user_ptr(rw->addr); - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - - len = clen; - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - rw->addr = (unsigned long) buf; - iov[0].iov_base = buf; - rw->len = iov[0].iov_len = (compat_size_t) len; - return 0; -} -#endif - -static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct iovec __user *uiov = u64_to_user_ptr(rw->addr); - void __user *buf; - ssize_t len; - - if (copy_from_user(iov, uiov, sizeof(*uiov))) - return -EFAULT; - - len = iov[0].iov_len; - if (len < 0) - return -EINVAL; - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - rw->addr = (unsigned long) buf; - iov[0].iov_base = buf; - rw->len = iov[0].iov_len = len; - return 0; -} - -static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - iov[0].iov_base = u64_to_user_ptr(rw->addr); - iov[0].iov_len = rw->len; - return 0; - } - if (rw->len != 1) - return -EINVAL; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - return io_compat_import(req, iov, issue_flags); -#endif - - return __io_iov_buffer_select(req, iov, issue_flags); -} - -static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, - struct io_rw_state *s, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct iov_iter *iter = &s->iter; - u8 opcode = req->opcode; - struct iovec *iovec; - void __user *buf; - size_t sqe_len; - ssize_t ret; - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, ddir, iter, issue_flags); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return ERR_PTR(-ENOBUFS); - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - - ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - iovec = s->fast_iov; - if (req->flags & REQ_F_BUFFER_SELECT) { - ret = io_iov_buffer_select(req, iovec, issue_flags); - if (ret) - return ERR_PTR(ret); - iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len); - return NULL; - } - - ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, - req->ctx->compat); - if (unlikely(ret < 0)) - return ERR_PTR(ret); - return iovec; -} - -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct io_rw_state *s, - unsigned int issue_flags) -{ - *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (unlikely(IS_ERR(*iovec))) - return PTR_ERR(*iovec); - - iov_iter_save_state(&s->iter, &s->iter_state); - return 0; -} - -static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) -{ - return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; -} - -/* - * For files that don't have ->read_iter() and ->write_iter(), handle them - * by looping over ->read() or ->write() manually. - */ -static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) -{ - struct kiocb *kiocb = &rw->kiocb; - struct file *file = kiocb->ki_filp; - ssize_t ret = 0; - loff_t *ppos; - - /* - * Don't support polled IO through this interface, and we can't - * support non-blocking either. For the latter, this just causes - * the kiocb to be handled from an async context. - */ - if (kiocb->ki_flags & IOCB_HIPRI) - return -EOPNOTSUPP; - if ((kiocb->ki_flags & IOCB_NOWAIT) && - !(kiocb->ki_filp->f_flags & O_NONBLOCK)) - return -EAGAIN; - - ppos = io_kiocb_ppos(kiocb); - - while (iov_iter_count(iter)) { - struct iovec iovec; - ssize_t nr; - - if (!iov_iter_is_bvec(iter)) { - iovec = iov_iter_iovec(iter); - } else { - iovec.iov_base = u64_to_user_ptr(rw->addr); - iovec.iov_len = rw->len; - } - - if (ddir == READ) { - nr = file->f_op->read(file, iovec.iov_base, - iovec.iov_len, ppos); - } else { - nr = file->f_op->write(file, iovec.iov_base, - iovec.iov_len, ppos); - } - - if (nr < 0) { - if (!ret) - ret = nr; - break; - } - ret += nr; - if (!iov_iter_is_bvec(iter)) { - iov_iter_advance(iter, nr); - } else { - rw->addr += nr; - rw->len -= nr; - if (!rw->len) - break; - } - if (nr != iovec.iov_len) - break; - } - - return ret; -} - -static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, struct iov_iter *iter) -{ - struct io_async_rw *io = req->async_data; - - memcpy(&io->s.iter, iter, sizeof(*iter)); - io->free_iovec = iovec; - io->bytes_done = 0; - /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter)) - return; - if (!iovec) { - unsigned iov_off = 0; - - io->s.iter.iov = io->s.fast_iov; - if (iter->iov != fast_iov) { - iov_off = iter->iov - fast_iov; - io->s.iter.iov += iov_off; - } - if (io->s.fast_iov != fast_iov) - memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, - sizeof(struct iovec) * iter->nr_segs); - } else { - req->flags |= REQ_F_NEED_CLEANUP; - } -} - bool io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); @@ -2264,448 +1478,13 @@ bool io_alloc_async_data(struct io_kiocb *req) return true; } -static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - struct io_rw_state *s, bool force) -{ - if (!force && !io_op_defs[req->opcode].prep_async) - return 0; - if (!req_has_async_data(req)) { - struct io_async_rw *iorw; - - if (io_alloc_async_data(req)) { - kfree(iovec); - return -ENOMEM; - } - - io_req_map_rw(req, iovec, s->fast_iov, &s->iter); - iorw = req->async_data; - /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); - } - return 0; -} - -static inline int io_rw_prep_async(struct io_kiocb *req, int rw) -{ - struct io_async_rw *iorw = req->async_data; - struct iovec *iov; - int ret; - - /* submission path, ->uring_lock should already be taken */ - ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); - if (unlikely(ret < 0)) - return ret; - - iorw->bytes_done = 0; - iorw->free_iovec = iov; - if (iov) - req->flags |= REQ_F_NEED_CLEANUP; - return 0; -} - -static int io_readv_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, READ); -} - -static int io_writev_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, WRITE); -} - -/* - * This is our waitqueue callback handler, registered through __folio_lock_async() - * when we initially tried to do the IO with the iocb armed our waitqueue. - * This gets called when the page is unlocked, and we generally expect that to - * happen when the page IO is completed and the page is now uptodate. This will - * queue a task_work based retry of the operation, attempting to copy the data - * again. If the latter fails because the page was NOT uptodate, then we will - * do a thread based blocking retry of the operation. That's the unexpected - * slow path. - */ -static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, - int sync, void *arg) -{ - struct wait_page_queue *wpq; - struct io_kiocb *req = wait->private; - struct io_rw *rw = io_kiocb_to_cmd(req); - struct wait_page_key *key = arg; - - wpq = container_of(wait, struct wait_page_queue, wait); - - if (!wake_page_match(wpq, key)) - return 0; - - rw->kiocb.ki_flags &= ~IOCB_WAITQ; - list_del_init(&wait->entry); - io_req_task_queue(req); - return 1; -} - -/* - * This controls whether a given IO request should be armed for async page - * based retry. If we return false here, the request is handed to the async - * worker threads for retry. If we're doing buffered reads on a regular file, - * we prepare a private wait_page_queue entry and retry the operation. This - * will either succeed because the page is now uptodate and unlocked, or it - * will register a callback when the page is unlocked at IO completion. Through - * that callback, io_uring uses task_work to setup a retry of the operation. - * That retry will attempt the buffered read again. The retry will generally - * succeed, or in rare cases where it fails, we then fall back to using the - * async worker threads for a blocking retry. - */ -static bool io_rw_should_retry(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - struct wait_page_queue *wait = &io->wpq; - struct io_rw *rw = io_kiocb_to_cmd(req); - struct kiocb *kiocb = &rw->kiocb; - - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) - return false; - - /* Only for buffered IO */ - if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) - return false; - - /* - * just use poll if we can, and don't attempt if the fs doesn't - * support callback based unlocks - */ - if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) - return false; - - wait->wait.func = io_async_buf_func; - wait->wait.private = req; - wait->wait.flags = 0; - INIT_LIST_HEAD(&wait->wait.entry); - kiocb->ki_flags |= IOCB_WAITQ; - kiocb->ki_flags &= ~IOCB_NOWAIT; - kiocb->ki_waitq = wait; - return true; -} - -static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) -{ - struct file *file = rw->kiocb.ki_filp; - - if (likely(file->f_op->read_iter)) - return call_read_iter(file, &rw->kiocb, iter); - else if (file->f_op->read) - return loop_rw_iter(READ, rw, iter); - else - return -EINVAL; -} - -static bool need_read_all(struct io_kiocb *req) -{ - return req->flags & REQ_F_ISREG || - S_ISBLK(file_inode(req->file)->i_mode); -} - -static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct kiocb *kiocb = &rw->kiocb; - struct io_ring_ctx *ctx = req->ctx; - struct file *file = req->file; - int ret; - - if (unlikely(!file || !(file->f_mode & mode))) - return -EBADF; - - if (!io_req_ffs_set(req)) - req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; - - kiocb->ki_flags = iocb_flags(file); - ret = kiocb_set_rw_flags(kiocb, rw->flags); - if (unlikely(ret)) - return ret; - - /* - * If the file is marked O_NONBLOCK, still allow retry for it if it - * supports async. Otherwise it's impossible to use O_NONBLOCK files - * reliably. If not, or it IOCB_NOWAIT is set, don't retry. - */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || - ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) - req->flags |= REQ_F_NOWAIT; - - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) - return -EOPNOTSUPP; - - kiocb->private = NULL; - kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; - kiocb->ki_complete = io_complete_rw_iopoll; - req->iopoll_completed = 0; - } else { - if (kiocb->ki_flags & IOCB_HIPRI) - return -EINVAL; - kiocb->ki_complete = io_complete_rw; - } - - return 0; -} - -static int io_read(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; - struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *io; - ssize_t ret, ret2; - loff_t *ppos; - - if (!req_has_async_data(req)) { - ret = io_import_iovec(READ, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - io = req->async_data; - s = &io->s; - - /* - * Safe and required to re-import if we're using provided - * buffers, as we dropped the selected one before retry. - */ - if (io_do_buffer_select(req)) { - ret = io_import_iovec(READ, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } - - /* - * We come here from an earlier attempt, restore our state to - * match in case it doesn't. It's cheap enough that we don't - * need to make this conditional. - */ - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } - ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - req->cqe.res = iov_iter_count(&s->iter); - - if (force_nonblock) { - /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) { - ret = io_setup_async_rw(req, iovec, s, true); - return ret ?: -EAGAIN; - } - kiocb->ki_flags |= IOCB_NOWAIT; - } else { - /* Ensure we clear previously set non-block flag */ - kiocb->ki_flags &= ~IOCB_NOWAIT; - } - - ppos = io_kiocb_update_pos(req); - - ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - - ret = io_iter_do_read(rw, &s->iter); - - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; - /* if we can poll, just do that */ - if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) - return -EAGAIN; - /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) - goto done; - /* no retry on NONBLOCK nor RWF_NOWAIT */ - if (req->flags & REQ_F_NOWAIT) - goto done; - ret = 0; - } else if (ret == -EIOCBQUEUED) { - goto out_free; - } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { - /* read all, failed, already did sync or don't want to retry */ - goto done; - } - - /* - * Don't depend on the iter state matching what was consumed, or being - * untouched in case of error. Restore it and we'll advance it - * manually if we need to. - */ - iov_iter_restore(&s->iter, &s->iter_state); - - ret2 = io_setup_async_rw(req, iovec, s, true); - if (ret2) - return ret2; - - iovec = NULL; - io = req->async_data; - s = &io->s; - /* - * Now use our persistent iterator and state, if we aren't already. - * We've restored and mapped the iter to match. - */ - - do { - /* - * We end up here because of a partial read, either from - * above or inside this loop. Advance the iter by the bytes - * that were consumed. - */ - iov_iter_advance(&s->iter, ret); - if (!iov_iter_count(&s->iter)) - break; - io->bytes_done += ret; - iov_iter_save_state(&s->iter, &s->iter_state); - - /* if we can retry, do so with the callbacks armed */ - if (!io_rw_should_retry(req)) { - kiocb->ki_flags &= ~IOCB_WAITQ; - return -EAGAIN; - } - - /* - * Now retry read with the IOCB_WAITQ parts set in the iocb. If - * we get -EIOCBQUEUED, then we'll get a notification when the - * desired page gets unlocked. We can also get a partial read - * here, and if we do, then just retry at the new offset. - */ - ret = io_iter_do_read(rw, &s->iter); - if (ret == -EIOCBQUEUED) - return IOU_ISSUE_SKIP_COMPLETE; - /* we got some bytes, but not all. retry. */ - kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(&s->iter, &s->iter_state); - } while (ret > 0); -done: - kiocb_done(req, ret, issue_flags); -out_free: - /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); - return IOU_ISSUE_SKIP_COMPLETE; -} - -static int io_write(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; - struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - ssize_t ret, ret2; - loff_t *ppos; - - if (!req_has_async_data(req)) { - ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - struct io_async_rw *io = req->async_data; - - s = &io->s; - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } - ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) { - kfree(iovec); - return ret; - } - req->cqe.res = iov_iter_count(&s->iter); - - if (force_nonblock) { - /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) - goto copy_iov; - - /* file path doesn't support NOWAIT for non-direct_IO */ - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && - (req->flags & REQ_F_ISREG)) - goto copy_iov; - - kiocb->ki_flags |= IOCB_NOWAIT; - } else { - /* Ensure we clear previously set non-block flag */ - kiocb->ki_flags &= ~IOCB_NOWAIT; - } - - ppos = io_kiocb_update_pos(req); - - ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) - goto out_free; - - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * io_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (req->flags & REQ_F_ISREG) { - sb_start_write(file_inode(req->file)->i_sb); - __sb_writers_release(file_inode(req->file)->i_sb, - SB_FREEZE_WRITE); - } - kiocb->ki_flags |= IOCB_WRITE; - - if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &s->iter); - else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, rw, &s->iter); - else - ret2 = -EINVAL; - - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } - - /* - * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just - * retry them without IOCB_NOWAIT. - */ - if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) - ret2 = -EAGAIN; - /* no retry on NONBLOCK nor RWF_NOWAIT */ - if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) - goto done; - if (!force_nonblock || ret2 != -EAGAIN) { - /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) - goto copy_iov; -done: - kiocb_done(req, ret2, issue_flags); - ret = IOU_ISSUE_SKIP_COMPLETE; - } else { -copy_iov: - iov_iter_restore(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, false); - return ret ?: -EAGAIN; - } -out_free: - /* it's reportedly faster than delegating the null check to kfree() */ - if (iovec) - kfree(iovec); - return ret; -} - static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, const struct io_uring_sqe *sqe) { return -EOPNOTSUPP; } -static int io_req_prep_async(struct io_kiocb *req) +int io_req_prep_async(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 71afb46070e364..22e6e52c42d261 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -5,11 +5,125 @@ #include #include "io_uring_types.h" +#ifndef CREATE_TRACE_POINTS +#include +#endif + enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, }; +bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags, u64 extra1, u64 extra2); + +static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +{ + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); +} + +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +static inline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int shift = 0; + unsigned int free, queued, len; + + if (ctx->flags & IORING_SETUP_CQE32) + shift = 1; + + /* userspace may cheat modifying the tail, be safe and do min */ + queued = min(__io_cqring_events(ctx), ctx->cq_entries); + free = ctx->cq_entries - queued; + /* we need a contiguous range, limit based on the current array offset */ + len = min(free, ctx->cq_entries - off); + if (!len) + return NULL; + + ctx->cached_cq_tail++; + ctx->cqe_cached = &rings->cqes[off]; + ctx->cqe_sentinel = ctx->cqe_cached + len; + ctx->cqe_cached++; + return &rings->cqes[off << shift]; +} + +static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +{ + if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { + struct io_uring_cqe *cqe = ctx->cqe_cached; + + if (ctx->flags & IORING_SETUP_CQE32) { + unsigned int off = ctx->cqe_cached - ctx->rings->cqes; + + cqe += off; + } + + ctx->cached_cq_tail++; + ctx->cqe_cached++; + return cqe; + } + + return __io_get_cqe(ctx); +} + +static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_uring_cqe *cqe; + + if (!(ctx->flags & IORING_SETUP_CQE32)) { + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, 0, 0); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(*cqe)); + return true; + } + + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + 0, 0); + } else { + u64 extra1 = 0, extra2 = 0; + + if (req->flags & REQ_F_CQE32_INIT) { + extra1 = req->extra1; + extra2 = req->extra2; + } + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, extra1, extra2); + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (likely(cqe)) { + memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); + WRITE_ONCE(cqe->big_cqe[0], extra1); + WRITE_ONCE(cqe->big_cqe[1], extra2); + return true; + } + + return io_cqring_event_overflow(ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + extra1, extra2); + } +} + static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; @@ -64,6 +178,17 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static inline void io_cqring_wake(struct io_ring_ctx *ctx) +{ + /* + * wake_up_all() may seem excessive, but io_wake_function() and + * io_should_wake() handle the termination of the loop and only + * wake as many waiters as we need to. + */ + if (wq_has_sleeper(&ctx->cq_wait)) + wake_up_all(&ctx->cq_wait); +} + static inline bool io_sqring_full(struct io_ring_ctx *ctx) { struct io_rings *r = ctx->rings; @@ -100,6 +225,7 @@ void __io_req_complete_post(struct io_kiocb *req); bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_cqring_ev_posted(struct io_ring_ctx *ctx); +void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); @@ -110,7 +236,10 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_work_add(struct io_kiocb *req); +void io_req_task_prio_work_add(struct io_kiocb *req); void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); +void io_req_task_queue(struct io_kiocb *req); +void io_queue_iowq(struct io_kiocb *req, bool *dont_use); void io_req_task_complete(struct io_kiocb *req, bool *locked); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, bool *locked); @@ -122,6 +251,8 @@ int io_uring_alloc_task_context(struct task_struct *task, int io_poll_issue(struct io_kiocb *req, bool *locked); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); +void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); +int io_req_prep_async(struct io_kiocb *req); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); void io_wq_submit_work(struct io_wq_work *work); diff --git a/io_uring/rw.c b/io_uring/rw.c new file mode 100644 index 00000000000000..f0b60199ee2279 --- /dev/null +++ b/io_uring/rw.c @@ -0,0 +1,1099 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "opdef.h" +#include "kbuf.h" +#include "rsrc.h" +#include "rw.h" + +struct io_rw { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct kiocb kiocb; + u64 addr; + u32 len; + rwf_t flags; +}; + +static inline bool io_file_supports_nowait(struct io_kiocb *req) +{ + return req->flags & REQ_F_SUPPORT_NOWAIT; +} + +int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + unsigned ioprio; + int ret; + + rw->kiocb.ki_pos = READ_ONCE(sqe->off); + /* used for fixed read/write too - just read unconditionally */ + req->buf_index = READ_ONCE(sqe->buf_index); + + if (req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED) { + struct io_ring_ctx *ctx = req->ctx; + u16 index; + + if (unlikely(req->buf_index >= ctx->nr_user_bufs)) + return -EFAULT; + index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); + req->imu = ctx->user_bufs[index]; + io_req_set_rsrc_node(req, ctx, 0); + } + + ioprio = READ_ONCE(sqe->ioprio); + if (ioprio) { + ret = ioprio_check_cap(ioprio); + if (ret) + return ret; + + rw->kiocb.ki_ioprio = ioprio; + } else { + rw->kiocb.ki_ioprio = get_current_ioprio(); + } + + rw->addr = READ_ONCE(sqe->addr); + rw->len = READ_ONCE(sqe->len); + rw->flags = READ_ONCE(sqe->rw_flags); + return 0; +} + +void io_readv_writev_cleanup(struct io_kiocb *req) +{ + struct io_async_rw *io = req->async_data; + + kfree(io->free_iovec); +} + +static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +{ + switch (ret) { + case -EIOCBQUEUED: + break; + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* + * We can't just restart the syscall, since previously + * submitted sqes may already be in progress. Just fail this + * IO with EINTR. + */ + ret = -EINTR; + fallthrough; + default: + kiocb->ki_complete(kiocb, ret); + } +} + +static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + + if (rw->kiocb.ki_pos != -1) + return &rw->kiocb.ki_pos; + + if (!(req->file->f_mode & FMODE_STREAM)) { + req->flags |= REQ_F_CUR_POS; + rw->kiocb.ki_pos = req->file->f_pos; + return &rw->kiocb.ki_pos; + } + + rw->kiocb.ki_pos = 0; + return NULL; +} + +static void io_req_task_queue_reissue(struct io_kiocb *req) +{ + req->io_task_work.func = io_queue_iowq; + io_req_task_work_add(req); +} + +#ifdef CONFIG_BLOCK +static bool io_resubmit_prep(struct io_kiocb *req) +{ + struct io_async_rw *io = req->async_data; + + if (!req_has_async_data(req)) + return !io_req_prep_async(req); + iov_iter_restore(&io->s.iter, &io->s.iter_state); + return true; +} + +static bool io_rw_should_reissue(struct io_kiocb *req) +{ + umode_t mode = file_inode(req->file)->i_mode; + struct io_ring_ctx *ctx = req->ctx; + + if (!S_ISBLK(mode) && !S_ISREG(mode)) + return false; + if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && + !(ctx->flags & IORING_SETUP_IOPOLL))) + return false; + /* + * If ref is dying, we might be running poll reap from the exit work. + * Don't attempt to reissue from that path, just let it fail with + * -EAGAIN. + */ + if (percpu_ref_is_dying(&ctx->refs)) + return false; + /* + * Play it safe and assume not safe to re-import and reissue if we're + * not in the original thread group (or in task context). + */ + if (!same_thread_group(req->task, current) || !in_task()) + return false; + return true; +} +#else +static bool io_resubmit_prep(struct io_kiocb *req) +{ + return false; +} +static bool io_rw_should_reissue(struct io_kiocb *req) +{ + return false; +} +#endif + +static void kiocb_end_write(struct io_kiocb *req) +{ + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (req->flags & REQ_F_ISREG) { + struct super_block *sb = file_inode(req->file)->i_sb; + + __sb_writers_acquired(sb, SB_FREEZE_WRITE); + sb_end_write(sb); + } +} + +static bool __io_complete_rw_common(struct io_kiocb *req, long res) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + + if (rw->kiocb.ki_flags & IOCB_WRITE) { + kiocb_end_write(req); + fsnotify_modify(req->file); + } else { + fsnotify_access(req->file); + } + if (unlikely(res != req->cqe.res)) { + if ((res == -EAGAIN || res == -EOPNOTSUPP) && + io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; + return true; + } + req_set_fail(req); + req->cqe.res = res; + } + return false; +} + +static void __io_complete_rw(struct io_kiocb *req, long res, + unsigned int issue_flags) +{ + if (__io_complete_rw_common(req, res)) + return; + io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags)); + __io_req_complete(req, issue_flags); +} + +static void io_complete_rw(struct kiocb *kiocb, long res) +{ + struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); + struct io_kiocb *req = cmd_to_io_kiocb(rw); + + if (__io_complete_rw_common(req, res)) + return; + io_req_set_res(req, res, 0); + req->io_task_work.func = io_req_task_complete; + io_req_task_prio_work_add(req); +} + +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) +{ + struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); + struct io_kiocb *req = cmd_to_io_kiocb(rw); + + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); + if (unlikely(res != req->cqe.res)) { + if (res == -EAGAIN && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; + return; + } + req->cqe.res = res; + } + + /* order with io_iopoll_complete() checking ->iopoll_completed */ + smp_store_release(&req->iopoll_completed, 1); +} + +static void kiocb_done(struct io_kiocb *req, ssize_t ret, + unsigned int issue_flags) +{ + struct io_async_rw *io = req->async_data; + struct io_rw *rw = io_kiocb_to_cmd(req); + + /* add previously done IO, if any */ + if (req_has_async_data(req) && io->bytes_done > 0) { + if (ret < 0) + ret = io->bytes_done; + else + ret += io->bytes_done; + } + + if (req->flags & REQ_F_CUR_POS) + req->file->f_pos = rw->kiocb.ki_pos; + if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) + __io_complete_rw(req, ret, issue_flags); + else + io_rw_done(&rw->kiocb, ret); + + if (req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + if (io_resubmit_prep(req)) + io_req_task_queue_reissue(req); + else + io_req_task_queue_fail(req, ret); + } +} + +static int __io_import_fixed(struct io_kiocb *req, int ddir, + struct iov_iter *iter, struct io_mapped_ubuf *imu) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + size_t len = rw->len; + u64 buf_end, buf_addr = rw->addr; + size_t offset; + + if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) + return -EFAULT; + /* not inside the mapped region */ + if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) + return -EFAULT; + + /* + * May not be a start of buffer, set size appropriately + * and advance us to the beginning. + */ + offset = buf_addr - imu->ubuf; + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); + + if (offset) { + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here, because + * we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are PAGE_SIZE in size, except potentially the + * first and last bvec + * + * So just find our index, and adjust the iterator afterwards. + * If the offset is within the first bvec (or the whole first + * bvec, just use iov_iter_advance(). This makes it easier + * since we can just skip the first segment, which may not + * be PAGE_SIZE aligned. + */ + const struct bio_vec *bvec = imu->bvec; + + if (offset <= bvec->bv_len) { + iov_iter_advance(iter, offset); + } else { + unsigned long seg_skip; + + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> PAGE_SHIFT); + + iter->bvec = bvec + seg_skip; + iter->nr_segs -= seg_skip; + iter->count -= bvec->bv_len + offset; + iter->iov_offset = offset & ~PAGE_MASK; + } + } + + return 0; +} + +static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, + unsigned int issue_flags) +{ + if (WARN_ON_ONCE(!req->imu)) + return -EFAULT; + return __io_import_fixed(req, rw, iter, req->imu); +} + +#ifdef CONFIG_COMPAT +static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, + unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct compat_iovec __user *uiov; + compat_ssize_t clen; + void __user *buf; + size_t len; + + uiov = u64_to_user_ptr(rw->addr); + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + + len = clen; + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + iov[0].iov_base = buf; + rw->len = iov[0].iov_len = (compat_size_t) len; + return 0; +} +#endif + +static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct iovec __user *uiov = u64_to_user_ptr(rw->addr); + void __user *buf; + ssize_t len; + + if (copy_from_user(iov, uiov, sizeof(*uiov))) + return -EFAULT; + + len = iov[0].iov_len; + if (len < 0) + return -EINVAL; + buf = io_buffer_select(req, &len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + iov[0].iov_base = buf; + rw->len = iov[0].iov_len = len; + return 0; +} + +static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, + unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { + iov[0].iov_base = u64_to_user_ptr(rw->addr); + iov[0].iov_len = rw->len; + return 0; + } + if (rw->len != 1) + return -EINVAL; + +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + return io_compat_import(req, iov, issue_flags); +#endif + + return __io_iov_buffer_select(req, iov, issue_flags); +} + +static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, + struct io_rw_state *s, + unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct iov_iter *iter = &s->iter; + u8 opcode = req->opcode; + struct iovec *iovec; + void __user *buf; + size_t sqe_len; + ssize_t ret; + + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { + ret = io_import_fixed(req, ddir, iter, issue_flags); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + buf = u64_to_user_ptr(rw->addr); + sqe_len = rw->len; + + if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return ERR_PTR(-ENOBUFS); + rw->addr = (unsigned long) buf; + rw->len = sqe_len; + } + + ret = import_single_range(ddir, buf, sqe_len, s->fast_iov, iter); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + iovec = s->fast_iov; + if (req->flags & REQ_F_BUFFER_SELECT) { + ret = io_iov_buffer_select(req, iovec, issue_flags); + if (ret) + return ERR_PTR(ret); + iov_iter_init(iter, ddir, iovec, 1, iovec->iov_len); + return NULL; + } + + ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, + req->ctx->compat); + if (unlikely(ret < 0)) + return ERR_PTR(ret); + return iovec; +} + +static inline int io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct io_rw_state *s, + unsigned int issue_flags) +{ + *iovec = __io_import_iovec(rw, req, s, issue_flags); + if (unlikely(IS_ERR(*iovec))) + return PTR_ERR(*iovec); + + iov_iter_save_state(&s->iter, &s->iter_state); + return 0; +} + +static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) +{ + return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; +} + +/* + * For files that don't have ->read_iter() and ->write_iter(), handle them + * by looping over ->read() or ->write() manually. + */ +static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) +{ + struct kiocb *kiocb = &rw->kiocb; + struct file *file = kiocb->ki_filp; + ssize_t ret = 0; + loff_t *ppos; + + /* + * Don't support polled IO through this interface, and we can't + * support non-blocking either. For the latter, this just causes + * the kiocb to be handled from an async context. + */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EOPNOTSUPP; + if ((kiocb->ki_flags & IOCB_NOWAIT) && + !(kiocb->ki_filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + + ppos = io_kiocb_ppos(kiocb); + + while (iov_iter_count(iter)) { + struct iovec iovec; + ssize_t nr; + + if (!iov_iter_is_bvec(iter)) { + iovec = iov_iter_iovec(iter); + } else { + iovec.iov_base = u64_to_user_ptr(rw->addr); + iovec.iov_len = rw->len; + } + + if (ddir == READ) { + nr = file->f_op->read(file, iovec.iov_base, + iovec.iov_len, ppos); + } else { + nr = file->f_op->write(file, iovec.iov_base, + iovec.iov_len, ppos); + } + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (!iov_iter_is_bvec(iter)) { + iov_iter_advance(iter, nr); + } else { + rw->addr += nr; + rw->len -= nr; + if (!rw->len) + break; + } + if (nr != iovec.iov_len) + break; + } + + return ret; +} + +static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, + const struct iovec *fast_iov, struct iov_iter *iter) +{ + struct io_async_rw *io = req->async_data; + + memcpy(&io->s.iter, iter, sizeof(*iter)); + io->free_iovec = iovec; + io->bytes_done = 0; + /* can only be fixed buffers, no need to do anything */ + if (iov_iter_is_bvec(iter)) + return; + if (!iovec) { + unsigned iov_off = 0; + + io->s.iter.iov = io->s.fast_iov; + if (iter->iov != fast_iov) { + iov_off = iter->iov - fast_iov; + io->s.iter.iov += iov_off; + } + if (io->s.fast_iov != fast_iov) + memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, + sizeof(struct iovec) * iter->nr_segs); + } else { + req->flags |= REQ_F_NEED_CLEANUP; + } +} + +static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, + struct io_rw_state *s, bool force) +{ + if (!force && !io_op_defs[req->opcode].prep_async) + return 0; + if (!req_has_async_data(req)) { + struct io_async_rw *iorw; + + if (io_alloc_async_data(req)) { + kfree(iovec); + return -ENOMEM; + } + + io_req_map_rw(req, iovec, s->fast_iov, &s->iter); + iorw = req->async_data; + /* we've copied and mapped the iter, ensure state is saved */ + iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); + } + return 0; +} + +static inline int io_rw_prep_async(struct io_kiocb *req, int rw) +{ + struct io_async_rw *iorw = req->async_data; + struct iovec *iov; + int ret; + + /* submission path, ->uring_lock should already be taken */ + ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); + if (unlikely(ret < 0)) + return ret; + + iorw->bytes_done = 0; + iorw->free_iovec = iov; + if (iov) + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +int io_readv_prep_async(struct io_kiocb *req) +{ + return io_rw_prep_async(req, READ); +} + +int io_writev_prep_async(struct io_kiocb *req) +{ + return io_rw_prep_async(req, WRITE); +} + +/* + * This is our waitqueue callback handler, registered through __folio_lock_async() + * when we initially tried to do the IO with the iocb armed our waitqueue. + * This gets called when the page is unlocked, and we generally expect that to + * happen when the page IO is completed and the page is now uptodate. This will + * queue a task_work based retry of the operation, attempting to copy the data + * again. If the latter fails because the page was NOT uptodate, then we will + * do a thread based blocking retry of the operation. That's the unexpected + * slow path. + */ +static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, + int sync, void *arg) +{ + struct wait_page_queue *wpq; + struct io_kiocb *req = wait->private; + struct io_rw *rw = io_kiocb_to_cmd(req); + struct wait_page_key *key = arg; + + wpq = container_of(wait, struct wait_page_queue, wait); + + if (!wake_page_match(wpq, key)) + return 0; + + rw->kiocb.ki_flags &= ~IOCB_WAITQ; + list_del_init(&wait->entry); + io_req_task_queue(req); + return 1; +} + +/* + * This controls whether a given IO request should be armed for async page + * based retry. If we return false here, the request is handed to the async + * worker threads for retry. If we're doing buffered reads on a regular file, + * we prepare a private wait_page_queue entry and retry the operation. This + * will either succeed because the page is now uptodate and unlocked, or it + * will register a callback when the page is unlocked at IO completion. Through + * that callback, io_uring uses task_work to setup a retry of the operation. + * That retry will attempt the buffered read again. The retry will generally + * succeed, or in rare cases where it fails, we then fall back to using the + * async worker threads for a blocking retry. + */ +static bool io_rw_should_retry(struct io_kiocb *req) +{ + struct io_async_rw *io = req->async_data; + struct wait_page_queue *wait = &io->wpq; + struct io_rw *rw = io_kiocb_to_cmd(req); + struct kiocb *kiocb = &rw->kiocb; + + /* never retry for NOWAIT, we just complete with -EAGAIN */ + if (req->flags & REQ_F_NOWAIT) + return false; + + /* Only for buffered IO */ + if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) + return false; + + /* + * just use poll if we can, and don't attempt if the fs doesn't + * support callback based unlocks + */ + if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC)) + return false; + + wait->wait.func = io_async_buf_func; + wait->wait.private = req; + wait->wait.flags = 0; + INIT_LIST_HEAD(&wait->wait.entry); + kiocb->ki_flags |= IOCB_WAITQ; + kiocb->ki_flags &= ~IOCB_NOWAIT; + kiocb->ki_waitq = wait; + return true; +} + +static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) +{ + struct file *file = rw->kiocb.ki_filp; + + if (likely(file->f_op->read_iter)) + return call_read_iter(file, &rw->kiocb, iter); + else if (file->f_op->read) + return loop_rw_iter(READ, rw, iter); + else + return -EINVAL; +} + +static bool need_read_all(struct io_kiocb *req) +{ + return req->flags & REQ_F_ISREG || + S_ISBLK(file_inode(req->file)->i_mode); +} + +static inline bool io_req_ffs_set(struct io_kiocb *req) +{ + return req->flags & REQ_F_FIXED_FILE; +} + +static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct kiocb *kiocb = &rw->kiocb; + struct io_ring_ctx *ctx = req->ctx; + struct file *file = req->file; + int ret; + + if (unlikely(!file || !(file->f_mode & mode))) + return -EBADF; + + if (!io_req_ffs_set(req)) + req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; + + kiocb->ki_flags = iocb_flags(file); + ret = kiocb_set_rw_flags(kiocb, rw->flags); + if (unlikely(ret)) + return ret; + + /* + * If the file is marked O_NONBLOCK, still allow retry for it if it + * supports async. Otherwise it's impossible to use O_NONBLOCK files + * reliably. If not, or it IOCB_NOWAIT is set, don't retry. + */ + if ((kiocb->ki_flags & IOCB_NOWAIT) || + ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) + req->flags |= REQ_F_NOWAIT; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) + return -EOPNOTSUPP; + + kiocb->private = NULL; + kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; + kiocb->ki_complete = io_complete_rw_iopoll; + req->iopoll_completed = 0; + } else { + if (kiocb->ki_flags & IOCB_HIPRI) + return -EINVAL; + kiocb->ki_complete = io_complete_rw; + } + + return 0; +} + +int io_read(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; + struct kiocb *kiocb = &rw->kiocb; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct io_async_rw *io; + ssize_t ret, ret2; + loff_t *ppos; + + if (!req_has_async_data(req)) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } else { + io = req->async_data; + s = &io->s; + + /* + * Safe and required to re-import if we're using provided + * buffers, as we dropped the selected one before retry. + */ + if (io_do_buffer_select(req)) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } + + /* + * We come here from an earlier attempt, restore our state to + * match in case it doesn't. It's cheap enough that we don't + * need to make this conditional. + */ + iov_iter_restore(&s->iter, &s->iter_state); + iovec = NULL; + } + ret = io_rw_init_file(req, FMODE_READ); + if (unlikely(ret)) { + kfree(iovec); + return ret; + } + req->cqe.res = iov_iter_count(&s->iter); + + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) { + ret = io_setup_async_rw(req, iovec, s, true); + return ret ?: -EAGAIN; + } + kiocb->ki_flags |= IOCB_NOWAIT; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; + } + + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); + if (unlikely(ret)) { + kfree(iovec); + return ret; + } + + ret = io_iter_do_read(rw, &s->iter); + + if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { + req->flags &= ~REQ_F_REISSUE; + /* if we can poll, just do that */ + if (req->opcode == IORING_OP_READ && file_can_poll(req->file)) + return -EAGAIN; + /* IOPOLL retry should happen for io-wq threads */ + if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) + goto done; + /* no retry on NONBLOCK nor RWF_NOWAIT */ + if (req->flags & REQ_F_NOWAIT) + goto done; + ret = 0; + } else if (ret == -EIOCBQUEUED) { + goto out_free; + } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || + (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { + /* read all, failed, already did sync or don't want to retry */ + goto done; + } + + /* + * Don't depend on the iter state matching what was consumed, or being + * untouched in case of error. Restore it and we'll advance it + * manually if we need to. + */ + iov_iter_restore(&s->iter, &s->iter_state); + + ret2 = io_setup_async_rw(req, iovec, s, true); + if (ret2) + return ret2; + + iovec = NULL; + io = req->async_data; + s = &io->s; + /* + * Now use our persistent iterator and state, if we aren't already. + * We've restored and mapped the iter to match. + */ + + do { + /* + * We end up here because of a partial read, either from + * above or inside this loop. Advance the iter by the bytes + * that were consumed. + */ + iov_iter_advance(&s->iter, ret); + if (!iov_iter_count(&s->iter)) + break; + io->bytes_done += ret; + iov_iter_save_state(&s->iter, &s->iter_state); + + /* if we can retry, do so with the callbacks armed */ + if (!io_rw_should_retry(req)) { + kiocb->ki_flags &= ~IOCB_WAITQ; + return -EAGAIN; + } + + /* + * Now retry read with the IOCB_WAITQ parts set in the iocb. If + * we get -EIOCBQUEUED, then we'll get a notification when the + * desired page gets unlocked. We can also get a partial read + * here, and if we do, then just retry at the new offset. + */ + ret = io_iter_do_read(rw, &s->iter); + if (ret == -EIOCBQUEUED) + return IOU_ISSUE_SKIP_COMPLETE; + /* we got some bytes, but not all. retry. */ + kiocb->ki_flags &= ~IOCB_WAITQ; + iov_iter_restore(&s->iter, &s->iter_state); + } while (ret > 0); +done: + kiocb_done(req, ret, issue_flags); +out_free: + /* it's faster to check here then delegate to kfree */ + if (iovec) + kfree(iovec); + return IOU_ISSUE_SKIP_COMPLETE; +} + +int io_write(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req); + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; + struct kiocb *kiocb = &rw->kiocb; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + ssize_t ret, ret2; + loff_t *ppos; + + if (!req_has_async_data(req)) { + ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) + return ret; + } else { + struct io_async_rw *io = req->async_data; + + s = &io->s; + iov_iter_restore(&s->iter, &s->iter_state); + iovec = NULL; + } + ret = io_rw_init_file(req, FMODE_WRITE); + if (unlikely(ret)) { + kfree(iovec); + return ret; + } + req->cqe.res = iov_iter_count(&s->iter); + + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) + goto copy_iov; + + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) + goto copy_iov; + + kiocb->ki_flags |= IOCB_NOWAIT; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; + } + + ppos = io_kiocb_update_pos(req); + + ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); + if (unlikely(ret)) + goto out_free; + + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (req->flags & REQ_F_ISREG) { + sb_start_write(file_inode(req->file)->i_sb); + __sb_writers_release(file_inode(req->file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; + + if (likely(req->file->f_op->write_iter)) + ret2 = call_write_iter(req->file, kiocb, &s->iter); + else if (req->file->f_op->write) + ret2 = loop_rw_iter(WRITE, rw, &s->iter); + else + ret2 = -EINVAL; + + if (req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + ret2 = -EAGAIN; + } + + /* + * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just + * retry them without IOCB_NOWAIT. + */ + if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT)) + ret2 = -EAGAIN; + /* no retry on NONBLOCK nor RWF_NOWAIT */ + if (ret2 == -EAGAIN && (req->flags & REQ_F_NOWAIT)) + goto done; + if (!force_nonblock || ret2 != -EAGAIN) { + /* IOPOLL retry should happen for io-wq threads */ + if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) + goto copy_iov; +done: + kiocb_done(req, ret2, issue_flags); + ret = IOU_ISSUE_SKIP_COMPLETE; + } else { +copy_iov: + iov_iter_restore(&s->iter, &s->iter_state); + ret = io_setup_async_rw(req, iovec, s, false); + return ret ?: -EAGAIN; + } +out_free: + /* it's reportedly faster than delegating the null check to kfree() */ + if (iovec) + kfree(iovec); + return ret; +} + +static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active || + ctx->has_evfd)) + __io_commit_cqring_flush(ctx); + + if (ctx->flags & IORING_SETUP_SQPOLL) + io_cqring_wake(ctx); +} + +int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) +{ + struct io_wq_work_node *pos, *start, *prev; + unsigned int poll_flags = BLK_POLL_NOSLEEP; + DEFINE_IO_COMP_BATCH(iob); + int nr_events = 0; + + /* + * Only spin for completions if we don't have multiple devices hanging + * off our complete list. + */ + if (ctx->poll_multi_queue || force_nonspin) + poll_flags |= BLK_POLL_ONESHOT; + + wq_list_for_each(pos, start, &ctx->iopoll_list) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + struct io_rw *rw = io_kiocb_to_cmd(req); + int ret; + + /* + * Move completed and retryable entries to our local lists. + * If we find a request that requires polling, break out + * and complete those lists first, if we have entries there. + */ + if (READ_ONCE(req->iopoll_completed)) + break; + + ret = rw->kiocb.ki_filp->f_op->iopoll(&rw->kiocb, &iob, poll_flags); + if (unlikely(ret < 0)) + return ret; + else if (ret) + poll_flags |= BLK_POLL_ONESHOT; + + /* iopoll may have completed current req */ + if (!rq_list_empty(iob.req_list) || + READ_ONCE(req->iopoll_completed)) + break; + } + + if (!rq_list_empty(iob.req_list)) + iob.complete(&iob); + else if (!pos) + return 0; + + prev = start; + wq_list_for_each_resume(pos, prev) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + + /* order with io_complete_rw_iopoll(), e.g. ->result updates */ + if (!smp_load_acquire(&req->iopoll_completed)) + break; + nr_events++; + if (unlikely(req->flags & REQ_F_CQE_SKIP)) + continue; + + req->cqe.flags = io_put_kbuf(req, 0); + __io_fill_cqe_req(req->ctx, req); + } + + if (unlikely(!nr_events)) + return 0; + + io_commit_cqring(ctx); + io_cqring_ev_posted_iopoll(ctx); + pos = start ? start->next : ctx->iopoll_list.first; + wq_list_cut(&ctx->iopoll_list, prev, start); + io_free_batch_list(ctx, pos); + return nr_events; +} diff --git a/io_uring/rw.h b/io_uring/rw.h new file mode 100644 index 00000000000000..0204c3fcafa517 --- /dev/null +++ b/io_uring/rw.h @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +struct io_rw_state { + struct iov_iter iter; + struct iov_iter_state iter_state; + struct iovec fast_iov[UIO_FASTIOV]; +}; + +struct io_async_rw { + struct io_rw_state s; + const struct iovec *free_iovec; + size_t bytes_done; + struct wait_page_queue wpq; +}; + +int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_read(struct io_kiocb *req, unsigned int issue_flags); +int io_readv_prep_async(struct io_kiocb *req); +int io_write(struct io_kiocb *req, unsigned int issue_flags); +int io_writev_prep_async(struct io_kiocb *req); +void io_readv_writev_cleanup(struct io_kiocb *req); From 0cf185c2ccf2a9638ab77d0a3e91ca3357b5450d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Jun 2022 16:27:42 -0600 Subject: [PATCH 0981/1250] io_uring: move opcode table to opdef.c We already have the declarations in opdef.h, move the rest into its own file rather than in the main io_uring.c file. Signed-off-by: Jens Axboe --- io_uring/Makefile | 2 +- io_uring/io_uring.c | 469 +----------------------------------- io_uring/io_uring_types.h | 2 + io_uring/opdef.c | 495 ++++++++++++++++++++++++++++++++++++++ io_uring/opdef.h | 2 + 5 files changed, 501 insertions(+), 469 deletions(-) create mode 100644 io_uring/opdef.c diff --git a/io_uring/Makefile b/io_uring/Makefile index d70deed65a0bb2..466639c289be7f 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o + cancel.o kbuf.o rsrc.o rw.o opdef.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0af61a6c29cfee..c703190986270b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -90,22 +90,8 @@ #include "kbuf.h" #include "rsrc.h" -#include "xattr.h" -#include "nop.h" -#include "fs.h" -#include "splice.h" -#include "sync.h" -#include "advise.h" -#include "openclose.h" -#include "uring_cmd.h" -#include "epoll.h" -#include "statx.h" -#include "net.h" -#include "msg_ring.h" #include "timeout.h" #include "poll.h" -#include "cancel.h" -#include "rw.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -161,13 +147,6 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; -const char *io_uring_get_opcode(u8 opcode) -{ - if (opcode < IORING_OP_LAST) - return io_op_defs[opcode].name; - return "INVALID"; -} - struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -1478,12 +1457,6 @@ bool io_alloc_async_data(struct io_kiocb *req) return true; } -static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, - const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - int io_req_prep_async(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -3909,442 +3882,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, return ret; } -static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) -{ - WARN_ON_ONCE(1); - return -ECANCELED; -} - -const struct io_op_def io_op_defs[] = { - [IORING_OP_NOP] = { - .audit_skip = 1, - .iopoll = 1, - .name = "NOP", - .prep = io_nop_prep, - .issue = io_nop, - }, - [IORING_OP_READV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READV", - .prep = io_prep_rw, - .issue = io_read, - .prep_async = io_readv_prep_async, - .cleanup = io_readv_writev_cleanup, - }, - [IORING_OP_WRITEV] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITEV", - .prep = io_prep_rw, - .issue = io_write, - .prep_async = io_writev_prep_async, - .cleanup = io_readv_writev_cleanup, - }, - [IORING_OP_FSYNC] = { - .needs_file = 1, - .audit_skip = 1, - .name = "FSYNC", - .prep = io_fsync_prep, - .issue = io_fsync, - }, - [IORING_OP_READ_FIXED] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ_FIXED", - .prep = io_prep_rw, - .issue = io_read, - }, - [IORING_OP_WRITE_FIXED] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE_FIXED", - .prep = io_prep_rw, - .issue = io_write, - }, - [IORING_OP_POLL_ADD] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - .name = "POLL_ADD", - .prep = io_poll_add_prep, - .issue = io_poll_add, - }, - [IORING_OP_POLL_REMOVE] = { - .audit_skip = 1, - .name = "POLL_REMOVE", - .prep = io_poll_remove_prep, - .issue = io_poll_remove, - }, - [IORING_OP_SYNC_FILE_RANGE] = { - .needs_file = 1, - .audit_skip = 1, - .name = "SYNC_FILE_RANGE", - .prep = io_sfr_prep, - .issue = io_sync_file_range, - }, - [IORING_OP_SENDMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .ioprio = 1, - .name = "SENDMSG", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep = io_sendmsg_prep, - .issue = io_sendmsg, - .prep_async = io_sendmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_RECVMSG] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .ioprio = 1, - .name = "RECVMSG", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), - .prep = io_recvmsg_prep, - .issue = io_recvmsg, - .prep_async = io_recvmsg_prep_async, - .cleanup = io_sendmsg_recvmsg_cleanup, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "TIMEOUT", - .prep = io_timeout_prep, - .issue = io_timeout, - }, - [IORING_OP_TIMEOUT_REMOVE] = { - /* used by timeout updates' prep() */ - .audit_skip = 1, - .name = "TIMEOUT_REMOVE", - .prep = io_timeout_remove_prep, - .issue = io_timeout_remove, - }, - [IORING_OP_ACCEPT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .poll_exclusive = 1, - .ioprio = 1, /* used for flags */ - .name = "ACCEPT", -#if defined(CONFIG_NET) - .prep = io_accept_prep, - .issue = io_accept, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_ASYNC_CANCEL] = { - .audit_skip = 1, - .name = "ASYNC_CANCEL", - .prep = io_async_cancel_prep, - .issue = io_async_cancel, - }, - [IORING_OP_LINK_TIMEOUT] = { - .audit_skip = 1, - .async_size = sizeof(struct io_timeout_data), - .name = "LINK_TIMEOUT", - .prep = io_link_timeout_prep, - .issue = io_no_issue, - }, - [IORING_OP_CONNECT] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .name = "CONNECT", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_connect), - .prep = io_connect_prep, - .issue = io_connect, - .prep_async = io_connect_prep_async, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_FALLOCATE] = { - .needs_file = 1, - .name = "FALLOCATE", - .prep = io_fallocate_prep, - .issue = io_fallocate, - }, - [IORING_OP_OPENAT] = { - .name = "OPENAT", - .prep = io_openat_prep, - .issue = io_openat, - .cleanup = io_open_cleanup, - }, - [IORING_OP_CLOSE] = { - .name = "CLOSE", - .prep = io_close_prep, - .issue = io_close, - }, - [IORING_OP_FILES_UPDATE] = { - .audit_skip = 1, - .iopoll = 1, - .name = "FILES_UPDATE", - .prep = io_files_update_prep, - .issue = io_files_update, - }, - [IORING_OP_STATX] = { - .audit_skip = 1, - .name = "STATX", - .prep = io_statx_prep, - .issue = io_statx, - .cleanup = io_statx_cleanup, - }, - [IORING_OP_READ] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "READ", - .prep = io_prep_rw, - .issue = io_read, - }, - [IORING_OP_WRITE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .plug = 1, - .audit_skip = 1, - .ioprio = 1, - .iopoll = 1, - .async_size = sizeof(struct io_async_rw), - .name = "WRITE", - .prep = io_prep_rw, - .issue = io_write, - }, - [IORING_OP_FADVISE] = { - .needs_file = 1, - .audit_skip = 1, - .name = "FADVISE", - .prep = io_fadvise_prep, - .issue = io_fadvise, - }, - [IORING_OP_MADVISE] = { - .name = "MADVISE", - .prep = io_madvise_prep, - .issue = io_madvise, - }, - [IORING_OP_SEND] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollout = 1, - .audit_skip = 1, - .ioprio = 1, - .name = "SEND", -#if defined(CONFIG_NET) - .prep = io_sendmsg_prep, - .issue = io_send, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_RECV] = { - .needs_file = 1, - .unbound_nonreg_file = 1, - .pollin = 1, - .buffer_select = 1, - .audit_skip = 1, - .ioprio = 1, - .name = "RECV", -#if defined(CONFIG_NET) - .prep = io_recvmsg_prep, - .issue = io_recv, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_OPENAT2] = { - .name = "OPENAT2", - .prep = io_openat2_prep, - .issue = io_openat2, - .cleanup = io_open_cleanup, - }, - [IORING_OP_EPOLL_CTL] = { - .unbound_nonreg_file = 1, - .audit_skip = 1, - .name = "EPOLL", -#if defined(CONFIG_EPOLL) - .prep = io_epoll_ctl_prep, - .issue = io_epoll_ctl, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_SPLICE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - .name = "SPLICE", - .prep = io_splice_prep, - .issue = io_splice, - }, - [IORING_OP_PROVIDE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - .name = "PROVIDE_BUFFERS", - .prep = io_provide_buffers_prep, - .issue = io_provide_buffers, - }, - [IORING_OP_REMOVE_BUFFERS] = { - .audit_skip = 1, - .iopoll = 1, - .name = "REMOVE_BUFFERS", - .prep = io_remove_buffers_prep, - .issue = io_remove_buffers, - }, - [IORING_OP_TEE] = { - .needs_file = 1, - .hash_reg_file = 1, - .unbound_nonreg_file = 1, - .audit_skip = 1, - .name = "TEE", - .prep = io_tee_prep, - .issue = io_tee, - }, - [IORING_OP_SHUTDOWN] = { - .needs_file = 1, - .name = "SHUTDOWN", -#if defined(CONFIG_NET) - .prep = io_shutdown_prep, - .issue = io_shutdown, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_RENAMEAT] = { - .name = "RENAMEAT", - .prep = io_renameat_prep, - .issue = io_renameat, - .cleanup = io_renameat_cleanup, - }, - [IORING_OP_UNLINKAT] = { - .name = "UNLINKAT", - .prep = io_unlinkat_prep, - .issue = io_unlinkat, - .cleanup = io_unlinkat_cleanup, - }, - [IORING_OP_MKDIRAT] = { - .name = "MKDIRAT", - .prep = io_mkdirat_prep, - .issue = io_mkdirat, - .cleanup = io_mkdirat_cleanup, - }, - [IORING_OP_SYMLINKAT] = { - .name = "SYMLINKAT", - .prep = io_symlinkat_prep, - .issue = io_symlinkat, - .cleanup = io_link_cleanup, - }, - [IORING_OP_LINKAT] = { - .name = "LINKAT", - .prep = io_linkat_prep, - .issue = io_linkat, - .cleanup = io_link_cleanup, - }, - [IORING_OP_MSG_RING] = { - .needs_file = 1, - .iopoll = 1, - .name = "MSG_RING", - .prep = io_msg_ring_prep, - .issue = io_msg_ring, - }, - [IORING_OP_FSETXATTR] = { - .needs_file = 1, - .name = "FSETXATTR", - .prep = io_fsetxattr_prep, - .issue = io_fsetxattr, - .cleanup = io_xattr_cleanup, - }, - [IORING_OP_SETXATTR] = { - .name = "SETXATTR", - .prep = io_setxattr_prep, - .issue = io_setxattr, - .cleanup = io_xattr_cleanup, - }, - [IORING_OP_FGETXATTR] = { - .needs_file = 1, - .name = "FGETXATTR", - .prep = io_fgetxattr_prep, - .issue = io_fgetxattr, - .cleanup = io_xattr_cleanup, - }, - [IORING_OP_GETXATTR] = { - .name = "GETXATTR", - .prep = io_getxattr_prep, - .issue = io_getxattr, - .cleanup = io_xattr_cleanup, - }, - [IORING_OP_SOCKET] = { - .audit_skip = 1, - .name = "SOCKET", -#if defined(CONFIG_NET) - .prep = io_socket_prep, - .issue = io_socket, -#else - .prep = io_eopnotsupp_prep, -#endif - }, - [IORING_OP_URING_CMD] = { - .needs_file = 1, - .plug = 1, - .name = "URING_CMD", - .async_size = uring_cmd_pdu_size(1), - .prep = io_uring_cmd_prep, - .issue = io_uring_cmd, - .prep_async = io_uring_cmd_prep_async, - }, -}; - static int __init io_uring_init(void) { - int i; - #define __BUILD_BUG_VERIFY_ELEMENT(stype, eoffset, etype, ename) do { \ BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \ BUILD_BUG_ON(sizeof(etype) != sizeof_field(stype, ename)); \ @@ -4400,17 +3939,11 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); - BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32)); - for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { - BUG_ON(!io_op_defs[i].prep); - if (io_op_defs[i].prep != io_eopnotsupp_prep) - BUG_ON(!io_op_defs[i].issue); - WARN_ON_ONCE(!io_op_defs[i].name); - } + io_uring_optable_init(); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 147e1e597530a9..fb87af5fd8e751 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -3,6 +3,8 @@ #include #include +#include +#include #include "io-wq.h" #include "filetable.h" diff --git a/io_uring/opdef.c b/io_uring/opdef.c new file mode 100644 index 00000000000000..d687d33f9c0c03 --- /dev/null +++ b/io_uring/opdef.c @@ -0,0 +1,495 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * io_uring opcode handling table + */ +#include +#include +#include +#include +#include + +#include "io_uring_types.h" +#include "io_uring.h" +#include "opdef.h" +#include "refs.h" +#include "tctx.h" +#include "sqpoll.h" +#include "fdinfo.h" +#include "kbuf.h" +#include "rsrc.h" + +#include "xattr.h" +#include "nop.h" +#include "fs.h" +#include "splice.h" +#include "sync.h" +#include "advise.h" +#include "openclose.h" +#include "uring_cmd.h" +#include "epoll.h" +#include "statx.h" +#include "net.h" +#include "msg_ring.h" +#include "timeout.h" +#include "poll.h" +#include "cancel.h" +#include "rw.h" + +static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) +{ + WARN_ON_ONCE(1); + return -ECANCELED; +} + +static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, + const struct io_uring_sqe *sqe) +{ + return -EOPNOTSUPP; +} + +const struct io_op_def io_op_defs[] = { + [IORING_OP_NOP] = { + .audit_skip = 1, + .iopoll = 1, + .name = "NOP", + .prep = io_nop_prep, + .issue = io_nop, + }, + [IORING_OP_READV] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "READV", + .prep = io_prep_rw, + .issue = io_read, + .prep_async = io_readv_prep_async, + .cleanup = io_readv_writev_cleanup, + }, + [IORING_OP_WRITEV] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "WRITEV", + .prep = io_prep_rw, + .issue = io_write, + .prep_async = io_writev_prep_async, + .cleanup = io_readv_writev_cleanup, + }, + [IORING_OP_FSYNC] = { + .needs_file = 1, + .audit_skip = 1, + .name = "FSYNC", + .prep = io_fsync_prep, + .issue = io_fsync, + }, + [IORING_OP_READ_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "READ_FIXED", + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITE_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "WRITE_FIXED", + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_POLL_ADD] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .name = "POLL_ADD", + .prep = io_poll_add_prep, + .issue = io_poll_add, + }, + [IORING_OP_POLL_REMOVE] = { + .audit_skip = 1, + .name = "POLL_REMOVE", + .prep = io_poll_remove_prep, + .issue = io_poll_remove, + }, + [IORING_OP_SYNC_FILE_RANGE] = { + .needs_file = 1, + .audit_skip = 1, + .name = "SYNC_FILE_RANGE", + .prep = io_sfr_prep, + .issue = io_sync_file_range, + }, + [IORING_OP_SENDMSG] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .ioprio = 1, + .name = "SENDMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep = io_sendmsg_prep, + .issue = io_sendmsg, + .prep_async = io_sendmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_RECVMSG] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .ioprio = 1, + .name = "RECVMSG", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep = io_recvmsg_prep, + .issue = io_recvmsg, + .prep_async = io_recvmsg_prep_async, + .cleanup = io_sendmsg_recvmsg_cleanup, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_TIMEOUT] = { + .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), + .name = "TIMEOUT", + .prep = io_timeout_prep, + .issue = io_timeout, + }, + [IORING_OP_TIMEOUT_REMOVE] = { + /* used by timeout updates' prep() */ + .audit_skip = 1, + .name = "TIMEOUT_REMOVE", + .prep = io_timeout_remove_prep, + .issue = io_timeout_remove, + }, + [IORING_OP_ACCEPT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .poll_exclusive = 1, + .ioprio = 1, /* used for flags */ + .name = "ACCEPT", +#if defined(CONFIG_NET) + .prep = io_accept_prep, + .issue = io_accept, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_ASYNC_CANCEL] = { + .audit_skip = 1, + .name = "ASYNC_CANCEL", + .prep = io_async_cancel_prep, + .issue = io_async_cancel, + }, + [IORING_OP_LINK_TIMEOUT] = { + .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), + .name = "LINK_TIMEOUT", + .prep = io_link_timeout_prep, + .issue = io_no_issue, + }, + [IORING_OP_CONNECT] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .name = "CONNECT", +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_connect), + .prep = io_connect_prep, + .issue = io_connect, + .prep_async = io_connect_prep_async, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_FALLOCATE] = { + .needs_file = 1, + .name = "FALLOCATE", + .prep = io_fallocate_prep, + .issue = io_fallocate, + }, + [IORING_OP_OPENAT] = { + .name = "OPENAT", + .prep = io_openat_prep, + .issue = io_openat, + .cleanup = io_open_cleanup, + }, + [IORING_OP_CLOSE] = { + .name = "CLOSE", + .prep = io_close_prep, + .issue = io_close, + }, + [IORING_OP_FILES_UPDATE] = { + .audit_skip = 1, + .iopoll = 1, + .name = "FILES_UPDATE", + .prep = io_files_update_prep, + .issue = io_files_update, + }, + [IORING_OP_STATX] = { + .audit_skip = 1, + .name = "STATX", + .prep = io_statx_prep, + .issue = io_statx, + .cleanup = io_statx_cleanup, + }, + [IORING_OP_READ] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "READ", + .prep = io_prep_rw, + .issue = io_read, + }, + [IORING_OP_WRITE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .async_size = sizeof(struct io_async_rw), + .name = "WRITE", + .prep = io_prep_rw, + .issue = io_write, + }, + [IORING_OP_FADVISE] = { + .needs_file = 1, + .audit_skip = 1, + .name = "FADVISE", + .prep = io_fadvise_prep, + .issue = io_fadvise, + }, + [IORING_OP_MADVISE] = { + .name = "MADVISE", + .prep = io_madvise_prep, + .issue = io_madvise, + }, + [IORING_OP_SEND] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .audit_skip = 1, + .ioprio = 1, + .name = "SEND", +#if defined(CONFIG_NET) + .prep = io_sendmsg_prep, + .issue = io_send, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_RECV] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .buffer_select = 1, + .audit_skip = 1, + .ioprio = 1, + .name = "RECV", +#if defined(CONFIG_NET) + .prep = io_recvmsg_prep, + .issue = io_recv, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_OPENAT2] = { + .name = "OPENAT2", + .prep = io_openat2_prep, + .issue = io_openat2, + .cleanup = io_open_cleanup, + }, + [IORING_OP_EPOLL_CTL] = { + .unbound_nonreg_file = 1, + .audit_skip = 1, + .name = "EPOLL", +#if defined(CONFIG_EPOLL) + .prep = io_epoll_ctl_prep, + .issue = io_epoll_ctl, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_SPLICE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .name = "SPLICE", + .prep = io_splice_prep, + .issue = io_splice, + }, + [IORING_OP_PROVIDE_BUFFERS] = { + .audit_skip = 1, + .iopoll = 1, + .name = "PROVIDE_BUFFERS", + .prep = io_provide_buffers_prep, + .issue = io_provide_buffers, + }, + [IORING_OP_REMOVE_BUFFERS] = { + .audit_skip = 1, + .iopoll = 1, + .name = "REMOVE_BUFFERS", + .prep = io_remove_buffers_prep, + .issue = io_remove_buffers, + }, + [IORING_OP_TEE] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .audit_skip = 1, + .name = "TEE", + .prep = io_tee_prep, + .issue = io_tee, + }, + [IORING_OP_SHUTDOWN] = { + .needs_file = 1, + .name = "SHUTDOWN", +#if defined(CONFIG_NET) + .prep = io_shutdown_prep, + .issue = io_shutdown, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_RENAMEAT] = { + .name = "RENAMEAT", + .prep = io_renameat_prep, + .issue = io_renameat, + .cleanup = io_renameat_cleanup, + }, + [IORING_OP_UNLINKAT] = { + .name = "UNLINKAT", + .prep = io_unlinkat_prep, + .issue = io_unlinkat, + .cleanup = io_unlinkat_cleanup, + }, + [IORING_OP_MKDIRAT] = { + .name = "MKDIRAT", + .prep = io_mkdirat_prep, + .issue = io_mkdirat, + .cleanup = io_mkdirat_cleanup, + }, + [IORING_OP_SYMLINKAT] = { + .name = "SYMLINKAT", + .prep = io_symlinkat_prep, + .issue = io_symlinkat, + .cleanup = io_link_cleanup, + }, + [IORING_OP_LINKAT] = { + .name = "LINKAT", + .prep = io_linkat_prep, + .issue = io_linkat, + .cleanup = io_link_cleanup, + }, + [IORING_OP_MSG_RING] = { + .needs_file = 1, + .iopoll = 1, + .name = "MSG_RING", + .prep = io_msg_ring_prep, + .issue = io_msg_ring, + }, + [IORING_OP_FSETXATTR] = { + .needs_file = 1, + .name = "FSETXATTR", + .prep = io_fsetxattr_prep, + .issue = io_fsetxattr, + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SETXATTR] = { + .name = "SETXATTR", + .prep = io_setxattr_prep, + .issue = io_setxattr, + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_FGETXATTR] = { + .needs_file = 1, + .name = "FGETXATTR", + .prep = io_fgetxattr_prep, + .issue = io_fgetxattr, + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_GETXATTR] = { + .name = "GETXATTR", + .prep = io_getxattr_prep, + .issue = io_getxattr, + .cleanup = io_xattr_cleanup, + }, + [IORING_OP_SOCKET] = { + .audit_skip = 1, + .name = "SOCKET", +#if defined(CONFIG_NET) + .prep = io_socket_prep, + .issue = io_socket, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_URING_CMD] = { + .needs_file = 1, + .plug = 1, + .name = "URING_CMD", + .async_size = uring_cmd_pdu_size(1), + .prep = io_uring_cmd_prep, + .issue = io_uring_cmd, + .prep_async = io_uring_cmd_prep_async, + }, +}; + +const char *io_uring_get_opcode(u8 opcode) +{ + if (opcode < IORING_OP_LAST) + return io_op_defs[opcode].name; + return "INVALID"; +} + +void __init io_uring_optable_init(void) +{ + int i; + + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); + + for (i = 0; i < ARRAY_SIZE(io_op_defs); i++) { + BUG_ON(!io_op_defs[i].prep); + if (io_op_defs[i].prep != io_eopnotsupp_prep) + BUG_ON(!io_op_defs[i].issue); + WARN_ON_ONCE(!io_op_defs[i].name); + } +} diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 4578adcdba8a41..ece8ed4f96c434 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -37,4 +37,6 @@ struct io_op_def { }; extern const struct io_op_def io_op_defs[]; + +void io_uring_optable_init(void); #endif From d0eac5e5d2956569fb32a7c72d94f538df4c8672 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 27 May 2022 10:55:07 -0600 Subject: [PATCH 0982/1250] io_uring: add support for level triggered poll By default, the POLL_ADD command does edge triggered poll - if we get a non-zero mask on the initial poll attempt, we complete the request successfully. Support level triggered by always waiting for a notification, regardless of whether or not the initial mask matches the file state. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 3 +++ io_uring/poll.c | 15 ++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0ad3da28d2fce8..4927bb69387a29 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -229,10 +229,13 @@ enum io_uring_op { * * IORING_POLL_UPDATE Update existing poll request, matching * sqe->addr as the old user_data field. + * + * IORING_POLL_LEVEL Level triggered poll. */ #define IORING_POLL_ADD_MULTI (1U << 0) #define IORING_POLL_UPDATE_EVENTS (1U << 1) #define IORING_POLL_UPDATE_USER_DATA (1U << 2) +#define IORING_POLL_ADD_LEVEL (1U << 3) /* * ASYNC_CANCEL flags. diff --git a/io_uring/poll.c b/io_uring/poll.c index b80f7fa261232d..558dc170468ad8 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -423,11 +423,13 @@ static int __io_arm_poll_handler(struct io_kiocb *req, atomic_set(&req->poll_refs, 1); mask = vfs_poll(req->file, &ipt->pt) & poll->events; - if (mask && (poll->events & EPOLLONESHOT)) { + if (mask && + ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { io_poll_remove_entries(req); /* no one else has access to the req, forget about the ref */ return mask; } + if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { io_poll_remove_entries(req); if (!ipt->error) @@ -439,7 +441,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, io_poll_req_insert(req); spin_unlock(&ctx->completion_lock); - if (mask) { + if (mask && (poll->events & EPOLLET)) { /* can't multishot if failed, just queue the event we've got */ if (unlikely(ipt->error || !ipt->nr_entries)) { poll->events |= EPOLLONESHOT; @@ -475,7 +477,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask = POLLPRI | POLLERR; + __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; if (!def->pollin && !def->pollout) @@ -638,7 +640,10 @@ static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, #endif if (!(flags & IORING_POLL_ADD_MULTI)) events |= EPOLLONESHOT; - return demangle_poll(events) | (events & (EPOLLEXCLUSIVE|EPOLLONESHOT)); + if (!(flags & IORING_POLL_ADD_LEVEL)) + events |= EPOLLET; + return demangle_poll(events) | + (events & (EPOLLEXCLUSIVE|EPOLLONESHOT|EPOLLET)); } int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -679,7 +684,7 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sqe->buf_index || sqe->off || sqe->addr) return -EINVAL; flags = READ_ONCE(sqe->len); - if (flags & ~IORING_POLL_ADD_MULTI) + if (flags & ~(IORING_POLL_ADD_MULTI|IORING_POLL_ADD_LEVEL)) return -EINVAL; if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) return -EINVAL; From 85573c4bd94217e3583ed28d2f1ca4dbc7c3be0d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Jun 2022 12:36:42 -0600 Subject: [PATCH 0983/1250] io_uring: deprecate epoll_ctl support As far as we know, nobody ever adopted the epoll_ctl management via io_uring. Deprecate it now with a warning, and plan on removing it in a later kernel version. When we do remove it, we can revert the following commits as well: 39220e8d4a2a ("eventpoll: support non-blocking do_epoll_ctl() calls") 58e41a44c488 ("eventpoll: abstract out epoll_ctl() handler") Suggested-by: Linus Torvalds Link: https://lore.kernel.org/io-uring/CAHk-=wiTyisXBgKnVHAGYCNvkmjk=50agS2Uk6nr+n3ssLZg2w@mail.gmail.com/ Signed-off-by: Jens Axboe --- io_uring/epoll.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/io_uring/epoll.c b/io_uring/epoll.c index acbb32498127ad..10853e8ed07887 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -26,6 +26,10 @@ int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_epoll *epoll = io_kiocb_to_cmd(req); + pr_warn_once("%s: epoll_ctl support in io_uring is deprecated and will " + "be removed in a future Linux kernel version.\n", + current->comm); + if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; From 7961705bf3d57437e11713d00fc0f6ab3ae5cfbf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:47 +0100 Subject: [PATCH 0984/1250] io_uring: make reg buf init consistent The default (i.e. empty) state of register buffer is dummy_ubuf, so set it to dummy on init instead of NULL. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c5456aecf03d9627fbd6e65e100e2b5293a6151e.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 8c40b20659d40a..214ff0dfa6a48e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -567,7 +567,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, io_buffer_unmap(ctx, &imu); break; } - ctx->user_bufs[i] = NULL; + ctx->user_bufs[i] = ctx->dummy_ubuf; needs_switch = true; } @@ -1203,14 +1203,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size_t size; int ret, nr_pages, i; - if (!iov->iov_base) { - *pimu = ctx->dummy_ubuf; + *pimu = ctx->dummy_ubuf; + if (!iov->iov_base) return 0; - } - *pimu = NULL; ret = -ENOMEM; - pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, &nr_pages); if (IS_ERR(pages)) { From 5925ae6bf9287f1b7e41b314cc78533c3eca003f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:48 +0100 Subject: [PATCH 0985/1250] io_uring: move defer_list to slow data draining is slow path, move defer_list to the end where slow data lives inside the context. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e16379391ca72b490afdd24e8944baab849b4a7b.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring_types.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index fb87af5fd8e751..ee58273569702d 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -162,7 +162,6 @@ struct io_ring_ctx { struct io_uring_sqe *sq_sqes; unsigned cached_sq_head; unsigned sq_entries; - struct list_head defer_list; /* * Fixed resources fast path, should be accessed only under @@ -274,8 +273,12 @@ struct io_ring_ctx { struct work_struct exit_work; struct list_head tctx_list; struct completion ref_comp; + + /* io-wq management, e.g. thread count */ u32 iowq_limits[2]; bool iowq_limits_set; + + struct list_head defer_list; }; }; From 0c991fc904d3b179665970e53114520f690291ec Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:49 +0100 Subject: [PATCH 0986/1250] io_uring: better caching for ctx timeout fields Following timeout fields access patterns, move all of them into a separate cache line inside ctx, so they don't intervene with normal completion caching, especially since timeout removals and completion are separated and the later is done via tw. It also sheds some bytes from io_ring_ctx, 1216B -> 1152B Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4b163793072840de53b3cb66e0c2995e7226ff78.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring_types.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index ee58273569702d..4fc42055dbdd1c 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -181,8 +181,6 @@ struct io_ring_ctx { struct xarray io_bl_xa; struct list_head io_buffers_cache; - struct list_head timeout_list; - struct list_head ltimeout_list; struct list_head cq_overflow_list; struct list_head apoll_cache; struct xarray personalities; @@ -215,15 +213,11 @@ struct io_ring_ctx { struct io_ev_fd __rcu *io_ev_fd; struct wait_queue_head cq_wait; unsigned cq_extra; - atomic_t cq_timeouts; - unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; struct { spinlock_t completion_lock; - spinlock_t timeout_lock; - /* * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -255,6 +249,15 @@ struct io_ring_ctx { struct list_head io_buffers_pages; }; + /* timeouts */ + struct { + spinlock_t timeout_lock; + atomic_t cq_timeouts; + struct list_head timeout_list; + struct list_head ltimeout_list; + unsigned cq_last_tm_flush; + } ____cacheline_aligned_in_smp; + /* Keep this last, we don't need it for the fast path */ struct { #if defined(CONFIG_UNIX) From 451137f3d089037f58922f6f940d41d850d22567 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:50 +0100 Subject: [PATCH 0987/1250] io_uring: refactor ctx slow data placement Shove all slow path data at the end of ctx and get rid of extra indention. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bcaf200298dd469af20787650550efc66d89bef2.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring_types.h | 81 +++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 42 deletions(-) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 4fc42055dbdd1c..ef1cf86e893264 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -185,7 +185,6 @@ struct io_ring_ctx { struct list_head apoll_cache; struct xarray personalities; u32 pers_next; - unsigned sq_thread_idle; } ____cacheline_aligned_in_smp; /* IRQ completion list, under ->completion_lock */ @@ -232,23 +231,6 @@ struct io_ring_ctx { struct list_head io_buffers_comp; } ____cacheline_aligned_in_smp; - struct io_restriction restrictions; - - /* slow path rsrc auxilary data, used by update/register */ - struct { - struct io_rsrc_node *rsrc_backup_node; - struct io_mapped_ubuf *dummy_ubuf; - struct io_rsrc_data *file_data; - struct io_rsrc_data *buf_data; - - struct delayed_work rsrc_put_work; - struct llist_head rsrc_put_llist; - struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; - - struct list_head io_buffers_pages; - }; - /* timeouts */ struct { spinlock_t timeout_lock; @@ -259,30 +241,45 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* Keep this last, we don't need it for the fast path */ - struct { - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif - /* hashed buffered write serialization */ - struct io_wq_hash *hash_map; - - /* Only used for accounting purposes */ - struct user_struct *user; - struct mm_struct *mm_account; - - /* ctx exit and cancelation */ - struct llist_head fallback_llist; - struct delayed_work fallback_work; - struct work_struct exit_work; - struct list_head tctx_list; - struct completion ref_comp; - - /* io-wq management, e.g. thread count */ - u32 iowq_limits[2]; - bool iowq_limits_set; - - struct list_head defer_list; - }; + + struct io_restriction restrictions; + + /* slow path rsrc auxilary data, used by update/register */ + struct io_rsrc_node *rsrc_backup_node; + struct io_mapped_ubuf *dummy_ubuf; + struct io_rsrc_data *file_data; + struct io_rsrc_data *buf_data; + + struct delayed_work rsrc_put_work; + struct llist_head rsrc_put_llist; + struct list_head rsrc_ref_list; + spinlock_t rsrc_ref_lock; + + struct list_head io_buffers_pages; + + #if defined(CONFIG_UNIX) + struct socket *ring_sock; + #endif + /* hashed buffered write serialization */ + struct io_wq_hash *hash_map; + + /* Only used for accounting purposes */ + struct user_struct *user; + struct mm_struct *mm_account; + + /* ctx exit and cancelation */ + struct llist_head fallback_llist; + struct delayed_work fallback_work; + struct work_struct exit_work; + struct list_head tctx_list; + struct completion ref_comp; + + /* io-wq management, e.g. thread count */ + u32 iowq_limits[2]; + bool iowq_limits_set; + + struct list_head defer_list; + unsigned sq_thread_idle; }; enum { From da664786422dea7caded76e0561c4e45c44af7b0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:51 +0100 Subject: [PATCH 0988/1250] io_uring: move small helpers to headers There is a bunch of inline helpers that will be useful not only to the core of io_uring, move them to headers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/22df99c83723e44cba7e945e8519e64e3642c064.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 17 ----------------- io_uring/io_uring.h | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c703190986270b..fd3ad784865287 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -160,14 +160,6 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); -static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) -{ - if (!*locked) { - mutex_lock(&ctx->uring_lock); - *locked = true; - } -} - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs)) @@ -423,15 +415,6 @@ static void io_prep_async_link(struct io_kiocb *req) } } -static inline void io_req_add_compl_list(struct io_kiocb *req) -{ - struct io_submit_state *state = &req->ctx->submit_state; - - if (!(req->flags & REQ_F_CQE_SKIP)) - state->flush_cqes = true; - wq_list_add_tail(&req->comp_list, &state->compl_reqs); -} - void io_queue_iowq(struct io_kiocb *req, bool *dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 22e6e52c42d261..6744ce111e3812 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -217,6 +217,28 @@ static inline bool io_run_task_work(void) return false; } +static inline void io_req_complete_state(struct io_kiocb *req) +{ + req->flags |= REQ_F_COMPLETE_INLINE; +} + +static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) +{ + if (!*locked) { + mutex_lock(&ctx->uring_lock); + *locked = true; + } +} + +static inline void io_req_add_compl_list(struct io_kiocb *req) +{ + struct io_submit_state *state = &req->ctx->submit_state; + + if (!(req->flags & REQ_F_CQE_SKIP)) + state->flush_cqes = true; + wq_list_add_tail(&req->comp_list, &state->compl_reqs); +} + int io_run_task_work_sig(void); void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); From 41b3a2aeabf2ba39ba89f92cf7c487edcf53906e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:52 +0100 Subject: [PATCH 0989/1250] io_uring: explain io_wq_work::cancel_seq placement Add a comment on why we keep ->cancel_seq in struct io_wq_work instead of struct io_kiocb despite it needed only by io_uring but not io-wq. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/988e87eec9dc700b5dae933df3aefef303502f6c.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io-wq.h | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index ba6eee76d028f6..3f54ee2a8eebd3 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) struct io_wq_work { struct io_wq_work_node list; unsigned flags; + /* place it here instead of io_kiocb as it fills padding and saves 4B */ int cancel_seq; }; From cd784f262e3a5495b89a4e265d0feec39789dc9f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:53 +0100 Subject: [PATCH 0990/1250] io_uring: inline ->registered_rings There can be only 16 registered rings, no need to allocate an array for them separately but store it in tctx. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/495f0b953c87994dd9e13de2134019054fa5830d.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/tctx.c | 10 ---------- io_uring/tctx.h | 3 ++- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 3f7e9feb6ca2bc..5a5d4f908529a2 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -53,7 +53,6 @@ void __io_uring_free(struct task_struct *tsk) WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs); - kfree(tctx->registered_rings); percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; @@ -69,16 +68,8 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, if (unlikely(!tctx)) return -ENOMEM; - tctx->registered_rings = kcalloc(IO_RINGFD_REG_MAX, - sizeof(struct file *), GFP_KERNEL); - if (unlikely(!tctx->registered_rings)) { - kfree(tctx); - return -ENOMEM; - } - ret = percpu_counter_init(&tctx->inflight, 0, GFP_KERNEL); if (unlikely(ret)) { - kfree(tctx->registered_rings); kfree(tctx); return ret; } @@ -87,7 +78,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, if (IS_ERR(tctx->io_wq)) { ret = PTR_ERR(tctx->io_wq); percpu_counter_destroy(&tctx->inflight); - kfree(tctx->registered_rings); kfree(tctx); return ret; } diff --git a/io_uring/tctx.h b/io_uring/tctx.h index f4964e40d07e0d..7684713e950f4f 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -20,8 +20,9 @@ struct io_uring_task { struct io_wq_work_list task_list; struct io_wq_work_list prio_task_list; struct callback_head task_work; - struct file **registered_rings; bool task_running; + + struct file *registered_rings[IO_RINGFD_REG_MAX]; }; struct io_tctx_node { From 0b96e6d1a621ab746038108b7edd26ee7fb49605 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:54 +0100 Subject: [PATCH 0991/1250] io_uring: never defer-complete multi-apoll Luckily, nnobody completes multi-apoll requests outside the polling functions, but don't set IO_URING_F_COMPLETE_DEFER in any case as there is nobody who is catching REQ_F_COMPLETE_INLINE, and so will leak requests if used. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a65ed3f5effd9321ee06e6edea294a03be3e15a0.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fd3ad784865287..8a8d8b323519a0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1599,7 +1599,7 @@ int io_poll_issue(struct io_kiocb *req, bool *locked) io_tw_lock(req->ctx, locked); if (unlikely(req->task->flags & PF_EXITING)) return -EFAULT; - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + return io_issue_sqe(req, IO_URING_F_NONBLOCK); } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) From dbc3d4d3b143ae2f186bff3d8273277254998529 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:55 +0100 Subject: [PATCH 0992/1250] io_uring: remove check_cq checking from hot paths All ctx->check_cq events are slow path, don't test every single flag one by one in the hot path, but add a common guarding if. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/dff026585cea7ff3a172a7c83894a3b0111bbf6a.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8a8d8b323519a0..a4c1746d0691d2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1259,24 +1259,25 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) int ret = 0; unsigned long check_cq; + check_cq = READ_ONCE(ctx->check_cq); + if (unlikely(check_cq)) { + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + __io_cqring_overflow_flush(ctx, false); + /* + * Similarly do not spin if we have not informed the user of any + * dropped CQE. + */ + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) + return -EBADR; + } /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ - check_cq = READ_ONCE(ctx->check_cq); - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx, false); if (io_cqring_events(ctx)) return 0; - /* - * Similarly do not spin if we have not informed the user of any - * dropped CQE. - */ - if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) - return -EBADR; - do { /* * If a submit got punted to a workqueue, we can have the @@ -2203,12 +2204,15 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, ret = io_run_task_work_sig(); if (ret || io_should_wake(iowq)) return ret; + check_cq = READ_ONCE(ctx->check_cq); - /* let the caller flush overflows, retry */ - if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - return 1; - if (unlikely(check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT))) - return -EBADR; + if (unlikely(check_cq)) { + /* let the caller flush overflows, retry */ + if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + return 1; + if (check_cq & BIT(IO_CHECK_CQ_DROPPED_BIT)) + return -EBADR; + } if (!schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS)) return -ETIME; return 1; From f14b69621ecf22accf4ed102a6a5803ab4ce6dc9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 15 Jun 2022 17:33:56 +0100 Subject: [PATCH 0993/1250] io_uring: don't set REQ_F_COMPLETE_INLINE in tw io_req_task_complete() enqueues requests for state completion itself, no need for REQ_F_COMPLETE_INLINE, which is only serve the purpose of not bloating the kernel. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/aca80f71464ad02c06f1311d998a2d6ee0b31573.1655310733.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a4c1746d0691d2..4adfc4ebf8c199 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1314,7 +1314,6 @@ inline void io_req_task_complete(struct io_kiocb *req, bool *locked) { if (*locked) { req->cqe.flags |= io_put_kbuf(req, 0); - req->flags |= REQ_F_COMPLETE_INLINE; io_req_add_compl_list(req); } else { req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED); From 2fbf9edf1125e5b34118beb5948f85d9e607144e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Jun 2022 16:28:17 -0600 Subject: [PATCH 0994/1250] io_uring: remove unused IO_REQ_CACHE_SIZE defined Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4adfc4ebf8c199..72640aa55abcde 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -115,7 +115,6 @@ #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 -#define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 enum { From 384a319025402df922269a6d643d59a73e89ed09 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:21:57 +0100 Subject: [PATCH 0995/1250] io_uring: rw: delegate sync completions to core io_uring io_issue_sqe() from the io_uring core knows how to complete requests based on the returned error code, we can delegate io_read()/io_write() completion to it. Make kiocb_done() to return the right completion code and propagate it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/32ef005b45d23bf6b5e6837740dc0331bb051bd4.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/rw.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index f0b60199ee2279..e5ca23d0783e2c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -207,15 +207,6 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) return false; } -static void __io_complete_rw(struct io_kiocb *req, long res, - unsigned int issue_flags) -{ - if (__io_complete_rw_common(req, res)) - return; - io_req_set_res(req, req->cqe.res, io_put_kbuf(req, issue_flags)); - __io_req_complete(req, issue_flags); -} - static void io_complete_rw(struct kiocb *kiocb, long res) { struct io_rw *rw = container_of(kiocb, struct io_rw, kiocb); @@ -247,7 +238,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) smp_store_release(&req->iopoll_completed, 1); } -static void kiocb_done(struct io_kiocb *req, ssize_t ret, +static int kiocb_done(struct io_kiocb *req, ssize_t ret, unsigned int issue_flags) { struct io_async_rw *io = req->async_data; @@ -263,10 +254,15 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) - __io_complete_rw(req, ret, issue_flags); - else + if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { + if (!__io_complete_rw_common(req, ret)) { + io_req_set_res(req, req->cqe.res, + io_put_kbuf(req, issue_flags)); + return IOU_OK; + } + } else { io_rw_done(&rw->kiocb, ret); + } if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; @@ -275,6 +271,7 @@ static void kiocb_done(struct io_kiocb *req, ssize_t ret, else io_req_task_queue_fail(req, ret); } + return IOU_ISSUE_SKIP_COMPLETE; } static int __io_import_fixed(struct io_kiocb *req, int ddir, @@ -847,7 +844,9 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) goto done; ret = 0; } else if (ret == -EIOCBQUEUED) { - goto out_free; + if (iovec) + kfree(iovec); + return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ @@ -905,12 +904,10 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) iov_iter_restore(&s->iter, &s->iter_state); } while (ret > 0); done: - kiocb_done(req, ret, issue_flags); -out_free: /* it's faster to check here then delegate to kfree */ if (iovec) kfree(iovec); - return IOU_ISSUE_SKIP_COMPLETE; + return kiocb_done(req, ret, issue_flags); } int io_write(struct io_kiocb *req, unsigned int issue_flags) @@ -960,8 +957,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) - goto out_free; + if (unlikely(ret)) { + kfree(iovec); + return ret; + } /* * Open-code file_start_write here to grab freeze protection, @@ -1003,15 +1002,13 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto copy_iov; done: - kiocb_done(req, ret2, issue_flags); - ret = IOU_ISSUE_SKIP_COMPLETE; + ret = kiocb_done(req, ret2, issue_flags); } else { copy_iov: iov_iter_restore(&s->iter, &s->iter_state); ret = io_setup_async_rw(req, iovec, s, false); return ret ?: -EAGAIN; } -out_free: /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) kfree(iovec); From e6eda5e30552c639daa0a0517d16959c2e71ab99 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:21:58 +0100 Subject: [PATCH 0996/1250] io_uring: kill REQ_F_COMPLETE_INLINE REQ_F_COMPLETE_INLINE is only needed to delay queueing into the completion list to io_queue_sqe() as __io_req_complete() is inlined and we don't want to bloat the kernel. As now we complete in a more centralised fashion in io_issue_sqe() we can get rid of the flag and queue to the list directly. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/600ba20a9338b8a39b249b23d3d177803613dde4.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 +++++++----------- io_uring/io_uring.h | 5 ----- io_uring/io_uring_types.h | 3 --- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 72640aa55abcde..541c109a9273bf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -742,10 +742,7 @@ void io_req_complete_post(struct io_kiocb *req) inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) { - if (issue_flags & IO_URING_F_COMPLETE_DEFER) - req->flags |= REQ_F_COMPLETE_INLINE; - else - io_req_complete_post(req); + io_req_complete_post(req); } void io_req_complete_failed(struct io_kiocb *req, s32 res) @@ -1581,9 +1578,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (creds) revert_creds(creds); - if (ret == IOU_OK) - __io_req_complete(req, issue_flags); - else if (ret != IOU_ISSUE_SKIP_COMPLETE) + if (ret == IOU_OK) { + if (issue_flags & IO_URING_F_COMPLETE_DEFER) + io_req_add_compl_list(req); + else + io_req_complete_post(req); + } else if (ret != IOU_ISSUE_SKIP_COMPLETE) return ret; /* If the op doesn't have a file, we're not polling for it */ @@ -1749,10 +1749,6 @@ static inline void io_queue_sqe(struct io_kiocb *req) ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); - if (req->flags & REQ_F_COMPLETE_INLINE) { - io_req_add_compl_list(req); - return; - } /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 6744ce111e3812..3f06fbae0ee9e5 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -217,11 +217,6 @@ static inline bool io_run_task_work(void) return false; } -static inline void io_req_complete_state(struct io_kiocb *req) -{ - req->flags |= REQ_F_COMPLETE_INLINE; -} - static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) { if (!*locked) { diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index ef1cf86e893264..4576ea8cad2e6c 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -301,7 +301,6 @@ enum { REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_BUFFER_RING_BIT, - REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_CREDS_BIT, REQ_F_REFCOUNT_BIT, @@ -356,8 +355,6 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* buffer selected from ring, needs commit */ REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT), - /* completion is deferred through io_comp_state */ - REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), /* supports async reads/writes */ From acb01604de22c13a44b037d7a0499902afcf4a06 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:21:59 +0100 Subject: [PATCH 0997/1250] io_uring: refactor io_req_task_complete() Clean up io_req_task_complete() and deduplicate io_put_kbuf() calls. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ae3148ac7eb5cce3e06895cde306e9e959d6f6ae.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 541c109a9273bf..957a5bc1b528c9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1306,15 +1306,19 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return ret; } -inline void io_req_task_complete(struct io_kiocb *req, bool *locked) + +void io_req_task_complete(struct io_kiocb *req, bool *locked) { - if (*locked) { - req->cqe.flags |= io_put_kbuf(req, 0); + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { + unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + + req->cqe.flags |= io_put_kbuf(req, issue_flags); + } + + if (*locked) io_req_add_compl_list(req); - } else { - req->cqe.flags |= io_put_kbuf(req, IO_URING_F_UNLOCKED); + else io_req_complete_post(req); - } } /* From c721c61142214e14a879c57ee78f792c7d99f381 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:00 +0100 Subject: [PATCH 0998/1250] io_uring: don't inline io_put_kbuf io_put_kbuf() is huge, don't bloat the kernel with inlining. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2e21ccf0be471ffa654032914b9430813cae53f8.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 33 +++++++++++++++++++++++++++++++++ io_uring/kbuf.h | 38 ++++++-------------------------------- 2 files changed, 39 insertions(+), 32 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index bc58890d932b2f..b9c7f6e87cc9a3 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -83,6 +83,39 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } +unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) +{ + unsigned int cflags; + + /* + * We can add this buffer back to two lists: + * + * 1) The io_buffers_cache list. This one is protected by the + * ctx->uring_lock. If we already hold this lock, add back to this + * list as we can grab it from issue as well. + * 2) The io_buffers_comp list. This one is protected by the + * ctx->completion_lock. + * + * We migrate buffers from the comp_list to the issue cache list + * when we need one. + */ + if (req->flags & REQ_F_BUFFER_RING) { + /* no buffers to recycle for this case */ + cflags = __io_put_kbuf_list(req, NULL); + } else if (issue_flags & IO_URING_F_UNLOCKED) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock(&ctx->completion_lock); + cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp); + spin_unlock(&ctx->completion_lock); + } else { + lockdep_assert_held(&req->ctx->uring_lock); + + cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache); + } + return cflags; +} + static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl) { diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 9da3a933ef40e1..304e7139d83562 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -47,6 +47,8 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); +unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); + static inline bool io_do_buffer_select(struct io_kiocb *req) { if (!(req->flags & REQ_F_BUFFER_SELECT)) @@ -79,7 +81,8 @@ static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) __io_kbuf_recycle(req, issue_flags); } -static unsigned int __io_put_kbuf(struct io_kiocb *req, struct list_head *list) +static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, + struct list_head *list) { if (req->flags & REQ_F_BUFFER_RING) { if (req->buf_list) @@ -99,44 +102,15 @@ static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; - return __io_put_kbuf(req, &req->ctx->io_buffers_comp); + return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); } static inline unsigned int io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) { - unsigned int cflags; if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) return 0; - - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (req->flags & REQ_F_BUFFER_RING) { - /* no buffers to recycle for this case */ - cflags = __io_put_kbuf(req, NULL); - } else if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - cflags = __io_put_kbuf(req, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - cflags = __io_put_kbuf(req, &req->ctx->io_buffers_cache); - } - - return cflags; + return __io_put_kbuf(req, issue_flags); } #endif From 448841bce406180687512e0a9c8f5185e27ad553 Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 16 Jun 2022 10:22:01 +0100 Subject: [PATCH 0999/1250] io_uring: poll: remove unnecessary req->ref set We now don't need to set req->refcount for poll requests since the reworked poll code ensures no request release race. Signed-off-by: Hao Xu Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ec6fee45705890bdb968b0c175519242753c0215.1655371007.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 558dc170468ad8..fdb6b1101ffcb7 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -689,7 +689,6 @@ int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if ((flags & IORING_POLL_ADD_MULTI) && (req->flags & REQ_F_CQE_SKIP)) return -EINVAL; - io_req_set_refcount(req); poll->events = io_poll_parse_events(sqe, flags); return 0; } From 70c18eb9532f48d466c9a73fd189f5e2383ab47c Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 16 Jun 2022 10:22:02 +0100 Subject: [PATCH 1000/1250] io_uring: switch cancel_hash to use per entry spinlock Add a new io_hash_bucket structure so that each bucket in cancel_hash has separate spinlock. Use per entry lock for cancel_hash, this removes some completion lock invocation and remove contension between different cancel_hash entries. Signed-off-by: Hao Xu Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/05d1e135b0c8bce9d1441e6346776589e5783e26.1655371007.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/cancel.c | 14 ++++++- io_uring/cancel.h | 6 +++ io_uring/fdinfo.c | 9 +++-- io_uring/io_uring.c | 9 +++-- io_uring/io_uring_types.h | 2 +- io_uring/poll.c | 80 ++++++++++++++++++++++++--------------- 6 files changed, 80 insertions(+), 40 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 83cceb52d82d64..6f2888388a40ed 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -93,14 +93,14 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) if (!ret) return 0; - spin_lock(&ctx->completion_lock); ret = io_poll_cancel(ctx, cd); if (ret != -ENOENT) goto out; + spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); -out: spin_unlock(&ctx->completion_lock); +out: return ret; } @@ -192,3 +192,13 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +void init_hash_table(struct io_hash_bucket *hash_table, unsigned size) +{ + unsigned int i; + + for (i = 0; i < size; i++) { + spin_lock_init(&hash_table[i].lock); + INIT_HLIST_HEAD(&hash_table[i].list); + } +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 4f35d86963253f..556a7dcf160e47 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -4,3 +4,9 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); +void init_hash_table(struct io_hash_bucket *hash_table, unsigned size); + +struct io_hash_bucket { + spinlock_t lock; + struct hlist_head list; +} ____cacheline_aligned_in_smp; diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index fcedde4b4b1e1f..f941c73f550259 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -13,6 +13,7 @@ #include "io_uring.h" #include "sqpoll.h" #include "fdinfo.h" +#include "cancel.h" #ifdef CONFIG_PROC_FS static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, @@ -157,17 +158,19 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, mutex_unlock(&ctx->uring_lock); seq_puts(m, "PollList:\n"); - spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list = &ctx->cancel_hash[i]; + struct io_hash_bucket *hb = &ctx->cancel_hash[i]; struct io_kiocb *req; - hlist_for_each_entry(req, list, hash_node) + spin_lock(&hb->lock); + hlist_for_each_entry(req, &hb->list, hash_node) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, task_work_pending(req->task)); + spin_unlock(&hb->lock); } seq_puts(m, "CqOverflowList:\n"); + spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { struct io_uring_cqe *cqe = &ocqe->cqe; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 957a5bc1b528c9..ac6946e3f174b5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -89,6 +89,7 @@ #include "fdinfo.h" #include "kbuf.h" #include "rsrc.h" +#include "cancel.h" #include "timeout.h" #include "poll.h" @@ -260,11 +261,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (hash_bits <= 0) hash_bits = 1; ctx->cancel_hash_bits = hash_bits; - ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), - GFP_KERNEL); + ctx->cancel_hash = + kmalloc((1U << hash_bits) * sizeof(struct io_hash_bucket), + GFP_KERNEL); if (!ctx->cancel_hash) goto err; - __hash_init(ctx->cancel_hash, 1U << hash_bits); + + init_hash_table(ctx->cancel_hash, 1U << hash_bits); ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); if (!ctx->dummy_ubuf) diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 4576ea8cad2e6c..1f8db2dd7af752 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -224,7 +224,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct io_wq_work_list iopoll_list; - struct hlist_head *cancel_hash; + struct io_hash_bucket *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; diff --git a/io_uring/poll.c b/io_uring/poll.c index fdb6b1101ffcb7..1511a26b412d17 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -19,6 +19,7 @@ #include "opdef.h" #include "kbuf.h" #include "poll.h" +#include "cancel.h" struct io_poll_update { struct file *file; @@ -73,10 +74,22 @@ static struct io_poll *io_poll_get_single(struct io_kiocb *req) static void io_poll_req_insert(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct hlist_head *list; + u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); + struct io_hash_bucket *hb = &ctx->cancel_hash[index]; - list = &ctx->cancel_hash[hash_long(req->cqe.user_data, ctx->cancel_hash_bits)]; - hlist_add_head(&req->hash_node, list); + spin_lock(&hb->lock); + hlist_add_head(&req->hash_node, &hb->list); + spin_unlock(&hb->lock); +} + +static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); + spinlock_t *lock = &ctx->cancel_hash[index].lock; + + spin_lock(lock); + hash_del(&req->hash_node); + spin_unlock(lock); } static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, @@ -220,8 +233,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } io_poll_remove_entries(req); + io_poll_req_delete(req, ctx); spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); req->cqe.flags = 0; __io_req_complete_post(req); io_commit_cqring(ctx); @@ -231,7 +244,6 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) static void io_apoll_task_func(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; int ret; ret = io_poll_check_events(req, locked); @@ -239,9 +251,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) return; io_poll_remove_entries(req); - spin_lock(&ctx->completion_lock); - hash_del(&req->hash_node); - spin_unlock(&ctx->completion_lock); + io_poll_req_delete(req, req->ctx); if (!ret) io_req_task_submit(req, locked); @@ -437,9 +447,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, return 0; } - spin_lock(&ctx->completion_lock); io_poll_req_insert(req); - spin_unlock(&ctx->completion_lock); if (mask && (poll->events & EPOLLET)) { /* can't multishot if failed, just queue the event we've got */ @@ -540,32 +548,31 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool found = false; int i; - spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; + struct io_hash_bucket *hb = &ctx->cancel_hash[i]; - list = &ctx->cancel_hash[i]; - hlist_for_each_entry_safe(req, tmp, list, hash_node) { + spin_lock(&hb->lock); + hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { if (io_match_task_safe(req, tsk, cancel_all)) { hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } } + spin_unlock(&hb->lock); } - spin_unlock(&ctx->completion_lock); return found; } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) { - struct hlist_head *list; struct io_kiocb *req; + u32 index = hash_long(cd->data, ctx->cancel_hash_bits); + struct io_hash_bucket *hb = &ctx->cancel_hash[index]; - list = &ctx->cancel_hash[hash_long(cd->data, ctx->cancel_hash_bits)]; - hlist_for_each_entry(req, list, hash_node) { + spin_lock(&hb->lock); + hlist_for_each_entry(req, &hb->list, hash_node) { if (cd->data != req->cqe.user_data) continue; if (poll_only && req->opcode != IORING_OP_POLL_ADD) @@ -577,21 +584,21 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, } return req; } + spin_unlock(&hb->lock); return NULL; } static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) { struct io_kiocb *req; int i; for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct hlist_head *list; + struct io_hash_bucket *hb = &ctx->cancel_hash[i]; - list = &ctx->cancel_hash[i]; - hlist_for_each_entry(req, list, hash_node) { + spin_lock(&hb->lock); + hlist_for_each_entry(req, &hb->list, hash_node) { if (!(cd->flags & IORING_ASYNC_CANCEL_ANY) && req->file != cd->file) continue; @@ -600,12 +607,12 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, req->work.cancel_seq = cd->seq; return req; } + spin_unlock(&hb->lock); } return NULL; } static bool io_poll_disarm(struct io_kiocb *req) - __must_hold(&ctx->completion_lock) { if (!io_poll_get_ownership(req)) return false; @@ -615,17 +622,23 @@ static bool io_poll_disarm(struct io_kiocb *req) } int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) - __must_hold(&ctx->completion_lock) { struct io_kiocb *req; + u32 index; + spinlock_t *lock; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) req = io_poll_file_find(ctx, cd); else req = io_poll_find(ctx, false, cd); - if (!req) + if (!req) { return -ENOENT; + } else { + index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); + lock = &ctx->cancel_hash[index].lock; + } io_poll_cancel_req(req); + spin_unlock(lock); return 0; } @@ -719,18 +732,23 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_poll_update *poll_update = io_kiocb_to_cmd(req); struct io_cancel_data cd = { .data = poll_update->old_user_data, }; struct io_ring_ctx *ctx = req->ctx; + u32 index = hash_long(cd.data, ctx->cancel_hash_bits); + spinlock_t *lock = &ctx->cancel_hash[index].lock; struct io_kiocb *preq; int ret2, ret = 0; bool locked; - spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, true, &cd); - if (!preq || !io_poll_disarm(preq)) { - spin_unlock(&ctx->completion_lock); - ret = preq ? -EALREADY : -ENOENT; + if (!preq) { + ret = -ENOENT; + goto out; + } + ret2 = io_poll_disarm(preq); + spin_unlock(lock); + if (!ret2) { + ret = -EALREADY; goto out; } - spin_unlock(&ctx->completion_lock); if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ From d777ab41d8c0b8c3d12ff28c7b6fc83c709cc3de Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:03 +0100 Subject: [PATCH 1001/1250] io_uring: pass poll_find lock back Instead of using implicit knowledge of what is locked or not after io_poll_find() and co returns, pass back a pointer to the locked bucket if any. If set the user must to unlock the spinlock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/dae1dc5749aa34367812ecf62f82fd3f053aae44.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/poll.c | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 1511a26b412d17..5b2c6ce26a5547 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -565,12 +565,15 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, - struct io_cancel_data *cd) + struct io_cancel_data *cd, + struct io_hash_bucket **out_bucket) { struct io_kiocb *req; u32 index = hash_long(cd->data, ctx->cancel_hash_bits); struct io_hash_bucket *hb = &ctx->cancel_hash[index]; + *out_bucket = NULL; + spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { if (cd->data != req->cqe.user_data) @@ -582,6 +585,7 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, continue; req->work.cancel_seq = cd->seq; } + *out_bucket = hb; return req; } spin_unlock(&hb->lock); @@ -589,11 +593,14 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, } static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, - struct io_cancel_data *cd) + struct io_cancel_data *cd, + struct io_hash_bucket **out_bucket) { struct io_kiocb *req; int i; + *out_bucket = NULL; + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_hash[i]; @@ -605,6 +612,7 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, if (cd->seq == req->work.cancel_seq) continue; req->work.cancel_seq = cd->seq; + *out_bucket = hb; return req; } spin_unlock(&hb->lock); @@ -623,23 +631,19 @@ static bool io_poll_disarm(struct io_kiocb *req) int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { + struct io_hash_bucket *bucket; struct io_kiocb *req; - u32 index; - spinlock_t *lock; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd); + req = io_poll_file_find(ctx, cd, &bucket); else - req = io_poll_find(ctx, false, cd); - if (!req) { - return -ENOENT; - } else { - index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); - lock = &ctx->cancel_hash[index].lock; - } - io_poll_cancel_req(req); - spin_unlock(lock); - return 0; + req = io_poll_find(ctx, false, cd, &bucket); + + if (req) + io_poll_cancel_req(req); + if (bucket) + spin_unlock(&bucket->lock); + return req ? 0 : -ENOENT; } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -732,19 +736,21 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_poll_update *poll_update = io_kiocb_to_cmd(req); struct io_cancel_data cd = { .data = poll_update->old_user_data, }; struct io_ring_ctx *ctx = req->ctx; - u32 index = hash_long(cd.data, ctx->cancel_hash_bits); - spinlock_t *lock = &ctx->cancel_hash[index].lock; + struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; bool locked; - preq = io_poll_find(ctx, true, &cd); + preq = io_poll_find(ctx, true, &cd, &bucket); + if (preq) + ret2 = io_poll_disarm(preq); + if (bucket) + spin_unlock(&bucket->lock); + if (!preq) { ret = -ENOENT; goto out; } - ret2 = io_poll_disarm(preq); - spin_unlock(lock); if (!ret2) { ret = -EALREADY; goto out; From 9b0dae591a5bd35a85da81ef0d92b86e44aa3894 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:04 +0100 Subject: [PATCH 1002/1250] io_uring: clean up io_try_cancel Get rid of an unnecessary extra goto in io_try_cancel() and simplify the function. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/48cf5417b43a8386c6c364dba1ad9b4c7382d158.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/cancel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 6f2888388a40ed..a253e2ad22ebd0 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -95,12 +95,12 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) ret = io_poll_cancel(ctx, cd); if (ret != -ENOENT) - goto out; + return ret; + spin_lock(&ctx->completion_lock); if (!(cd->flags & IORING_ASYNC_CANCEL_FD)) ret = io_timeout_cancel(ctx, cd); spin_unlock(&ctx->completion_lock); -out: return ret; } From 3c510d815b09f881bb0cfab530b00071f2aac152 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:05 +0100 Subject: [PATCH 1003/1250] io_uring: limit the number of cancellation buckets Don't allocate to many hash/cancellation buckets, there might be too many, clamp it to 8 bits, or 256 * 64B = 16KB. We don't usually have too many requests, and 256 buckets should be enough, especially since we do hash search only in the cancellation path. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b9620c8072ba61a2d50eba894b89bd93a94a9abd.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ac6946e3f174b5..aafdf1330ec67d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -254,12 +254,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) /* * Use 5 bits less than the max cq entries, that should give us around - * 32 entries per hash list if totally full and uniformly spread. + * 32 entries per hash list if totally full and uniformly spread, but + * don't keep too many buckets to not overconsume memory. */ - hash_bits = ilog2(p->cq_entries); - hash_bits -= 5; - if (hash_bits <= 0) - hash_bits = 1; + hash_bits = ilog2(p->cq_entries) - 5; + hash_bits = clamp(hash_bits, 1, 8); + ctx->cancel_hash_bits = hash_bits; ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct io_hash_bucket), From 9b9a0c5ada24b10d1aa98b882408c0b62b193c44 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:06 +0100 Subject: [PATCH 1004/1250] io_uring: clean up io_ring_ctx_alloc Add a variable for the number of hash buckets in io_ring_ctx_alloc(), makes it more readable. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/993926ed0d614ba9a76b2a85bebae2babcb13983.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index aafdf1330ec67d..85a479594b05aa 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -244,6 +244,8 @@ static __cold void io_fallback_req_func(struct work_struct *work) static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; + unsigned hash_buckets; + size_t hash_size; int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -259,15 +261,15 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) */ hash_bits = ilog2(p->cq_entries) - 5; hash_bits = clamp(hash_bits, 1, 8); + hash_buckets = 1U << hash_bits; + hash_size = hash_buckets * sizeof(struct io_hash_bucket); ctx->cancel_hash_bits = hash_bits; - ctx->cancel_hash = - kmalloc((1U << hash_bits) * sizeof(struct io_hash_bucket), - GFP_KERNEL); + ctx->cancel_hash = kmalloc(hash_size, GFP_KERNEL); if (!ctx->cancel_hash) goto err; - init_hash_table(ctx->cancel_hash, 1U << hash_bits); + init_hash_table(ctx->cancel_hash, hash_buckets); ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); if (!ctx->dummy_ubuf) From 389a427d10fc3ea0014e45ff06e6d84f8add625a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:07 +0100 Subject: [PATCH 1005/1250] io_uring: use state completion infra for poll reqs Use io_req_task_complete() for poll request completions, so it can utilise state completions and save lots of unnecessary locking. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ced94cb5a728d8e386c640d052fd3da3f5d6891a.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/poll.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 5b2c6ce26a5547..7f2ebb56f7dbf9 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -234,12 +234,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) io_poll_remove_entries(req); io_poll_req_delete(req, ctx); - spin_lock(&ctx->completion_lock); - req->cqe.flags = 0; - __io_req_complete_post(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_req_set_res(req, req->cqe.res, 0); + io_req_task_complete(req, locked); } static void io_apoll_task_func(struct io_kiocb *req, bool *locked) From 31d0a0e9c9c99376dd12ccc4cfe2d4ffd17ced95 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:08 +0100 Subject: [PATCH 1006/1250] io_uring: add IORING_SETUP_SINGLE_ISSUER Add a new IORING_SETUP_SINGLE_ISSUER flag and the userspace visible part of it, i.e. put limitations of submitters. Also, don't allow it together with IOPOLL as we're not going to put it to good use. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4bcc41ee467fdf04c8aab8baf6ce3ba21858c3d4.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 ++++- io_uring/io_uring.c | 7 +++++-- io_uring/io_uring_types.h | 1 + io_uring/tctx.c | 27 ++++++++++++++++++++++++--- io_uring/tctx.h | 4 ++-- 5 files changed, 36 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4927bb69387a29..d7ae81b10893e9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -140,9 +140,12 @@ enum { * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN. */ #define IORING_SETUP_TASKRUN_FLAG (1U << 9) - #define IORING_SETUP_SQE128 (1U << 10) /* SQEs are 128 byte */ #define IORING_SETUP_CQE32 (1U << 11) /* CQEs are 32 byte */ +/* + * Only one task is allowed to submit requests + */ +#define IORING_SETUP_SINGLE_ISSUER (1U << 12) enum io_uring_op { IORING_OP_NOP, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 85a479594b05aa..06772139b7dad8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2457,6 +2457,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_destroy_buffers(ctx); if (ctx->sq_creds) put_cred(ctx->sq_creds); + if (ctx->submitter_task) + put_task_struct(ctx->submitter_task); /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) @@ -3189,7 +3191,7 @@ static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) if (fd < 0) return fd; - ret = io_uring_add_tctx_node(ctx); + ret = __io_uring_add_tctx_node(ctx, false); if (ret) { put_unused_fd(fd); return ret; @@ -3409,7 +3411,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ | IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL | IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_SQE128 | IORING_SETUP_CQE32)) + IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | + IORING_SETUP_SINGLE_ISSUER)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 1f8db2dd7af752..8b00243abf65fa 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -243,6 +243,7 @@ struct io_ring_ctx { /* Keep this last, we don't need it for the fast path */ struct io_restriction restrictions; + struct task_struct *submitter_task; /* slow path rsrc auxilary data, used by update/register */ struct io_rsrc_node *rsrc_backup_node; diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 5a5d4f908529a2..a819da8fc85cd2 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -94,12 +94,32 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, return 0; } -int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) +static int io_register_submitter(struct io_ring_ctx *ctx) +{ + int ret = 0; + + mutex_lock(&ctx->uring_lock); + if (!ctx->submitter_task) + ctx->submitter_task = get_task_struct(current); + else if (ctx->submitter_task != current) + ret = -EEXIST; + mutex_unlock(&ctx->uring_lock); + + return ret; +} + +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; int ret; + if ((ctx->flags & IORING_SETUP_SINGLE_ISSUER) && submitter) { + ret = io_register_submitter(ctx); + if (ret) + return ret; + } + if (unlikely(!tctx)) { ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) @@ -133,7 +153,8 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) list_add(&node->ctx_node, &ctx->tctx_list); mutex_unlock(&ctx->uring_lock); } - tctx->last = ctx; + if (submitter) + tctx->last = ctx; return 0; } @@ -241,7 +262,7 @@ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, return -EINVAL; mutex_unlock(&ctx->uring_lock); - ret = io_uring_add_tctx_node(ctx); + ret = __io_uring_add_tctx_node(ctx, false); mutex_lock(&ctx->uring_lock); if (ret) return ret; diff --git a/io_uring/tctx.h b/io_uring/tctx.h index 7684713e950f4f..dde82ce4d8e29b 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -34,7 +34,7 @@ struct io_tctx_node { int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); void io_uring_del_tctx_node(unsigned long index); -int __io_uring_add_tctx_node(struct io_ring_ctx *ctx); +int __io_uring_add_tctx_node(struct io_ring_ctx *ctx, bool submitter); void io_uring_clean_tctx(struct io_uring_task *tctx); void io_uring_unreg_ringfd(void); @@ -52,5 +52,5 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) if (likely(tctx && tctx->last == ctx)) return 0; - return __io_uring_add_tctx_node(ctx); + return __io_uring_add_tctx_node(ctx, true); } From 05b696820f1ba13446e36cc17b8af85b500a6c3c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:09 +0100 Subject: [PATCH 1007/1250] io_uring: pass hash table into poll_find In preparation for having multiple cancellation hash tables, pass a table pointer into io_poll_find() and other poll cancel functions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a31c88502463dce09254240fa037352927d7ecc3.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/poll.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 7f2ebb56f7dbf9..96199c999fe60c 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -562,11 +562,12 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd, + struct io_hash_bucket hash_table[], struct io_hash_bucket **out_bucket) { struct io_kiocb *req; u32 index = hash_long(cd->data, ctx->cancel_hash_bits); - struct io_hash_bucket *hb = &ctx->cancel_hash[index]; + struct io_hash_bucket *hb = &hash_table[index]; *out_bucket = NULL; @@ -590,6 +591,7 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + struct io_hash_bucket hash_table[], struct io_hash_bucket **out_bucket) { struct io_kiocb *req; @@ -598,7 +600,7 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, *out_bucket = NULL; for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct io_hash_bucket *hb = &ctx->cancel_hash[i]; + struct io_hash_bucket *hb = &hash_table[i]; spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { @@ -625,15 +627,16 @@ static bool io_poll_disarm(struct io_kiocb *req) return true; } -int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) +static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + struct io_hash_bucket hash_table[]) { struct io_hash_bucket *bucket; struct io_kiocb *req; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd, &bucket); + req = io_poll_file_find(ctx, cd, ctx->cancel_hash, &bucket); else - req = io_poll_find(ctx, false, cd, &bucket); + req = io_poll_find(ctx, false, cd, ctx->cancel_hash, &bucket); if (req) io_poll_cancel_req(req); @@ -642,6 +645,11 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return req ? 0 : -ENOENT; } +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) +{ + return __io_poll_cancel(ctx, cd, ctx->cancel_hash); +} + static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, unsigned int flags) { @@ -737,7 +745,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) int ret2, ret = 0; bool locked; - preq = io_poll_find(ctx, true, &cd, &bucket); + preq = io_poll_find(ctx, true, &cd, ctx->cancel_hash, &bucket); if (preq) ret2 = io_poll_disarm(preq); if (bucket) From 394462e7adc84e0fe8744e84976e251668f52090 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:10 +0100 Subject: [PATCH 1008/1250] io_uring: introduce a struct for hash table Instead of passing around a pointer to hash buckets, add a bit of type safety and wrap it into a structure. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d65bc3faba537ec2aca9eabf334394936d44bd28.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/cancel.c | 6 +++--- io_uring/cancel.h | 7 +------ io_uring/fdinfo.c | 4 ++-- io_uring/io_uring.c | 29 ++++++++++++++++------------ io_uring/io_uring_types.h | 13 +++++++++++-- io_uring/poll.c | 40 +++++++++++++++++++++------------------ 6 files changed, 56 insertions(+), 43 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index a253e2ad22ebd0..f28f0a7d127240 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -193,12 +193,12 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -void init_hash_table(struct io_hash_bucket *hash_table, unsigned size) +void init_hash_table(struct io_hash_table *table, unsigned size) { unsigned int i; for (i = 0; i < size; i++) { - spin_lock_init(&hash_table[i].lock); - INIT_HLIST_HEAD(&hash_table[i].list); + spin_lock_init(&table->hbs[i].lock); + INIT_HLIST_HEAD(&table->hbs[i].list); } } diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 556a7dcf160e47..fd4cb1a2595de5 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -4,9 +4,4 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); -void init_hash_table(struct io_hash_bucket *hash_table, unsigned size); - -struct io_hash_bucket { - spinlock_t lock; - struct hlist_head list; -} ____cacheline_aligned_in_smp; +void init_hash_table(struct io_hash_table *table, unsigned size); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index f941c73f550259..344e7d90d55756 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -158,8 +158,8 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, mutex_unlock(&ctx->uring_lock); seq_puts(m, "PollList:\n"); - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct io_hash_bucket *hb = &ctx->cancel_hash[i]; + for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { + struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; struct io_kiocb *req; spin_lock(&hb->lock); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 06772139b7dad8..0b3851a0db2ba1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -241,11 +241,23 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_put(&ctx->refs); } +static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) +{ + unsigned hash_buckets = 1U << bits; + size_t hash_size = hash_buckets * sizeof(table->hbs[0]); + + table->hbs = kmalloc(hash_size, GFP_KERNEL); + if (!table->hbs) + return -ENOMEM; + + table->hash_bits = bits; + init_hash_table(table, hash_buckets); + return 0; +} + static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; - unsigned hash_buckets; - size_t hash_size; int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -261,16 +273,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) */ hash_bits = ilog2(p->cq_entries) - 5; hash_bits = clamp(hash_bits, 1, 8); - hash_buckets = 1U << hash_bits; - hash_size = hash_buckets * sizeof(struct io_hash_bucket); - - ctx->cancel_hash_bits = hash_bits; - ctx->cancel_hash = kmalloc(hash_size, GFP_KERNEL); - if (!ctx->cancel_hash) + if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; - init_hash_table(ctx->cancel_hash, hash_buckets); - ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); if (!ctx->dummy_ubuf) goto err; @@ -311,7 +316,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; err: kfree(ctx->dummy_ubuf); - kfree(ctx->cancel_hash); + kfree(ctx->cancel_table.hbs); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); @@ -2487,7 +2492,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_req_caches_free(ctx); if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); - kfree(ctx->cancel_hash); + kfree(ctx->cancel_table.hbs); kfree(ctx->dummy_ubuf); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index 8b00243abf65fa..d3b9bde9c702c6 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -9,6 +9,16 @@ #include "io-wq.h" #include "filetable.h" +struct io_hash_bucket { + spinlock_t lock; + struct hlist_head list; +} ____cacheline_aligned_in_smp; + +struct io_hash_table { + struct io_hash_bucket *hbs; + unsigned hash_bits; +}; + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -224,8 +234,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct io_wq_work_list iopoll_list; - struct io_hash_bucket *cancel_hash; - unsigned cancel_hash_bits; + struct io_hash_table cancel_table; bool poll_multi_queue; struct list_head io_buffers_comp; diff --git a/io_uring/poll.c b/io_uring/poll.c index 96199c999fe60c..ea6466388ed9ef 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -73,9 +73,9 @@ static struct io_poll *io_poll_get_single(struct io_kiocb *req) static void io_poll_req_insert(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; - u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); - struct io_hash_bucket *hb = &ctx->cancel_hash[index]; + struct io_hash_table *table = &req->ctx->cancel_table; + u32 index = hash_long(req->cqe.user_data, table->hash_bits); + struct io_hash_bucket *hb = &table->hbs[index]; spin_lock(&hb->lock); hlist_add_head(&req->hash_node, &hb->list); @@ -84,8 +84,9 @@ static void io_poll_req_insert(struct io_kiocb *req) static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) { - u32 index = hash_long(req->cqe.user_data, ctx->cancel_hash_bits); - spinlock_t *lock = &ctx->cancel_hash[index].lock; + struct io_hash_table *table = &req->ctx->cancel_table; + u32 index = hash_long(req->cqe.user_data, table->hash_bits); + spinlock_t *lock = &table->hbs[index].lock; spin_lock(lock); hash_del(&req->hash_node); @@ -539,13 +540,15 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all) { + struct io_hash_table *table = &ctx->cancel_table; + unsigned nr_buckets = 1U << table->hash_bits; struct hlist_node *tmp; struct io_kiocb *req; bool found = false; int i; - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct io_hash_bucket *hb = &ctx->cancel_hash[i]; + for (i = 0; i < nr_buckets; i++) { + struct io_hash_bucket *hb = &table->hbs[i]; spin_lock(&hb->lock); hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { @@ -562,12 +565,12 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd, - struct io_hash_bucket hash_table[], + struct io_hash_table *table, struct io_hash_bucket **out_bucket) { struct io_kiocb *req; - u32 index = hash_long(cd->data, ctx->cancel_hash_bits); - struct io_hash_bucket *hb = &hash_table[index]; + u32 index = hash_long(cd->data, table->hash_bits); + struct io_hash_bucket *hb = &table->hbs[index]; *out_bucket = NULL; @@ -591,16 +594,17 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, struct io_cancel_data *cd, - struct io_hash_bucket hash_table[], + struct io_hash_table *table, struct io_hash_bucket **out_bucket) { + unsigned nr_buckets = 1U << table->hash_bits; struct io_kiocb *req; int i; *out_bucket = NULL; - for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { - struct io_hash_bucket *hb = &hash_table[i]; + for (i = 0; i < nr_buckets; i++) { + struct io_hash_bucket *hb = &table->hbs[i]; spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { @@ -628,15 +632,15 @@ static bool io_poll_disarm(struct io_kiocb *req) } static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, - struct io_hash_bucket hash_table[]) + struct io_hash_table *table) { struct io_hash_bucket *bucket; struct io_kiocb *req; if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd, ctx->cancel_hash, &bucket); + req = io_poll_file_find(ctx, cd, table, &bucket); else - req = io_poll_find(ctx, false, cd, ctx->cancel_hash, &bucket); + req = io_poll_find(ctx, false, cd, table, &bucket); if (req) io_poll_cancel_req(req); @@ -647,7 +651,7 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { - return __io_poll_cancel(ctx, cd, ctx->cancel_hash); + return __io_poll_cancel(ctx, cd, &ctx->cancel_table); } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -745,7 +749,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) int ret2, ret = 0; bool locked; - preq = io_poll_find(ctx, true, &cd, ctx->cancel_hash, &bucket); + preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); if (preq) ret2 = io_poll_disarm(preq); if (bucket) From be52a96c7c43de11cf3b31e267f09e0d68249f44 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:11 +0100 Subject: [PATCH 1009/1250] io_uring: propagate locking state to poll cancel Poll cancellation will be soon need to grab ->uring_lock inside, pass the locking state, i.e. issue_flags, inside the cancellation functions. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b86781d047727c07163443b57551a3fa57c7c5e1.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/cancel.c | 7 ++++--- io_uring/cancel.h | 3 ++- io_uring/poll.c | 3 ++- io_uring/poll.h | 3 ++- io_uring/timeout.c | 3 ++- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index f28f0a7d127240..f07bfd27c98ac2 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -78,7 +78,8 @@ static int io_async_cancel_one(struct io_uring_task *tctx, return ret; } -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, + unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; int ret; @@ -93,7 +94,7 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd) if (!ret) return 0; - ret = io_poll_cancel(ctx, cd); + ret = io_poll_cancel(ctx, cd, issue_flags); if (ret != -ENOENT) return ret; @@ -136,7 +137,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, int ret, nr = 0; do { - ret = io_try_cancel(req, cd); + ret = io_try_cancel(req, cd, issue_flags); if (ret == -ENOENT) break; if (!all) diff --git a/io_uring/cancel.h b/io_uring/cancel.h index fd4cb1a2595de5..8dd259dc383e2c 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -3,5 +3,6 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd); +int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, + unsigned int issue_flags); void init_hash_table(struct io_hash_table *table, unsigned size); diff --git a/io_uring/poll.c b/io_uring/poll.c index ea6466388ed9ef..c4edf8794538c8 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -649,7 +649,8 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, return req ? 0 : -ENOENT; } -int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned issue_flags) { return __io_poll_cancel(ctx, cd, &ctx->cancel_table); } diff --git a/io_uring/poll.h b/io_uring/poll.h index cc75c1567a84ae..fa3e19790281b2 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -24,7 +24,8 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags); int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags); -int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); +int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned issue_flags); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 69cca42d6835b4..526fc8b2e3b65d 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -262,6 +262,7 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) { + unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; @@ -273,7 +274,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) .data = prev->cqe.user_data, }; - ret = io_try_cancel(req, &cd); + ret = io_try_cancel(req, &cd, issue_flags); } io_req_set_res(req, ret ?: -ETIME, 0); io_req_complete_post(req); From 067d18f8798141d742a759d11fdaf43f205165db Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 10:22:12 +0100 Subject: [PATCH 1010/1250] io_uring: mutex locked poll hashing Currently we do two extra spin lock/unlock pairs to add a poll/apoll request to the cancellation hash table and remove it from there. On the submission side we often already hold ->uring_lock and tw completion is likely to hold it as well. Add a second cancellation hash table protected by ->uring_lock. In concerns for latency because of a need to have the mutex locked on the completion side, use the new table only in following cases: 1) IORING_SETUP_SINGLE_ISSUER: only one task grabs uring_lock, so there is little to no contention and so the main tw hander will almost always end up grabbing it before calling callbacks. 2) IORING_SETUP_SQPOLL: same as with single issuer, only one task is a major user of ->uring_lock. 3) apoll: we normally grab the lock on the completion side anyway to execute the request, so it's free. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1bbad9c78c454b7b92f100bbf46730a37df7194f.1655371007.git.asml.silence@gmail.com Reviewed-by: Hao Xu Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 ++- io_uring/io_uring_types.h | 4 ++ io_uring/poll.c | 117 +++++++++++++++++++++++++++++++------- 3 files changed, 108 insertions(+), 22 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0b3851a0db2ba1..eeda1673179544 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -275,6 +275,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) hash_bits = clamp(hash_bits, 1, 8); if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; + if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) + goto err; ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); if (!ctx->dummy_ubuf) @@ -317,6 +319,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) err: kfree(ctx->dummy_ubuf); kfree(ctx->cancel_table.hbs); + kfree(ctx->cancel_table_locked.hbs); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); kfree(ctx); @@ -2493,6 +2496,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_table.hbs); + kfree(ctx->cancel_table_locked.hbs); kfree(ctx->dummy_ubuf); kfree(ctx->io_bl); xa_destroy(&ctx->io_bl_xa); @@ -2654,12 +2658,13 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) __io_cqring_overflow_flush(ctx, true); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); + if (ctx->rings) + io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); /* failed during ring init, it couldn't have issued any requests */ if (ctx->rings) { io_kill_timeouts(ctx, NULL, true); - io_poll_remove_all(ctx, NULL, true); /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); } @@ -2784,7 +2789,9 @@ static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, } ret |= io_cancel_defer_files(ctx, task, cancel_all); + mutex_lock(&ctx->uring_lock); ret |= io_poll_remove_all(ctx, task, cancel_all); + mutex_unlock(&ctx->uring_lock); ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) ret |= io_run_task_work(); diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h index d3b9bde9c702c6..65ac7cdaaa73f4 100644 --- a/io_uring/io_uring_types.h +++ b/io_uring/io_uring_types.h @@ -191,6 +191,7 @@ struct io_ring_ctx { struct xarray io_bl_xa; struct list_head io_buffers_cache; + struct io_hash_table cancel_table_locked; struct list_head cq_overflow_list; struct list_head apoll_cache; struct xarray personalities; @@ -323,6 +324,7 @@ enum { REQ_F_CQE32_INIT_BIT, REQ_F_APOLL_MULTISHOT_BIT, REQ_F_CLEAR_POLLIN_BIT, + REQ_F_HASH_LOCKED_BIT, /* keep async read/write and isreg together and in order */ REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, @@ -393,6 +395,8 @@ enum { REQ_F_CQE32_INIT = BIT(REQ_F_CQE32_INIT_BIT), /* recvmsg special flag, clear EPOLLIN */ REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT), + /* hashed into ->cancel_hash_locked, protected by ->uring_lock */ + REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); diff --git a/io_uring/poll.c b/io_uring/poll.c index c4edf8794538c8..9ae2982aef7c6c 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -93,6 +93,32 @@ static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) spin_unlock(lock); } +static void io_poll_req_insert_locked(struct io_kiocb *req) +{ + struct io_hash_table *table = &req->ctx->cancel_table_locked; + u32 index = hash_long(req->cqe.user_data, table->hash_bits); + + hlist_add_head(&req->hash_node, &table->hbs[index].list); +} + +static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (req->flags & REQ_F_HASH_LOCKED) { + /* + * ->cancel_table_locked is protected by ->uring_lock in + * contrast to per bucket spinlocks. Likely, tctx_task_work() + * already grabbed the mutex for us, but there is a chance it + * failed. + */ + io_tw_lock(ctx, locked); + hash_del(&req->hash_node); + } else { + io_poll_req_delete(req, ctx); + } +} + static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, wait_queue_func_t wake_func) { @@ -217,7 +243,6 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) static void io_poll_task_func(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; int ret; ret = io_poll_check_events(req, locked); @@ -234,7 +259,8 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } io_poll_remove_entries(req); - io_poll_req_delete(req, ctx); + io_poll_tw_hash_eject(req, locked); + io_req_set_res(req, req->cqe.res, 0); io_req_task_complete(req, locked); } @@ -248,7 +274,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) return; io_poll_remove_entries(req); - io_poll_req_delete(req, req->ctx); + io_poll_tw_hash_eject(req, locked); if (!ret) io_req_task_submit(req, locked); @@ -444,7 +470,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req, return 0; } - io_poll_req_insert(req); + if (req->flags & REQ_F_HASH_LOCKED) + io_poll_req_insert_locked(req); + else + io_poll_req_insert(req); if (mask && (poll->events & EPOLLET)) { /* can't multishot if failed, just queue the event we've got */ @@ -485,6 +514,15 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; + /* + * apoll requests already grab the mutex to complete in the tw handler, + * so removal from the mutex-backed hash is free, use it by default. + */ + if (issue_flags & IO_URING_F_UNLOCKED) + req->flags &= ~REQ_F_HASH_LOCKED; + else + req->flags |= REQ_F_HASH_LOCKED; + if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; if (!file_can_poll(req->file)) @@ -534,13 +572,10 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_OK; } -/* - * Returns true if we found and killed one or more poll requests - */ -__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) +static __cold bool io_poll_remove_all_table(struct task_struct *tsk, + struct io_hash_table *table, + bool cancel_all) { - struct io_hash_table *table = &ctx->cancel_table; unsigned nr_buckets = 1U << table->hash_bits; struct hlist_node *tmp; struct io_kiocb *req; @@ -563,6 +598,17 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, return found; } +/* + * Returns true if we found and killed one or more poll requests + */ +__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, + bool cancel_all) + __must_hold(&ctx->uring_lock) +{ + return io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all) | + io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); +} + static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, struct io_cancel_data *cd, struct io_hash_table *table, @@ -622,13 +668,15 @@ static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, return NULL; } -static bool io_poll_disarm(struct io_kiocb *req) +static int io_poll_disarm(struct io_kiocb *req) { + if (!req) + return -ENOENT; if (!io_poll_get_ownership(req)) - return false; + return -EALREADY; io_poll_remove_entries(req); hash_del(&req->hash_node); - return true; + return 0; } static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, @@ -652,7 +700,16 @@ static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags) { - return __io_poll_cancel(ctx, cd, &ctx->cancel_table); + int ret; + + ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table); + if (ret != -ENOENT) + return ret; + + io_ring_submit_lock(ctx, issue_flags); + ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked); + io_ring_submit_unlock(ctx, issue_flags); + return ret; } static __poll_t io_poll_parse_events(const struct io_uring_sqe *sqe, @@ -727,6 +784,16 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.pt._qproc = io_poll_queue_proc; + /* + * If sqpoll or single issuer, there is no contention for ->uring_lock + * and we'll end up holding it in tw handlers anyway. + */ + if (!(issue_flags & IO_URING_F_UNLOCKED) && + (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER))) + req->flags |= REQ_F_HASH_LOCKED; + else + req->flags &= ~REQ_F_HASH_LOCKED; + ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); if (ret) { io_req_set_res(req, ret, 0); @@ -751,20 +818,28 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) bool locked; preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); - if (preq) - ret2 = io_poll_disarm(preq); + ret2 = io_poll_disarm(preq); if (bucket) spin_unlock(&bucket->lock); - - if (!preq) { - ret = -ENOENT; + if (!ret2) + goto found; + if (ret2 != -ENOENT) { + ret = ret2; goto out; } - if (!ret2) { - ret = -EALREADY; + + io_ring_submit_lock(ctx, issue_flags); + preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket); + ret2 = io_poll_disarm(preq); + if (bucket) + spin_unlock(&bucket->lock); + io_ring_submit_unlock(ctx, issue_flags); + if (ret2) { + ret = ret2; goto out; } +found: if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ if (poll_update->update_events) { From ba15af9a627056fa844c9c1a3cab2662f822f56a Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Fri, 17 Jun 2022 13:04:29 +0800 Subject: [PATCH 1011/1250] io_uring: kbuf: add comments for some tricky code Add comments to explain why it is always under uring lock when incrementing head in __io_kbuf_recycle. And rectify one comemnt about kbuf consuming in iowq case. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220617050429.94293-1-hao.xu@linux.dev Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index b9c7f6e87cc9a3..59e4fafeb28ce4 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -52,6 +52,13 @@ void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) if (req->flags & REQ_F_BUFFER_RING) { if (req->buf_list) { if (req->flags & REQ_F_PARTIAL_IO) { + /* + * If we end up here, then the io_uring_lock has + * been kept held since we retrieved the buffer. + * For the io-wq case, we already cleared + * req->buf_list when the buffer was retrieved, + * hence it cannot be set here for that case. + */ req->buf_list->head++; req->buf_list = NULL; } else { @@ -163,12 +170,13 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) { /* * If we came in unlocked, we have no choice but to consume the - * buffer here. This does mean it'll be pinned until the IO - * completes. But coming in unlocked means we're in io-wq - * context, hence there should be no further retry. For the - * locked case, the caller must ensure to call the commit when - * the transfer completes (or if we get -EAGAIN and must poll - * or retry). + * buffer here, otherwise nothing ensures that the buffer won't + * get used by others. This does mean it'll be pinned until the + * IO completes, coming in unlocked means we're being called from + * io-wq context and there may be further retries in async hybrid + * mode. For the locked case, the caller must call commit when + * the transfer completes (or if we get -EAGAIN and must poll of + * retry). */ req->buf_list = NULL; bl->head++; From 2a3cbdd805efa4f928a50366ade829c7670e7ad9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:00 +0100 Subject: [PATCH 1012/1250] io_uring: don't expose io_fill_cqe_aux() Deduplicate some code and add a helper for filling an aux CQE, locking and notification. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b7c6557c8f9dc5c4cfb01292116c682a0ff61081.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 18 ++++++++++++++++-- io_uring/io_uring.h | 3 +-- io_uring/msg_ring.c | 11 +---------- io_uring/net.c | 20 +++++--------------- io_uring/poll.c | 24 ++++++++---------------- io_uring/rsrc.c | 14 +++++--------- 6 files changed, 36 insertions(+), 54 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index eeda1673179544..8c1b0e0ce5bb8a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -676,8 +676,8 @@ bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, return true; } -bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags) +static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, + u64 user_data, s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -704,6 +704,20 @@ bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); } +bool io_post_aux_cqe(struct io_ring_ctx *ctx, + u64 user_data, s32 res, u32 cflags) +{ + bool filled; + + spin_lock(&ctx->completion_lock); + filled = io_fill_cqe_aux(ctx, user_data, res, cflags); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + if (filled) + io_cqring_ev_posted(ctx); + return filled; +} + static void __io_req_complete_put(struct io_kiocb *req) { /* diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 3f06fbae0ee9e5..18754fb790255e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -239,8 +239,7 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); -bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags); +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_cqring_ev_posted(struct io_ring_ctx *ctx); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 3b89f9a0a0b459..7c3c5f3ab06b57 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -34,7 +34,6 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req); struct io_ring_ctx *target_ctx; - bool filled; int ret; ret = -EBADFD; @@ -43,16 +42,8 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) ret = -EOVERFLOW; target_ctx = req->file->private_data; - - spin_lock(&target_ctx->completion_lock); - filled = io_fill_cqe_aux(target_ctx, msg->user_data, msg->len, 0); - io_commit_cqring(target_ctx); - spin_unlock(&target_ctx->completion_lock); - - if (filled) { - io_cqring_ev_posted(target_ctx); + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) ret = 0; - } done: if (ret < 0) diff --git a/io_uring/net.c b/io_uring/net.c index fe1fe920b9291f..35d0183fe75817 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -644,22 +644,12 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } - if (ret >= 0) { - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, ret, - IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - goto retry; - } - ret = -ECANCELED; - } - return ret; + if (ret < 0) + return ret; + if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE)) + goto retry; + return -ECANCELED; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/poll.c b/io_uring/poll.c index 9ae2982aef7c6c..e0c181fe6264c0 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -214,23 +214,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - bool filled; - - spin_lock(&ctx->completion_lock); - filled = io_fill_cqe_aux(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - if (filled) { - io_cqring_ev_posted(ctx); - continue; - } - return -ECANCELED; - } - ret = io_poll_issue(req, locked); - if (ret) - return ret; + if (!io_post_aux_cqe(ctx, req->cqe.user_data, + mask, IORING_CQE_F_MORE)) + return -ECANCELED; + } else { + ret = io_poll_issue(req, locked); + if (ret) + return ret; + } /* * Release all references, retry if someone tried to restart diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 214ff0dfa6a48e..7fed3105152a95 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -174,17 +174,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) list_del(&prsrc->list); if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) + if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); - - spin_lock(&ctx->completion_lock); - io_fill_cqe_aux(ctx, prsrc->tag, 0, 0); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - - if (ctx->flags & IORING_SETUP_IOPOLL) + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); mutex_unlock(&ctx->uring_lock); + } else { + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); + } } rsrc_data->do_put(ctx, prsrc); From 53c0172a17c3c2307bd03cd34c88a2ae780adfb2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:01 +0100 Subject: [PATCH 1013/1250] io_uring: don't inline __io_get_cqe() __io_get_cqe() is not as hot as io_get_cqe(), no need to inline it, it sheds ~500B from the binary. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c1ac829198a881b7af8710926f99a3559b9f24c0.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 35 +++++++++++++++++++++++++++++++++++ io_uring/io_uring.h | 36 +----------------------------------- 2 files changed, 36 insertions(+), 35 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8c1b0e0ce5bb8a..df6a9abdd966ad 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -166,6 +166,11 @@ static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) __io_submit_flush_completions(ctx); } +static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) +{ + return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); +} + static bool io_match_linked(struct io_kiocb *head) { struct io_kiocb *req; @@ -676,6 +681,36 @@ bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, return true; } +/* + * writes to the cq entry need to come after reading head; the + * control dependency is enough as we're using WRITE_ONCE to + * fill the cq entry + */ +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); + unsigned int shift = 0; + unsigned int free, queued, len; + + if (ctx->flags & IORING_SETUP_CQE32) + shift = 1; + + /* userspace may cheat modifying the tail, be safe and do min */ + queued = min(__io_cqring_events(ctx), ctx->cq_entries); + free = ctx->cq_entries - queued; + /* we need a contiguous range, limit based on the current array offset */ + len = min(free, ctx->cq_entries - off); + if (!len) + return NULL; + + ctx->cached_cq_tail++; + ctx->cqe_cached = &rings->cqes[off]; + ctx->cqe_sentinel = ctx->cqe_cached + len; + ctx->cqe_cached++; + return &rings->cqes[off << shift]; +} + static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 18754fb790255e..94bd6732f55873 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -14,44 +14,10 @@ enum { IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, }; +struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, u64 extra1, u64 extra2); -static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) -{ - return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); -} - -/* - * writes to the cq entry need to come after reading head; the - * control dependency is enough as we're using WRITE_ONCE to - * fill the cq entry - */ -static inline struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) -{ - struct io_rings *rings = ctx->rings; - unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int shift = 0; - unsigned int free, queued, len; - - if (ctx->flags & IORING_SETUP_CQE32) - shift = 1; - - /* userspace may cheat modifying the tail, be safe and do min */ - queued = min(__io_cqring_events(ctx), ctx->cq_entries); - free = ctx->cq_entries - queued; - /* we need a contiguous range, limit based on the current array offset */ - len = min(free, ctx->cq_entries - off); - if (!len) - return NULL; - - ctx->cached_cq_tail++; - ctx->cqe_cached = &rings->cqes[off]; - ctx->cqe_sentinel = ctx->cqe_cached + len; - ctx->cqe_cached++; - return &rings->cqes[off << shift]; -} - static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { From 16a6ac0df82ad4916e2a424101b72ae6bc81ac8d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:02 +0100 Subject: [PATCH 1014/1250] io_uring: introduce io_req_cqe_overflow() __io_fill_cqe_req() is hot and inlined, we want it to be as small as possible. Add io_req_cqe_overflow() accepting only a request and doing all overflow accounting, and replace with it two calls to 6 argument io_cqring_event_overflow(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/048b9fbcce56814d77a1a540409c98c3d383edcb.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 +++++++++++++-- io_uring/io_uring.h | 12 ++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index df6a9abdd966ad..7acb94c180b8e8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -643,8 +643,8 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags, u64 extra1, u64 extra2) +static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, + s32 res, u32 cflags, u64 extra1, u64 extra2) { struct io_overflow_cqe *ocqe; size_t ocq_size = sizeof(struct io_overflow_cqe); @@ -681,6 +681,17 @@ bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, return true; } +bool io_req_cqe_overflow(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_CQE32_INIT)) { + req->extra1 = 0; + req->extra2 = 0; + } + return io_cqring_event_overflow(req->ctx, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + req->extra1, req->extra2); +} + /* * writes to the cq entry need to come after reading head; the * control dependency is enough as we're using WRITE_ONCE to diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 94bd6732f55873..88c64a69beaaeb 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -15,8 +15,7 @@ enum { }; struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); -bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags, u64 extra1, u64 extra2); +bool io_req_cqe_overflow(struct io_kiocb *req); static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { @@ -56,10 +55,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, memcpy(cqe, &req->cqe, sizeof(*cqe)); return true; } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - 0, 0); } else { u64 extra1 = 0, extra2 = 0; @@ -83,11 +78,8 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, WRITE_ONCE(cqe->big_cqe[1], extra2); return true; } - - return io_cqring_event_overflow(ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - extra1, extra2); } + return io_req_cqe_overflow(req); } static inline void req_set_fail(struct io_kiocb *req) From a8991d5ec8d53dcc68a6eefe5970ea2176cff02c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:03 +0100 Subject: [PATCH 1015/1250] io_uring: deduplicate __io_fill_cqe_req tracing Deduplicate two trace_io_uring_complete() calls in __io_fill_cqe_req(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/277ed85dba5189ab7d932164b314013a0f0b0fdc.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 88c64a69beaaeb..763915c665933f 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -41,10 +41,12 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - if (!(ctx->flags & IORING_SETUP_CQE32)) { - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, 0, 0); + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, + (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); + if (!(ctx->flags & IORING_SETUP_CQE32)) { /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in @@ -63,9 +65,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, extra2 = req->extra2; } - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, extra1, extra2); - /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in From 67e30949f020644905ab49df028c789cad87f922 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:04 +0100 Subject: [PATCH 1016/1250] io_uring: deduplicate io_get_cqe() calls Deduplicate calls to io_get_cqe() from __io_fill_cqe_req(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4fa077986cc3abab7c59ff4e7c390c783885465f.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 763915c665933f..dfb490e7cf4593 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -45,19 +45,17 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, req->cqe.res, req->cqe.flags, (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqe(ctx); + if (unlikely(!cqe)) + return io_req_cqe_overflow(req); + memcpy(cqe, &req->cqe, sizeof(*cqe)); - if (!(ctx->flags & IORING_SETUP_CQE32)) { - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(*cqe)); - return true; - } - } else { + if (ctx->flags & IORING_SETUP_CQE32) { u64 extra1 = 0, extra2 = 0; if (req->flags & REQ_F_CQE32_INIT) { @@ -65,20 +63,10 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, extra2 = req->extra2; } - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ - cqe = io_get_cqe(ctx); - if (likely(cqe)) { - memcpy(cqe, &req->cqe, sizeof(struct io_uring_cqe)); - WRITE_ONCE(cqe->big_cqe[0], extra1); - WRITE_ONCE(cqe->big_cqe[1], extra2); - return true; - } + WRITE_ONCE(cqe->big_cqe[0], extra1); + WRITE_ONCE(cqe->big_cqe[1], extra2); } - return io_req_cqe_overflow(req); + return true; } static inline void req_set_fail(struct io_kiocb *req) From 6c75953377e66b59361410774cadbd45c849586c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 17 Jun 2022 09:48:05 +0100 Subject: [PATCH 1017/1250] io_uring: change ->cqe_cached invariant for CQE32 With IORING_SETUP_CQE32 ->cqe_cached doesn't store a real address but rather an implicit offset into cqes. Store the real cqe pointer and increment it accordingly if CQE32. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1ee1838cba16bed96381a006950b36ba640d998c.1655455613.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 ++++++++++----- io_uring/io_uring.h | 8 ++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 7acb94c180b8e8..0dbf6a74f9f3a1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -701,11 +701,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) { struct io_rings *rings = ctx->rings; unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); - unsigned int shift = 0; unsigned int free, queued, len; - if (ctx->flags & IORING_SETUP_CQE32) - shift = 1; /* userspace may cheat modifying the tail, be safe and do min */ queued = min(__io_cqring_events(ctx), ctx->cq_entries); @@ -715,11 +712,19 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) if (!len) return NULL; - ctx->cached_cq_tail++; + if (ctx->flags & IORING_SETUP_CQE32) { + off <<= 1; + len <<= 1; + } + ctx->cqe_cached = &rings->cqes[off]; ctx->cqe_sentinel = ctx->cqe_cached + len; + + ctx->cached_cq_tail++; ctx->cqe_cached++; - return &rings->cqes[off << shift]; + if (ctx->flags & IORING_SETUP_CQE32) + ctx->cqe_cached++; + return &rings->cqes[off]; } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index dfb490e7cf4593..558a860a93fcd6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -22,14 +22,10 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { struct io_uring_cqe *cqe = ctx->cqe_cached; - if (ctx->flags & IORING_SETUP_CQE32) { - unsigned int off = ctx->cqe_cached - ctx->rings->cqes; - - cqe += off; - } - ctx->cached_cq_tail++; ctx->cqe_cached++; + if (ctx->flags & IORING_SETUP_CQE32) + ctx->cqe_cached++; return cqe; } From 6186928b6fbe50022fe6c53e0777ac90a2900aac Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 13:57:18 +0100 Subject: [PATCH 1018/1250] io_uring: kill extra io_uring_types.h includes io_uring/io_uring.h already includes io_uring_types.h, no need to include it every time. Kill it in a bunch of places, it prepares us for following patches. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/94d8c943fbe0ef949981c508ddcee7fc1c18850f.1655384063.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/advise.c | 1 - io_uring/cancel.c | 1 - io_uring/epoll.c | 1 - io_uring/fdinfo.c | 1 - io_uring/filetable.c | 1 - io_uring/fs.c | 1 - io_uring/io_uring.c | 1 - io_uring/kbuf.c | 1 - io_uring/msg_ring.c | 1 - io_uring/net.c | 1 - io_uring/nop.c | 1 - io_uring/opdef.c | 1 - io_uring/openclose.c | 1 - io_uring/poll.c | 1 - io_uring/rsrc.c | 1 - io_uring/rw.c | 1 - io_uring/splice.c | 1 - io_uring/sqpoll.c | 1 - io_uring/statx.c | 1 - io_uring/sync.c | 1 - io_uring/tctx.c | 1 - io_uring/timeout.c | 1 - io_uring/uring_cmd.c | 1 - io_uring/xattr.c | 1 - 24 files changed, 24 deletions(-) diff --git a/io_uring/advise.c b/io_uring/advise.c index 8870fdf66ffbbd..581956934c0bf0 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -11,7 +11,6 @@ #include #include -#include "io_uring_types.h" #include "io_uring.h" #include "advise.h" diff --git a/io_uring/cancel.c b/io_uring/cancel.c index f07bfd27c98ac2..d1e7f5a955ab20 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -10,7 +10,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "tctx.h" #include "poll.h" diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 10853e8ed07887..a8b794471d6b83 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -9,7 +9,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "epoll.h" diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 344e7d90d55756..61c35707a6cfc2 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -9,7 +9,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "sqpoll.h" #include "fdinfo.h" diff --git a/io_uring/filetable.c b/io_uring/filetable.c index e449ceb9a848e6..534e1a3c625d9f 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -9,7 +9,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "rsrc.h" #include "filetable.h" diff --git a/io_uring/fs.c b/io_uring/fs.c index aac1bc5255b07f..0de4f549bb7df6 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -12,7 +12,6 @@ #include "../fs/internal.h" -#include "io_uring_types.h" #include "io_uring.h" #include "fs.h" diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0dbf6a74f9f3a1..0a1f83a936b753 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -80,7 +80,6 @@ #include "io-wq.h" -#include "io_uring_types.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 59e4fafeb28ce4..62de0dda24bf6e 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -11,7 +11,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "opdef.h" #include "kbuf.h" diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7c3c5f3ab06b57..b02be23496521c 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -7,7 +7,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "msg_ring.h" diff --git a/io_uring/net.c b/io_uring/net.c index 35d0183fe75817..b77bfbfb081673 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -10,7 +10,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "kbuf.h" #include "net.h" diff --git a/io_uring/nop.c b/io_uring/nop.c index d363d8ce70a3b7..d956599a3c1b8f 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -7,7 +7,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "nop.h" diff --git a/io_uring/opdef.c b/io_uring/opdef.c index d687d33f9c0c03..a7b84b43e6c235 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -8,7 +8,6 @@ #include #include -#include "io_uring_types.h" #include "io_uring.h" #include "opdef.h" #include "refs.h" diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 1cbf3903097053..099a5ec84dfdb7 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -12,7 +12,6 @@ #include "../fs/internal.h" -#include "io_uring_types.h" #include "io_uring.h" #include "rsrc.h" #include "openclose.h" diff --git a/io_uring/poll.c b/io_uring/poll.c index e0c181fe6264c0..63aca920543b31 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -13,7 +13,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "refs.h" #include "opdef.h" diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 7fed3105152a95..68629eba413265 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -12,7 +12,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "openclose.h" #include "rsrc.h" diff --git a/io_uring/rw.c b/io_uring/rw.c index e5ca23d0783e2c..f8b42f2265df98 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -14,7 +14,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "opdef.h" #include "kbuf.h" diff --git a/io_uring/splice.c b/io_uring/splice.c index 0e19d63303452a..b013ba34bffa58 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -11,7 +11,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "splice.h" diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 149d5c976f1467..76d4d70c733a99 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -14,7 +14,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "sqpoll.h" diff --git a/io_uring/statx.c b/io_uring/statx.c index 83b15687e9c5ee..6056cd7f48761c 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -8,7 +8,6 @@ #include "../fs/internal.h" -#include "io_uring_types.h" #include "io_uring.h" #include "statx.h" diff --git a/io_uring/sync.c b/io_uring/sync.c index 9ee8ff865521f3..f2102afa79ca63 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -11,7 +11,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "sync.h" diff --git a/io_uring/tctx.c b/io_uring/tctx.c index a819da8fc85cd2..9b30fb0d360307 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -9,7 +9,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "tctx.h" diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 526fc8b2e3b65d..f9df359813c93e 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -8,7 +8,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "refs.h" #include "cancel.h" diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index abf78918a0995f..0a421ed51e7e16 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -6,7 +6,6 @@ #include -#include "io_uring_types.h" #include "io_uring.h" #include "uring_cmd.h" diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 79adf4efba0184..b179f9acd5acc5 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -13,7 +13,6 @@ #include "../fs/internal.h" -#include "io_uring_types.h" #include "io_uring.h" #include "xattr.h" From 73b6da82944c35ac042831a0be91be759e242309 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 13:57:19 +0100 Subject: [PATCH 1019/1250] io_uring: make io_uring_types.h public Move io_uring types to linux/include, need them public so tracing can see the definitions and we can clean trace/events/io_uring.h Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a15f12e8cb7289b2de0deaddcc7518d98a132d17.1655384063.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- {io_uring => include/linux}/io_uring_types.h | 28 ++++++++++++++++++-- io_uring/filetable.h | 11 -------- io_uring/io-wq.h | 17 +----------- io_uring/io_uring.h | 4 ++- io_uring/refs.h | 2 +- 5 files changed, 31 insertions(+), 31 deletions(-) rename {io_uring => include/linux}/io_uring_types.h (96%) diff --git a/io_uring/io_uring_types.h b/include/linux/io_uring_types.h similarity index 96% rename from io_uring/io_uring_types.h rename to include/linux/io_uring_types.h index 65ac7cdaaa73f4..779c72da5b8faf 100644 --- a/io_uring/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -6,8 +6,32 @@ #include #include -#include "io-wq.h" -#include "filetable.h" +struct io_wq_work_node { + struct io_wq_work_node *next; +}; + +struct io_wq_work_list { + struct io_wq_work_node *first; + struct io_wq_work_node *last; +}; + +struct io_wq_work { + struct io_wq_work_node list; + unsigned flags; + /* place it here instead of io_kiocb as it fills padding and saves 4B */ + int cancel_seq; +}; + +struct io_fixed_file { + /* file * with additional FFS_* flags */ + unsigned long file_ptr; +}; + +struct io_file_table { + struct io_fixed_file *files; + unsigned long *bitmap; + unsigned int alloc_hint; +}; struct io_hash_bucket { spinlock_t lock; diff --git a/io_uring/filetable.h b/io_uring/filetable.h index c404360f709053..6b58aa48bc45d3 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -22,17 +22,6 @@ struct io_kiocb; #endif #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM) -struct io_fixed_file { - /* file * with additional FFS_* flags */ - unsigned long file_ptr; -}; - -struct io_file_table { - struct io_fixed_file *files; - unsigned long *bitmap; - unsigned int alloc_hint; -}; - bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); void io_free_file_tables(struct io_file_table *table); diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 3f54ee2a8eebd3..10b80ef78bb817 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -2,6 +2,7 @@ #define INTERNAL_IO_WQ_H #include +#include struct io_wq; @@ -20,15 +21,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -struct io_wq_work_node { - struct io_wq_work_node *next; -}; - -struct io_wq_work_list { - struct io_wq_work_node *first; - struct io_wq_work_node *last; -}; - #define wq_list_for_each(pos, prv, head) \ for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) @@ -152,13 +144,6 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) return node; } -struct io_wq_work { - struct io_wq_work_node list; - unsigned flags; - /* place it here instead of io_kiocb as it fills padding and saves 4B */ - int cancel_seq; -}; - static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) { if (!work->list.next) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 558a860a93fcd6..5eaa01c4697c37 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -3,7 +3,9 @@ #include #include -#include "io_uring_types.h" +#include +#include "io-wq.h" +#include "filetable.h" #ifndef CREATE_TRACE_POINTS #include diff --git a/io_uring/refs.h b/io_uring/refs.h index 334c5ead4c43d9..1336de3f2a30aa 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -2,7 +2,7 @@ #define IOU_REQ_REF_H #include -#include "io_uring_types.h" +#include /* * Shamelessly stolen from the mm implementation of page reference checking, From c316a1536661b60edae1b40a8fe2ec37f4fd9cf0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 16 Jun 2022 13:57:20 +0100 Subject: [PATCH 1020/1250] io_uring: clean up tracing events We have lots of trace events accepting an io_uring request and wanting to print some of its fields like user_data, opcode, flags and so on. However, as trace points were unaware of io_uring structures, we had to pass all the fields as arguments. Teach trace/events/io_uring.h about struct io_kiocb and stop the misery of passing a horde of arguments to trace helpers. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/40ff72f92798114e56d400f2b003beb6cde6ef53.1655384063.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 142 +++++++++++++------------------- io_uring/io_uring.c | 16 ++-- io_uring/poll.c | 5 +- io_uring/timeout.c | 3 +- 4 files changed, 66 insertions(+), 100 deletions(-) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index aa2f951b07cdff..3bc8dec9acaacf 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -7,6 +7,7 @@ #include #include +#include #include struct io_wq_work; @@ -97,9 +98,7 @@ TRACE_EVENT(io_uring_register, /** * io_uring_file_get - called before getting references to an SQE file * - * @ctx: pointer to a ring context structure * @req: pointer to a submitted request - * @user_data: user data associated with the request * @fd: SQE file descriptor * * Allows to trace out how often an SQE file reference is obtained, which can @@ -108,9 +107,9 @@ TRACE_EVENT(io_uring_register, */ TRACE_EVENT(io_uring_file_get, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, int fd), + TP_PROTO(struct io_kiocb *req, int fd), - TP_ARGS(ctx, req, user_data, fd), + TP_ARGS(req, fd), TP_STRUCT__entry ( __field( void *, ctx ) @@ -120,9 +119,9 @@ TRACE_EVENT(io_uring_file_get, ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; + __entry->user_data = req->cqe.user_data; __entry->fd = fd; ), @@ -133,22 +132,16 @@ TRACE_EVENT(io_uring_file_get, /** * io_uring_queue_async_work - called before submitting a new async work * - * @ctx: pointer to a ring context structure * @req: pointer to a submitted request - * @user_data: user data associated with the request - * @opcode: opcode of request - * @flags request flags - * @work: pointer to a submitted io_wq_work * @rw: type of workqueue, hashed or normal * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, - TP_PROTO(void *ctx, void * req, unsigned long long user_data, u8 opcode, - unsigned int flags, struct io_wq_work *work, int rw), + TP_PROTO(struct io_kiocb *req, int rw), - TP_ARGS(ctx, req, user_data, opcode, flags, work, rw), + TP_ARGS(req, rw), TP_STRUCT__entry ( __field( void *, ctx ) @@ -159,19 +152,19 @@ TRACE_EVENT(io_uring_queue_async_work, __field( struct io_wq_work *, work ) __field( int, rw ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->flags = flags; - __entry->opcode = opcode; - __entry->work = work; + __entry->user_data = req->cqe.user_data; + __entry->flags = req->flags; + __entry->opcode = req->opcode; + __entry->work = &req->work; __entry->rw = rw; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p", @@ -183,19 +176,16 @@ TRACE_EVENT(io_uring_queue_async_work, /** * io_uring_defer - called when an io_uring request is deferred * - * @ctx: pointer to a ring context structure * @req: pointer to a deferred request - * @user_data: user data associated with the request - * @opcode: opcode of request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode), + TP_PROTO(struct io_kiocb *req), - TP_ARGS(ctx, req, user_data, opcode), + TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) @@ -203,16 +193,16 @@ TRACE_EVENT(io_uring_defer, __field( unsigned long long, data ) __field( u8, opcode ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->data = user_data; - __entry->opcode = opcode; + __entry->data = req->cqe.user_data; + __entry->opcode = req->opcode; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", @@ -224,7 +214,6 @@ TRACE_EVENT(io_uring_defer, * io_uring_link - called before the io_uring request added into link_list of * another request * - * @ctx: pointer to a ring context structure * @req: pointer to a linked request * @target_req: pointer to a previous request, that would contain @req * @@ -233,9 +222,9 @@ TRACE_EVENT(io_uring_defer, */ TRACE_EVENT(io_uring_link, - TP_PROTO(void *ctx, void *req, void *target_req), + TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req), - TP_ARGS(ctx, req, target_req), + TP_ARGS(req, target_req), TP_STRUCT__entry ( __field( void *, ctx ) @@ -244,7 +233,7 @@ TRACE_EVENT(io_uring_link, ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; __entry->target_req = target_req; ), @@ -285,10 +274,7 @@ TRACE_EVENT(io_uring_cqring_wait, /** * io_uring_fail_link - called before failing a linked request * - * @ctx: pointer to a ring context structure * @req: request, which links were cancelled - * @user_data: user data associated with the request - * @opcode: opcode of request * @link: cancelled link * * Allows to track linked requests cancellation, to see not only that some work @@ -296,9 +282,9 @@ TRACE_EVENT(io_uring_cqring_wait, */ TRACE_EVENT(io_uring_fail_link, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, void *link), + TP_PROTO(struct io_kiocb *req, struct io_kiocb *link), - TP_ARGS(ctx, req, user_data, opcode, link), + TP_ARGS(req, link), TP_STRUCT__entry ( __field( void *, ctx ) @@ -307,17 +293,17 @@ TRACE_EVENT(io_uring_fail_link, __field( u8, opcode ) __field( void *, link ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->opcode = opcode; + __entry->user_data = req->cqe.user_data; + __entry->opcode = req->opcode; __entry->link = link; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", @@ -376,23 +362,17 @@ TRACE_EVENT(io_uring_complete, /** * io_uring_submit_sqe - called before submitting one SQE * - * @ctx: pointer to a ring context structure * @req: pointer to a submitted request - * @user_data: user data associated with the request - * @opcode: opcode of request - * @flags request flags * @force_nonblock: whether a context blocking or not - * @sq_thread: true if sq_thread has submitted this SQE * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ TRACE_EVENT(io_uring_submit_sqe, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, u32 flags, - bool force_nonblock, bool sq_thread), + TP_PROTO(struct io_kiocb *req, bool force_nonblock), - TP_ARGS(ctx, req, user_data, opcode, flags, force_nonblock, sq_thread), + TP_ARGS(req, force_nonblock), TP_STRUCT__entry ( __field( void *, ctx ) @@ -403,19 +383,19 @@ TRACE_EVENT(io_uring_submit_sqe, __field( bool, force_nonblock ) __field( bool, sq_thread ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->opcode = opcode; - __entry->flags = flags; + __entry->user_data = req->cqe.user_data; + __entry->opcode = req->opcode; + __entry->flags = req->flags; __entry->force_nonblock = force_nonblock; - __entry->sq_thread = sq_thread; + __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, " @@ -427,10 +407,7 @@ TRACE_EVENT(io_uring_submit_sqe, /* * io_uring_poll_arm - called after arming a poll wait if successful * - * @ctx: pointer to a ring context structure * @req: pointer to the armed request - * @user_data: user data associated with the request - * @opcode: opcode of request * @mask: request poll events mask * @events: registered events of interest * @@ -439,10 +416,9 @@ TRACE_EVENT(io_uring_submit_sqe, */ TRACE_EVENT(io_uring_poll_arm, - TP_PROTO(void *ctx, void *req, u64 user_data, u8 opcode, - int mask, int events), + TP_PROTO(struct io_kiocb *req, int mask, int events), - TP_ARGS(ctx, req, user_data, opcode, mask, events), + TP_ARGS(req, mask, events), TP_STRUCT__entry ( __field( void *, ctx ) @@ -452,18 +428,18 @@ TRACE_EVENT(io_uring_poll_arm, __field( int, mask ) __field( int, events ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->opcode = opcode; + __entry->user_data = req->cqe.user_data; + __entry->opcode = req->opcode; __entry->mask = mask; __entry->events = events; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", @@ -475,18 +451,15 @@ TRACE_EVENT(io_uring_poll_arm, /* * io_uring_task_add - called after adding a task * - * @ctx: pointer to a ring context structure * @req: pointer to request - * @user_data: user data associated with the request - * @opcode: opcode of request * @mask: request poll events mask * */ TRACE_EVENT(io_uring_task_add, - TP_PROTO(void *ctx, void *req, unsigned long long user_data, u8 opcode, int mask), + TP_PROTO(struct io_kiocb *req, int mask), - TP_ARGS(ctx, req, user_data, opcode, mask), + TP_ARGS(req, mask), TP_STRUCT__entry ( __field( void *, ctx ) @@ -495,17 +468,17 @@ TRACE_EVENT(io_uring_task_add, __field( u8, opcode ) __field( int, mask ) - __string( op_str, io_uring_get_opcode(opcode) ) + __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; - __entry->user_data = user_data; - __entry->opcode = opcode; + __entry->user_data = req->cqe.user_data; + __entry->opcode = req->opcode; __entry->mask = mask; - __assign_str(op_str, io_uring_get_opcode(opcode)); + __assign_str(op_str, io_uring_get_opcode(req->opcode)); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", @@ -518,7 +491,6 @@ TRACE_EVENT(io_uring_task_add, * io_uring_req_failed - called when an sqe is errored dring submission * * @sqe: pointer to the io_uring_sqe that failed - * @ctx: pointer to a ring context structure * @req: pointer to request * @error: error it failed with * @@ -526,9 +498,9 @@ TRACE_EVENT(io_uring_task_add, */ TRACE_EVENT(io_uring_req_failed, - TP_PROTO(const struct io_uring_sqe *sqe, void *ctx, void *req, int error), + TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error), - TP_ARGS(sqe, ctx, req, error), + TP_ARGS(sqe, req, error), TP_STRUCT__entry ( __field( void *, ctx ) @@ -552,7 +524,7 @@ TRACE_EVENT(io_uring_req_failed, ), TP_fast_assign( - __entry->ctx = ctx; + __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = sqe->user_data; __entry->opcode = sqe->opcode; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0a1f83a936b753..ef4371790aaa5d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -452,9 +452,7 @@ void io_queue_iowq(struct io_kiocb *req, bool *dont_use) if (WARN_ON_ONCE(!same_thread_group(req->task, current))) req->work.flags |= IO_WQ_WORK_CANCEL; - trace_io_uring_queue_async_work(req->ctx, req, req->cqe.user_data, - req->opcode, req->flags, &req->work, - io_wq_is_hashed(&req->work)); + trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); if (link) io_queue_linked_timeout(link); @@ -1583,7 +1581,7 @@ static __cold void io_drain_req(struct io_kiocb *req) goto queue; } - trace_io_uring_defer(ctx, req, req->cqe.user_data, req->opcode); + trace_io_uring_defer(req); de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); @@ -1783,7 +1781,7 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) { struct file *file = fget(fd); - trace_io_uring_file_get(req->ctx, req, req->cqe.user_data, fd); + trace_io_uring_file_get(req, fd); /* we don't allow fixed io_uring files */ if (file && io_is_uring_fops(file)) @@ -2006,7 +2004,7 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe, struct io_submit_link *link = &ctx->submit_state.link; struct io_kiocb *head = link->head; - trace_io_uring_req_failed(sqe, ctx, req, ret); + trace_io_uring_req_failed(sqe, req, ret); /* * Avoid breaking links in the middle as it renders links with SQPOLL @@ -2048,9 +2046,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return io_submit_fail_init(sqe, req, ret); /* don't need @sqe from now on */ - trace_io_uring_submit_sqe(ctx, req, req->cqe.user_data, req->opcode, - req->flags, true, - ctx->flags & IORING_SETUP_SQPOLL); + trace_io_uring_submit_sqe(req, true); /* * If we already have a head request, queue this one for async @@ -2064,7 +2060,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, if (unlikely(ret)) return io_submit_fail_init(sqe, req, ret); - trace_io_uring_link(ctx, req, link->head); + trace_io_uring_link(req, link->head); link->last->link = req; link->last = req; diff --git a/io_uring/poll.c b/io_uring/poll.c index 63aca920543b31..b2659b56c702f3 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -288,7 +288,7 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, else req->io_task_work.func = io_apoll_task_func; - trace_io_uring_task_add(req->ctx, req, req->cqe.user_data, req->opcode, mask); + trace_io_uring_task_add(req, mask); io_req_task_work_add(req); } @@ -558,8 +558,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) if (ret || ipt.error) return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - trace_io_uring_poll_arm(ctx, req, req->cqe.user_data, req->opcode, - mask, apoll->poll.events); + trace_io_uring_poll_arm(req, mask, apoll->poll.events); return IO_APOLL_OK; } diff --git a/io_uring/timeout.c b/io_uring/timeout.c index f9df359813c93e..557c637af158be 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -115,8 +115,7 @@ static void io_fail_links(struct io_kiocb *req) nxt = link->link; link->link = NULL; - trace_io_uring_fail_link(req->ctx, req, req->cqe.user_data, - req->opcode, link); + trace_io_uring_fail_link(req, link); if (ignore_cqes) link->flags |= REQ_F_CQE_SKIP; From a6a703c8a35593f81ca1f0fa6290f8ac61990e72 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 18 Jun 2022 19:44:33 -0600 Subject: [PATCH 1021/1250] io_uring: move a few private types to local headers Commit 3a3d47fa9cfd ("io_uring: make io_uring_types.h public") moved a bunch of io_uring types to a kernel wide header, so we could make tracing a bit saner rather than pass in a ton of arguments. However, there are a few types in there that are not really needed to be system wide. Move the cancel data and mapped buffers back to the appropriate io_uring local headers. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 18 ------------------ io_uring/cancel.h | 13 +++++++++++++ io_uring/fdinfo.c | 1 + io_uring/poll.h | 1 + io_uring/rsrc.h | 8 ++++++++ io_uring/timeout.h | 1 + 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 779c72da5b8faf..2015f3ea7cb76b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -528,27 +528,9 @@ struct io_kiocb { struct io_wq_work work; }; -struct io_cancel_data { - struct io_ring_ctx *ctx; - union { - u64 data; - struct file *file; - }; - u32 flags; - int seq; -}; - struct io_overflow_cqe { struct list_head list; struct io_uring_cqe cqe; }; -struct io_mapped_ubuf { - u64 ubuf; - u64 ubuf_end; - unsigned int nr_bvecs; - unsigned long acct_pages; - struct bio_vec bvec[]; -}; - #endif diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 8dd259dc383e2c..2338012a5b06fb 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -1,5 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#include + +struct io_cancel_data { + struct io_ring_ctx *ctx; + union { + u64 data; + struct file *file; + }; + u32 flags; + int seq; +}; + + int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 61c35707a6cfc2..b29e2d02216f22 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -13,6 +13,7 @@ #include "sqpoll.h" #include "fdinfo.h" #include "cancel.h" +#include "rsrc.h" #ifdef CONFIG_PROC_FS static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, diff --git a/io_uring/poll.h b/io_uring/poll.h index fa3e19790281b2..c40673d7da0199 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -24,6 +24,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags); int io_poll_remove_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags); +struct io_cancel_data; int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 872c86312cbc2d..03f26516e99463 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -45,6 +45,14 @@ struct io_rsrc_node { bool done; }; +struct io_mapped_ubuf { + u64 ubuf; + u64 ubuf_end; + unsigned int nr_bvecs; + unsigned long acct_pages; + struct bio_vec bvec[]; +}; + void io_rsrc_put_work(struct work_struct *work); void io_rsrc_refs_refill(struct io_ring_ctx *ctx); void io_wait_rsrc_data(struct io_rsrc_data *data); diff --git a/io_uring/timeout.h b/io_uring/timeout.h index dd7cfb0d936671..858c62644897a5 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -22,6 +22,7 @@ static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) } __cold void io_flush_timeouts(struct io_ring_ctx *ctx); +struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); From 32597fd6cebe69f8c370e0ffef519215b88853e0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 19 Jun 2022 12:26:04 +0100 Subject: [PATCH 1022/1250] io_uring: remove extra io_commit_cqring() We don't post events in __io_commit_cqring_flush() anymore but send all requests to tw, so no need to do io_commit_cqring() there. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f2481e32375e749be89c42e4804268b608722cef.1655637157.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ef4371790aaa5d..efad2d9b7b42c5 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -480,7 +480,6 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_flush_timeouts(ctx); if (ctx->drain_active) io_queue_deferred(ctx); - io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); } if (ctx->has_evfd) From b12b72c46a2d119a5cd3a780c648c609ca086deb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 19 Jun 2022 12:26:05 +0100 Subject: [PATCH 1023/1250] io_uring: reshuffle io_uring/io_uring.h It's a good idea to first do forward declarations and then inline helpers, otherwise there will be keep stumbling on dependencies between them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1d7fa6672ed43f20ccc0c54ae201369ebc3ebfab.1655637157.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 95 ++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 48 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 5eaa01c4697c37..7b2055b342dfab 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -18,6 +18,53 @@ enum { struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); bool io_req_cqe_overflow(struct io_kiocb *req); +int io_run_task_work_sig(void); +void io_req_complete_failed(struct io_kiocb *req, s32 res); +void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); +void io_req_complete_post(struct io_kiocb *req); +void __io_req_complete_post(struct io_kiocb *req); +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); +void io_cqring_ev_posted(struct io_ring_ctx *ctx); +void __io_commit_cqring_flush(struct io_ring_ctx *ctx); + +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); + +struct file *io_file_get_normal(struct io_kiocb *req, int fd); +struct file *io_file_get_fixed(struct io_kiocb *req, int fd, + unsigned issue_flags); + +bool io_is_uring_fops(struct file *file); +bool io_alloc_async_data(struct io_kiocb *req); +void io_req_task_work_add(struct io_kiocb *req); +void io_req_task_prio_work_add(struct io_kiocb *req); +void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); +void io_req_task_queue(struct io_kiocb *req); +void io_queue_iowq(struct io_kiocb *req, bool *dont_use); +void io_req_task_complete(struct io_kiocb *req, bool *locked); +void io_req_task_queue_fail(struct io_kiocb *req, int ret); +void io_req_task_submit(struct io_kiocb *req, bool *locked); +void tctx_task_work(struct callback_head *cb); +__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); +int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx); + +int io_poll_issue(struct io_kiocb *req, bool *locked); +int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); +int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); +void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); +int io_req_prep_async(struct io_kiocb *req); + +struct io_wq_work *io_wq_free_work(struct io_wq_work *work); +void io_wq_submit_work(struct io_wq_work *work); + +void io_free_req(struct io_kiocb *req); +void io_queue_next(struct io_kiocb *req); + +bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, + bool cancel_all); + +#define io_for_each_link(pos, head) \ + for (pos = (head); pos; pos = pos->link) static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { @@ -177,52 +224,4 @@ static inline void io_req_add_compl_list(struct io_kiocb *req) wq_list_add_tail(&req->comp_list, &state->compl_reqs); } -int io_run_task_work_sig(void); -void io_req_complete_failed(struct io_kiocb *req, s32 res); -void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); -void io_req_complete_post(struct io_kiocb *req); -void __io_req_complete_post(struct io_kiocb *req); -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -void io_cqring_ev_posted(struct io_ring_ctx *ctx); -void __io_commit_cqring_flush(struct io_ring_ctx *ctx); - -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); - -struct file *io_file_get_normal(struct io_kiocb *req, int fd); -struct file *io_file_get_fixed(struct io_kiocb *req, int fd, - unsigned issue_flags); - -bool io_is_uring_fops(struct file *file); -bool io_alloc_async_data(struct io_kiocb *req); -void io_req_task_work_add(struct io_kiocb *req); -void io_req_task_prio_work_add(struct io_kiocb *req); -void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); -void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, bool *dont_use); -void io_req_task_complete(struct io_kiocb *req, bool *locked); -void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, bool *locked); -void tctx_task_work(struct callback_head *cb); -__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx); - -int io_poll_issue(struct io_kiocb *req, bool *locked); -int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); -int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); -void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); -int io_req_prep_async(struct io_kiocb *req); - -struct io_wq_work *io_wq_free_work(struct io_wq_work *work); -void io_wq_submit_work(struct io_wq_work *work); - -void io_free_req(struct io_kiocb *req); -void io_queue_next(struct io_kiocb *req); - -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, - bool cancel_all); - -#define io_for_each_link(pos, head) \ - for (pos = (head); pos; pos = pos->link) - #endif From f0b15b51bc01774411ab37352fa733017657bf89 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 19 Jun 2022 12:26:06 +0100 Subject: [PATCH 1024/1250] io_uring: move io_eventfd_signal() Move io_eventfd_signal() in the sources without any changes and kill its forward declaration. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9ebebb3f6f56f5a5448a621e0b6a537720c43334.1655637157.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index efad2d9b7b42c5..61d4e6d0731a20 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -142,8 +142,6 @@ static void io_queue_sqe(struct io_kiocb *req); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); -static void io_eventfd_signal(struct io_ring_ctx *ctx); - static struct kmem_cache *req_cachep; struct sock *io_uring_get_socket(struct file *file) @@ -472,20 +470,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) } } -void __io_commit_cqring_flush(struct io_ring_ctx *ctx) -{ - if (ctx->off_timeout_used || ctx->drain_active) { - spin_lock(&ctx->completion_lock); - if (ctx->off_timeout_used) - io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } - if (ctx->has_evfd) - io_eventfd_signal(ctx); -} - static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; @@ -513,6 +497,20 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx) rcu_read_unlock(); } +void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ + if (ctx->off_timeout_used || ctx->drain_active) { + spin_lock(&ctx->completion_lock); + if (ctx->off_timeout_used) + io_flush_timeouts(ctx); + if (ctx->drain_active) + io_queue_deferred(ctx); + spin_unlock(&ctx->completion_lock); + } + if (ctx->has_evfd) + io_eventfd_signal(ctx); +} + /* * This should only get called when at least one event has been posted. * Some applications rely on the eventfd notification count only changing From 0e38e3122212ae4bc6646bb111d010b1232dabd6 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 19 Jun 2022 12:26:08 +0100 Subject: [PATCH 1025/1250] io_uring: remove ->flush_cqes optimisation It's not clear how widely used IOSQE_CQE_SKIP_SUCCESS is, and how often ->flush_cqes flag prevents from completion being flushed. Sometimes it's high level of concurrency that enables it at least for one CQE, but sometimes it doesn't save much because nobody waiting on the CQ. Remove ->flush_cqes flag and the optimisation, it should benefit the normal use case. Note, that there is no spurious eventfd problem with that as checks for spuriousness were incorporated into io_eventfd_signal(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/692e81eeddccc096f449a7960365fa7b4a18f8e6.1655637157.git.asml.silence@gmail.com [axboe: remove now dead state->flush_cqes variable] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - io_uring/io_uring.c | 23 ++++++++++------------- io_uring/io_uring.h | 2 -- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 2015f3ea7cb76b..6bcd7bff6479be 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -148,7 +148,6 @@ struct io_submit_state { bool plug_started; bool need_plug; - bool flush_cqes; unsigned short submit_nr; struct blk_plug plug; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 61d4e6d0731a20..16a625e854ec33 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1250,22 +1250,19 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; - if (state->flush_cqes) { - spin_lock(&ctx->completion_lock); - wq_list_for_each(node, prev, &state->compl_reqs) { - struct io_kiocb *req = container_of(node, struct io_kiocb, - comp_list); - - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(ctx, req); - } + spin_lock(&ctx->completion_lock); + wq_list_for_each(node, prev, &state->compl_reqs) { + struct io_kiocb *req = container_of(node, struct io_kiocb, + comp_list); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - state->flush_cqes = false; + if (!(req->flags & REQ_F_CQE_SKIP)) + __io_fill_cqe_req(ctx, req); } + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); + io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 7b2055b342dfab..bdc62727638be8 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -219,8 +219,6 @@ static inline void io_req_add_compl_list(struct io_kiocb *req) { struct io_submit_state *state = &req->ctx->submit_state; - if (!(req->flags & REQ_F_CQE_SKIP)) - state->flush_cqes = true; wq_list_add_tail(&req->comp_list, &state->compl_reqs); } From 31f9a92ac61bb05dc53fb704014b4c8bcd8cf982 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:52 +0100 Subject: [PATCH 1026/1250] io_uring: fix multi ctx cancellation io_uring_try_cancel_requests() loops until there is nothing left to do with the ring, however there might be several rings and they might have dependencies between them, e.g. via poll requests. Instead of cancelling rings one by one, try to cancel them all and only then loop over if we still potenially some work to do. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8d491fe02d8ac4c77ff38061cf86b9a827e8845c.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 87 ++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 16a625e854ec33..707b599b9224a4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -132,7 +132,7 @@ struct io_defer_entry { #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) -static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, +static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); @@ -2648,7 +2648,9 @@ static __cold void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { - io_uring_try_cancel_requests(ctx, NULL, true); + while (io_uring_try_cancel_requests(ctx, NULL, true)) + cond_resched(); + if (ctx->sq_data) { struct io_sq_data *sqd = ctx->sq_data; struct task_struct *tsk; @@ -2806,53 +2808,48 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } -static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, +static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; + enum io_wq_cancel cret; + bool ret = false; /* failed during ring init, it couldn't have issued any requests */ if (!ctx->rings) - return; - - while (1) { - enum io_wq_cancel cret; - bool ret = false; + return false; - if (!task) { - ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx && tctx->io_wq) { - /* - * Cancels requests of all rings, not only @ctx, but - * it's fine as the task is in exit/exec. - */ - cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, - &cancel, true); - ret |= (cret != IO_WQ_CANCEL_NOTFOUND); - } + if (!task) { + ret |= io_uring_try_cancel_iowq(ctx); + } else if (tctx && tctx->io_wq) { + /* + * Cancels requests of all rings, not only @ctx, but + * it's fine as the task is in exit/exec. + */ + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, + &cancel, true); + ret |= (cret != IO_WQ_CANCEL_NOTFOUND); + } - /* SQPOLL thread does its own polling */ - if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { - while (!wq_list_empty(&ctx->iopoll_list)) { - io_iopoll_try_reap_events(ctx); - ret = true; - } + /* SQPOLL thread does its own polling */ + if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || + (ctx->sq_data && ctx->sq_data->thread == current)) { + while (!wq_list_empty(&ctx->iopoll_list)) { + io_iopoll_try_reap_events(ctx); + ret = true; } - - ret |= io_cancel_defer_files(ctx, task, cancel_all); - mutex_lock(&ctx->uring_lock); - ret |= io_poll_remove_all(ctx, task, cancel_all); - mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, task, cancel_all); - if (task) - ret |= io_run_task_work(); - if (!ret) - break; - cond_resched(); } + + ret |= io_cancel_defer_files(ctx, task, cancel_all); + mutex_lock(&ctx->uring_lock); + ret |= io_poll_remove_all(ctx, task, cancel_all); + mutex_unlock(&ctx->uring_lock); + ret |= io_kill_timeouts(ctx, task, cancel_all); + if (task) + ret |= io_run_task_work(); + return ret; } static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) @@ -2882,6 +2879,8 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) atomic_inc(&tctx->in_idle); do { + bool loop = false; + io_uring_drop_tctx_refs(current); /* read completions before cancelations */ inflight = tctx_inflight(tctx, !cancel_all); @@ -2896,13 +2895,19 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) /* sqpoll task will cancel all its requests */ if (node->ctx->sq_data) continue; - io_uring_try_cancel_requests(node->ctx, current, - cancel_all); + loop |= io_uring_try_cancel_requests(node->ctx, + current, cancel_all); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) - io_uring_try_cancel_requests(ctx, current, - cancel_all); + loop |= io_uring_try_cancel_requests(ctx, + current, + cancel_all); + } + + if (loop) { + cond_resched(); + continue; } prepare_to_wait(&tctx->wait, &wait, TASK_INTERRUPTIBLE); From 8703e6df74da0c0672ad7c4a5f7a725b0d47f8d0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:53 +0100 Subject: [PATCH 1027/1250] io_uring: improve task exit timeout cancellations Don't spin trying to cancel timeouts that are reachable but not cancellable, e.g. already executing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ab8a7440a60bbdf69ae514f672ad050e43dd1b03.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 557c637af158be..a79a7d6ef1b37a 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -49,7 +49,7 @@ static inline void io_put_req(struct io_kiocb *req) } } -static void io_kill_timeout(struct io_kiocb *req, int status) +static bool io_kill_timeout(struct io_kiocb *req, int status) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { @@ -64,7 +64,9 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&timeout->list); io_req_tw_post_queue(req, status, 0); + return true; } + return false; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) @@ -620,10 +622,9 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); - if (io_match_task(req, tsk, cancel_all)) { - io_kill_timeout(req, -ECANCELED); + if (io_match_task(req, tsk, cancel_all) && + io_kill_timeout(req, -ECANCELED)) canceled++; - } } spin_unlock_irq(&ctx->timeout_lock); io_commit_cqring(ctx); From 61beb1f58edbd794056bad58dc46a224948c9933 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:54 +0100 Subject: [PATCH 1028/1250] io_uring: fix io_poll_remove_all clang warnings clang complains on bitwise operations with bools, add a bit more verbosity to better show that we want to call io_poll_remove_all_table() twice but with different arguments. Reported-by: Nathan Chancellor Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f11d21dcdf9233e0eeb15fa13b858a05a78eb310.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index b2659b56c702f3..cbf44c38efd9ee 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -595,8 +595,11 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all) __must_hold(&ctx->uring_lock) { - return io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all) | - io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); + bool ret; + + ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all); + ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); + return ret; } static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, From 93996d971371443fe2d69b3ce84fe10b5e9052bd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:55 +0100 Subject: [PATCH 1029/1250] io_uring: hide eventfd assumptions in eventfd paths Some io_uring-eventfd users assume that there won't be spurious wakeups. That assumption has to be honoured by all io_cqring_ev_posted() callers, which is inconvenient and from time to time leads to problems but should be maintained to not break the userspace. Instead of making the callers track whether a CQE was posted or not, hide it inside io_eventfd_signal(). It saves ->cached_cq_tail it saw last time and triggers the eventfd only when ->cached_cq_tail changed since then. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0ffc66bae37a2513080b601e4370e147faaa72c5.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ io_uring/io_uring.c | 44 ++++++++++++++++++++-------------- io_uring/timeout.c | 3 +-- 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 6bcd7bff6479be..5987f8acca3838 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -314,6 +314,8 @@ struct io_ring_ctx { struct list_head defer_list; unsigned sq_thread_idle; + /* protected by ->completion_lock */ + unsigned evfd_last_cq_tail; }; enum { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 707b599b9224a4..84f92362521670 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -473,6 +473,22 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx) static void io_eventfd_signal(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; + bool skip; + + spin_lock(&ctx->completion_lock); + /* + * Eventfd should only get triggered when at least one event has been + * posted. Some applications rely on the eventfd notification count only + * changing IFF a new CQE has been added to the CQ ring. There's no + * depedency on 1:1 relationship between how many times this function is + * called (and hence the eventfd count) and number of CQEs posted to the + * CQ ring. + */ + skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + if (skip) + return; rcu_read_lock(); /* @@ -511,13 +527,6 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_signal(ctx); } -/* - * This should only get called when at least one event has been posted. - * Some applications rely on the eventfd notification count only changing - * IFF a new CQE has been added to the CQ ring. There's no depedency on - * 1:1 relationship between how many times this function is called (and - * hence the eventfd count) and number of CQEs posted to the CQ ring. - */ void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (unlikely(ctx->off_timeout_used || ctx->drain_active || @@ -530,7 +539,7 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx) /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { - bool all_flushed, posted; + bool all_flushed; size_t cqe_size = sizeof(struct io_uring_cqe); if (!force && __io_cqring_events(ctx) == ctx->cq_entries) @@ -539,7 +548,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (ctx->flags & IORING_SETUP_CQE32) cqe_size <<= 1; - posted = false; spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe(ctx); @@ -554,7 +562,6 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) else io_account_cq_overflow(ctx); - posted = true; list_del(&ocqe->list); kfree(ocqe); } @@ -567,8 +574,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); return all_flushed; } @@ -758,8 +764,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, filled = io_fill_cqe_aux(ctx, user_data, res, cflags); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (filled) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); return filled; } @@ -940,14 +945,12 @@ __cold void io_free_req(struct io_kiocb *req) static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - bool posted; spin_lock(&ctx->completion_lock); - posted = io_disarm_next(req); + io_disarm_next(req); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) @@ -2428,6 +2431,11 @@ static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, kfree(ev_fd); return ret; } + + spin_lock(&ctx->completion_lock); + ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); + ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; rcu_assign_pointer(ctx->io_ev_fd, ev_fd); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index a79a7d6ef1b37a..424b2fc858b8b4 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -629,7 +629,6 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, spin_unlock_irq(&ctx->timeout_lock); io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - if (canceled != 0) - io_cqring_ev_posted(ctx); + io_cqring_ev_posted(ctx); return canceled != 0; } From 3805a7e49eea4d24fb3ff11756dc09767fa441af Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:56 +0100 Subject: [PATCH 1030/1250] io_uring: introduce locking helpers for CQE posting spin_lock(&ctx->completion_lock); /* post CQEs */ io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); We have many places repeating this sequence, and the three function unlock section is not perfect from the maintainance perspective and also makes it harder to add new locking/sync trick. Introduce two helpers. io_cq_lock(), which is simple and only grabs ->completion_lock, and io_cq_unlock_post() encapsulating the three call section. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/fe0c682bf7f7b55d9be55b0d034be9c1949277dc.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 57 +++++++++++++++++++++------------------------ io_uring/io_uring.h | 9 ++++++- io_uring/timeout.c | 6 ++--- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 84f92362521670..0db73d01455d10 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -527,7 +527,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_signal(ctx); } -void io_cqring_ev_posted(struct io_ring_ctx *ctx) +static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) { if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) @@ -536,6 +536,19 @@ void io_cqring_ev_posted(struct io_ring_ctx *ctx) io_cqring_wake(ctx); } +static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) + __releases(ctx->completion_lock) +{ + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_cqring_ev_posted(ctx); +} + +void io_cq_unlock_post(struct io_ring_ctx *ctx) +{ + __io_cq_unlock_post(ctx); +} + /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { @@ -548,7 +561,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) if (ctx->flags & IORING_SETUP_CQE32) cqe_size <<= 1; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe; @@ -572,9 +585,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); return all_flushed; } @@ -760,11 +771,9 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, { bool filled; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); return filled; } @@ -810,11 +819,9 @@ void io_req_complete_post(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); __io_req_complete_post(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); } inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags) @@ -946,11 +953,9 @@ static void __io_req_find_next_prep(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); io_disarm_next(req); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); } static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) @@ -984,13 +989,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } -static inline void ctx_commit_and_unlock(struct io_ring_ctx *ctx) -{ - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); -} - static void handle_prev_tw_list(struct io_wq_work_node *node, struct io_ring_ctx **ctx, bool *uring_locked) { @@ -1006,7 +1004,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, if (req->ctx != *ctx) { if (unlikely(!*uring_locked && *ctx)) - ctx_commit_and_unlock(*ctx); + io_cq_unlock_post(*ctx); ctx_flush_and_put(*ctx, uring_locked); *ctx = req->ctx; @@ -1014,7 +1012,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, *uring_locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); if (unlikely(!*uring_locked)) - spin_lock(&(*ctx)->completion_lock); + io_cq_lock(*ctx); } if (likely(*uring_locked)) { req->io_task_work.func(req, uring_locked); @@ -1026,7 +1024,7 @@ static void handle_prev_tw_list(struct io_wq_work_node *node, } while (node); if (unlikely(!*uring_locked)) - ctx_commit_and_unlock(*ctx); + io_cq_unlock_post(*ctx); } static void handle_tw_list(struct io_wq_work_node *node, @@ -1261,10 +1259,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) if (!(req->flags & REQ_F_CQE_SKIP)) __io_fill_cqe_req(ctx, req); } - - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + __io_cq_unlock_post(ctx); io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index bdc62727638be8..738fb96575ab7c 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -24,7 +24,6 @@ void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -void io_cqring_ev_posted(struct io_ring_ctx *ctx); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); @@ -66,6 +65,14 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) +static inline void io_cq_lock(struct io_ring_ctx *ctx) + __acquires(ctx->completion_lock) +{ + spin_lock(&ctx->completion_lock); +} + +void io_cq_unlock_post(struct io_ring_ctx *ctx); + static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) { if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 424b2fc858b8b4..7e2c341f976258 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -617,7 +617,7 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_timeout *timeout, *tmp; int canceled = 0; - spin_lock(&ctx->completion_lock); + io_cq_lock(ctx); spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); @@ -627,8 +627,6 @@ __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, canceled++; } spin_unlock_irq(&ctx->timeout_lock); - io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - io_cqring_ev_posted(ctx); + io_cq_unlock_post(ctx); return canceled != 0; } From 0dc54bfa4883cbf291e7ac868a1972abf61e6cad Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:57 +0100 Subject: [PATCH 1031/1250] io_uring: add io_commit_cqring_flush() Since __io_commit_cqring_flush users moved to different files, introduce io_commit_cqring_flush() helper and encapsulate all flags testing details inside. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0da03887435dd9869ffe46dcd3962bf104afcca3.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 +---- io_uring/io_uring.h | 6 ++++++ io_uring/rw.c | 5 +---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0db73d01455d10..3e65e04915a782 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -529,10 +529,7 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) static inline void io_cqring_ev_posted(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - + io_commit_cqring_flush(ctx); io_cqring_wake(ctx); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 738fb96575ab7c..afca7ff8956c32 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -229,4 +229,10 @@ static inline void io_req_add_compl_list(struct io_kiocb *req) wq_list_add_tail(&req->comp_list, &state->compl_reqs); } +static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) +{ + if (unlikely(ctx->off_timeout_used || ctx->drain_active || ctx->has_evfd)) + __io_commit_cqring_flush(ctx); +} + #endif diff --git a/io_uring/rw.c b/io_uring/rw.c index f8b42f2265df98..0028e95e6633af 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -1016,10 +1016,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || - ctx->has_evfd)) - __io_commit_cqring_flush(ctx); - + io_commit_cqring_flush(ctx); if (ctx->flags & IORING_SETUP_SQPOLL) io_cqring_wake(ctx); } From 77ae66d7f7ed4d9a68647623582e9c361a59db79 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:58 +0100 Subject: [PATCH 1032/1250] io_uring: opcode independent fixed buf import Fixed buffers are generic infrastructure, make io_import_fixed() opcode agnostic. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b1e765c8a1c2c913a05a28d2399fc53e1d3cf37a.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 0028e95e6633af..ded8ef01165cdd 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -273,14 +273,15 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, return IOU_ISSUE_SKIP_COMPLETE; } -static int __io_import_fixed(struct io_kiocb *req, int ddir, - struct iov_iter *iter, struct io_mapped_ubuf *imu) +static int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len) { - struct io_rw *rw = io_kiocb_to_cmd(req); - size_t len = rw->len; - u64 buf_end, buf_addr = rw->addr; + u64 buf_end; size_t offset; + if (WARN_ON_ONCE(!imu)) + return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ @@ -332,14 +333,6 @@ static int __io_import_fixed(struct io_kiocb *req, int ddir, return 0; } -static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter, - unsigned int issue_flags) -{ - if (WARN_ON_ONCE(!req->imu)) - return -EFAULT; - return __io_import_fixed(req, rw, iter, req->imu); -} - #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) @@ -426,7 +419,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, ssize_t ret; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(req, ddir, iter, issue_flags); + ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); if (ret) return ERR_PTR(ret); return NULL; From ccf48f131d16cf6d3334286b7e20a3aad8676fc5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:25:59 +0100 Subject: [PATCH 1033/1250] io_uring: move io_import_fixed() Move io_import_fixed() into rsrc.c where it belongs. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4d5becb21f332b4fef6a7cedd6a50e65e2371630.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ io_uring/rsrc.h | 3 +++ io_uring/rw.c | 60 ------------------------------------------------- 3 files changed, 63 insertions(+), 60 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 68629eba413265..1106089551595e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1310,3 +1310,63 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, io_rsrc_node_switch(ctx, NULL); return ret; } + +int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len) +{ + u64 buf_end; + size_t offset; + + if (WARN_ON_ONCE(!imu)) + return -EFAULT; + if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) + return -EFAULT; + /* not inside the mapped region */ + if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) + return -EFAULT; + + /* + * May not be a start of buffer, set size appropriately + * and advance us to the beginning. + */ + offset = buf_addr - imu->ubuf; + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); + + if (offset) { + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here, because + * we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are PAGE_SIZE in size, except potentially the + * first and last bvec + * + * So just find our index, and adjust the iterator afterwards. + * If the offset is within the first bvec (or the whole first + * bvec, just use iov_iter_advance(). This makes it easier + * since we can just skip the first segment, which may not + * be PAGE_SIZE aligned. + */ + const struct bio_vec *bvec = imu->bvec; + + if (offset <= bvec->bv_len) { + iov_iter_advance(iter, offset); + } else { + unsigned long seg_skip; + + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> PAGE_SHIFT); + + iter->bvec = bvec + seg_skip; + iter->nr_segs -= seg_skip; + iter->count -= bvec->bv_len + offset; + iter->iov_offset = offset & ~PAGE_MASK; + } + } + + return 0; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 03f26516e99463..87f58315b247bc 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -64,6 +64,9 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill); +int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len); void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); diff --git a/io_uring/rw.c b/io_uring/rw.c index ded8ef01165cdd..e07f2670dfa815 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -273,66 +273,6 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, return IOU_ISSUE_SKIP_COMPLETE; } -static int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) -{ - u64 buf_end; - size_t offset; - - if (WARN_ON_ONCE(!imu)) - return -EFAULT; - if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) - return -EFAULT; - /* not inside the mapped region */ - if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end)) - return -EFAULT; - - /* - * May not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ - offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); - - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are PAGE_SIZE in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be PAGE_SIZE aligned. - */ - const struct bio_vec *bvec = imu->bvec; - - if (offset <= bvec->bv_len) { - iov_iter_advance(iter, offset); - } else { - unsigned long seg_skip; - - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> PAGE_SHIFT); - - iter->bvec = bvec + seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ~PAGE_MASK; - } - } - - return 0; -} - #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, unsigned int issue_flags) From 8312cc0e08683d71bc077e676008bd1e73ccf276 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:26:00 +0100 Subject: [PATCH 1034/1250] io_uring: consistent naming for inline completion Improve naming of the inline/deferred completion helper so it's consistent with it's *_post counterpart. Add some comments and extra lockdeps to ensure the locking is done right. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/797c619943dac06529e9d3fcb16e4c3cde6ad1a3.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 ++-- io_uring/io_uring.h | 10 +++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3e65e04915a782..8bc63413fc547e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1380,7 +1380,7 @@ void io_req_task_complete(struct io_kiocb *req, bool *locked) } if (*locked) - io_req_add_compl_list(req); + io_req_complete_defer(req); else io_req_complete_post(req); } @@ -1648,7 +1648,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret == IOU_OK) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) - io_req_add_compl_list(req); + io_req_complete_defer(req); else io_req_complete_post(req); } else if (ret != IOU_ISSUE_SKIP_COMPLETE) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index afca7ff8956c32..7a00bbe85d35dd 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -222,10 +222,18 @@ static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) } } -static inline void io_req_add_compl_list(struct io_kiocb *req) +/* + * Don't complete immediately but use deferred completion infrastructure. + * Protected by ->uring_lock and can only be used either with + * IO_URING_F_COMPLETE_DEFER or inside a tw handler holding the mutex. + */ +static inline void io_req_complete_defer(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) { struct io_submit_state *state = &req->ctx->submit_state; + lockdep_assert_held(&req->ctx->uring_lock); + wq_list_add_tail(&req->comp_list, &state->compl_reqs); } From 4916f55c4a594887d32ee28d109f56cf433d16cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 01:26:01 +0100 Subject: [PATCH 1035/1250] io_uring: add a warn_once for poll_find io_poll_remove() expects poll_find() to search only for poll requests and passes a flag for this. Just be a little bit extra cautious considering lots of recent poll/cancellation changes and add a WARN_ON_ONCE checking that we don't get an apoll'ed request. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ec9a66f1e22f99dcd02288d4e42f3cc6bb357804.1655684496.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/io_uring/poll.c b/io_uring/poll.c index cbf44c38efd9ee..bd3110750cfada 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -833,6 +833,11 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) } found: + if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) { + ret = -EFAULT; + goto out; + } + if (poll_update->update_events || poll_update->update_user_data) { /* only mask one event flags, keep behavior flags */ if (poll_update->update_events) { From 1a502b544790b060866f5b3b21f83f87468ca696 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 20 Jun 2022 15:27:35 +0100 Subject: [PATCH 1036/1250] io_uring: optimize io_uring_task layout task_work bits of io_uring_task are split into two cache lines causing extra cache bouncing, place them into a separate cache line. Also move the most used submission path fields closer together, so there are hot. Cc: stable@vger.kernel.org # 5.15+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/tctx.h | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/io_uring/tctx.h b/io_uring/tctx.h index dde82ce4d8e29b..dead0ed0042984 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -7,22 +7,24 @@ struct io_uring_task { /* submission side */ - int cached_refs; - struct xarray xa; - struct wait_queue_head wait; - const struct io_ring_ctx *last; - struct io_wq *io_wq; - struct percpu_counter inflight; - atomic_t inflight_tracked; - atomic_t in_idle; - - spinlock_t task_lock; - struct io_wq_work_list task_list; - struct io_wq_work_list prio_task_list; - struct callback_head task_work; - bool task_running; - - struct file *registered_rings[IO_RINGFD_REG_MAX]; + int cached_refs; + const struct io_ring_ctx *last; + struct io_wq *io_wq; + struct file *registered_rings[IO_RINGFD_REG_MAX]; + + struct xarray xa; + struct wait_queue_head wait; + atomic_t in_idle; + atomic_t inflight_tracked; + struct percpu_counter inflight; + + struct { /* task_work */ + spinlock_t task_lock; + bool task_running; + struct io_wq_work_list task_list; + struct io_wq_work_list prio_task_list; + struct callback_head task_work; + } ____cacheline_aligned_in_smp; }; struct io_tctx_node { From 80ff20d177b416ee3e634ab6f93cac97354ebe99 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 21 Jun 2022 10:09:00 +0100 Subject: [PATCH 1037/1250] io_uring: improve io_run_task_work() Since SQPOLL now uses TWA_SIGNAL_NO_IPI, there won't be task work items without TIF_NOTIFY_SIGNAL. Simplify io_run_task_work() by removing task->task_works check. Even though looks it doesn't cause extra cache bouncing, it's still nice to not touch it an extra time when it might be not cached. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/75d4f34b0c671075892821a409e28da6cb1d64fe.1655802465.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 7a00bbe85d35dd..4c4d38ffc5ec5e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -203,7 +203,7 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline bool io_run_task_work(void) { - if (test_thread_flag(TIF_NOTIFY_SIGNAL) || task_work_pending(current)) { + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) { __set_current_state(TASK_RUNNING); clear_notify_signal(); if (task_work_pending(current)) From 2ae163969f83b61436af01723b46cd2648738fea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 21 Jun 2022 10:09:01 +0100 Subject: [PATCH 1038/1250] io_uring: move list helpers to a separate file It's annoying to have io-wq.h as a dependency every time we want some of struct io_wq_work_list helpers, move them into a separate file. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c1d891ce12b30767d1d2a3b7db2ca3abc1ecc4a2.1655802465.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 1 + io_uring/io-wq.h | 131 ----------------------------------------- io_uring/io_uring.h | 1 + io_uring/slist.h | 138 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 140 insertions(+), 131 deletions(-) create mode 100644 io_uring/slist.h diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 824623bcf1a53f..3e34dfbdf9466a 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -18,6 +18,7 @@ #include #include "io-wq.h" +#include "slist.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index 10b80ef78bb817..31228426d19241 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -21,137 +21,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -#define wq_list_for_each(pos, prv, head) \ - for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) - -#define wq_list_for_each_resume(pos, prv) \ - for (; pos; prv = pos, pos = (pos)->next) - -#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) -#define INIT_WQ_LIST(list) do { \ - (list)->first = NULL; \ -} while (0) - -static inline void wq_list_add_after(struct io_wq_work_node *node, - struct io_wq_work_node *pos, - struct io_wq_work_list *list) -{ - struct io_wq_work_node *next = pos->next; - - pos->next = node; - node->next = next; - if (!next) - list->last = node; -} - -/** - * wq_list_merge - merge the second list to the first one. - * @list0: the first list - * @list1: the second list - * Return the first node after mergence. - */ -static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, - struct io_wq_work_list *list1) -{ - struct io_wq_work_node *ret; - - if (!list0->first) { - ret = list1->first; - } else { - ret = list0->first; - list0->last->next = list1->first; - } - INIT_WQ_LIST(list0); - INIT_WQ_LIST(list1); - return ret; -} - -static inline void wq_list_add_tail(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = NULL; - if (!list->first) { - list->last = node; - WRITE_ONCE(list->first, node); - } else { - list->last->next = node; - list->last = node; - } -} - -static inline void wq_list_add_head(struct io_wq_work_node *node, - struct io_wq_work_list *list) -{ - node->next = list->first; - if (!node->next) - list->last = node; - WRITE_ONCE(list->first, node); -} - -static inline void wq_list_cut(struct io_wq_work_list *list, - struct io_wq_work_node *last, - struct io_wq_work_node *prev) -{ - /* first in the list, if prev==NULL */ - if (!prev) - WRITE_ONCE(list->first, last->next); - else - prev->next = last->next; - - if (last == list->last) - list->last = prev; - last->next = NULL; -} - -static inline void __wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - list->last->next = to->next; - to->next = list->first; - INIT_WQ_LIST(list); -} - -static inline bool wq_list_splice(struct io_wq_work_list *list, - struct io_wq_work_node *to) -{ - if (!wq_list_empty(list)) { - __wq_list_splice(list, to); - return true; - } - return false; -} - -static inline void wq_stack_add_head(struct io_wq_work_node *node, - struct io_wq_work_node *stack) -{ - node->next = stack->next; - stack->next = node; -} - -static inline void wq_list_del(struct io_wq_work_list *list, - struct io_wq_work_node *node, - struct io_wq_work_node *prev) -{ - wq_list_cut(list, node, prev); -} - -static inline -struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) -{ - struct io_wq_work_node *node = stack->next; - - stack->next = node->next; - return node; -} - -static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) -{ - if (!work->list.next) - return NULL; - - return container_of(work->list.next, struct io_wq_work, list); -} - typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); typedef void (io_wq_work_fn)(struct io_wq_work *); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 4c4d38ffc5ec5e..f026d2670959e6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -5,6 +5,7 @@ #include #include #include "io-wq.h" +#include "slist.h" #include "filetable.h" #ifndef CREATE_TRACE_POINTS diff --git a/io_uring/slist.h b/io_uring/slist.h new file mode 100644 index 00000000000000..f27601fa46607b --- /dev/null +++ b/io_uring/slist.h @@ -0,0 +1,138 @@ +#ifndef INTERNAL_IO_SLIST_H +#define INTERNAL_IO_SLIST_H + +#include + +#define wq_list_for_each(pos, prv, head) \ + for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) + +#define wq_list_for_each_resume(pos, prv) \ + for (; pos; prv = pos, pos = (pos)->next) + +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) + +#define INIT_WQ_LIST(list) do { \ + (list)->first = NULL; \ +} while (0) + +static inline void wq_list_add_after(struct io_wq_work_node *node, + struct io_wq_work_node *pos, + struct io_wq_work_list *list) +{ + struct io_wq_work_node *next = pos->next; + + pos->next = node; + node->next = next; + if (!next) + list->last = node; +} + +/** + * wq_list_merge - merge the second list to the first one. + * @list0: the first list + * @list1: the second list + * Return the first node after mergence. + */ +static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0, + struct io_wq_work_list *list1) +{ + struct io_wq_work_node *ret; + + if (!list0->first) { + ret = list1->first; + } else { + ret = list0->first; + list0->last->next = list1->first; + } + INIT_WQ_LIST(list0); + INIT_WQ_LIST(list1); + return ret; +} + +static inline void wq_list_add_tail(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + node->next = NULL; + if (!list->first) { + list->last = node; + WRITE_ONCE(list->first, node); + } else { + list->last->next = node; + list->last = node; + } +} + +static inline void wq_list_add_head(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + node->next = list->first; + if (!node->next) + list->last = node; + WRITE_ONCE(list->first, node); +} + +static inline void wq_list_cut(struct io_wq_work_list *list, + struct io_wq_work_node *last, + struct io_wq_work_node *prev) +{ + /* first in the list, if prev==NULL */ + if (!prev) + WRITE_ONCE(list->first, last->next); + else + prev->next = last->next; + + if (last == list->last) + list->last = prev; + last->next = NULL; +} + +static inline void __wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + list->last->next = to->next; + to->next = list->first; + INIT_WQ_LIST(list); +} + +static inline bool wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + if (!wq_list_empty(list)) { + __wq_list_splice(list, to); + return true; + } + return false; +} + +static inline void wq_stack_add_head(struct io_wq_work_node *node, + struct io_wq_work_node *stack) +{ + node->next = stack->next; + stack->next = node; +} + +static inline void wq_list_del(struct io_wq_work_list *list, + struct io_wq_work_node *node, + struct io_wq_work_node *prev) +{ + wq_list_cut(list, node, prev); +} + +static inline +struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) +{ + struct io_wq_work_node *node = stack->next; + + stack->next = node->next; + return node; +} + +static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) +{ + if (!work->list.next) + return NULL; + + return container_of(work->list.next, struct io_wq_work, list); +} + +#endif // INTERNAL_IO_SLIST_H \ No newline at end of file From c582ded59d1050a34583d3ce530c74ca29779d4f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 21 Jun 2022 10:09:02 +0100 Subject: [PATCH 1039/1250] io_uring: dedup io_run_task_work We have an identical copy of io_run_task_work() for io-wq called io_flush_signals(), deduplicate them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a157a4df5fa217b8bd03c73494f2fd0e24e44fbc.1655802465.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/filetable.h | 2 ++ io_uring/io-wq.c | 17 +++-------------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 6b58aa48bc45d3..fb5a274c08ffcc 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -2,6 +2,8 @@ #ifndef IOU_FILE_TABLE_H #define IOU_FILE_TABLE_H +#include + struct io_ring_ctx; struct io_kiocb; diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 3e34dfbdf9466a..77df5b43bf5239 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -19,6 +19,7 @@ #include "io-wq.h" #include "slist.h" +#include "io_uring.h" #define WORKER_IDLE_TIMEOUT (5 * HZ) @@ -519,23 +520,11 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, return NULL; } -static bool io_flush_signals(void) -{ - if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { - __set_current_state(TASK_RUNNING); - clear_notify_signal(); - if (task_work_pending(current)) - task_work_run(); - return true; - } - return false; -} - static void io_assign_current_work(struct io_worker *worker, struct io_wq_work *work) { if (work) { - io_flush_signals(); + io_run_task_work(); cond_resched(); } @@ -655,7 +644,7 @@ static int io_wqe_worker(void *data) last_timeout = false; __io_worker_idle(wqe, worker); raw_spin_unlock(&wqe->lock); - if (io_flush_signals()) + if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); if (signal_pending(current)) { From 8b05078de0dcf25f9a9109e37d878b643e6bd06e Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:21 -0700 Subject: [PATCH 1040/1250] io_uring: remove priority tw list optimisation This optimisation has some built in assumptions that make it easy to introduce bugs. It also does not have clear wins that make it worth keeping. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-2-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 77 +++++++-------------------------------------- io_uring/io_uring.h | 1 - io_uring/rw.c | 2 +- io_uring/tctx.c | 1 - io_uring/tctx.h | 1 - 5 files changed, 12 insertions(+), 70 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8bc63413fc547e..d21d0fc3645b12 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -986,44 +986,6 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } -static void handle_prev_tw_list(struct io_wq_work_node *node, - struct io_ring_ctx **ctx, bool *uring_locked) -{ - if (*ctx && !*uring_locked) - spin_lock(&(*ctx)->completion_lock); - - do { - struct io_wq_work_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - - prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - - if (req->ctx != *ctx) { - if (unlikely(!*uring_locked && *ctx)) - io_cq_unlock_post(*ctx); - - ctx_flush_and_put(*ctx, uring_locked); - *ctx = req->ctx; - /* if not contended, grab and improve batching */ - *uring_locked = mutex_trylock(&(*ctx)->uring_lock); - percpu_ref_get(&(*ctx)->refs); - if (unlikely(!*uring_locked)) - io_cq_lock(*ctx); - } - if (likely(*uring_locked)) { - req->io_task_work.func(req, uring_locked); - } else { - req->cqe.flags = io_put_kbuf_comp(req); - __io_req_complete_post(req); - } - node = next; - } while (node); - - if (unlikely(!*uring_locked)) - io_cq_unlock_post(*ctx); -} - static void handle_tw_list(struct io_wq_work_node *node, struct io_ring_ctx **ctx, bool *locked) { @@ -1054,27 +1016,20 @@ void tctx_task_work(struct callback_head *cb) task_work); while (1) { - struct io_wq_work_node *node1, *node2; + struct io_wq_work_node *node; spin_lock_irq(&tctx->task_lock); - node1 = tctx->prio_task_list.first; - node2 = tctx->task_list.first; + node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); - if (!node2 && !node1) + if (!node) tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); - if (!node2 && !node1) + if (!node) break; - - if (node1) - handle_prev_tw_list(node1, &ctx, &uring_locked); - if (node2) - handle_tw_list(node2, &ctx, &uring_locked); + handle_tw_list(node, &ctx, &uring_locked); cond_resched(); - if (data_race(!tctx->task_list.first) && - data_race(!tctx->prio_task_list.first) && uring_locked) + if (data_race(!tctx->task_list.first) && uring_locked) io_submit_flush_completions(ctx); } @@ -1086,8 +1041,7 @@ void tctx_task_work(struct callback_head *cb) } static void __io_req_task_work_add(struct io_kiocb *req, - struct io_uring_task *tctx, - struct io_wq_work_list *list) + struct io_uring_task *tctx) { struct io_ring_ctx *ctx = req->ctx; struct io_wq_work_node *node; @@ -1095,7 +1049,7 @@ static void __io_req_task_work_add(struct io_kiocb *req, bool running; spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_add_tail(&req->io_task_work.node, list); + wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); running = tctx->task_running; if (!running) tctx->task_running = true; @@ -1113,7 +1067,8 @@ static void __io_req_task_work_add(struct io_kiocb *req, spin_lock_irqsave(&tctx->task_lock, flags); tctx->task_running = false; - node = wq_list_merge(&tctx->prio_task_list, &tctx->task_list); + node = tctx->task_list.first; + INIT_WQ_LIST(&tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); while (node) { @@ -1129,17 +1084,7 @@ void io_req_task_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; - __io_req_task_work_add(req, tctx, &tctx->task_list); -} - -void io_req_task_prio_work_add(struct io_kiocb *req) -{ - struct io_uring_task *tctx = req->task->io_uring; - - if (req->ctx->flags & IORING_SETUP_SQPOLL) - __io_req_task_work_add(req, tctx, &tctx->prio_task_list); - else - __io_req_task_work_add(req, tctx, &tctx->task_list); + __io_req_task_work_add(req, tctx); } static void io_req_tw_post(struct io_kiocb *req, bool *locked) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index f026d2670959e6..f77e4a5403e4e5 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -36,7 +36,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_work_add(struct io_kiocb *req); -void io_req_task_prio_work_add(struct io_kiocb *req); void io_req_tw_post_queue(struct io_kiocb *req, s32 res, u32 cflags); void io_req_task_queue(struct io_kiocb *req); void io_queue_iowq(struct io_kiocb *req, bool *dont_use); diff --git a/io_uring/rw.c b/io_uring/rw.c index e07f2670dfa815..ade3e235f2770c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -215,7 +215,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) return; io_req_set_res(req, res, 0); req->io_task_work.func = io_req_task_complete; - io_req_task_prio_work_add(req); + io_req_task_work_add(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 9b30fb0d360307..7a68ba9beec3ea 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -88,7 +88,6 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); - INIT_WQ_LIST(&tctx->prio_task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } diff --git a/io_uring/tctx.h b/io_uring/tctx.h index dead0ed0042984..c8566ea5dca477 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -22,7 +22,6 @@ struct io_uring_task { spinlock_t task_lock; bool task_running; struct io_wq_work_list task_list; - struct io_wq_work_list prio_task_list; struct callback_head task_work; } ____cacheline_aligned_in_smp; }; From 46b0878d891b75b12e943b02a4c87bc144a53991 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:22 -0700 Subject: [PATCH 1041/1250] io_uring: remove __io_req_task_work_add this is no longer needed as there is only one caller Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-3-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d21d0fc3645b12..bf7ca2b279d3de 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1040,9 +1040,9 @@ void tctx_task_work(struct callback_head *cb) io_uring_drop_tctx_refs(current); } -static void __io_req_task_work_add(struct io_kiocb *req, - struct io_uring_task *tctx) +void io_req_task_work_add(struct io_kiocb *req) { + struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; struct io_wq_work_node *node; unsigned long flags; @@ -1080,13 +1080,6 @@ static void __io_req_task_work_add(struct io_kiocb *req, } } -void io_req_task_work_add(struct io_kiocb *req) -{ - struct io_uring_task *tctx = req->task->io_uring; - - __io_req_task_work_add(req, tctx); -} - static void io_req_tw_post(struct io_kiocb *req, bool *locked) { io_req_complete_post(req); From dca6cd4188f036fef9dd134fba7bb2ef954193b2 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:23 -0700 Subject: [PATCH 1042/1250] io_uring: lockless task list With networking use cases we see contention on the spinlock used to protect the task_list when multiple threads try and add completions at once. Instead we can use a lockless list, and assume that the first caller to add to the list is responsible for kicking off task work. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-4-dylany@fb.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 38 ++++++++-------------------------- io_uring/tctx.c | 3 +-- io_uring/tctx.h | 6 +++--- 4 files changed, 14 insertions(+), 35 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 5987f8acca3838..918165a200533f 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); struct io_task_work { union { - struct io_wq_work_node node; + struct llist_node node; struct llist_node fallback_node; }; io_req_tw_func_t func; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bf7ca2b279d3de..0124335c6d092a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } -static void handle_tw_list(struct io_wq_work_node *node, + +static void handle_tw_list(struct llist_node *node, struct io_ring_ctx **ctx, bool *locked) { do { - struct io_wq_work_node *next = node->next; + struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb) struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); + struct llist_node *node = llist_del_all(&tctx->task_list); - while (1) { - struct io_wq_work_node *node; - - spin_lock_irq(&tctx->task_lock); - node = tctx->task_list.first; - INIT_WQ_LIST(&tctx->task_list); - if (!node) - tctx->task_running = false; - spin_unlock_irq(&tctx->task_lock); - if (!node) - break; + if (node) { handle_tw_list(node, &ctx, &uring_locked); cond_resched(); - - if (data_race(!tctx->task_list.first) && uring_locked) - io_submit_flush_completions(ctx); } ctx_flush_and_put(ctx, &uring_locked); @@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req) { struct io_uring_task *tctx = req->task->io_uring; struct io_ring_ctx *ctx = req->ctx; - struct io_wq_work_node *node; - unsigned long flags; + struct llist_node *node; bool running; - spin_lock_irqsave(&tctx->task_lock, flags); - wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); - running = tctx->task_running; - if (!running) - tctx->task_running = true; - spin_unlock_irqrestore(&tctx->task_lock, flags); + running = !llist_add(&req->io_task_work.node, &tctx->task_list); /* task_work already pending, we're done */ if (running) @@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req) if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; - spin_lock_irqsave(&tctx->task_lock, flags); - tctx->task_running = false; - node = tctx->task_list.first; - INIT_WQ_LIST(&tctx->task_list); - spin_unlock_irqrestore(&tctx->task_lock, flags); + + node = llist_del_all(&tctx->task_list); while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 7a68ba9beec3ea..7f97d97fef0a96 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, atomic_set(&tctx->in_idle, 0); atomic_set(&tctx->inflight_tracked, 0); task->io_uring = tctx; - spin_lock_init(&tctx->task_lock); - INIT_WQ_LIST(&tctx->task_list); + init_llist_head(&tctx->task_list); init_task_work(&tctx->task_work, tctx_task_work); return 0; } diff --git a/io_uring/tctx.h b/io_uring/tctx.h index c8566ea5dca477..8a33ff6e5d9138 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include + /* * Arbitrary limit, can be raised if need be */ @@ -19,9 +21,7 @@ struct io_uring_task { struct percpu_counter inflight; struct { /* task_work */ - spinlock_t task_lock; - bool task_running; - struct io_wq_work_list task_list; + struct llist_head task_list; struct callback_head task_work; } ____cacheline_aligned_in_smp; }; From 977595c31bca7ab69efa6b4884c657bd73e825a1 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:24 -0700 Subject: [PATCH 1043/1250] io_uring: introduce llist helpers Introduce helpers to atomically switch llist. Will later move this into common code Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-5-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0124335c6d092a..356000255211fb 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1009,6 +1009,36 @@ static void handle_tw_list(struct llist_node *node, } while (node); } +/** + * io_llist_xchg - swap all entries in a lock-less list + * @head: the head of lock-less list to delete all entries + * @new: new entry as the head of the list + * + * If list is empty, return NULL, otherwise, return the pointer to the first entry. + * The order of entries returned is from the newest to the oldest added one. + */ +static inline struct llist_node *io_llist_xchg(struct llist_head *head, + struct llist_node *new) +{ + return xchg(&head->first, new); +} + +/** + * io_llist_cmpxchg - possibly swap all entries in a lock-less list + * @head: the head of lock-less list to delete all entries + * @old: expected old value of the first entry of the list + * @new: new entry as the head of the list + * + * perform a cmpxchg on the first entry of the list. + */ + +static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, + struct llist_node *old, + struct llist_node *new) +{ + return cmpxchg(&head->first, old, new); +} + void tctx_task_work(struct callback_head *cb) { bool uring_locked = false; From 47583fc525b4066539f530cc38c846fef30587dd Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:25 -0700 Subject: [PATCH 1044/1250] io_uring: batch task_work Batching task work up is an important performance optimisation, as task_work_add is expensive. In order to keep the semantics replace the task_list with a fake node while processing the old list, and then do a cmpxchg at the end to see if there is more work. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-6-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 356000255211fb..9d523fafacb72d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -986,11 +986,11 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } - static void handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, bool *locked) + struct io_ring_ctx **ctx, bool *locked, + struct llist_node *last) { - do { + while (node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); @@ -1006,7 +1006,7 @@ static void handle_tw_list(struct llist_node *node, } req->io_task_work.func(req, locked); node = next; - } while (node); + } } /** @@ -1045,11 +1045,15 @@ void tctx_task_work(struct callback_head *cb) struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); - struct llist_node *node = llist_del_all(&tctx->task_list); - - if (node) { - handle_tw_list(node, &ctx, &uring_locked); - cond_resched(); + struct llist_node fake = {}; + struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake); + + handle_tw_list(node, &ctx, &uring_locked, NULL); + node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); + while (node != &fake) { + node = io_llist_xchg(&tctx->task_list, &fake); + handle_tw_list(node, &ctx, &uring_locked, &fake); + node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); } ctx_flush_and_put(ctx, &uring_locked); From 5541b530fdae2a4b68662d0ef2dd4869cb86e310 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:27 -0700 Subject: [PATCH 1045/1250] io_uring: add trace event for running task work This is useful for investigating if task_work is batching Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-8-dylany@fb.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 3bc8dec9acaacf..918e3a43e4b280 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -600,6 +600,36 @@ TRACE_EVENT(io_uring_cqe_overflow, __entry->cflags, __entry->ocqe) ); +/* + * io_uring_task_work_run - ran task work + * + * @tctx: pointer to a io_uring_task + * @count: how many functions it ran + * @loops: how many loops it ran + * + */ +TRACE_EVENT(io_uring_task_work_run, + + TP_PROTO(void *tctx, unsigned int count, unsigned int loops), + + TP_ARGS(tctx, count, loops), + + TP_STRUCT__entry ( + __field( void *, tctx ) + __field( unsigned int, count ) + __field( unsigned int, loops ) + ), + + TP_fast_assign( + __entry->tctx = tctx; + __entry->count = count; + __entry->loops = loops; + ), + + TP_printk("tctx %p, count %u, loops %u", + __entry->tctx, __entry->count, __entry->loops) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ From 0b01a50034c5d9649ab83afaf61ebfe16d119786 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Wed, 22 Jun 2022 06:40:28 -0700 Subject: [PATCH 1046/1250] io_uring: trace task_work_run trace task_work_run to help provide stats on how often task work is run and what batch sizes are coming through. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220622134028.2013417-9-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9d523fafacb72d..997b915a1ff78a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -986,10 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) percpu_ref_put(&ctx->refs); } -static void handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, bool *locked, - struct llist_node *last) +static unsigned int handle_tw_list(struct llist_node *node, + struct io_ring_ctx **ctx, bool *locked, + struct llist_node *last) { + unsigned int count = 0; + while (node != last) { struct llist_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, @@ -1006,7 +1008,10 @@ static void handle_tw_list(struct llist_node *node, } req->io_task_work.func(req, locked); node = next; + count++; } + + return count; } /** @@ -1047,12 +1052,14 @@ void tctx_task_work(struct callback_head *cb) task_work); struct llist_node fake = {}; struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake); + unsigned int loops = 1; + unsigned int count = handle_tw_list(node, &ctx, &uring_locked, NULL); - handle_tw_list(node, &ctx, &uring_locked, NULL); node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); while (node != &fake) { + loops++; node = io_llist_xchg(&tctx->task_list, &fake); - handle_tw_list(node, &ctx, &uring_locked, &fake); + count += handle_tw_list(node, &ctx, &uring_locked, &fake); node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); } @@ -1061,6 +1068,8 @@ void tctx_task_work(struct callback_head *cb) /* relaxed read is enough as only the task itself sets ->in_idle */ if (unlikely(atomic_read(&tctx->in_idle))) io_uring_drop_tctx_refs(current); + + trace_io_uring_task_work_run(tctx, count, loops); } void io_req_task_work_add(struct io_kiocb *req) From 19c39fcab4159c69ed1624afd8eb52eb19815e9e Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Wed, 22 Jun 2022 13:55:51 +0800 Subject: [PATCH 1047/1250] io_uring: kbuf: kill __io_kbuf_recycle() __io_kbuf_recycle() is only called in io_kbuf_recycle(). Kill it and tweak the code so that the legacy pbuf and ring pbuf code become clear Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220622055551.642370-1-hao.xu@linux.dev Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 71 +++++++++++++++++++++++++++++-------------------- io_uring/kbuf.h | 21 +++++---------- 2 files changed, 49 insertions(+), 43 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 62de0dda24bf6e..8bf47e49ea5bc7 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -37,36 +37,30 @@ static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, return xa_load(&ctx->io_bl_xa, bgid); } -void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +static int io_buffer_add_list(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, unsigned int bgid) +{ + bl->bgid = bgid; + if (bgid < BGID_ARRAY) + return 0; + + return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); +} + +void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; struct io_buffer *buf; /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. + * For legacy provided buffer mode, don't recycle if we already did + * IO to this buffer. For ring-mapped provided buffer mode, we should + * increment ring->head to explicitly monopolize the buffer to avoid + * multiple use. */ - if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - /* - * If we end up here, then the io_uring_lock has - * been kept held since we retrieved the buffer. - * For the io-wq case, we already cleared - * req->buf_list when the buffer was retrieved, - * hence it cannot be set here for that case. - */ - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - } - } + if (req->flags & REQ_F_PARTIAL_IO) return; - } io_ring_submit_lock(ctx, issue_flags); @@ -77,16 +71,35 @@ void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) req->buf_index = buf->bgid; io_ring_submit_unlock(ctx, issue_flags); + return; } -static int io_buffer_add_list(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned int bgid) +void io_kbuf_recycle_ring(struct io_kiocb *req) { - bl->bgid = bgid; - if (bgid < BGID_ARRAY) - return 0; - - return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + * The exception is partial io, that case we should increment bl->head + * to monopolize the buffer. + */ + if (req->buf_list) { + if (req->flags & REQ_F_PARTIAL_IO) { + /* + * If we end up here, then the io_uring_lock has + * been kept held since we retrieved the buffer. + * For the io-wq case, we already cleared + * req->buf_list when the buffer was retrieved, + * hence it cannot be set here for that case. + */ + req->buf_list->head++; + req->buf_list = NULL; + } else { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + } + return; } unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 304e7139d83562..721465c5d809fc 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -35,7 +35,6 @@ struct io_buffer { void __user *io_buffer_select(struct io_kiocb *req, size_t *len, unsigned int issue_flags); -void __io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); @@ -49,6 +48,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); +void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void io_kbuf_recycle_ring(struct io_kiocb *req); + static inline bool io_do_buffer_select(struct io_kiocb *req) { if (!(req->flags & REQ_F_BUFFER_SELECT)) @@ -58,18 +60,6 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - /* - * For legacy provided buffer mode, don't recycle if we already did - * IO to this buffer. For ring-mapped provided buffer mode, we should - * increment ring->head to explicitly monopolize the buffer to avoid - * multiple use. - */ - if ((req->flags & REQ_F_BUFFER_SELECTED) && - (req->flags & REQ_F_PARTIAL_IO)) - return; - /* * READV uses fields in `struct io_rw` (len/addr) to stash the selected * buffer data. However if that buffer is recycled the original request @@ -78,7 +68,10 @@ static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) if (req->opcode == IORING_OP_READV) return; - __io_kbuf_recycle(req, issue_flags); + if (req->flags & REQ_F_BUFFER_SELECTED) + io_kbuf_recycle_legacy(req, issue_flags); + if (req->flags & REQ_F_BUFFER_RING) + io_kbuf_recycle_ring(req); } static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, From 03538a047e927b2fe17cf467e91273251478f772 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 18 Jun 2022 09:23:54 -0600 Subject: [PATCH 1048/1250] io_uring: have cancelation API accept io_uring_task directly We just use the io_kiocb passed in to find the io_uring_task, and we already pass in the ctx via cd->ctx anyway. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 17 +++++++++-------- io_uring/cancel.h | 2 +- io_uring/timeout.c | 2 +- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index d1e7f5a955ab20..500ee5f5fd23ca 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -77,15 +77,15 @@ static int io_async_cancel_one(struct io_uring_task *tctx, return ret; } -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, +int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned issue_flags) { - struct io_ring_ctx *ctx = req->ctx; + struct io_ring_ctx *ctx = cd->ctx; int ret; - WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); + WARN_ON_ONCE(!io_wq_current_is_worker() && tctx != current->io_uring); - ret = io_async_cancel_one(req->task->io_uring, cd); + ret = io_async_cancel_one(tctx, cd); /* * Fall-through even for -EALREADY, as we may have poll armed * that need unarming. @@ -104,7 +104,6 @@ int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, return ret; } - int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_cancel *cancel = io_kiocb_to_cmd(req); @@ -127,7 +126,8 @@ int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, +static int __io_async_cancel(struct io_cancel_data *cd, + struct io_uring_task *tctx, unsigned int issue_flags) { bool all = cd->flags & (IORING_ASYNC_CANCEL_ALL|IORING_ASYNC_CANCEL_ANY); @@ -136,7 +136,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, struct io_kiocb *req, int ret, nr = 0; do { - ret = io_try_cancel(req, cd, issue_flags); + ret = io_try_cancel(tctx, cd, issue_flags); if (ret == -ENOENT) break; if (!all) @@ -170,6 +170,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) .flags = cancel->flags, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; + struct io_uring_task *tctx = req->task->io_uring; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { @@ -185,7 +186,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) cd.file = req->file; } - ret = __io_async_cancel(&cd, req, issue_flags); + ret = __io_async_cancel(&cd, tctx, issue_flags); done: if (ret < 0) req_set_fail(req); diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 2338012a5b06fb..1bc7e917ce94e6 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -16,6 +16,6 @@ struct io_cancel_data { int io_async_cancel_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); -int io_try_cancel(struct io_kiocb *req, struct io_cancel_data *cd, +int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned int issue_flags); void init_hash_table(struct io_hash_table *table, unsigned size); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7e2c341f976258..4af074b8f6b7d7 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -274,7 +274,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) .data = prev->cqe.user_data, }; - ret = io_try_cancel(req, &cd, issue_flags); + ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); } io_req_set_res(req, ret ?: -ETIME, 0); io_req_complete_post(req); From a2530ece66e0461bf4d10c4d3404a4e4dbedc45b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 18 Jun 2022 09:47:04 -0600 Subject: [PATCH 1049/1250] io_uring: add IORING_ASYNC_CANCEL_FD_FIXED cancel flag In preparation for not having a request to pass in that carries this state, add a separate cancelation flag that allows the caller to ask for a fixed file for cancelation. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 ++ io_uring/cancel.c | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d7ae81b10893e9..a09a78bd75566e 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -247,10 +247,12 @@ enum io_uring_op { * IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the * request 'user_data' * IORING_ASYNC_CANCEL_ANY Match any request + * IORING_ASYNC_CANCEL_FD_FIXED 'fd' passed in is a fixed descriptor */ #define IORING_ASYNC_CANCEL_ALL (1U << 0) #define IORING_ASYNC_CANCEL_FD (1U << 1) #define IORING_ASYNC_CANCEL_ANY (1U << 2) +#define IORING_ASYNC_CANCEL_FD_FIXED (1U << 3) /* * send/sendmsg and recv/recvmsg flags (sqe->ioprio) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 500ee5f5fd23ca..da486de07029a5 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -24,7 +24,7 @@ struct io_cancel { }; #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ - IORING_ASYNC_CANCEL_ANY) + IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED) static bool io_cancel_cb(struct io_wq_work *work, void *data) { @@ -174,11 +174,14 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { - if (req->flags & REQ_F_FIXED_FILE) + if (req->flags & REQ_F_FIXED_FILE || + cd.flags & IORING_ASYNC_CANCEL_FD_FIXED) { + req->flags |= REQ_F_FIXED_FILE; req->file = io_file_get_fixed(req, cancel->fd, issue_flags); - else + } else { req->file = io_file_get_normal(req, cancel->fd); + } if (!req->file) { ret = -EBADF; goto done; From cd976e60ec51ba25c33fba7b1d026f6b6502fb44 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 18 Jun 2022 10:00:50 -0600 Subject: [PATCH 1050/1250] io_uring: add sync cancelation API through io_uring_register() The io_uring cancelation API is async, like any other API that we expose there. For the case of finding a request to cancel, or not finding one, it is fully sync in that when submission returns, the CQE for both the cancelation request and the targeted request have been posted to the CQ ring. However, if the targeted work is being executed by io-wq, the API can only start the act of canceling it. This makes it difficult to use in some circumstances, as the caller then has to wait for the CQEs to come in and match on the same cancelation data there. Provide a IORING_REGISTER_SYNC_CANCEL command for io_uring_register() that does sync cancelations, always. For the io-wq case, it'll wait for the cancelation to come in before returning. The only expected returns from this API is: 0 Request found and canceled fine. > 0 Requests found and canceled. Only happens if asked to cancel multiple requests, and if the work wasn't in progress. -ENOENT Request not found. -ETIME A timeout on the operation was requested, but the timeout expired before we could cancel. and we won't get -EALREADY via this API. If the timeout value passed in is -1 (tv_sec and tv_nsec), then that means that no timeout is requested. Otherwise, the timespec passed in is the amount of time the sync cancel will wait for a successful cancelation. Link: https://github.com/axboe/liburing/discussions/608 Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 15 +++++ io_uring/cancel.c | 107 ++++++++++++++++++++++++++++++++++ io_uring/cancel.h | 2 + io_uring/io_uring.c | 6 ++ 4 files changed, 130 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a09a78bd75566e..094f706c93e0b9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -10,6 +10,7 @@ #include #include +#include /* * IO submission data structure (Submission Queue Entry) @@ -428,6 +429,9 @@ enum { IORING_REGISTER_PBUF_RING = 22, IORING_UNREGISTER_PBUF_RING = 23, + /* sync cancelation API */ + IORING_REGISTER_SYNC_CANCEL = 24, + /* this goes last */ IORING_REGISTER_LAST }; @@ -563,4 +567,15 @@ struct io_uring_getevents_arg { __u64 ts; }; +/* + * Argument for IORING_REGISTER_SYNC_CANCEL + */ +struct io_uring_sync_cancel_reg { + __u64 addr; + __s32 fd; + __u32 flags; + struct __kernel_timespec timeout; + __u64 pad[4]; +}; + #endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index da486de07029a5..8435a1eba59acc 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -206,3 +207,109 @@ void init_hash_table(struct io_hash_table *table, unsigned size) INIT_HLIST_HEAD(&table->hbs[i].list); } } + +static int __io_sync_cancel(struct io_uring_task *tctx, + struct io_cancel_data *cd, int fd) +{ + struct io_ring_ctx *ctx = cd->ctx; + + /* fixed must be grabbed every time since we drop the uring_lock */ + if ((cd->flags & IORING_ASYNC_CANCEL_FD) && + (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { + unsigned long file_ptr; + + if (unlikely(fd > ctx->nr_user_files)) + return -EBADF; + fd = array_index_nospec(fd, ctx->nr_user_files); + file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; + cd->file = (struct file *) (file_ptr & FFS_MASK); + if (!cd->file) + return -EBADF; + } + + return __io_async_cancel(cd, tctx, 0); +} + +int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) + __must_hold(&ctx->uring_lock) +{ + struct io_cancel_data cd = { + .ctx = ctx, + .seq = atomic_inc_return(&ctx->cancel_seq), + }; + ktime_t timeout = KTIME_MAX; + struct io_uring_sync_cancel_reg sc; + struct fd f = { }; + DEFINE_WAIT(wait); + int ret; + + if (copy_from_user(&sc, arg, sizeof(sc))) + return -EFAULT; + if (sc.flags & ~CANCEL_FLAGS) + return -EINVAL; + if (sc.pad[0] || sc.pad[1] || sc.pad[2] || sc.pad[3]) + return -EINVAL; + + cd.data = sc.addr; + cd.flags = sc.flags; + + /* we can grab a normal file descriptor upfront */ + if ((cd.flags & IORING_ASYNC_CANCEL_FD) && + !(cd.flags & IORING_ASYNC_CANCEL_FD_FIXED)) { + f = fdget(sc.fd); + if (!f.file) + return -EBADF; + cd.file = f.file; + } + + ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); + + /* found something, done! */ + if (ret != -EALREADY) + goto out; + + if (sc.timeout.tv_sec != -1UL || sc.timeout.tv_nsec != -1UL) { + struct timespec64 ts = { + .tv_sec = sc.timeout.tv_sec, + .tv_nsec = sc.timeout.tv_nsec + }; + + timeout = ktime_add_ns(timespec64_to_ktime(ts), ktime_get_ns()); + } + + /* + * Keep looking until we get -ENOENT. we'll get woken everytime + * every time a request completes and will retry the cancelation. + */ + do { + cd.seq = atomic_inc_return(&ctx->cancel_seq); + + prepare_to_wait(&ctx->cq_wait, &wait, TASK_INTERRUPTIBLE); + + ret = __io_sync_cancel(current->io_uring, &cd, sc.fd); + + if (ret != -EALREADY) + break; + + mutex_unlock(&ctx->uring_lock); + ret = io_run_task_work_sig(); + if (ret < 0) { + mutex_lock(&ctx->uring_lock); + break; + } + ret = schedule_hrtimeout(&timeout, HRTIMER_MODE_ABS); + mutex_lock(&ctx->uring_lock); + if (!ret) { + ret = -ETIME; + break; + } + } while (1); + + finish_wait(&ctx->cq_wait, &wait); + + if (ret == -ENOENT || ret > 0) + ret = 0; +out: + fdput(f); + return ret; +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index 1bc7e917ce94e6..6a59ee484d0cca 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -19,3 +19,5 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned int issue_flags); void init_hash_table(struct io_hash_table *table, unsigned size); + +int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 997b915a1ff78a..45538b3c3a764e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3871,6 +3871,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_pbuf_ring(ctx, arg); break; + case IORING_REGISTER_SYNC_CANCEL: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_sync_cancel(ctx, arg); + break; default: ret = -EINVAL; break; From 5a382dda9bd7cfc8c1299466756dd77507735df1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:44 +0100 Subject: [PATCH 1051/1250] io_uring: clean poll ->private flagging We store a req pointer in wqe->private but also take one bit to mark double poll entries. Replace macro helpers with inline functions for better type checking and also name the double flag. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9a61240555c64ac0b7a9b0eb59a9efeb638a35a4.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index bd3110750cfada..210b174b155b61 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -39,6 +39,22 @@ struct io_poll_table { #define IO_POLL_CANCEL_FLAG BIT(31) #define IO_POLL_REF_MASK GENMASK(30, 0) +#define IO_WQE_F_DOUBLE 1 + +static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) +{ + unsigned long priv = (unsigned long)wqe->private; + + return (struct io_kiocb *)(priv & ~IO_WQE_F_DOUBLE); +} + +static inline bool wqe_is_double(struct wait_queue_entry *wqe) +{ + unsigned long priv = (unsigned long)wqe->private; + + return priv & IO_WQE_F_DOUBLE; +} + /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can * bump it and acquire ownership. It's disallowed to modify requests while not @@ -306,8 +322,6 @@ static void io_poll_cancel_req(struct io_kiocb *req) io_poll_execute(req, 0, 0); } -#define wqe_to_req(wait) ((void *)((unsigned long) (wait)->private & ~1)) -#define wqe_is_double(wait) ((unsigned long) (wait)->private & 1) #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, @@ -392,7 +406,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, return; } /* mark as double wq entry */ - wqe_private |= 1; + wqe_private |= IO_WQE_F_DOUBLE; req->flags |= REQ_F_DOUBLE_POLL; io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; From 6a7ece5a4e4c2f608a6c1dab480aa7eeb0911ca3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:45 +0100 Subject: [PATCH 1052/1250] io_uring: remove events caching atavisms Remove events argument from *io_poll_execute(), it's not needed and not used. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/12efd4e15c6a90cf9e5b59807cfcb57852b51dc7.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 210b174b155b61..7de8c52793cd3b 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -289,8 +289,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_req_complete_failed(req, ret); } -static void __io_poll_execute(struct io_kiocb *req, int mask, - __poll_t __maybe_unused events) +static void __io_poll_execute(struct io_kiocb *req, int mask) { io_req_set_res(req, mask, 0); /* @@ -308,18 +307,17 @@ static void __io_poll_execute(struct io_kiocb *req, int mask, io_req_task_work_add(req); } -static inline void io_poll_execute(struct io_kiocb *req, int res, - __poll_t events) +static inline void io_poll_execute(struct io_kiocb *req, int res) { if (io_poll_get_ownership(req)) - __io_poll_execute(req, res, events); + __io_poll_execute(req, res); } static void io_poll_cancel_req(struct io_kiocb *req) { io_poll_mark_cancelled(req); /* kick tw, which should complete the request */ - io_poll_execute(req, 0, 0); + io_poll_execute(req, 0); } #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) @@ -334,7 +332,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (unlikely(mask & POLLFREE)) { io_poll_mark_cancelled(req); /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0, poll->events); + io_poll_execute(req, 0); /* * If the waitqueue is being freed early but someone is already @@ -369,7 +367,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, else req->flags &= ~REQ_F_SINGLE_POLL; } - __io_poll_execute(req, mask, poll->events); + __io_poll_execute(req, mask); } return 1; } @@ -487,7 +485,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, req->apoll_events |= EPOLLONESHOT; ipt->error = 0; } - __io_poll_execute(req, mask, poll->events); + __io_poll_execute(req, mask); return 0; } @@ -497,7 +495,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, */ v = atomic_dec_return(&req->poll_refs); if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0, poll->events); + __io_poll_execute(req, 0); return 0; } From aea92968fd8ab9e9198afce31bed83851fc188cc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:46 +0100 Subject: [PATCH 1053/1250] io_uring: add a helper for apoll alloc Extract a helper function for apoll allocation, makes the code easier to read. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2f93282b47dd678e805dd0d7097f66968ced495c.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 7de8c52793cd3b..aef77f2a8a9a76 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -508,10 +508,33 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } +static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, + unsigned issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct async_poll *apoll; + + if (req->flags & REQ_F_POLLED) { + apoll = req->apoll; + kfree(apoll->double_poll); + } else if (!(issue_flags & IO_URING_F_UNLOCKED) && + !list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del_init(&apoll->poll.wait.entry); + } else { + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return NULL; + } + apoll->double_poll = NULL; + req->apoll = apoll; + return apoll; +} + int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; struct async_poll *apoll; struct io_poll_table ipt; __poll_t mask = POLLPRI | POLLERR | EPOLLET; @@ -546,21 +569,10 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) } if (def->poll_exclusive) mask |= EPOLLEXCLUSIVE; - if (req->flags & REQ_F_POLLED) { - apoll = req->apoll; - kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del_init(&apoll->poll.wait.entry); - } else { - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) - return IO_APOLL_ABORTED; - } - apoll->double_poll = NULL; - req->apoll = apoll; + + apoll = io_req_alloc_apoll(req, issue_flags); + if (!apoll) + return IO_APOLL_ABORTED; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; From b98248fbb312c6ebc540f1bbf7c809afa07503ed Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:47 +0100 Subject: [PATCH 1054/1250] io_uring: change arm poll return values The rules for __io_arm_poll_handler()'s result parsing are complicated, as the first step don't pass return a mask but pass back a positive return code and fill ipt->result_mask. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/529e29e9f97f2e6e383ccd44234d8b576a83a921.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index aef77f2a8a9a76..80113b036c88d8 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -34,6 +34,8 @@ struct io_poll_table { struct io_kiocb *req; int nr_entries; int error; + /* output value, set only if arm poll returns >0 */ + __poll_t result_mask; }; #define IO_POLL_CANCEL_FLAG BIT(31) @@ -462,8 +464,9 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { io_poll_remove_entries(req); + ipt->result_mask = mask; /* no one else has access to the req, forget about the ref */ - return mask; + return 1; } if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { @@ -813,7 +816,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); if (ret) { - io_req_set_res(req, ret, 0); + io_req_set_res(req, ipt.result_mask, 0); return IOU_OK; } if (ipt.error) { From 4a615dbc8db60b8350c22550ef706066c7708a3f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:48 +0100 Subject: [PATCH 1055/1250] io_uring: refactor poll arm error handling __io_arm_poll_handler() errors parsing is a horror, in case it failed it returns 0 and the caller is expected to look at ipt.error, which already led us to a number of problems before. When it returns a valid mask, leave it as it's not, i.e. return 1 and store the mask in ipt.result_mask. In case of a failure that can be handled inline return an error code (negative value), and return 0 if __io_arm_poll_handler() took ownership of the request and will complete it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/018cacdaef5fe95d7dc56b32e85d752cab7607f6.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 44 +++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 80113b036c88d8..3f3ae3b1505fa4 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -435,6 +435,12 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, (struct io_poll **) &pt->req->async_data); } +/* + * Returns 0 when it's handed over for polling. The caller owns the requests if + * it returns non-zero, but otherwise should not touch it. Negative values + * contain an error code. When the result is >0, the polling has completed + * inline and ipt.result_mask is set to the mask. + */ static int __io_arm_poll_handler(struct io_kiocb *req, struct io_poll *poll, struct io_poll_table *ipt, __poll_t mask) @@ -461,6 +467,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req, atomic_set(&req->poll_refs, 1); mask = vfs_poll(req->file, &ipt->pt) & poll->events; + if (unlikely(ipt->error || !ipt->nr_entries)) { + io_poll_remove_entries(req); + + if (mask && (poll->events & EPOLLET)) { + ipt->result_mask = mask; + return 1; + } else { + return ipt->error ?: -EINVAL; + } + } + if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { io_poll_remove_entries(req); @@ -469,25 +486,12 @@ static int __io_arm_poll_handler(struct io_kiocb *req, return 1; } - if (!mask && unlikely(ipt->error || !ipt->nr_entries)) { - io_poll_remove_entries(req); - if (!ipt->error) - ipt->error = -EINVAL; - return 0; - } - if (req->flags & REQ_F_HASH_LOCKED) io_poll_req_insert_locked(req); else io_poll_req_insert(req); if (mask && (poll->events & EPOLLET)) { - /* can't multishot if failed, just queue the event we've got */ - if (unlikely(ipt->error || !ipt->nr_entries)) { - poll->events |= EPOLLONESHOT; - req->apoll_events |= EPOLLONESHOT; - ipt->error = 0; - } __io_poll_execute(req, mask); return 0; } @@ -582,9 +586,8 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) io_kbuf_recycle(req, issue_flags); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); - if (ret || ipt.error) - return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; - + if (ret) + return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED; trace_io_uring_poll_arm(req, mask, apoll->poll.events); return IO_APOLL_OK; } @@ -815,16 +818,11 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_HASH_LOCKED; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); - if (ret) { + if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); return IOU_OK; } - if (ipt.error) { - req_set_fail(req); - return ipt.error; - } - - return IOU_ISSUE_SKIP_COMPLETE; + return ret ?: IOU_ISSUE_SKIP_COMPLETE; } int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) From 617b1092abc7c2eaf5e5f6ce77366a0dc2157ace Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 23 Jun 2022 14:24:49 +0100 Subject: [PATCH 1056/1250] io_uring: optimise submission side poll_refs The final poll_refs put in __io_arm_poll_handler() takes quite some cycles. When we're arming from the original task context task_work won't be run, so in this case we can assume that we won't race with task_works and so not take the initial ownership ref. One caveat is that after arming a poll we may race with it, so we have to add a bunch of io_poll_get_ownership() hidden inside of io_poll_can_finish_inline() whenever we want to complete arming inline. For the same reason we can't just set REQ_F_DOUBLE_POLL in __io_queue_proc() and so need to sync with the first poll entry by taking its wq head lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8825315d7f5e182ac1578a031e546f79b1c97d01.1655990418.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 88 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 21 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 3f3ae3b1505fa4..eba767594deee5 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -34,6 +34,7 @@ struct io_poll_table { struct io_kiocb *req; int nr_entries; int error; + bool owning; /* output value, set only if arm poll returns >0 */ __poll_t result_mask; }; @@ -374,6 +375,27 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, return 1; } +static void io_poll_double_prepare(struct io_kiocb *req) +{ + struct wait_queue_head *head; + struct io_poll *poll = io_poll_get_single(req); + + /* head is RCU protected, see io_poll_remove_entries() comments */ + rcu_read_lock(); + head = smp_load_acquire(&poll->head); + if (head) { + /* + * poll arm may not hold ownership and so race with + * io_poll_wake() by modifying req->flags. There is only one + * poll entry queued, serialise with it by taking its head lock. + */ + spin_lock_irq(&head->lock); + req->flags |= REQ_F_DOUBLE_POLL; + spin_unlock_irq(&head->lock); + } + rcu_read_unlock(); +} + static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, struct wait_queue_head *head, struct io_poll **poll_ptr) @@ -405,16 +427,19 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, pt->error = -ENOMEM; return; } + + io_poll_double_prepare(req); /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; - req->flags |= REQ_F_DOUBLE_POLL; io_init_poll_iocb(poll, first->events, first->wait.func); *poll_ptr = poll; if (req->opcode == IORING_OP_POLL_ADD) req->flags |= REQ_F_ASYNC_DATA; + } else { + /* fine to modify, there is no poll queued to race with us */ + req->flags |= REQ_F_SINGLE_POLL; } - req->flags |= REQ_F_SINGLE_POLL; pt->nr_entries++; poll->head = head; poll->wait.private = (void *) wqe_private; @@ -435,6 +460,12 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, (struct io_poll **) &pt->req->async_data); } +static bool io_poll_can_finish_inline(struct io_kiocb *req, + struct io_poll_table *pt) +{ + return pt->owning || io_poll_get_ownership(req); +} + /* * Returns 0 when it's handed over for polling. The caller owns the requests if * it returns non-zero, but otherwise should not touch it. Negative values @@ -443,7 +474,8 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, */ static int __io_arm_poll_handler(struct io_kiocb *req, struct io_poll *poll, - struct io_poll_table *ipt, __poll_t mask) + struct io_poll_table *ipt, __poll_t mask, + unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; int v; @@ -452,34 +484,45 @@ static int __io_arm_poll_handler(struct io_kiocb *req, req->work.cancel_seq = atomic_read(&ctx->cancel_seq); io_init_poll_iocb(poll, mask, io_poll_wake); poll->file = req->file; - req->apoll_events = poll->events; ipt->pt._key = mask; ipt->req = req; ipt->error = 0; ipt->nr_entries = 0; - /* - * Take the ownership to delay any tw execution up until we're done - * with poll arming. see io_poll_get_ownership(). + * Polling is either completed here or via task_work, so if we're in the + * task context we're naturally serialised with tw by merit of running + * the same task. When it's io-wq, take the ownership to prevent tw + * from running. However, when we're in the task context, skip taking + * it as an optimisation. + * + * Note: even though the request won't be completed/freed, without + * ownership we still can race with io_poll_wake(). + * io_poll_can_finish_inline() tries to deal with that. */ - atomic_set(&req->poll_refs, 1); + ipt->owning = issue_flags & IO_URING_F_UNLOCKED; + + atomic_set(&req->poll_refs, (int)ipt->owning); mask = vfs_poll(req->file, &ipt->pt) & poll->events; if (unlikely(ipt->error || !ipt->nr_entries)) { io_poll_remove_entries(req); - if (mask && (poll->events & EPOLLET)) { + if (!io_poll_can_finish_inline(req, ipt)) { + io_poll_mark_cancelled(req); + return 0; + } else if (mask && (poll->events & EPOLLET)) { ipt->result_mask = mask; return 1; - } else { - return ipt->error ?: -EINVAL; } + return ipt->error ?: -EINVAL; } if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { + if (!io_poll_can_finish_inline(req, ipt)) + return 0; io_poll_remove_entries(req); ipt->result_mask = mask; /* no one else has access to the req, forget about the ref */ @@ -491,18 +534,21 @@ static int __io_arm_poll_handler(struct io_kiocb *req, else io_poll_req_insert(req); - if (mask && (poll->events & EPOLLET)) { + if (mask && (poll->events & EPOLLET) && + io_poll_can_finish_inline(req, ipt)) { __io_poll_execute(req, mask); return 0; } - /* - * Release ownership. If someone tried to queue a tw while it was - * locked, kick it off for them. - */ - v = atomic_dec_return(&req->poll_refs); - if (unlikely(v & IO_POLL_REF_MASK)) - __io_poll_execute(req, 0); + if (ipt->owning) { + /* + * Release ownership. If someone tried to queue a tw while it was + * locked, kick it off for them. + */ + v = atomic_dec_return(&req->poll_refs); + if (unlikely(v & IO_POLL_REF_MASK)) + __io_poll_execute(req, 0); + } return 0; } @@ -585,7 +631,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) io_kbuf_recycle(req, issue_flags); - ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask); + ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, issue_flags); if (ret) return ret > 0 ? IO_APOLL_READY : IO_APOLL_ABORTED; trace_io_uring_poll_arm(req, mask, apoll->poll.events); @@ -817,7 +863,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) else req->flags &= ~REQ_F_HASH_LOCKED; - ret = __io_arm_poll_handler(req, poll, &ipt, poll->events); + ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); return IOU_OK; From b50873fe62448563519829c927364c0d3929a76b Mon Sep 17 00:00:00 2001 From: Hao Xu Date: Thu, 23 Jun 2022 21:01:26 +0800 Subject: [PATCH 1057/1250] io_uring: kbuf: inline io_kbuf_recycle_ring() Make io_kbuf_recycle_ring() inline since it is the fast path of provided buffer. Signed-off-by: Hao Xu Link: https://lore.kernel.org/r/20220623130126.179232-1-hao.xu@linux.dev Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 28 ---------------------------- io_uring/kbuf.h | 28 +++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 8bf47e49ea5bc7..5e00f16e89b866 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -74,34 +74,6 @@ void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) return; } -void io_kbuf_recycle_ring(struct io_kiocb *req) -{ - /* - * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear - * the flag and hence ensure that bl->head doesn't get incremented. - * If the tail has already been incremented, hang on to it. - * The exception is partial io, that case we should increment bl->head - * to monopolize the buffer. - */ - if (req->buf_list) { - if (req->flags & REQ_F_PARTIAL_IO) { - /* - * If we end up here, then the io_uring_lock has - * been kept held since we retrieved the buffer. - * For the io-wq case, we already cleared - * req->buf_list when the buffer was retrieved, - * hence it cannot be set here for that case. - */ - req->buf_list->head++; - req->buf_list = NULL; - } else { - req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; - } - } - return; -} - unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) { unsigned int cflags; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 721465c5d809fc..b3e8c6c5fee143 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -49,7 +49,33 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); -void io_kbuf_recycle_ring(struct io_kiocb *req); + +static inline void io_kbuf_recycle_ring(struct io_kiocb *req) +{ + /* + * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear + * the flag and hence ensure that bl->head doesn't get incremented. + * If the tail has already been incremented, hang on to it. + * The exception is partial io, that case we should increment bl->head + * to monopolize the buffer. + */ + if (req->buf_list) { + if (req->flags & REQ_F_PARTIAL_IO) { + /* + * If we end up here, then the io_uring_lock has + * been kept held since we retrieved the buffer. + * For the io-wq case, we already cleared + * req->buf_list when the buffer was retrieved, + * hence it cannot be set here for that case. + */ + req->buf_list->head++; + req->buf_list = NULL; + } else { + req->buf_index = req->buf_list->bgid; + req->flags &= ~REQ_F_BUFFER_RING; + } + } +} static inline bool io_do_buffer_select(struct io_kiocb *req) { From eb25e7f4e4601b4489877ce5e8e9b7b96ac9d65d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 21 Jun 2022 14:34:15 -0600 Subject: [PATCH 1058/1250] io_uring: move POLLFREE handling to separate function We really don't care about this at all in terms of performance. Outside of having it already be marked unlikely(), shove it into a separate __cold function. Signed-off-by: Jens Axboe --- io_uring/poll.c | 50 ++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index eba767594deee5..fa25b88a7b9336 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -325,6 +325,31 @@ static void io_poll_cancel_req(struct io_kiocb *req) #define IO_ASYNC_POLL_COMMON (EPOLLONESHOT | EPOLLPRI) +static __cold int io_pollfree_wake(struct io_kiocb *req, struct io_poll *poll) +{ + io_poll_mark_cancelled(req); + /* we have to kick tw in case it's not already */ + io_poll_execute(req, 0); + + /* + * If the waitqueue is being freed early but someone is already + * holds ownership over it, we have to tear down the request as + * best we can. That means immediately removing the request from + * its waitqueue and preventing all further accesses to the + * waitqueue via the request. + */ + list_del_init(&poll->wait.entry); + + /* + * Careful: this *must* be the last step, since as soon + * as req->head is NULL'ed out, the request can be + * completed and freed, since aio_poll_complete_work() + * will no longer need to take the waitqueue lock. + */ + smp_store_release(&poll->head, NULL); + return 1; +} + static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { @@ -332,29 +357,8 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, struct io_poll *poll = container_of(wait, struct io_poll, wait); __poll_t mask = key_to_poll(key); - if (unlikely(mask & POLLFREE)) { - io_poll_mark_cancelled(req); - /* we have to kick tw in case it's not already */ - io_poll_execute(req, 0); - - /* - * If the waitqueue is being freed early but someone is already - * holds ownership over it, we have to tear down the request as - * best we can. That means immediately removing the request from - * its waitqueue and preventing all further accesses to the - * waitqueue via the request. - */ - list_del_init(&poll->wait.entry); - - /* - * Careful: this *must* be the last step, since as soon - * as req->head is NULL'ed out, the request can be - * completed and freed, since aio_poll_complete_work() - * will no longer need to take the waitqueue lock. - */ - smp_store_release(&poll->head, NULL); - return 1; - } + if (unlikely(mask & POLLFREE)) + return io_pollfree_wake(req, poll); /* for instances that support it check for an event match first */ if (mask && !(mask & (poll->events & ~IO_ASYNC_POLL_COMMON))) From 4a099bf6a5fdb3b8a3ca91236e8f09c4b8051f1a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:52:58 +0100 Subject: [PATCH 1059/1250] io_uring: improve io_fail_links() io_fail_links() is called with ->completion_lock held and for that reason we'd want to keep it as small as we can. Instead of doing __io_req_complete_post() for each linked request under the lock, fail them in a task_work handler under ->uring_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a2f68708b970a21f4e84ddfa7b3abd67a8fffb27.1656153285.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 4af074b8f6b7d7..2f9e5693547931 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -101,32 +101,44 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void io_fail_links(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) +static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) { - struct io_kiocb *nxt, *link = req->link; - bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; - - req->link = NULL; + io_tw_lock(link->ctx, locked); while (link) { + struct io_kiocb *nxt = link->link; long res = -ECANCELED; if (link->flags & REQ_F_FAIL) res = link->cqe.res; - - nxt = link->link; link->link = NULL; + io_req_set_res(link, res, 0); + io_req_task_complete(link, locked); + link = nxt; + } +} - trace_io_uring_fail_link(req, link); +static void io_fail_links(struct io_kiocb *req) + __must_hold(&req->ctx->completion_lock) +{ + struct io_kiocb *link = req->link; + bool ignore_cqes = req->flags & REQ_F_SKIP_LINK_CQES; + + if (!link) + return; + while (link) { if (ignore_cqes) link->flags |= REQ_F_CQE_SKIP; else link->flags &= ~REQ_F_CQE_SKIP; - io_req_set_res(link, res, 0); - __io_req_complete_post(link); - link = nxt; + trace_io_uring_fail_link(req, link); + link = link->link; } + + link = req->link; + link->io_task_work.func = io_req_tw_fail_links; + io_req_task_work_add(link); + req->link = NULL; } static inline void io_remove_next_linked(struct io_kiocb *req) From 004376248485f25d8b6155d78e5a9ca83c417c30 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:52:59 +0100 Subject: [PATCH 1060/1250] io_uring: fuse fallback_node and normal tw node Now as both normal and fallback paths use llist, just keep one node head in struct io_task_work and kill off ->fallback_node. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d04ebde409f7b162fe247b361b4486b193293e46.1656153285.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +---- io_uring/io_uring.c | 5 ++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 918165a200533f..3ca8f363f5046c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -427,10 +427,7 @@ enum { typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); struct io_task_work { - union { - struct llist_node node; - struct llist_node fallback_node; - }; + struct llist_node node; io_req_tw_func_t func; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 45538b3c3a764e..86a0b0c6f5bf77 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -233,7 +233,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) bool locked = false; percpu_ref_get(&ctx->refs); - llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) + llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &locked); if (locked) { @@ -1091,13 +1091,12 @@ void io_req_task_work_add(struct io_kiocb *req) if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) return; - node = llist_del_all(&tctx->task_list); while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; - if (llist_add(&req->io_task_work.fallback_node, + if (llist_add(&req->io_task_work.node, &req->ctx->fallback_llist)) schedule_delayed_work(&req->ctx->fallback_work, 1); } From 6465662ec252cc77486f0e55a2da7f3a25f5c538 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:53:00 +0100 Subject: [PATCH 1061/1250] io_uring: remove extra TIF_NOTIFY_SIGNAL check io_run_task_work() accounts for TIF_NOTIFY_SIGNAL, so no need to have an second check in io_run_task_work_sig(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/52ce41a592ad904511697f432141e5690fd4b968.1656153285.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 86a0b0c6f5bf77..f40526426db813 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2205,8 +2205,6 @@ int io_run_task_work_sig(void) { if (io_run_task_work()) return 1; - if (test_thread_flag(TIF_NOTIFY_SIGNAL)) - return -ERESTARTSYS; if (task_sigpending(current)) return -EINTR; return 0; From a69a2e6dcd308f0319cb3676cb978f5b68871989 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:53:01 +0100 Subject: [PATCH 1062/1250] io_uring: don't check file ops of registered rings Registered rings are per definitions io_uring files, so we don't need to additionally verify them. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/425cd64fd885b8e329a46c205ee811987691baaf.1656153286.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f40526426db813..e1e8dcd17df354 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3036,22 +3036,22 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (flags & IORING_ENTER_REGISTERED_RING) { struct io_uring_task *tctx = current->io_uring; - if (!tctx || fd >= IO_RINGFD_REG_MAX) + if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) return -EINVAL; fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); f.file = tctx->registered_rings[fd]; f.flags = 0; + if (unlikely(!f.file)) + return -EBADF; } else { f = fdget(fd); + if (unlikely(!f.file)) + return -EBADF; + ret = -EOPNOTSUPP; + if (unlikely(!io_is_uring_fops(f.file))) + goto out_fput; } - if (unlikely(!f.file)) - return -EBADF; - - ret = -EOPNOTSUPP; - if (unlikely(!io_is_uring_fops(f.file))) - goto out_fput; - ret = -ENXIO; ctx = f.file->private_data; if (unlikely(!percpu_ref_tryget(&ctx->refs))) From 3d8aa1e9f2370b2ba8d04dde91f334f3e0fa1f12 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:53:02 +0100 Subject: [PATCH 1063/1250] io_uring: remove ctx->refs pinning on enter io_uring_enter() takes ctx->refs, which was previously preventing racing with register quiesce. However, as register now doesn't touch the refs, we can freely kill extra ctx pinning and rely on the fact that we're holding a file reference preventing the ring from being destroyed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a11c57ad33a1be53541fce90669c1b79cf4d8940.1656153286.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e1e8dcd17df354..070ee9ec9ee721 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3049,14 +3049,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, return -EBADF; ret = -EOPNOTSUPP; if (unlikely(!io_is_uring_fops(f.file))) - goto out_fput; + goto out; } - ret = -ENXIO; ctx = f.file->private_data; - if (unlikely(!percpu_ref_tryget(&ctx->refs))) - goto out_fput; - ret = -EBADFD; if (unlikely(ctx->flags & IORING_SETUP_R_DISABLED)) goto out; @@ -3141,10 +3137,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, &ctx->check_cq); } } - out: - percpu_ref_put(&ctx->refs); -out_fput: fdput(f); return ret; } @@ -3730,11 +3723,10 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, int ret; /* - * We're inside the ring mutex, if the ref is already dying, then - * someone else killed the ctx or is already going through - * io_uring_register(). + * We don't quiesce the refs for register anymore and so it can't be + * dying as we're holding a file ref here. */ - if (percpu_ref_is_dying(&ctx->refs)) + if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs))) return -ENXIO; if (ctx->restricted) { From 5bc28b5590a8ce6abd66a0d5729eb67bd549b3ce Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 28 Jun 2022 21:33:20 +0200 Subject: [PATCH 1064/1250] io_uring: replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.16/process/deprecated.html#zero-length-and-one-element-arrays Link: https://github.com/KSPP/linux/issues/78 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 094f706c93e0b9..8fe0275cdaf311 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -495,7 +495,7 @@ struct io_uring_probe { __u8 ops_len; /* length of ops[] array below */ __u16 resv; __u32 resv2[3]; - struct io_uring_probe_op ops[0]; + struct io_uring_probe_op ops[]; }; struct io_uring_restriction { From d75c24f662f1feb350d84afe23d7536397319d09 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 04:42:56 -0600 Subject: [PATCH 1065/1250] io_uring: split out fixed file installation and removal Put it with the filetable code, which is where it belongs. While doing so, have the helpers take a ctx rather than an io_kiocb. It doesn't make sense to use a request, as it's not an operation on the request itself. It applies to the ring itself. Signed-off-by: Jens Axboe --- io_uring/filetable.c | 66 ++++++++++++++++++++++++++++++++++---------- io_uring/filetable.h | 3 ++ io_uring/openclose.c | 35 +++-------------------- io_uring/openclose.h | 2 +- io_uring/rsrc.c | 2 +- 5 files changed, 60 insertions(+), 48 deletions(-) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 534e1a3c625d9f..abaa5ba7f6552a 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -58,11 +58,10 @@ void io_free_file_tables(struct io_file_table *table) table->bitmap = NULL; } -static int io_install_fixed_file(struct io_kiocb *req, struct file *file, - unsigned int issue_flags, u32 slot_index) +static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, + u32 slot_index) __must_hold(&req->ctx->uring_lock) { - struct io_ring_ctx *ctx = req->ctx; bool needs_switch = false; struct io_fixed_file *file_slot; int ret; @@ -108,34 +107,71 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, return ret; } -/* - * Note when io_fixed_fd_install() returns error value, it will ensure - * fput() is called correspondingly. - */ -int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, - struct file *file, unsigned int file_slot) +int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, + unsigned int file_slot) { bool alloc_slot = file_slot == IORING_FILE_INDEX_ALLOC; - struct io_ring_ctx *ctx = req->ctx; int ret; - io_ring_submit_lock(ctx, issue_flags); - if (alloc_slot) { ret = io_file_bitmap_get(ctx); if (unlikely(ret < 0)) - goto err; + return ret; file_slot = ret; } else { file_slot--; } - ret = io_install_fixed_file(req, file, issue_flags, file_slot); + ret = io_install_fixed_file(ctx, file, file_slot); if (!ret && alloc_slot) ret = file_slot; -err: + return ret; +} +/* + * Note when io_fixed_fd_install() returns error value, it will ensure + * fput() is called correspondingly. + */ +int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, + struct file *file, unsigned int file_slot) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + io_ring_submit_lock(ctx, issue_flags); + ret = __io_fixed_fd_install(ctx, file, file_slot); io_ring_submit_unlock(ctx, issue_flags); + if (unlikely(ret < 0)) fput(file); return ret; } + +int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) +{ + struct io_fixed_file *file_slot; + struct file *file; + int ret; + + if (unlikely(!ctx->file_data)) + return -ENXIO; + if (offset >= ctx->nr_user_files) + return -EINVAL; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + return ret; + + offset = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, offset); + if (!file_slot->file_ptr) + return -EBADF; + + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); + if (ret) + return ret; + + file_slot->file_ptr = 0; + io_file_bitmap_clear(&ctx->file_table, offset); + io_rsrc_node_switch(ctx, ctx->file_data); + return 0; +} diff --git a/io_uring/filetable.h b/io_uring/filetable.h index fb5a274c08ffcc..79eb50c1980e84 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -29,6 +29,9 @@ void io_free_file_tables(struct io_file_table *table); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); +int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, + unsigned int file_slot); +int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); unsigned int io_file_get_flags(struct file *file); diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 099a5ec84dfdb7..d1818ec9169ba2 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -173,42 +173,15 @@ void io_open_cleanup(struct io_kiocb *req) putname(open->filename); } -int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, +int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, unsigned int offset) { - struct io_ring_ctx *ctx = req->ctx; - struct io_fixed_file *file_slot; - struct file *file; int ret; io_ring_submit_lock(ctx, issue_flags); - ret = -ENXIO; - if (unlikely(!ctx->file_data)) - goto out; - ret = -EINVAL; - if (offset >= ctx->nr_user_files) - goto out; - ret = io_rsrc_node_switch_start(ctx); - if (ret) - goto out; - - offset = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, offset); - ret = -EBADF; - if (!file_slot->file_ptr) - goto out; - - file = (struct file *)(file_slot->file_ptr & FFS_MASK); - ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); - if (ret) - goto out; - - file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, offset); - io_rsrc_node_switch(ctx, ctx->file_data); - ret = 0; -out: + ret = io_fixed_fd_remove(ctx, offset); io_ring_submit_unlock(ctx, issue_flags); + return ret; } @@ -216,7 +189,7 @@ static inline int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { struct io_close *close = io_kiocb_to_cmd(req); - return __io_close_fixed(req, issue_flags, close->file_slot - 1); + return __io_close_fixed(req->ctx, issue_flags, close->file_slot - 1); } int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 9f578f3fad870d..4b1c28d3a66c3f 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -int __io_close_fixed(struct io_kiocb *req, unsigned int issue_flags, +int __io_close_fixed(struct io_ring_ctx *ctx, unsigned int issue_flags, unsigned int offset); int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 1106089551595e..706fa020505b1a 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -703,7 +703,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, if (ret < 0) break; if (copy_to_user(&fds[done], &ret, sizeof(ret))) { - __io_close_fixed(req, issue_flags, ret); + __io_close_fixed(req->ctx, issue_flags, ret); ret = -EFAULT; break; } From 9bf2944565b3ec20c9b27ccaad7e729d3588d46b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jun 2022 04:47:02 -0600 Subject: [PATCH 1066/1250] io_uring: add support for passing fixed file descriptors With IORING_OP_MSG_RING, one ring can send a message to another ring. Extend that support to also allow sending a fixed file descriptor to that ring, enabling one ring to pass a registered descriptor to another one. Arguments are extended to pass in: sqe->addr3 fixed file slot in source ring sqe->file_index fixed file slot in destination ring IORING_OP_MSG_RING is extended to take a command argument in sqe->addr. If set to zero (or IORING_MSG_DATA), it sends just a message like before. If set to IORING_MSG_SEND_FD, a fixed file descriptor is sent according to the above arguments. Two common use cases for this are: 1) Server needs to be shutdown or restarted, pass file descriptors to another onei 2) Backend is split, and one accepts connections, while others then get the fd passed and handle the actual connection. Both of those are classic SCM_RIGHTS use cases, and it's not possible to support them with direct descriptors today. By default, this will post a CQE to the target ring, similarly to how IORING_MSG_DATA does it. If IORING_MSG_RING_CQE_SKIP is set, no message is posted to the target ring. The issuer is expected to notify the receiver side separately. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 17 +++++ io_uring/msg_ring.c | 130 ++++++++++++++++++++++++++++++++-- 2 files changed, 140 insertions(+), 7 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 8fe0275cdaf311..f378eabbff21b6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -51,6 +51,7 @@ struct io_uring_sqe { __u32 unlink_flags; __u32 hardlink_flags; __u32 xattr_flags; + __u32 msg_ring_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -270,6 +271,22 @@ enum io_uring_op { */ #define IORING_ACCEPT_MULTISHOT (1U << 0) +/* + * IORING_OP_MSG_RING command types, stored in sqe->addr + */ +enum { + IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */ + IORING_MSG_SEND_FD, /* send a registered fd to another ring */ +}; + +/* + * IORING_OP_MSG_RING flags (sqe->msg_ring_flags) + * + * IORING_MSG_RING_CQE_SKIP Don't post a CQE to the target ring. Not + * applicable for IORING_MSG_DATA, obviously. + */ +#define IORING_MSG_RING_CQE_SKIP (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index b02be23496521c..939205b30c8b62 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -3,46 +3,162 @@ #include #include #include +#include #include #include #include "io_uring.h" +#include "rsrc.h" +#include "filetable.h" #include "msg_ring.h" struct io_msg { struct file *file; u64 user_data; u32 len; + u32 cmd; + u32 src_fd; + u32 dst_fd; + u32 flags; }; +static int io_msg_ring_data(struct io_kiocb *req) +{ + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req); + + if (msg->src_fd || msg->dst_fd || msg->flags) + return -EINVAL; + + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + return 0; + + return -EOVERFLOW; +} + +static void io_double_unlock_ctx(struct io_ring_ctx *ctx, + struct io_ring_ctx *octx, + unsigned int issue_flags) +{ + if (issue_flags & IO_URING_F_UNLOCKED) + mutex_unlock(&ctx->uring_lock); + mutex_unlock(&octx->uring_lock); +} + +static int io_double_lock_ctx(struct io_ring_ctx *ctx, + struct io_ring_ctx *octx, + unsigned int issue_flags) +{ + /* + * To ensure proper ordering between the two ctxs, we can only + * attempt a trylock on the target. If that fails and we already have + * the source ctx lock, punt to io-wq. + */ + if (!(issue_flags & IO_URING_F_UNLOCKED)) { + if (!mutex_trylock(&octx->uring_lock)) + return -EAGAIN; + return 0; + } + + /* Always grab smallest value ctx first. We know ctx != octx. */ + if (ctx < octx) { + mutex_lock(&ctx->uring_lock); + mutex_lock(&octx->uring_lock); + } else { + mutex_lock(&octx->uring_lock); + mutex_lock(&ctx->uring_lock); + } + + return 0; +} + +static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + unsigned long file_ptr; + struct file *src_file; + int ret; + + if (target_ctx == ctx) + return -EINVAL; + + ret = io_double_lock_ctx(ctx, target_ctx, issue_flags); + if (unlikely(ret)) + return ret; + + ret = -EBADF; + if (unlikely(msg->src_fd >= ctx->nr_user_files)) + goto out_unlock; + + msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files); + file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr; + src_file = (struct file *) (file_ptr & FFS_MASK); + get_file(src_file); + + ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); + if (ret < 0) { + fput(src_file); + goto out_unlock; + } + + if (msg->flags & IORING_MSG_RING_CQE_SKIP) + goto out_unlock; + + /* + * If this fails, the target still received the file descriptor but + * wasn't notified of the fact. This means that if this request + * completes with -EOVERFLOW, then the sender must ensure that a + * later IORING_OP_MSG_RING delivers the message. + */ + if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + ret = -EOVERFLOW; +out_unlock: + io_double_unlock_ctx(ctx, target_ctx, issue_flags); + return ret; +} + int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_msg *msg = io_kiocb_to_cmd(req); - if (unlikely(sqe->addr || sqe->rw_flags || sqe->splice_fd_in || - sqe->buf_index || sqe->personality)) + if (unlikely(sqe->buf_index || sqe->personality)) return -EINVAL; msg->user_data = READ_ONCE(sqe->off); msg->len = READ_ONCE(sqe->len); + msg->cmd = READ_ONCE(sqe->addr); + msg->src_fd = READ_ONCE(sqe->addr3); + msg->dst_fd = READ_ONCE(sqe->file_index); + msg->flags = READ_ONCE(sqe->msg_ring_flags); + if (msg->flags & ~IORING_MSG_RING_CQE_SKIP) + return -EINVAL; + return 0; } int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req); - struct io_ring_ctx *target_ctx; int ret; ret = -EBADFD; if (!io_is_uring_fops(req->file)) goto done; - ret = -EOVERFLOW; - target_ctx = req->file->private_data; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) - ret = 0; + switch (msg->cmd) { + case IORING_MSG_DATA: + ret = io_msg_ring_data(req); + break; + case IORING_MSG_SEND_FD: + ret = io_msg_send_fd(req, issue_flags); + break; + default: + ret = -EINVAL; + break; + } done: if (ret < 0) From 9b6b055e054ad8c8e1e64f7422059640dacf7c4d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 25 Jun 2022 11:55:38 +0100 Subject: [PATCH 1067/1250] io_uring: let to set a range for file slot allocation From recently io_uring provides an option to allocate a file index for operation registering fixed files. However, it's utterly unusable with mixed approaches when for a part of files the userspace knows better where to place it, as it may race and users don't have any sane way to pick a slot and hoping it will not be taken. Let the userspace to register a range of fixed file slots in which the auto-allocation happens. The use case is splittting the fixed table in two parts, where on of them is used for auto-allocation and another for slot-specified operations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/66ab0394e436f38437cf7c44676e1920d09687ad.1656154403.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ include/uapi/linux/io_uring.h | 13 +++++++++++++ io_uring/filetable.c | 24 ++++++++++++++++++++---- io_uring/filetable.h | 20 +++++++++++++++++--- io_uring/io_uring.c | 6 ++++++ io_uring/rsrc.c | 2 ++ 6 files changed, 61 insertions(+), 7 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3ca8f363f5046c..26ef11e978d403 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -233,6 +233,9 @@ struct io_ring_ctx { unsigned long check_cq; + unsigned int file_alloc_start; + unsigned int file_alloc_end; + struct { /* * We cache a range of free CQEs we can use, once exhausted it diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f378eabbff21b6..cf95354198a393 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -449,6 +449,9 @@ enum { /* sync cancelation API */ IORING_REGISTER_SYNC_CANCEL = 24, + /* register a range of fixed file slots for automatic slot allocation */ + IORING_REGISTER_FILE_ALLOC_RANGE = 25, + /* this goes last */ IORING_REGISTER_LAST }; @@ -595,4 +598,14 @@ struct io_uring_sync_cancel_reg { __u64 pad[4]; }; +/* + * Argument for IORING_REGISTER_FILE_ALLOC_RANGE + * The range is specified as [off, off + len) + */ +struct io_uring_file_index_range { + __u32 off; + __u32 len; + __u64 resv; +}; + #endif diff --git a/io_uring/filetable.c b/io_uring/filetable.c index abaa5ba7f6552a..7b473259f3f45a 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -16,7 +16,7 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) { struct io_file_table *table = &ctx->file_table; - unsigned long nr = ctx->nr_user_files; + unsigned long nr = ctx->file_alloc_end; int ret; do { @@ -24,11 +24,10 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) if (ret != nr) return ret; - if (!table->alloc_hint) + if (table->alloc_hint == ctx->file_alloc_start) break; - nr = table->alloc_hint; - table->alloc_hint = 0; + table->alloc_hint = ctx->file_alloc_start; } while (1); return -ENFILE; @@ -175,3 +174,20 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) io_rsrc_node_switch(ctx, ctx->file_data); return 0; } + +int io_register_file_alloc_range(struct io_ring_ctx *ctx, + struct io_uring_file_index_range __user *arg) +{ + struct io_uring_file_index_range range; + u32 end; + + if (copy_from_user(&range, arg, sizeof(range))) + return -EFAULT; + if (check_add_overflow(range.off, range.len, &end)) + return -EOVERFLOW; + if (range.resv || end > ctx->nr_user_files) + return -EINVAL; + + io_file_table_set_alloc_range(ctx, range.off, range.len); + return 0; +} diff --git a/io_uring/filetable.h b/io_uring/filetable.h index 79eb50c1980e84..ff3a712e11bf33 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -3,9 +3,7 @@ #define IOU_FILE_TABLE_H #include - -struct io_ring_ctx; -struct io_kiocb; +#include /* * FFS_SCM is only available on 64-bit archs, for 32-bit we just define it as 0 @@ -33,6 +31,9 @@ int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, unsigned int file_slot); int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset); +int io_register_file_alloc_range(struct io_ring_ctx *ctx, + struct io_uring_file_index_range __user *arg); + unsigned int io_file_get_flags(struct file *file); static inline void io_file_bitmap_clear(struct io_file_table *table, int bit) @@ -71,4 +72,17 @@ static inline void io_fixed_file_set(struct io_fixed_file *file_slot, file_slot->file_ptr = file_ptr; } +static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) +{ + ctx->file_table.alloc_hint = ctx->file_alloc_start; +} + +static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx, + unsigned off, unsigned len) +{ + ctx->file_alloc_start = off; + ctx->file_alloc_end = off + len; + io_reset_alloc_hint(ctx); +} + #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 070ee9ec9ee721..745264938a489f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3866,6 +3866,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sync_cancel(ctx, arg); break; + case IORING_REGISTER_FILE_ALLOC_RANGE: + ret = -EINVAL; + if (!arg || nr_args) + break; + ret = io_register_file_alloc_range(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 706fa020505b1a..d2e589c703d063 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -1012,6 +1012,8 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, io_file_bitmap_set(&ctx->file_table, i); } + /* default it to the whole table */ + io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); io_rsrc_node_switch(ctx, NULL); return 0; fail: From 527adb7ef32c5dcb89a99a1cea3f63010bcde8df Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:20 -0700 Subject: [PATCH 1068/1250] io_uring: allow 0 length for buffer select If user gives 0 for length, we can set it from the available buffer size. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-2-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 5e00f16e89b866..e538fa7cb727cf 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -115,7 +115,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); - if (*len > kbuf->len) + if (*len == 0 || *len > kbuf->len) *len = kbuf->len; req->flags |= REQ_F_BUFFER_SELECTED; req->kbuf = kbuf; @@ -145,7 +145,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, buf = page_address(bl->buf_pages[index]); buf += off; } - if (*len > buf->len) + if (*len == 0 || *len > buf->len) *len = buf->len; req->flags |= REQ_F_BUFFER_RING; req->buf_list = bl; From 678575394c97a3c962b9d4521e9acf4598e094c3 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:21 -0700 Subject: [PATCH 1069/1250] io_uring: restore bgid in io_put_kbuf Attempt to restore bgid. This is needed when recycling unused buffers as the next time around it will want the correct bgid. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-3-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/kbuf.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index b3e8c6c5fee143..d6af208d109ffe 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -103,16 +103,21 @@ static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, struct list_head *list) { + unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + if (req->flags & REQ_F_BUFFER_RING) { - if (req->buf_list) + if (req->buf_list) { + req->buf_index = req->buf_list->bgid; req->buf_list->head++; + } req->flags &= ~REQ_F_BUFFER_RING; } else { + req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); req->flags &= ~REQ_F_BUFFER_SELECTED; } - return IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + return ret; } static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) From 14810ccc6051e13d987e1c2796bf27f95652f06d Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:22 -0700 Subject: [PATCH 1070/1250] io_uring: allow iov_len = 0 for recvmsg and buffer select When using BUFFER_SELECT there is no technical requirement that the user actually provides iov, and this removes one copy_from_user call. So allow iov_len to be 0. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-4-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index b77bfbfb081673..06eaef9f97bec5 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -300,12 +300,18 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, return ret; if (req->flags & REQ_F_BUFFER_SELECT) { - if (iov_len > 1) + if (iov_len == 0) { + sr->len = iomsg->fast_iov[0].iov_len = 0; + iomsg->fast_iov[0].iov_base = NULL; + iomsg->free_iov = NULL; + } else if (iov_len > 1) { return -EINVAL; - if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) - return -EFAULT; - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; + } else { + if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) + return -EFAULT; + sr->len = iomsg->fast_iov[0].iov_len; + iomsg->free_iov = NULL; + } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, From f6b4094b8948c64a1dbc38bfef9f154132aef21b Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:23 -0700 Subject: [PATCH 1071/1250] io_uring: recycle buffers on error Rather than passing an error back to the user with a buffer attached, recycle the buffer immediately. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-5-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 06eaef9f97bec5..e4422dff07048a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -481,10 +481,13 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; - if (ret >= 0) + if (ret > 0) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; + else + io_kbuf_recycle(req, issue_flags); + cflags = io_put_kbuf(req, issue_flags); if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; @@ -557,10 +560,13 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); } - if (ret >= 0) + if (ret > 0) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; + else + io_kbuf_recycle(req, issue_flags); + cflags = io_put_kbuf(req, issue_flags); if (msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; From ca814cd6207440d99bb022e51430ffd7c94359b7 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:24 -0700 Subject: [PATCH 1072/1250] io_uring: clean up io_poll_check_events return values The values returned are a bit confusing, where 0 and 1 have implied meaning, so add some definitions for them. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-6-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index fa25b88a7b9336..922a3d1b2e31d1 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -192,13 +192,18 @@ static void io_poll_remove_entries(struct io_kiocb *req) rcu_read_unlock(); } +enum { + IOU_POLL_DONE = 0, + IOU_POLL_NO_ACTION = 1, +}; + /* * All poll tw should go through this. Checks for poll events, manages * references, does rewait, etc. * - * Returns a negative error on failure. >0 when no action require, which is - * either spurious wakeup or multishot CQE is served. 0 when it's done with - * the request, then the mask is stored in req->cqe.res. + * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require, + * which is either spurious wakeup or multishot CQE is served. + * IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res. */ static int io_poll_check_events(struct io_kiocb *req, bool *locked) { @@ -214,10 +219,11 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) /* tw handler should be the owner, and so have some references */ if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) - return 0; + return IOU_POLL_DONE; if (v & IO_POLL_CANCEL_FLAG) return -ECANCELED; + /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { struct poll_table_struct pt = { ._key = req->apoll_events }; req->cqe.res = vfs_poll(req->file, &pt) & req->apoll_events; @@ -226,7 +232,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) if ((unlikely(!req->cqe.res))) continue; if (req->apoll_events & EPOLLONESHOT) - return 0; + return IOU_POLL_DONE; /* multishot, just fill a CQE and proceed */ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { @@ -238,7 +244,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) return -ECANCELED; } else { ret = io_poll_issue(req, locked); - if (ret) + if (ret < 0) return ret; } @@ -248,7 +254,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) */ } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); - return 1; + return IOU_POLL_NO_ACTION; } static void io_poll_task_func(struct io_kiocb *req, bool *locked) @@ -256,12 +262,11 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) int ret; ret = io_poll_check_events(req, locked); - if (ret > 0) + if (ret == IOU_POLL_NO_ACTION) return; - if (!ret) { + if (ret == IOU_POLL_DONE) { struct io_poll *poll = io_kiocb_to_cmd(req); - req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else { req->cqe.res = ret; @@ -280,7 +285,7 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) int ret; ret = io_poll_check_events(req, locked); - if (ret > 0) + if (ret == IOU_POLL_NO_ACTION) return; io_poll_remove_entries(req); From b3390b4b92b6a25084873dcb6781f65b8f867cb6 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:25 -0700 Subject: [PATCH 1073/1250] io_uring: add IOU_STOP_MULTISHOT return code For multishot we want a way to signal the caller that multishot has ended but also this might not be an error return. For example sockets return 0 when closed, which should end a multishot recv, but still have a CQE with result 0 Introduce IOU_STOP_MULTISHOT which does this and indicates that the return code is stored inside req->cqe Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-7-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 7 +++++++ io_uring/poll.c | 11 +++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index f77e4a5403e4e5..e8da70781fa34b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -15,6 +15,13 @@ enum { IOU_OK = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, + + /* + * Intended only when both REQ_F_POLLED and REQ_F_APOLL_MULTISHOT + * are set to indicate to the poll runner that multishot should be + * removed and the result is set on req->cqe.res. + */ + IOU_STOP_MULTISHOT = -ECANCELED, }; struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx); diff --git a/io_uring/poll.c b/io_uring/poll.c index 922a3d1b2e31d1..64d426d696abc8 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -195,6 +195,7 @@ static void io_poll_remove_entries(struct io_kiocb *req) enum { IOU_POLL_DONE = 0, IOU_POLL_NO_ACTION = 1, + IOU_POLL_REMOVE_POLL_USE_RES = 2, }; /* @@ -204,6 +205,8 @@ enum { * Returns a negative error on failure. IOU_POLL_NO_ACTION when no action require, * which is either spurious wakeup or multishot CQE is served. * IOU_POLL_DONE when it's done with the request, then the mask is stored in req->cqe.res. + * IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot poll and that the result + * is stored in req->cqe. */ static int io_poll_check_events(struct io_kiocb *req, bool *locked) { @@ -244,6 +247,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) return -ECANCELED; } else { ret = io_poll_issue(req, locked); + if (ret == IOU_STOP_MULTISHOT) + return IOU_POLL_REMOVE_POLL_USE_RES; if (ret < 0) return ret; } @@ -268,7 +273,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) if (ret == IOU_POLL_DONE) { struct io_poll *poll = io_kiocb_to_cmd(req); req->cqe.res = mangle_poll(req->cqe.res & poll->events); - } else { + } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; req_set_fail(req); } @@ -291,7 +296,9 @@ static void io_apoll_task_func(struct io_kiocb *req, bool *locked) io_poll_remove_entries(req); io_poll_tw_hash_eject(req, locked); - if (!ret) + if (ret == IOU_POLL_REMOVE_POLL_USE_RES) + io_req_complete_post(req); + else if (ret == IOU_POLL_DONE) io_req_task_submit(req, locked); else io_req_complete_failed(req, ret); From 61af7ee3f52d80e1def5260d9a6744bf96a704cc Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:26 -0700 Subject: [PATCH 1074/1250] io_uring: add allow_overflow to io_post_aux_cqe Some use cases of io_post_aux_cqe would not want to overflow as is, but might want to change the flags/result. For example multishot receive requires in order CQE, and so if there is an overflow it would need to stop receiving until the overflow is taken care of. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-8-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 14 ++++++++++---- io_uring/io_uring.h | 3 ++- io_uring/msg_ring.c | 4 ++-- io_uring/net.c | 2 +- io_uring/poll.c | 2 +- io_uring/rsrc.c | 4 ++-- 6 files changed, 18 insertions(+), 11 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 745264938a489f..523b6ebad15a30 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -736,7 +736,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) } static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags) + u64 user_data, s32 res, u32 cflags, + bool allow_overflow) { struct io_uring_cqe *cqe; @@ -760,16 +761,21 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, } return true; } - return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + + if (allow_overflow) + return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + + return false; } bool io_post_aux_cqe(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags) + u64 user_data, s32 res, u32 cflags, + bool allow_overflow) { bool filled; io_cq_lock(ctx); - filled = io_fill_cqe_aux(ctx, user_data, res, cflags); + filled = io_fill_cqe_aux(ctx, user_data, res, cflags, allow_overflow); io_cq_unlock_post(ctx); return filled; } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e8da70781fa34b..e022d71c177a46 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -31,7 +31,8 @@ void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 939205b30c8b62..753d16734319a4 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -31,7 +31,7 @@ static int io_msg_ring_data(struct io_kiocb *req) if (msg->src_fd || msg->dst_fd || msg->flags) return -EINVAL; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true)) return 0; return -EOVERFLOW; @@ -113,7 +113,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) * completes with -EOVERFLOW, then the sender must ensure that a * later IORING_OP_MSG_RING delivers the message. */ - if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0, true)) ret = -EOVERFLOW; out_unlock: io_double_unlock_ctx(ctx, target_ctx, issue_flags); diff --git a/io_uring/net.c b/io_uring/net.c index e4422dff07048a..601955fdb124f7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -658,7 +658,7 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) return ret; - if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE)) + if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) goto retry; return -ECANCELED; } diff --git a/io_uring/poll.c b/io_uring/poll.c index 64d426d696abc8..e8f922a4f6d754 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -243,7 +243,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) req->apoll_events); if (!io_post_aux_cqe(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE)) + mask, IORING_CQE_F_MORE, true)) return -ECANCELED; } else { ret = io_poll_issue(req, locked); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d2e589c703d063..0250c13ae1cdd0 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -175,10 +175,10 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) if (prsrc->tag) { if (ctx->flags & IORING_SETUP_IOPOLL) { mutex_lock(&ctx->uring_lock); - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); + io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); mutex_unlock(&ctx->uring_lock); } else { - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); + io_post_aux_cqe(ctx, prsrc->tag, 0, 0, true); } } From 8c71a9efa1e23223ba2d1336398b23742537d86d Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:27 -0700 Subject: [PATCH 1075/1250] io_uring: fix multishot poll on overflow On overflow, multishot poll can still complete with the IORING_CQE_F_MORE flag set. If in the meantime the user clears a CQE and a the poll was cancelled then the poll will post a CQE without the IORING_CQE_F_MORE (and likely result -ECANCELED). However when processing the application will encounter the non-overflow CQE which indicates that there will be no more events posted. Typical userspace applications would free memory associated with the poll in this case. It will then subsequently receive the earlier CQE which has overflowed, which breaks the contract given by the IORING_CQE_F_MORE flag. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-9-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index e8f922a4f6d754..57747d92bba427 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -243,8 +243,10 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) req->apoll_events); if (!io_post_aux_cqe(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE, true)) - return -ECANCELED; + mask, IORING_CQE_F_MORE, false)) { + io_req_set_res(req, mask, 0); + return IOU_POLL_REMOVE_POLL_USE_RES; + } } else { ret = io_poll_issue(req, locked); if (ret == IOU_STOP_MULTISHOT) From d0e8dc8b36ead9afd905f26cca7e1a7b566b3cba Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:28 -0700 Subject: [PATCH 1076/1250] io_uring: fix multishot accept ordering Similar to multishot poll, drop multishot accept when CQE overflow occurs. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-10-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 601955fdb124f7..e1eaf902f3b201 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -656,11 +656,14 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } - if (ret < 0) - return ret; - if (io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, true)) + if (ret >= 0 && + io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, false)) goto retry; - return -ECANCELED; + + io_req_set_res(req, ret, 0); + if (req->flags & REQ_F_POLLED) + return IOU_STOP_MULTISHOT; + return IOU_OK; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) From 07daee61a56297091c81ca6781b6f55da41b94dd Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:29 -0700 Subject: [PATCH 1077/1250] io_uring: multishot recv Support multishot receive for io_uring. Typical server applications will run a loop where for each recv CQE it requeues another recv/recvmsg. This can be simplified by using the existing multishot functionality combined with io_uring's provided buffers. The API is to add the IORING_RECV_MULTISHOT flag to the SQE. CQEs will then be posted (with IORING_CQE_F_MORE flag set) when data is available and is read. Once an error occurs or the socket ends, the multishot will be removed and a completion without IORING_CQE_F_MORE will be posted. The benefit to this is that the recv is much more performant. * Subsequent receives are queued up straight away without requiring the application to finish a processing loop. * If there are more data in the socket (sat the provided buffer size is smaller than the socket buffer) then the data is immediately returned, improving batching. * Poll is only armed once and reused, saving CPU cycles Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-11-dylany@fb.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 ++ io_uring/net.c | 102 +++++++++++++++++++++++++++++----- 2 files changed, 94 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cf95354198a393..499679134961b2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -263,8 +263,13 @@ enum io_uring_op { * or receive and arm poll if that yields an * -EAGAIN result, arm poll upfront and skip * the initial transfer attempt. + * + * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if + * the handler will continue to report + * CQEs on behalf of the same SQE. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) +#define IORING_RECV_MULTISHOT (1U << 1) /* * accept flags stored in sqe->ioprio diff --git a/io_uring/net.c b/io_uring/net.c index e1eaf902f3b201..cb08a4b6284096 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -389,6 +389,8 @@ int io_recvmsg_prep_async(struct io_kiocb *req) return ret; } +#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT) + int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); @@ -399,13 +401,22 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + if (sr->flags & ~(RECVMSG_FLAGS)) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; + if (sr->flags & IORING_RECV_MULTISHOT) { + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + if (sr->msg_flags & MSG_WAITALL) + return -EINVAL; + if (req->opcode == IORING_OP_RECV && sr->len) + return -EINVAL; + req->flags |= REQ_F_APOLL_MULTISHOT; + } #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -415,6 +426,48 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static inline void io_recv_prep_retry(struct io_kiocb *req) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req); + + sr->done_io = 0; + sr->len = 0; /* get from the provided buffer */ +} + +/* + * Finishes io_recv and io_recvmsg. + * + * Returns true if it is actually finished, or false if it should run + * again (for multishot). + */ +static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags) +{ + if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + io_req_set_res(req, *ret, cflags); + *ret = IOU_OK; + return true; + } + + if (*ret > 0) { + if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret, + cflags | IORING_CQE_F_MORE, false)) { + io_recv_prep_retry(req); + return false; + } + /* + * Otherwise stop multishot but use the current result. + * Probably will end up going into overflow, but this means + * we cannot trust the ordering anymore + */ + } + + io_req_set_res(req, *ret, cflags); + + if (req->flags & REQ_F_POLLED) + *ret = IOU_STOP_MULTISHOT; + return true; +} + int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); @@ -424,6 +477,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + size_t len = sr->len; sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -442,16 +496,17 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg); +retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; - buf = io_buffer_select(req, &sr->len, issue_flags); + buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = sr->len; + kmsg->fast_iov[0].iov_len = len; iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - sr->len); + len); } flags = sr->msg_flags; @@ -463,8 +518,15 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_get_inq = 1; ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg); + if (ret == -EAGAIN && force_nonblock) { + ret = io_setup_async_msg(req, kmsg); + if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) { + io_kbuf_recycle(req, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; + } + return ret; + } if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { @@ -491,8 +553,11 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; + + if (!io_recv_finish(req, &ret, cflags)) + goto retry_multishot; + + return ret; } int io_recv(struct io_kiocb *req, unsigned int issue_flags) @@ -505,6 +570,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + size_t len = sr->len; if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) @@ -514,16 +580,17 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; +retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; - buf = io_buffer_select(req, &sr->len, issue_flags); + buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; sr->buf = buf; } - ret = import_single_range(READ, sr->buf, sr->len, &iov, &msg.msg_iter); + ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter); if (unlikely(ret)) goto out_free; @@ -543,8 +610,14 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) ret = sock_recvmsg(sock, &msg, flags); if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) + if (ret == -EAGAIN && force_nonblock) { + if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) { + io_kbuf_recycle(req, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; + } + return -EAGAIN; + } if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { @@ -570,8 +643,11 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); if (msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; + + if (!io_recv_finish(req, &ret, cflags)) + goto retry_multishot; + + return ret; } int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) From 104bcfd7ecce224e55ab9b7c2fc0947611fa0888 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:30 -0700 Subject: [PATCH 1078/1250] io_uring: fix io_uring_cqe_overflow trace format Make the trace format consistent with io_uring_complete for cflags Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-12-dylany@fb.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 918e3a43e4b280..95a8cfaad15a03 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -594,7 +594,7 @@ TRACE_EVENT(io_uring_cqe_overflow, __entry->ocqe = ocqe; ), - TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, " + TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, " "overflow_cqe %p", __entry->ctx, __entry->user_data, __entry->res, __entry->cflags, __entry->ocqe) From af94cebd273ebf3e2ca550d4fef8b5ef7c7a1e80 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 30 Jun 2022 02:12:31 -0700 Subject: [PATCH 1079/1250] io_uring: only trace one of complete or overflow In overflow we see a duplcate line in the trace, and in some cases 3 lines (if initial io_post_aux_cqe fails). Instead just trace once for each CQE Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220630091231.1456789-13-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 ++- io_uring/io_uring.h | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 523b6ebad15a30..caf979cd432729 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -742,7 +742,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, struct io_uring_cqe *cqe; ctx->cq_extra++; - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); /* * If we can't get a cq entry, userspace overflowed the @@ -751,6 +750,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, */ cqe = io_get_cqe(ctx); if (likely(cqe)) { + trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); + WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e022d71c177a46..868f45d55543bc 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -101,10 +101,6 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, { struct io_uring_cqe *cqe; - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, - (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); /* * If we can't get a cq entry, userspace overflowed the * submission (by quite a lot). Increment the overflow count in @@ -113,6 +109,12 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, cqe = io_get_cqe(ctx); if (unlikely(!cqe)) return io_req_cqe_overflow(req); + + trace_io_uring_complete(req->ctx, req, req->cqe.user_data, + req->cqe.res, req->cqe.flags, + (req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0, + (req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0); + memcpy(cqe, &req->cqe, sizeof(*cqe)); if (ctx->flags & IORING_SETUP_CQE32) { From 67a9eea23f1cba8a6c5e041967a423bf04611ba2 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Mon, 4 Jul 2022 07:01:06 -0700 Subject: [PATCH 1080/1250] io_uring: disable multishot recvmsg recvmsg has semantics that do not make it trivial to extend to multishot. Specifically it has user pointers and returns data in the original parameter. In order to make this API useful these will need to be somehow included with the provided buffers. For now remove multishot for recvmsg as it is not useful. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220704140106.200167-1-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index cb08a4b6284096..6679069eeef162 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -409,6 +409,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; if (sr->flags & IORING_RECV_MULTISHOT) { + if (req->opcode == IORING_OP_RECVMSG) + return -EINVAL; if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sr->msg_flags & MSG_WAITALL) @@ -435,7 +437,7 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) } /* - * Finishes io_recv and io_recvmsg. + * Finishes io_recv * * Returns true if it is actually finished, or false if it should run * again (for multishot). @@ -477,7 +479,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - size_t len = sr->len; sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -496,17 +497,16 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg); -retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; - buf = io_buffer_select(req, &len, issue_flags); + buf = io_buffer_select(req, &sr->len, issue_flags); if (!buf) return -ENOBUFS; kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = len; + kmsg->fast_iov[0].iov_len = sr->len; iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - len); + sr->len); } flags = sr->msg_flags; @@ -518,15 +518,8 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_get_inq = 1; ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) { - ret = io_setup_async_msg(req, kmsg); - if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) == - IO_APOLL_MULTI_POLLED) { - io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - return ret; - } + if (ret == -EAGAIN && force_nonblock) + return io_setup_async_msg(req, kmsg); if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { @@ -554,10 +547,8 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!io_recv_finish(req, &ret, cflags)) - goto retry_multishot; - - return ret; + io_req_set_res(req, ret, cflags); + return IOU_OK; } int io_recv(struct io_kiocb *req, unsigned int issue_flags) From 69a7c434f4333aaa65aaf0305d5e12026ad43e99 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jul 2022 15:13:14 +0100 Subject: [PATCH 1081/1250] io_uring: don't miss setting REQ_F_DOUBLE_POLL When adding a second poll entry we should set REQ_F_DOUBLE_POLL unconditionally. We might race with the first entry removal but that doesn't change the rule. Fixes: a18427bb2d9b ("io_uring: optimise submission side poll_refs") Reported-and-tested-by: syzbot+49950ba66096b1f0209b@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8b680d83ded07424db83e8745585e7a6d72826ef.1657203020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 57747d92bba427..3710a0a46a8778 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -401,16 +401,18 @@ static void io_poll_double_prepare(struct io_kiocb *req) /* head is RCU protected, see io_poll_remove_entries() comments */ rcu_read_lock(); head = smp_load_acquire(&poll->head); - if (head) { - /* - * poll arm may not hold ownership and so race with - * io_poll_wake() by modifying req->flags. There is only one - * poll entry queued, serialise with it by taking its head lock. - */ + /* + * poll arm may not hold ownership and so race with + * io_poll_wake() by modifying req->flags. There is only one + * poll entry queued, serialise with it by taking its head lock. + */ + if (head) spin_lock_irq(&head->lock); - req->flags |= REQ_F_DOUBLE_POLL; + + req->flags |= REQ_F_DOUBLE_POLL; + + if (head) spin_unlock_irq(&head->lock); - } rcu_read_unlock(); } From 214bf9753e56ebb5f5bd2a76e4b242adbbf7453d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jul 2022 15:13:15 +0100 Subject: [PATCH 1082/1250] io_uring: don't race double poll setting REQ_F_ASYNC_DATA Just as with io_poll_double_prepare() setting REQ_F_DOUBLE_POLL, we can race with the first poll entry when setting REQ_F_ASYNC_DATA. Move it under io_poll_double_prepare(). Fixes: a18427bb2d9b ("io_uring: optimise submission side poll_refs") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/df6920f509c11115aa2bce8b34dc5fdb0eb98920.1657203020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 3710a0a46a8778..c1359d45a396f2 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -410,6 +410,8 @@ static void io_poll_double_prepare(struct io_kiocb *req) spin_lock_irq(&head->lock); req->flags |= REQ_F_DOUBLE_POLL; + if (req->opcode == IORING_OP_POLL_ADD) + req->flags |= REQ_F_ASYNC_DATA; if (head) spin_unlock_irq(&head->lock); @@ -448,13 +450,11 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, return; } - io_poll_double_prepare(req); /* mark as double wq entry */ wqe_private |= IO_WQE_F_DOUBLE; io_init_poll_iocb(poll, first->events, first->wait.func); + io_poll_double_prepare(req); *poll_ptr = poll; - if (req->opcode == IORING_OP_POLL_ADD) - req->flags |= REQ_F_ASYNC_DATA; } else { /* fine to modify, there is no poll queued to race with us */ req->flags |= REQ_F_SINGLE_POLL; From dc64630f725bf2eccd7cae72ff1d1a7c0709a761 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jul 2022 15:13:16 +0100 Subject: [PATCH 1083/1250] io_uring: clear REQ_F_HASH_LOCKED on hash removal Instead of clearing REQ_F_HASH_LOCKED while arming a poll, unset the bit when we're removing the entry from the table in io_poll_tw_hash_eject(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/02e48bb88d6f1480c94ac2924c43ad1fbd48e92a.1657203020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index c1359d45a396f2..77b669b0604609 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -132,6 +132,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) */ io_tw_lock(ctx, locked); hash_del(&req->hash_node); + req->flags &= ~REQ_F_HASH_LOCKED; } else { io_poll_req_delete(req, ctx); } @@ -617,9 +618,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) * apoll requests already grab the mutex to complete in the tw handler, * so removal from the mutex-backed hash is free, use it by default. */ - if (issue_flags & IO_URING_F_UNLOCKED) - req->flags &= ~REQ_F_HASH_LOCKED; - else + if (!(issue_flags & IO_URING_F_UNLOCKED)) req->flags |= REQ_F_HASH_LOCKED; if (!def->pollin && !def->pollout) @@ -880,8 +879,6 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) if (!(issue_flags & IO_URING_F_UNLOCKED) && (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER))) req->flags |= REQ_F_HASH_LOCKED; - else - req->flags &= ~REQ_F_HASH_LOCKED; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { From 93e9cc1411ac2f3d30be757200ac9f8bb766af7e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 7 Jul 2022 15:13:17 +0100 Subject: [PATCH 1084/1250] io_uring: consolidate hash_locked io-wq handling Don't duplicate code disabling REQ_F_HASH_LOCKED for IO_URING_F_UNLOCKED (i.e. io-wq), move the handling into __io_arm_poll_handler(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0ff0ffdfaa65b3d536131535c3dad3c63d9b7bb0.1657203020.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/io_uring/poll.c b/io_uring/poll.c index 77b669b0604609..76592063abe7ba 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -523,8 +523,12 @@ static int __io_arm_poll_handler(struct io_kiocb *req, * io_poll_can_finish_inline() tries to deal with that. */ ipt->owning = issue_flags & IO_URING_F_UNLOCKED; - atomic_set(&req->poll_refs, (int)ipt->owning); + + /* io-wq doesn't hold uring_lock */ + if (issue_flags & IO_URING_F_UNLOCKED) + req->flags &= ~REQ_F_HASH_LOCKED; + mask = vfs_poll(req->file, &ipt->pt) & poll->events; if (unlikely(ipt->error || !ipt->nr_entries)) { @@ -618,8 +622,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) * apoll requests already grab the mutex to complete in the tw handler, * so removal from the mutex-backed hash is free, use it by default. */ - if (!(issue_flags & IO_URING_F_UNLOCKED)) - req->flags |= REQ_F_HASH_LOCKED; + req->flags |= REQ_F_HASH_LOCKED; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; @@ -876,8 +879,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) * If sqpoll or single issuer, there is no contention for ->uring_lock * and we'll end up holding it in tw handlers anyway. */ - if (!(issue_flags & IO_URING_F_UNLOCKED) && - (req->ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_SINGLE_ISSUER))) + if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER)) req->flags |= REQ_F_HASH_LOCKED; ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); From 14d6c4539caf1e039df556df18e780da8314e168 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Jul 2022 11:18:33 -0600 Subject: [PATCH 1085/1250] io_uring: move apoll cache to poll.c This is where it's used, move the flush handler in there. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 12 ------------ io_uring/poll.c | 12 ++++++++++++ io_uring/poll.h | 2 ++ 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index caf979cd432729..4d1ce58b015e19 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2445,18 +2445,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static void io_flush_apoll_cache(struct io_ring_ctx *ctx) -{ - struct async_poll *apoll; - - while (!list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del(&apoll->poll.wait.entry); - kfree(apoll); - } -} - static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); diff --git a/io_uring/poll.c b/io_uring/poll.c index 76592063abe7ba..052fcb6472083a 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -959,3 +959,15 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +void io_flush_apoll_cache(struct io_ring_ctx *ctx) +{ + struct async_poll *apoll; + + while (!list_empty(&ctx->apoll_cache)) { + apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, + poll.wait.entry); + list_del(&apoll->poll.wait.entry); + kfree(apoll); + } +} diff --git a/io_uring/poll.h b/io_uring/poll.h index c40673d7da0199..95f192c7babbd5 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -30,3 +30,5 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); + +void io_flush_apoll_cache(struct io_ring_ctx *ctx); From 5bab264c2dce715486518f83f370dd0342377f4a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Jul 2022 14:16:20 -0600 Subject: [PATCH 1086/1250] io_uring: add abstraction around apoll cache In preparation for adding limits, and one more user, abstract out the core bits of the allocation+free cache. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ++++- io_uring/alloc_cache.h | 41 ++++++++++++++++++++++++++++++++++ io_uring/io_uring.c | 8 +++---- io_uring/poll.c | 18 +++++---------- io_uring/poll.h | 9 ++++++-- 5 files changed, 62 insertions(+), 20 deletions(-) create mode 100644 io_uring/alloc_cache.h diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 26ef11e978d403..b548da03b563cb 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -158,6 +158,10 @@ struct io_ev_fd { struct rcu_head rcu; }; +struct io_alloc_cache { + struct hlist_head list; +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { @@ -216,7 +220,7 @@ struct io_ring_ctx { struct io_hash_table cancel_table_locked; struct list_head cq_overflow_list; - struct list_head apoll_cache; + struct io_alloc_cache apoll_cache; struct xarray personalities; u32 pers_next; } ____cacheline_aligned_in_smp; diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h new file mode 100644 index 00000000000000..98f2374c37c7bb --- /dev/null +++ b/io_uring/alloc_cache.h @@ -0,0 +1,41 @@ +#ifndef IOU_ALLOC_CACHE_H +#define IOU_ALLOC_CACHE_H + +struct io_cache_entry { + struct hlist_node node; +}; + +static inline void io_alloc_cache_put(struct io_alloc_cache *cache, + struct io_cache_entry *entry) +{ + hlist_add_head(&entry->node, &cache->list); +} + +static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) +{ + if (!hlist_empty(&cache->list)) { + struct hlist_node *node = cache->list.first; + + hlist_del(node); + return container_of(node, struct io_cache_entry, node); + } + + return NULL; +} + +static inline void io_alloc_cache_init(struct io_alloc_cache *cache) +{ + INIT_HLIST_HEAD(&cache->list); +} + +static inline void io_alloc_cache_free(struct io_alloc_cache *cache, + void (*free)(struct io_cache_entry *)) +{ + while (!hlist_empty(&cache->list)) { + struct hlist_node *node = cache->list.first; + + hlist_del(node); + free(container_of(node, struct io_cache_entry, node)); + } +} +#endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4d1ce58b015e19..a360a3d390c6ba 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -92,6 +92,7 @@ #include "timeout.h" #include "poll.h" +#include "alloc_cache.h" #define IORING_MAX_ENTRIES 32768 #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) @@ -295,7 +296,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_LIST_HEAD(&ctx->apoll_cache); + io_alloc_cache_init(&ctx->apoll_cache); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -1180,8 +1181,7 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) if (apoll->double_poll) kfree(apoll->double_poll); - list_add(&apoll->poll.wait.entry, - &ctx->apoll_cache); + io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) @@ -2467,7 +2467,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->rings) __io_cqring_overflow_flush(ctx, true); io_eventfd_unregister(ctx); - io_flush_apoll_cache(ctx); + io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); mutex_unlock(&ctx->uring_lock); io_destroy_buffers(ctx); if (ctx->sq_creds) diff --git a/io_uring/poll.c b/io_uring/poll.c index 052fcb6472083a..dadd293749b07b 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -590,16 +590,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + struct io_cache_entry *entry; struct async_poll *apoll; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - !list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del_init(&apoll->poll.wait.entry); + (entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) { + apoll = container_of(entry, struct async_poll, cache); } else { apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) @@ -960,14 +959,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -void io_flush_apoll_cache(struct io_ring_ctx *ctx) +void io_apoll_cache_free(struct io_cache_entry *entry) { - struct async_poll *apoll; - - while (!list_empty(&ctx->apoll_cache)) { - apoll = list_first_entry(&ctx->apoll_cache, struct async_poll, - poll.wait.entry); - list_del(&apoll->poll.wait.entry); - kfree(apoll); - } + kfree(container_of(entry, struct async_poll, cache)); } diff --git a/io_uring/poll.h b/io_uring/poll.h index 95f192c7babbd5..5f3bae50fc81a0 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include "alloc_cache.h" + enum { IO_APOLL_OK, IO_APOLL_ABORTED, @@ -14,7 +16,10 @@ struct io_poll { }; struct async_poll { - struct io_poll poll; + union { + struct io_poll poll; + struct io_cache_entry cache; + }; struct io_poll *double_poll; }; @@ -31,4 +36,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); -void io_flush_apoll_cache(struct io_ring_ctx *ctx); +void io_apoll_cache_free(struct io_cache_entry *entry); From 975f7992ec48e3bf3b46842abda951565ee3f971 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Jul 2022 14:20:54 -0600 Subject: [PATCH 1087/1250] io_uring: impose max limit on apoll cache Caches like this tend to grow to the peak size, and then never get any smaller. Impose a max limit on the size, to prevent it from growing too big. A somewhat randomly chosen 512 is the max size we'll allow the cache to get. If a batch of frees come in and would bring it over that, we simply start kfree'ing the surplus. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/alloc_cache.h | 16 ++++++++++++++-- io_uring/io_uring.c | 3 ++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b548da03b563cb..bf8f95332edae0 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -160,6 +160,7 @@ struct io_ev_fd { struct io_alloc_cache { struct hlist_head list; + unsigned int nr_cached; }; struct io_ring_ctx { diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 98f2374c37c7bb..729793ae97127a 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -1,14 +1,24 @@ #ifndef IOU_ALLOC_CACHE_H #define IOU_ALLOC_CACHE_H +/* + * Don't allow the cache to grow beyond this size. + */ +#define IO_ALLOC_CACHE_MAX 512 + struct io_cache_entry { struct hlist_node node; }; -static inline void io_alloc_cache_put(struct io_alloc_cache *cache, +static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, struct io_cache_entry *entry) { - hlist_add_head(&entry->node, &cache->list); + if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { + cache->nr_cached++; + hlist_add_head(&entry->node, &cache->list); + return true; + } + return false; } static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) @@ -26,6 +36,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c static inline void io_alloc_cache_init(struct io_alloc_cache *cache) { INIT_HLIST_HEAD(&cache->list); + cache->nr_cached = 0; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, @@ -37,5 +48,6 @@ static inline void io_alloc_cache_free(struct io_alloc_cache *cache, hlist_del(node); free(container_of(node, struct io_cache_entry, node)); } + cache->nr_cached = 0; } #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a360a3d390c6ba..c9c23e45976635 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1181,7 +1181,8 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) if (apoll->double_poll) kfree(apoll->double_poll); - io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache); + if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) + kfree(apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) From b787bc318d0f501c1d643528a83fb4ef3b82f810 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Jul 2022 14:30:09 -0600 Subject: [PATCH 1088/1250] io_uring: add netmsg cache For recvmsg/sendmsg, if they don't complete inline, we currently need to allocate a struct io_async_msghdr for each request. This is a somewhat large struct. Hook up sendmsg/recvmsg to use the io_alloc_cache. This reduces the alloc + free overhead considerably, yielding 4-5% of extra performance running netbench. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 ++-- io_uring/io_uring.c | 3 ++ io_uring/net.c | 63 +++++++++++++++++++++++++++++----- io_uring/net.h | 13 ++++++- 4 files changed, 73 insertions(+), 12 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index bf8f95332edae0..d54b8b7e074629 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -222,8 +222,7 @@ struct io_ring_ctx { struct io_hash_table cancel_table_locked; struct list_head cq_overflow_list; struct io_alloc_cache apoll_cache; - struct xarray personalities; - u32 pers_next; + struct io_alloc_cache netmsg_cache; } ____cacheline_aligned_in_smp; /* IRQ completion list, under ->completion_lock */ @@ -241,6 +240,9 @@ struct io_ring_ctx { unsigned int file_alloc_start; unsigned int file_alloc_end; + struct xarray personalities; + u32 pers_next; + struct { /* * We cache a range of free CQEs we can use, once exhausted it diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c9c23e45976635..f697ca4e8f558c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -89,6 +89,7 @@ #include "kbuf.h" #include "rsrc.h" #include "cancel.h" +#include "net.h" #include "timeout.h" #include "poll.h" @@ -297,6 +298,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); io_alloc_cache_init(&ctx->apoll_cache); + io_alloc_cache_init(&ctx->netmsg_cache); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -2469,6 +2471,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) __io_cqring_overflow_flush(ctx, true); io_eventfd_unregister(ctx); io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); mutex_unlock(&ctx->uring_lock); io_destroy_buffers(ctx); if (ctx->sq_creds) diff --git a/io_uring/net.c b/io_uring/net.c index 6679069eeef162..185553174437f3 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -12,6 +12,7 @@ #include "io_uring.h" #include "kbuf.h" +#include "alloc_cache.h" #include "net.h" #if defined(CONFIG_NET) @@ -97,18 +98,55 @@ static bool io_net_retry(struct socket *sock, int flags) return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; } +static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_async_msghdr *hdr = req->async_data; + + if (!hdr || issue_flags & IO_URING_F_UNLOCKED) + return; + + /* Let normal cleanup path reap it if we fail adding to the cache */ + if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + +static struct io_async_msghdr *io_recvmsg_alloc_async(struct io_kiocb *req, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_cache_entry *entry; + + if (!(issue_flags & IO_URING_F_UNLOCKED) && + (entry = io_alloc_cache_get(&ctx->netmsg_cache)) != NULL) { + struct io_async_msghdr *hdr; + + hdr = container_of(entry, struct io_async_msghdr, cache); + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = hdr; + return hdr; + } + + if (!io_alloc_async_data(req)) + return req->async_data; + + return NULL; +} + static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg) + struct io_async_msghdr *kmsg, + unsigned int issue_flags) { struct io_async_msghdr *async_msg = req->async_data; if (async_msg) return -EAGAIN; - if (io_alloc_async_data(req)) { + async_msg = io_recvmsg_alloc_async(req, issue_flags); + if (!async_msg) { kfree(kmsg->free_iov); return -ENOMEM; } - async_msg = req->async_data; req->flags |= REQ_F_NEED_CLEANUP; memcpy(async_msg, kmsg, sizeof(*kmsg)); async_msg->msg.msg_name = &async_msg->addr; @@ -195,7 +233,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -207,13 +245,13 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); } req_set_fail(req); } @@ -221,6 +259,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->free_iov) kfree(kmsg->free_iov); req->flags &= ~REQ_F_NEED_CLEANUP; + io_netmsg_recycle(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -495,7 +534,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); if (io_do_buffer_select(req)) { void __user *buf; @@ -519,13 +558,13 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_PARTIAL_IO; - return io_setup_async_msg(req, kmsg); + return io_setup_async_msg(req, kmsg, issue_flags); } req_set_fail(req); } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { @@ -535,6 +574,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) /* fast path, check for non-NULL to avoid function call */ if (kmsg->free_iov) kfree(kmsg->free_iov); + io_netmsg_recycle(req, issue_flags); req->flags &= ~REQ_F_NEED_CLEANUP; if (ret > 0) ret += sr->done_io; @@ -848,4 +888,9 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) io_req_set_res(req, ret, 0); return IOU_OK; } + +void io_netmsg_cache_free(struct io_cache_entry *entry) +{ + kfree(container_of(entry, struct io_async_msghdr, cache)); +} #endif diff --git a/io_uring/net.h b/io_uring/net.h index 81d71d1647704d..178a6d8b76e0a8 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -3,9 +3,14 @@ #include #include +#include "alloc_cache.h" + #if defined(CONFIG_NET) struct io_async_msghdr { - struct iovec fast_iov[UIO_FASTIOV]; + union { + struct iovec fast_iov[UIO_FASTIOV]; + struct io_cache_entry cache; + }; /* points to an allocated iov, if NULL we use fast_iov instead */ struct iovec *free_iov; struct sockaddr __user *uaddr; @@ -40,4 +45,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags); int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); + +void io_netmsg_cache_free(struct io_cache_entry *entry); +#else +static inline void io_netmsg_cache_free(struct io_cache_entry *entry) +{ +} #endif From 08c3be1466ae1e53609c27e3191331ac54d6e2d5 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 8 Jul 2022 11:18:35 -0700 Subject: [PATCH 1089/1250] io_uring: fix multishot ending when not polled If multishot is not actually polling then return IOU_OK rather than the result. If the result was > 0 this will confuse things further up the callstack which expect a return <= 0. Fixes: 1300ebb20286 ("io_uring: multishot recv") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220708181838.1495428-2-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index 185553174437f3..eb939899e9c5a0 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -506,6 +506,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c if (req->flags & REQ_F_POLLED) *ret = IOU_STOP_MULTISHOT; + else + *ret = IOU_OK; return true; } From 8928cb8a1161b0e08619eed98a80891bbb8383ca Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 8 Jul 2022 11:18:36 -0700 Subject: [PATCH 1090/1250] io_uring: support 0 length iov in buffer select in compat Match up work done in "io_uring: allow iov_len = 0 for recvmsg and buffer select", but for compat code path. Fixes: a68caad69ce5 ("io_uring: allow iov_len = 0 for recvmsg and buffer select") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220708181838.1495428-3-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index eb939899e9c5a0..dc9190eafbe703 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -382,16 +382,21 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { compat_ssize_t clen; - if (len > 1) - return -EINVAL; - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) + if (len == 0) { + sr->len = 0; + iomsg->free_iov = NULL; + } else if (len > 1) { return -EINVAL; - sr->len = clen; - iomsg->free_iov = NULL; + } else { + if (!access_ok(uiov, sizeof(*uiov))) + return -EFAULT; + if (__get_user(clen, &uiov->iov_len)) + return -EFAULT; + if (clen < 0) + return -EINVAL; + sr->len = clen; + iomsg->free_iov = NULL; + } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, (struct iovec __user *)uiov, len, From ff0c5d2a70a210bab0d245719c07938c2bbd2b9b Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 14 Jul 2022 04:02:56 -0700 Subject: [PATCH 1091/1250] net: copy from user before calling __copy_msghdr this is in preparation for multishot receive from io_uring, where it needs to have access to the original struct user_msghdr. functionally this should be a no-op. Acked-by: Paolo Abeni Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220714110258.1336200-2-dylany@fb.com Signed-off-by: Jens Axboe --- include/linux/socket.h | 7 +++---- io_uring/net.c | 17 +++++++++-------- net/socket.c | 37 ++++++++++++++++--------------------- 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/include/linux/socket.h b/include/linux/socket.h index 17311ad9f9af24..be24f1c8568a38 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -416,10 +416,9 @@ extern int recvmsg_copy_msghdr(struct msghdr *msg, struct user_msghdr __user *umsg, unsigned flags, struct sockaddr __user **uaddr, struct iovec **iov); -extern int __copy_msghdr_from_user(struct msghdr *kmsg, - struct user_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec __user **uiov, size_t *nsegs); +extern int __copy_msghdr(struct msghdr *kmsg, + struct user_msghdr *umsg, + struct sockaddr __user **save_addr); /* helpers which do the actual work for syscalls */ extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, diff --git a/io_uring/net.c b/io_uring/net.c index dc9190eafbe703..da7667ed36106d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -329,31 +329,32 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); - struct iovec __user *uiov; - size_t iov_len; + struct user_msghdr msg; int ret; - ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg, - &iomsg->uaddr, &uiov, &iov_len); + if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg))) + return -EFAULT; + + ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); if (ret) return ret; if (req->flags & REQ_F_BUFFER_SELECT) { - if (iov_len == 0) { + if (msg.msg_iovlen == 0) { sr->len = iomsg->fast_iov[0].iov_len = 0; iomsg->fast_iov[0].iov_base = NULL; iomsg->free_iov = NULL; - } else if (iov_len > 1) { + } else if (msg.msg_iovlen > 1) { return -EINVAL; } else { - if (copy_from_user(iomsg->fast_iov, uiov, sizeof(*uiov))) + if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov))) return -EFAULT; sr->len = iomsg->fast_iov[0].iov_len; iomsg->free_iov = NULL; } } else { iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, uiov, iov_len, UIO_FASTIOV, + ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, &iomsg->free_iov, &iomsg->msg.msg_iter, false); if (ret > 0) diff --git a/net/socket.c b/net/socket.c index 96300cdc06251f..843545c21ec235 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2358,25 +2358,20 @@ struct used_address { unsigned int name_len; }; -int __copy_msghdr_from_user(struct msghdr *kmsg, - struct user_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec __user **uiov, size_t *nsegs) +int __copy_msghdr(struct msghdr *kmsg, + struct user_msghdr *msg, + struct sockaddr __user **save_addr) { - struct user_msghdr msg; ssize_t err; - if (copy_from_user(&msg, umsg, sizeof(*umsg))) - return -EFAULT; - kmsg->msg_control_is_user = true; kmsg->msg_get_inq = 0; - kmsg->msg_control_user = msg.msg_control; - kmsg->msg_controllen = msg.msg_controllen; - kmsg->msg_flags = msg.msg_flags; + kmsg->msg_control_user = msg->msg_control; + kmsg->msg_controllen = msg->msg_controllen; + kmsg->msg_flags = msg->msg_flags; - kmsg->msg_namelen = msg.msg_namelen; - if (!msg.msg_name) + kmsg->msg_namelen = msg->msg_namelen; + if (!msg->msg_name) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) @@ -2386,11 +2381,11 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_namelen = sizeof(struct sockaddr_storage); if (save_addr) - *save_addr = msg.msg_name; + *save_addr = msg->msg_name; - if (msg.msg_name && kmsg->msg_namelen) { + if (msg->msg_name && kmsg->msg_namelen) { if (!save_addr) { - err = move_addr_to_kernel(msg.msg_name, + err = move_addr_to_kernel(msg->msg_name, kmsg->msg_namelen, kmsg->msg_name); if (err < 0) @@ -2401,12 +2396,10 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_namelen = 0; } - if (msg.msg_iovlen > UIO_MAXIOV) + if (msg->msg_iovlen > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; - *uiov = msg.msg_iov; - *nsegs = msg.msg_iovlen; return 0; } @@ -2418,8 +2411,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, struct user_msghdr msg; ssize_t err; - err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov, - &msg.msg_iovlen); + if (copy_from_user(&msg, umsg, sizeof(*umsg))) + return -EFAULT; + + err = __copy_msghdr(kmsg, &msg, save_addr); if (err) return err; From b787fa827a0ec445471ef9e31fbe63e7bba6023d Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 14 Jul 2022 04:02:57 -0700 Subject: [PATCH 1092/1250] net: copy from user before calling __get_compat_msghdr this is in preparation for multishot receive from io_uring, where it needs to have access to the original struct user_msghdr. functionally this should be a no-op. Acked-by: Paolo Abeni Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220714110258.1336200-3-dylany@fb.com Signed-off-by: Jens Axboe --- include/net/compat.h | 5 ++--- io_uring/net.c | 17 +++++++++-------- net/compat.c | 39 +++++++++++++++++---------------------- 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/include/net/compat.h b/include/net/compat.h index 595fee069b8250..84c163f40f38a5 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -46,9 +46,8 @@ struct compat_rtentry { unsigned short rt_irtt; /* Initial RTT */ }; -int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg, - struct sockaddr __user **save_addr, compat_uptr_t *ptr, - compat_size_t *len); +int __get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr *msg, + struct sockaddr __user **save_addr); int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, struct sockaddr __user **, struct iovec **); int put_cmsg_compat(struct msghdr*, int, int, int, void *); diff --git a/io_uring/net.c b/io_uring/net.c index da7667ed36106d..5bc3440a829012 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -369,24 +369,25 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); + struct compat_msghdr msg; struct compat_iovec __user *uiov; - compat_uptr_t ptr; - compat_size_t len; int ret; - ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr, - &ptr, &len); + if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg))) + return -EFAULT; + + ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr); if (ret) return ret; - uiov = compat_ptr(ptr); + uiov = compat_ptr(msg.msg_iov); if (req->flags & REQ_F_BUFFER_SELECT) { compat_ssize_t clen; - if (len == 0) { + if (msg.msg_iovlen == 0) { sr->len = 0; iomsg->free_iov = NULL; - } else if (len > 1) { + } else if (msg.msg_iovlen > 1) { return -EINVAL; } else { if (!access_ok(uiov, sizeof(*uiov))) @@ -400,7 +401,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, } } else { iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(READ, (struct iovec __user *)uiov, len, + ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen, UIO_FASTIOV, &iomsg->free_iov, &iomsg->msg.msg_iter, true); if (ret < 0) diff --git a/net/compat.c b/net/compat.c index 210fc3b4d0d833..513aa9a3fc6466 100644 --- a/net/compat.c +++ b/net/compat.c @@ -34,20 +34,15 @@ #include int __get_compat_msghdr(struct msghdr *kmsg, - struct compat_msghdr __user *umsg, - struct sockaddr __user **save_addr, - compat_uptr_t *ptr, compat_size_t *len) + struct compat_msghdr *msg, + struct sockaddr __user **save_addr) { - struct compat_msghdr msg; ssize_t err; - if (copy_from_user(&msg, umsg, sizeof(*umsg))) - return -EFAULT; - - kmsg->msg_flags = msg.msg_flags; - kmsg->msg_namelen = msg.msg_namelen; + kmsg->msg_flags = msg->msg_flags; + kmsg->msg_namelen = msg->msg_namelen; - if (!msg.msg_name) + if (!msg->msg_name) kmsg->msg_namelen = 0; if (kmsg->msg_namelen < 0) @@ -57,15 +52,15 @@ int __get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_namelen = sizeof(struct sockaddr_storage); kmsg->msg_control_is_user = true; - kmsg->msg_control_user = compat_ptr(msg.msg_control); - kmsg->msg_controllen = msg.msg_controllen; + kmsg->msg_control_user = compat_ptr(msg->msg_control); + kmsg->msg_controllen = msg->msg_controllen; if (save_addr) - *save_addr = compat_ptr(msg.msg_name); + *save_addr = compat_ptr(msg->msg_name); - if (msg.msg_name && kmsg->msg_namelen) { + if (msg->msg_name && kmsg->msg_namelen) { if (!save_addr) { - err = move_addr_to_kernel(compat_ptr(msg.msg_name), + err = move_addr_to_kernel(compat_ptr(msg->msg_name), kmsg->msg_namelen, kmsg->msg_name); if (err < 0) @@ -76,12 +71,10 @@ int __get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_namelen = 0; } - if (msg.msg_iovlen > UIO_MAXIOV) + if (msg->msg_iovlen > UIO_MAXIOV) return -EMSGSIZE; kmsg->msg_iocb = NULL; - *ptr = msg.msg_iov; - *len = msg.msg_iovlen; return 0; } @@ -90,15 +83,17 @@ int get_compat_msghdr(struct msghdr *kmsg, struct sockaddr __user **save_addr, struct iovec **iov) { - compat_uptr_t ptr; - compat_size_t len; + struct compat_msghdr msg; ssize_t err; - err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len); + if (copy_from_user(&msg, umsg, sizeof(*umsg))) + return -EFAULT; + + err = __get_compat_msghdr(kmsg, umsg, save_addr); if (err) return err; - err = import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr), len, + err = import_iovec(save_addr ? READ : WRITE, compat_ptr(msg.msg_iov), msg.msg_iovlen, UIO_FASTIOV, iov, &kmsg->msg_iter); return err < 0 ? err : 0; } From c56eab65de36c6750a872c4d3750fce70bbe248e Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Thu, 14 Jul 2022 04:02:58 -0700 Subject: [PATCH 1093/1250] io_uring: support multishot in recvmsg Similar to multishot recv, this will require provided buffers to be used. However recvmsg is much more complex than recv as it has multiple outputs. Specifically flags, name, and control messages. Support this by introducing a new struct io_uring_recvmsg_out with 4 fields. namelen, controllen and flags match the similar out fields in msghdr from standard recvmsg(2), payloadlen is the length of the payload following the header. This struct is placed at the start of the returned buffer. Based on what the user specifies in struct msghdr, the next bytes of the buffer will be name (the next msg_namelen bytes), and then control (the next msg_controllen bytes). The payload will come at the end. The return value in the CQE is the total used size of the provided buffer. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220714110258.1336200-4-dylany@fb.com [axboe: style fixups, see link] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 7 ++ io_uring/net.c | 180 ++++++++++++++++++++++++++++++---- io_uring/net.h | 6 ++ 3 files changed, 174 insertions(+), 19 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 499679134961b2..4c9b11e2e99158 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -613,4 +613,11 @@ struct io_uring_file_index_range { __u64 resv; }; +struct io_uring_recvmsg_out { + __u32 namelen; + __u32 controllen; + __u32 payloadlen; + __u32 flags; +}; + #endif diff --git a/io_uring/net.c b/io_uring/net.c index 5bc3440a829012..616d5f04cc7435 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -325,6 +325,21 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg) +{ + unsigned long hdr; + + if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), + (unsigned long)iomsg->namelen, &hdr)) + return true; + if (check_add_overflow(hdr, iomsg->controllen, &hdr)) + return true; + if (hdr > INT_MAX) + return true; + + return false; +} + static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg) { @@ -352,6 +367,13 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, sr->len = iomsg->fast_iov[0].iov_len; iomsg->free_iov = NULL; } + + if (req->flags & REQ_F_APOLL_MULTISHOT) { + iomsg->namelen = msg.msg_namelen; + iomsg->controllen = msg.msg_controllen; + if (io_recvmsg_multishot_overflow(iomsg)) + return -EOVERFLOW; + } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV, @@ -399,6 +421,13 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, sr->len = clen; iomsg->free_iov = NULL; } + + if (req->flags & REQ_F_APOLL_MULTISHOT) { + iomsg->namelen = msg.msg_namelen; + iomsg->controllen = msg.msg_controllen; + if (io_recvmsg_multishot_overflow(iomsg)) + return -EOVERFLOW; + } } else { iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen, @@ -455,8 +484,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; if (sr->flags & IORING_RECV_MULTISHOT) { - if (req->opcode == IORING_OP_RECVMSG) - return -EINVAL; if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sr->msg_flags & MSG_WAITALL) @@ -483,12 +510,13 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) } /* - * Finishes io_recv + * Finishes io_recv and io_recvmsg. * * Returns true if it is actually finished, or false if it should run * again (for multishot). */ -static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int cflags) +static inline bool io_recv_finish(struct io_kiocb *req, int *ret, + unsigned int cflags, bool mshot_finished) { if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { io_req_set_res(req, *ret, cflags); @@ -496,7 +524,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c return true; } - if (*ret > 0) { + if (!mshot_finished) { if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, false)) { io_recv_prep_retry(req); @@ -518,6 +546,90 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, unsigned int c return true; } +static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg, + struct io_sr_msg *sr, void __user **buf, + size_t *len) +{ + unsigned long ubuf = (unsigned long) *buf; + unsigned long hdr; + + hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + + kmsg->controllen; + if (*len < hdr) + return -EFAULT; + + if (kmsg->controllen) { + unsigned long control = ubuf + hdr - kmsg->controllen; + + kmsg->msg.msg_control_user = (void *) control; + kmsg->msg.msg_controllen = kmsg->controllen; + } + + sr->buf = *buf; /* stash for later copy */ + *buf = (void *) (ubuf + hdr); + kmsg->payloadlen = *len = *len - hdr; + return 0; +} + +struct io_recvmsg_multishot_hdr { + struct io_uring_recvmsg_out msg; + struct sockaddr_storage addr; +}; + +static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, + struct io_async_msghdr *kmsg, + unsigned int flags, bool *finished) +{ + int err; + int copy_len; + struct io_recvmsg_multishot_hdr hdr; + + if (kmsg->namelen) + kmsg->msg.msg_name = &hdr.addr; + kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); + kmsg->msg.msg_namelen = 0; + + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + + err = sock_recvmsg(sock, &kmsg->msg, flags); + *finished = err <= 0; + if (err < 0) + return err; + + hdr.msg = (struct io_uring_recvmsg_out) { + .controllen = kmsg->controllen - kmsg->msg.msg_controllen, + .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT + }; + + hdr.msg.payloadlen = err; + if (err > kmsg->payloadlen) + err = kmsg->payloadlen; + + copy_len = sizeof(struct io_uring_recvmsg_out); + if (kmsg->msg.msg_namelen > kmsg->namelen) + copy_len += kmsg->namelen; + else + copy_len += kmsg->msg.msg_namelen; + + /* + * "fromlen shall refer to the value before truncation.." + * 1003.1g + */ + hdr.msg.namelen = kmsg->msg.msg_namelen; + + /* ensure that there is no gap between hdr and sockaddr_storage */ + BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) != + sizeof(struct io_uring_recvmsg_out)); + if (copy_to_user(io->buf, &hdr, copy_len)) { + *finished = true; + return -EFAULT; + } + + return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen + + kmsg->controllen + err; +} + int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req); @@ -527,6 +639,7 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool mshot_finished = true; sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -545,16 +658,27 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg, issue_flags); +retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; + size_t len = sr->len; - buf = io_buffer_select(req, &sr->len, issue_flags); + buf = io_buffer_select(req, &len, issue_flags); if (!buf) return -ENOBUFS; + + if (req->flags & REQ_F_APOLL_MULTISHOT) { + ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len); + if (ret) { + io_kbuf_recycle(req, issue_flags); + return ret; + } + } + kmsg->fast_iov[0].iov_base = buf; - kmsg->fast_iov[0].iov_len = sr->len; + kmsg->fast_iov[0].iov_len = len; iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1, - sr->len); + len); } flags = sr->msg_flags; @@ -564,10 +688,23 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) min_ret = iov_iter_count(&kmsg->msg.msg_iter); kmsg->msg.msg_get_inq = 1; - ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, kmsg->uaddr, flags); + if (req->flags & REQ_F_APOLL_MULTISHOT) + ret = io_recvmsg_multishot(sock, sr, kmsg, flags, + &mshot_finished); + else + ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg, + kmsg->uaddr, flags); + if (ret < min_ret) { - if (ret == -EAGAIN && force_nonblock) - return io_setup_async_msg(req, kmsg, issue_flags); + if (ret == -EAGAIN && force_nonblock) { + ret = io_setup_async_msg(req, kmsg, issue_flags); + if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) == + IO_APOLL_MULTI_POLLED) { + io_kbuf_recycle(req, issue_flags); + return IOU_ISSUE_SKIP_COMPLETE; + } + return ret; + } if (ret == -ERESTARTSYS) ret = -EINTR; if (ret > 0 && io_net_retry(sock, flags)) { @@ -580,11 +717,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); - io_netmsg_recycle(req, issue_flags); - req->flags &= ~REQ_F_NEED_CLEANUP; if (ret > 0) ret += sr->done_io; else if (sr->done_io) @@ -596,8 +728,18 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (kmsg->msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - io_req_set_res(req, ret, cflags); - return IOU_OK; + if (!io_recv_finish(req, &ret, cflags, mshot_finished)) + goto retry_multishot; + + if (mshot_finished) { + io_netmsg_recycle(req, issue_flags); + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; + } + + return ret; } int io_recv(struct io_kiocb *req, unsigned int issue_flags) @@ -684,7 +826,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (msg.msg_inq) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!io_recv_finish(req, &ret, cflags)) + if (!io_recv_finish(req, &ret, cflags, ret <= 0)) goto retry_multishot; return ret; diff --git a/io_uring/net.h b/io_uring/net.h index 178a6d8b76e0a8..db20ce9d6546d4 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -9,6 +9,12 @@ struct io_async_msghdr { union { struct iovec fast_iov[UIO_FASTIOV]; + struct { + struct iovec fast_iov_one; + __kernel_size_t controllen; + int namelen; + __kernel_size_t payloadlen; + }; struct io_cache_entry cache; }; /* points to an allocated iov, if NULL we use fast_iov instead */ From 60875e71909321fb60aee7731fde33a120629279 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 14 Jul 2022 18:33:01 +0200 Subject: [PATCH 1094/1250] io_uring: Use atomic_long_try_cmpxchg in __io_account_mem Use atomic_long_try_cmpxchg instead of atomic_long_cmpxchg (*ptr, old, new) == old in __io_account_mem. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, atomic_long_try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails, enabling further code simplifications. No functional change intended. Signed-off-by: Uros Bizjak Cc: Jens Axboe Cc: Pavel Begunkov Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 0250c13ae1cdd0..7f66b0e2567432 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -56,14 +56,13 @@ static inline int __io_account_mem(struct user_struct *user, /* Don't allow more pages than we can safely lock */ page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur_pages = atomic_long_read(&user->locked_vm); do { - cur_pages = atomic_long_read(&user->locked_vm); new_pages = cur_pages + nr_pages; if (new_pages > page_limit) return -ENOMEM; - } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, - new_pages) != cur_pages); - + } while (!atomic_long_try_cmpxchg(&user->locked_vm, + &cur_pages, new_pages)); return 0; } From ad01b3fe46204a8280f55c9a9c6d35d225a408c2 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Fri, 15 Jul 2022 06:02:52 -0700 Subject: [PATCH 1095/1250] io_uring: fix types in io_recvmsg_multishot_overflow io_recvmsg_multishot_overflow had incorrect types on non x64 system. But also it had an unnecessary INT_MAX check, which could just be done by changing the type of the accumulator to int (also simplifying the casts). Reported-by: Stephen Rothwell Fixes: a8b38c4ce724 ("io_uring: support multishot in recvmsg") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20220715130252.610639-1-dylany@fb.com Signed-off-by: Jens Axboe --- io_uring/net.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 616d5f04cc7435..6b7d5f33e642a0 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -327,14 +327,14 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg) { - unsigned long hdr; + int hdr; - if (check_add_overflow(sizeof(struct io_uring_recvmsg_out), - (unsigned long)iomsg->namelen, &hdr)) + if (iomsg->namelen < 0) return true; - if (check_add_overflow(hdr, iomsg->controllen, &hdr)) + if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out), + iomsg->namelen, &hdr)) return true; - if (hdr > INT_MAX) + if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr)) return true; return false; From b8fcaacbf65840d411ec58bee5eabd6e0b514874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Fri, 15 Jul 2022 19:45:01 +0200 Subject: [PATCH 1096/1250] io_uring: Don't require reinitable percpu_ref MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit 8bb649ee1da3 ("io_uring: remove ring quiesce for io_uring_register") removed the worklow relying on reinit/resurrection of the percpu_ref, hence, initialization with that requested is a relic. This is based on code review, this causes no real bug (and theoretically can't). Technically it's a revert of commit 214828962dea ("io_uring: initialize percpu refcounters using PERCU_REF_ALLOW_REINIT") but since the flag omission is now justified, I'm not making this a revert. Fixes: 8bb649ee1da3 ("io_uring: remove ring quiesce for io_uring_register") Signed-off-by: Michal Koutný Acked-by: Roman Gushchin Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f697ca4e8f558c..624535c625652f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -289,7 +289,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->dummy_ubuf->ubuf = -1UL; if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, - PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) + 0, GFP_KERNEL)) goto err; ctx->flags = p->flags; From 7f91d3066ae1b75eb96ed1eef2b5d7d648762da4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Jul 2022 15:54:47 -0600 Subject: [PATCH 1097/1250] net: fix compat pointer in get_compat_msghdr() A previous change enabled external users to copy the data before calling __get_compat_msghdr(), but didn't modify get_compat_msghdr() or __io_compat_recvmsg_copy_hdr() to take that into account. They are both stil passing in the __user pointer rather than the copied version. Ensure we pass in the kernel struct, not the pointer to the user data. Link: https://lore.kernel.org/all/46439555-644d-08a1-7d66-16f8f9a320f0@samsung.com/ Fixes: 1a3e4e94a1b9 ("net: copy from user before calling __get_compat_msghdr") Reported-by: Marek Szyprowski Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- net/compat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 6b7d5f33e642a0..e61efa31c729c6 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -398,7 +398,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req, if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg))) return -EFAULT; - ret = __get_compat_msghdr(&iomsg->msg, sr->umsg_compat, &iomsg->uaddr); + ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); if (ret) return ret; diff --git a/net/compat.c b/net/compat.c index 513aa9a3fc6466..ed880729d159bd 100644 --- a/net/compat.c +++ b/net/compat.c @@ -89,7 +89,7 @@ int get_compat_msghdr(struct msghdr *kmsg, if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - err = __get_compat_msghdr(kmsg, umsg, save_addr); + err = __get_compat_msghdr(kmsg, &msg, save_addr); if (err) return err; From 6deeac5fc61b1821d23f86a59023179449d281c5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 10:51:44 -0700 Subject: [PATCH 1098/1250] mm: Move starting of background writeback into the main balancing loop We start background writeback if we are over background threshold after exiting the main loop in balance_dirty_pages(). This may result in basing the decision on already stale values (we may have slept for significant amount of time) and it is also inconvenient for refactoring needed for async dirty throttling. Move the check into the main waiting loop. Signed-off-by: Jan Kara Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20220623175157.1715274-2-shr@fb.com Signed-off-by: Jens Axboe --- mm/page-writeback.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 55c2776ae6999d..e59c523aed1a2f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1627,6 +1627,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb, } } + /* + * In laptop mode, we wait until hitting the higher threshold + * before starting background writeout, and then write out all + * the way down to the lower threshold. So slow writers cause + * minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && + !writeback_in_progress(wb)) + wb_start_background_writeback(wb); + /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts @@ -1657,6 +1670,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, break; } + /* Start writeback even when in laptop mode */ if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); @@ -1823,23 +1837,6 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (!dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; - - if (writeback_in_progress(wb)) - return; - - /* - * In laptop mode, we wait until hitting the higher threshold before - * starting background writeout, and then write out all the way down - * to the lower threshold. So slow writers cause minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (laptop_mode) - return; - - if (nr_reclaimable > gdtc->bg_thresh) - wb_start_background_writeback(wb); } static DEFINE_PER_CPU(int, bdp_ratelimits); From 0762dec683b2d349f326818f3439b81f297cbc64 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 10:51:45 -0700 Subject: [PATCH 1099/1250] mm: Move updates of dirty_exceeded into one place Transition of wb->dirty_exceeded from 0 to 1 happens before we go to sleep in balance_dirty_pages() while transition from 1 to 0 happens when exiting from balance_dirty_pages(), possibly based on old values. This does not make a lot of sense since wb->dirty_exceeded should simply reflect whether wb is over dirty limit and so we should ratelimit entering to balance_dirty_pages() less. Move the two updates together. Signed-off-by: Jan Kara Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20220623175157.1715274-3-shr@fb.com Signed-off-by: Jens Axboe --- mm/page-writeback.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e59c523aed1a2f..90b1998c16a144 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1729,8 +1729,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb, sdtc = mdtc; } - if (dirty_exceeded && !wb->dirty_exceeded) - wb->dirty_exceeded = 1; + if (dirty_exceeded != wb->dirty_exceeded) + wb->dirty_exceeded = dirty_exceeded; if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) @@ -1834,9 +1834,6 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (fatal_signal_pending(current)) break; } - - if (!dirty_exceeded && wb->dirty_exceeded) - wb->dirty_exceeded = 0; } static DEFINE_PER_CPU(int, bdp_ratelimits); From 47f94042b4f6bb9bb154e7fe44fe6c05b172567c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 10:51:46 -0700 Subject: [PATCH 1100/1250] mm: Add balance_dirty_pages_ratelimited_flags() function This adds the helper function balance_dirty_pages_ratelimited_flags(). It adds the parameter flags to balance_dirty_pages_ratelimited(). The flags parameter is passed to balance_dirty_pages(). For async buffered writes the flag value will be BDP_ASYNC. If balance_dirty_pages() gets called for async buffered write, we don't want to wait. Instead we need to indicate to the caller that throttling is needed so that it can stop writing and offload the rest of the write to a context that can block. The new helper function is also used by balance_dirty_pages_ratelimited(). Signed-off-by: Jan Kara Signed-off-by: Stefan Roesch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623175157.1715274-4-shr@fb.com [axboe: fix kerneltest bot 'ret' issue] Signed-off-by: Jens Axboe --- include/linux/writeback.h | 7 ++++++ mm/page-writeback.c | 51 +++++++++++++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index da21d63f70e285..b8c9610c2313cd 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -364,7 +364,14 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void wb_update_bandwidth(struct bdi_writeback *wb); + +/* Invoke balance dirty pages in async mode. */ +#define BDP_ASYNC 0x0001 + void balance_dirty_pages_ratelimited(struct address_space *mapping); +int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, + unsigned int flags); + bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 90b1998c16a144..d0d466a5c804ca 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ -static void balance_dirty_pages(struct bdi_writeback *wb, - unsigned long pages_dirtied) +static int balance_dirty_pages(struct bdi_writeback *wb, + unsigned long pages_dirtied, unsigned int flags) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; @@ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; + int ret = 0; for (;;) { unsigned long now = jiffies; @@ -1803,6 +1804,10 @@ static void balance_dirty_pages(struct bdi_writeback *wb, period, pause, start_time); + if (flags & BDP_ASYNC) { + ret = -EAGAIN; + break; + } __set_current_state(TASK_KILLABLE); wb->dirty_sleep = now; io_schedule_timeout(pause); @@ -1834,6 +1839,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (fatal_signal_pending(current)) break; } + return ret; } static DEFINE_PER_CPU(int, bdp_ratelimits); @@ -1855,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** - * balance_dirty_pages_ratelimited - balance dirty memory state - * @mapping: address_space which was dirtied + * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. + * @mapping: address_space which was dirtied. + * @flags: BDP flags. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * - * Once we're over the dirty memory limit we decrease the ratelimiting - * by a lot, to prevent individual processes from overshooting the limit - * by (ratelimit_pages) each. + * See balance_dirty_pages_ratelimited() for details. + * + * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to + * indicate that memory is out of balance and the caller must wait + * for I/O to complete. Otherwise, it will return 0 to indicate + * that either memory was already in balance, or it was able to sleep + * until the amount of dirty memory returned to balance. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, + unsigned int flags) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; + int ret = 0; int *p; if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) - return; + return ret; if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); @@ -1915,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(wb, current->nr_dirtied); + ret = balance_dirty_pages(wb, current->nr_dirtied, flags); wb_put(wb); + return ret; +} + +/** + * balance_dirty_pages_ratelimited - balance dirty memory state. + * @mapping: address_space which was dirtied. + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * Once we're over the dirty memory limit we decrease the ratelimiting + * by a lot, to prevent individual processes from overshooting the limit + * by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited(struct address_space *mapping) +{ + balance_dirty_pages_ratelimited_flags(mapping, 0); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); From 8be70c859444ffa601cc3fd3b2ede9a66241b6f9 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:47 -0700 Subject: [PATCH 1101/1250] iomap: Add flags parameter to iomap_page_create() Add the kiocb flags parameter to the function iomap_page_create(). Depending on the value of the flags parameter it enables different gfp flags. No intended functional changes in this patch. Signed-off-by: Stefan Roesch Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-5-shr@fb.com Signed-off-by: Jens Axboe --- fs/iomap/buffered-io.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d2a9f699e17ed3..3c97b713f831db 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio) static struct bio_set iomap_ioend_bioset; static struct iomap_page * -iomap_page_create(struct inode *inode, struct folio *folio) +iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) { struct iomap_page *iop = to_iomap_page(folio); unsigned int nr_blocks = i_blocks_per_folio(inode, folio); + gfp_t gfp; if (iop || nr_blocks <= 1) return iop; + if (flags & IOMAP_NOWAIT) + gfp = GFP_NOWAIT; + else + gfp = GFP_NOFS | __GFP_NOFAIL; + iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), - GFP_NOFS | __GFP_NOFAIL); - spin_lock_init(&iop->uptodate_lock); - if (folio_test_uptodate(folio)) - bitmap_fill(iop->uptodate, nr_blocks); - folio_attach_private(folio, iop); + gfp); + if (iop) { + spin_lock_init(&iop->uptodate_lock); + if (folio_test_uptodate(folio)) + bitmap_fill(iop->uptodate, nr_blocks); + folio_attach_private(folio, iop); + } return iop; } @@ -226,7 +234,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) - iop = iomap_page_create(iter->inode, folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); else iop = to_iomap_page(folio); @@ -264,7 +272,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(iter->inode, folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; @@ -547,7 +555,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap_page *iop = iomap_page_create(iter->inode, folio); + struct iomap_page *iop; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); @@ -558,6 +566,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return 0; folio_clear_error(folio); + iop = iomap_page_create(iter->inode, folio, iter->flags); + do { iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); @@ -1329,7 +1339,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct folio *folio, u64 end_pos) { - struct iomap_page *iop = iomap_page_create(inode, folio); + struct iomap_page *iop = iomap_page_create(inode, folio, 0); struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); unsigned nblocks = i_blocks_per_folio(inode, folio); From 3ba193ca3609db7ef8dc19632c1c306c235521ba Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:48 -0700 Subject: [PATCH 1102/1250] iomap: Add async buffered write support This adds async buffered write support to iomap. This replaces the call to balance_dirty_pages_ratelimited() with the call to balance_dirty_pages_ratelimited_flags. This allows to specify if the write request is async or not. In addition this also moves the above function call to the beginning of the function. If the function call is at the end of the function and the decision is made to throttle writes, then there is no request that io-uring can wait on. By moving it to the beginning of the function, the write request is not issued, but returns -EAGAIN instead. io-uring will punt the request and process it in the io-worker. By moving the function call to the beginning of the function, the write throttling will happen one page later. Signed-off-by: Stefan Roesch Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-6-shr@fb.com Signed-off-by: Jens Axboe --- fs/iomap/buffered-io.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3c97b713f831db..83cf093fcb9251 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -559,6 +559,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); + unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio); size_t from = offset_in_folio(folio, pos), to = from + len; size_t poff, plen; @@ -567,6 +568,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, folio_clear_error(folio); iop = iomap_page_create(iter->inode, folio, iter->flags); + if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) + return -EAGAIN; do { iomap_adjust_read_range(iter->inode, folio, &block_start, @@ -584,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return -EIO; folio_zero_segments(folio, poff, from, to, poff + plen); } else { - int status = iomap_read_folio_sync(block_start, folio, + int status; + + if (iter->flags & IOMAP_NOWAIT) + return -EAGAIN; + + status = iomap_read_folio_sync(block_start, folio, poff, plen, srcmap); if (status) return status; @@ -613,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; int status = 0; + if (iter->flags & IOMAP_NOWAIT) + fgp |= FGP_NOWAIT; + BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); @@ -632,7 +643,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); if (!folio) { - status = -ENOMEM; + status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; goto out_no_page; } if (pos + len > folio_pos(folio) + folio_size(folio)) @@ -750,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) loff_t pos = iter->pos; ssize_t written = 0; long status = 0; + struct address_space *mapping = iter->inode->i_mapping; + unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; do { struct folio *folio; @@ -762,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) bytes = min_t(unsigned long, PAGE_SIZE - offset, iov_iter_count(i)); again: + status = balance_dirty_pages_ratelimited_flags(mapping, + bdp_flags); + if (unlikely(status)) + break; + if (bytes > length) bytes = length; @@ -770,6 +788,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. + * + * For async buffered writes the assumption is that the user + * page has already been faulted in. This can be optimized by + * faulting the user page. */ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { status = -EFAULT; @@ -781,7 +803,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) break; page = folio_file_page(folio, pos >> PAGE_SHIFT); - if (mapping_writably_mapped(iter->inode->i_mapping)) + if (mapping_writably_mapped(mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); @@ -806,8 +828,6 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) pos += status; written += status; length -= status; - - balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (iov_iter_count(i) && length); return written ? written : status; @@ -825,6 +845,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, }; int ret; + if (iocb->ki_flags & IOCB_NOWAIT) + iter.flags |= IOMAP_NOWAIT; + while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); if (iter.pos == iocb->ki_pos) From bdd5a24bfc8943f2b9f79774b83b267aef7addb9 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:49 -0700 Subject: [PATCH 1103/1250] iomap: Return -EAGAIN from iomap_write_iter() If iomap_write_iter() encounters -EAGAIN, return -EAGAIN to the caller. Signed-off-by: Stefan Roesch Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-7-shr@fb.com Reviewed-by: Christoph Hellwig [axboe: make the suggested ternary edit] Signed-off-by: Jens Axboe --- fs/iomap/buffered-io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 83cf093fcb9251..c681eacc389b64 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -830,6 +830,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) length -= status; } while (iov_iter_count(i) && length); + if (status == -EAGAIN) { + iov_iter_revert(i, written); + return -EAGAIN; + } return written ? written : status; } From 0835057b58e5725ec3ea0be6b525490fa48b9b7d Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:50 -0700 Subject: [PATCH 1104/1250] fs: add a FMODE_BUF_WASYNC flags for f_mode This introduces the flag FMODE_BUF_WASYNC. If devices support async buffered writes, this flag can be set. It also modifies the check in generic_write_checks to take async buffered writes into consideration. Signed-off-by: Stefan Roesch Reviewed-by: Christoph Hellwig Reviewed-by: Christian Brauner (Microsoft) Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-8-shr@fb.com Signed-off-by: Jens Axboe --- fs/read_write.c | 4 +++- include/linux/fs.h | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/read_write.c b/fs/read_write.c index e0777eefd84650..319d88825d1cbd 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1660,7 +1660,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) if (iocb->ki_flags & IOCB_APPEND) iocb->ki_pos = i_size_read(inode); - if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_NOWAIT) && + !((iocb->ki_flags & IOCB_DIRECT) || + (file->f_mode & FMODE_BUF_WASYNC))) return -EINVAL; return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae57..bc84847c201ea6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -180,6 +180,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File supports async buffered reads */ #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) +/* File supports async nowait buffered writes */ +#define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000) + /* * Attribute flags. These should be or-ed together to figure out what * has been changed! From b8f85795d96a73c4bc8394e4b09e49db2b790336 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:51 -0700 Subject: [PATCH 1105/1250] fs: add __remove_file_privs() with flags parameter This adds the function __remove_file_privs, which allows the caller to pass the kiocb flags parameter. No intended functional changes in this patch. Signed-off-by: Stefan Roesch Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Christian Brauner (Microsoft) Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-9-shr@fb.com Signed-off-by: Jens Axboe --- fs/inode.c | 57 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 20 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index bd4da9c5207eab..a2e18379c8a620 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2010,36 +2010,43 @@ static int __remove_privs(struct user_namespace *mnt_userns, return notify_change(mnt_userns, dentry, &newattrs, NULL); } -/* - * Remove special file priviledges (suid, capabilities) when file is written - * to or truncated. - */ -int file_remove_privs(struct file *file) +static int __file_remove_privs(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); + int error; int kill; - int error = 0; - /* - * Fast path for nothing security related. - * As well for non-regular files, e.g. blkdev inodes. - * For example, blkdev_write_iter() might get here - * trying to remove privs which it is not allowed to. - */ if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); - if (kill < 0) + if (kill <= 0) return kill; - if (kill) - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); + + if (flags & IOCB_NOWAIT) + return -EAGAIN; + + error = __remove_privs(file_mnt_user_ns(file), dentry, kill); if (!error) inode_has_no_xattr(inode); return error; } + +/** + * file_remove_privs - remove special file privileges (suid, capabilities) + * @file: file to remove privileges from + * + * When file is modified by a write or truncation ensure that special + * file privileges are removed. + * + * Return: 0 on success, negative errno on failure. + */ +int file_remove_privs(struct file *file) +{ + return __file_remove_privs(file, 0); +} EXPORT_SYMBOL(file_remove_privs); /** @@ -2090,18 +2097,28 @@ int file_update_time(struct file *file) } EXPORT_SYMBOL(file_update_time); -/* Caller must hold the file's inode lock */ +/** + * file_modified - handle mandated vfs changes when modifying a file + * @file: file that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ int file_modified(struct file *file) { - int err; + int ret; /* * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ - err = file_remove_privs(file); - if (err) - return err; + ret = __file_remove_privs(file, 0); + if (ret) + return ret; if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; From 4efa35cd1dffdfdb807af4efe6bd36a5a9eafcdb Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:52 -0700 Subject: [PATCH 1106/1250] fs: Split off inode_needs_update_time and __file_update_time This splits off the functions inode_needs_update_time() and __file_update_time() from the function file_update_time(). This is required to support async buffered writes. No intended functional changes in this patch. Signed-off-by: Stefan Roesch Reviewed-by: Jan Kara Reviewed-by: Christian Brauner (Microsoft) Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-10-shr@fb.com Signed-off-by: Jens Axboe --- fs/inode.c | 76 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index a2e18379c8a620..ff726d99ecc796 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2049,35 +2049,18 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); -/** - * file_update_time - update mtime and ctime time - * @file: file accessed - * - * Update the mtime and ctime members of an inode and mark the inode - * for writeback. Note that this function is meant exclusively for - * usage in the file write path of filesystems, and filesystems may - * choose to explicitly ignore update via this function with the - * S_NOCMTIME inode flag, e.g. for network filesystem where these - * timestamps are handled by the server. This can return an error for - * file systems who need to allocate space in order to update an inode. - */ - -int file_update_time(struct file *file) +static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) { - struct inode *inode = file_inode(file); - struct timespec64 now; int sync_it = 0; - int ret; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; - now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) + if (!timespec64_equal(&inode->i_mtime, now)) sync_it = S_MTIME; - if (!timespec64_equal(&inode->i_ctime, &now)) + if (!timespec64_equal(&inode->i_ctime, now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) @@ -2086,15 +2069,50 @@ int file_update_time(struct file *file) if (!sync_it) return 0; - /* Finally allowed to write? Takes lock. */ - if (__mnt_want_write_file(file)) - return 0; + return sync_it; +} + +static int __file_update_time(struct file *file, struct timespec64 *now, + int sync_mode) +{ + int ret = 0; + struct inode *inode = file_inode(file); - ret = inode_update_time(inode, &now, sync_it); - __mnt_drop_write_file(file); + /* try to update time settings */ + if (!__mnt_want_write_file(file)) { + ret = inode_update_time(inode, now, sync_mode); + __mnt_drop_write_file(file); + } return ret; } + +/** + * file_update_time - update mtime and ctime time + * @file: file accessed + * + * Update the mtime and ctime members of an inode and mark the inode for + * writeback. Note that this function is meant exclusively for usage in + * the file write path of filesystems, and filesystems may choose to + * explicitly ignore updates via this function with the _NOCMTIME inode + * flag, e.g. for network filesystem where these imestamps are handled + * by the server. This can return an error for file systems who need to + * allocate space in order to update an inode. + * + * Return: 0 on success, negative errno on failure. + */ +int file_update_time(struct file *file) +{ + int ret; + struct inode *inode = file_inode(file); + struct timespec64 now = current_time(inode); + + ret = inode_needs_update_time(inode, &now); + if (ret <= 0) + return ret; + + return __file_update_time(file, &now, ret); +} EXPORT_SYMBOL(file_update_time); /** @@ -2111,6 +2129,8 @@ EXPORT_SYMBOL(file_update_time); int file_modified(struct file *file) { int ret; + struct inode *inode = file_inode(file); + struct timespec64 now = current_time(inode); /* * Clear the security bits if the process is not being run by root. @@ -2123,7 +2143,11 @@ int file_modified(struct file *file) if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; - return file_update_time(file); + ret = inode_needs_update_time(inode, &now); + if (ret <= 0) + return ret; + + return __file_update_time(file, &now, ret); } EXPORT_SYMBOL(file_modified); From 76c6a3e1db304aeb6a53c93232b878c9efb3d552 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:53 -0700 Subject: [PATCH 1107/1250] fs: Add async write file modification handling. This adds a file_modified_async() function to return -EAGAIN if the request either requires to remove privileges or needs to update the file modification time. This is required for async buffered writes, so the request gets handled in the io worker of io-uring. Signed-off-by: Stefan Roesch Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Christian Brauner (Microsoft) Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-11-shr@fb.com Signed-off-by: Jens Axboe --- fs/inode.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- include/linux/fs.h | 1 + 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index ff726d99ecc796..259ebf43889399 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2116,17 +2116,21 @@ int file_update_time(struct file *file) EXPORT_SYMBOL(file_update_time); /** - * file_modified - handle mandated vfs changes when modifying a file + * file_modified_flags - handle mandated vfs changes when modifying a file * @file: file that was modified + * @flags: kiocb flags * * When file has been modified ensure that special * file privileges are removed and time settings are updated. * + * If IOCB_NOWAIT is set, special file privileges will not be removed and + * time settings will not be updated. It will return -EAGAIN. + * * Context: Caller must hold the file's inode lock. * * Return: 0 on success, negative errno on failure. */ -int file_modified(struct file *file) +static int file_modified_flags(struct file *file, int flags) { int ret; struct inode *inode = file_inode(file); @@ -2136,7 +2140,7 @@ int file_modified(struct file *file) * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ - ret = __file_remove_privs(file, 0); + ret = __file_remove_privs(file, flags); if (ret) return ret; @@ -2146,11 +2150,46 @@ int file_modified(struct file *file) ret = inode_needs_update_time(inode, &now); if (ret <= 0) return ret; + if (flags & IOCB_NOWAIT) + return -EAGAIN; return __file_update_time(file, &now, ret); } + +/** + * file_modified - handle mandated vfs changes when modifying a file + * @file: file that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int file_modified(struct file *file) +{ + return file_modified_flags(file, 0); +} EXPORT_SYMBOL(file_modified); +/** + * kiocb_modified - handle mandated vfs changes when modifying a file + * @iocb: iocb that was modified + * + * When file has been modified ensure that special + * file privileges are removed and time settings are updated. + * + * Context: Caller must hold the file's inode lock. + * + * Return: 0 on success, negative errno on failure. + */ +int kiocb_modified(struct kiocb *iocb) +{ + return file_modified_flags(iocb->ki_filp, iocb->ki_flags); +} +EXPORT_SYMBOL_GPL(kiocb_modified); + int inode_needs_sync(struct inode *inode) { if (IS_SYNC(inode)) diff --git a/include/linux/fs.h b/include/linux/fs.h index bc84847c201ea6..c0d99b5a166bd7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2390,6 +2390,7 @@ static inline void file_accessed(struct file *file) } extern int file_modified(struct file *file); +int kiocb_modified(struct kiocb *iocb); int sync_inode_metadata(struct inode *inode, int wait); From b1db7826bd45ea877add80f600bf8494dc81da8f Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 16 Jun 2022 14:22:18 -0700 Subject: [PATCH 1108/1250] io_uring: Add support for async buffered writes This enables the async buffered writes for the filesystems that support async buffered writes in io-uring. Buffered writes are enabled for blocks that are already in the page cache or can be acquired with noio. Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20220616212221.2024518-12-shr@fb.com [axboe: adapt to 5.20 branch] Signed-off-by: Jens Axboe --- io_uring/rw.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index ade3e235f2770c..d6202a02c67cdf 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -641,7 +641,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) return -EINVAL; } -static bool need_read_all(struct io_kiocb *req) +static bool need_complete_io(struct io_kiocb *req) { return req->flags & REQ_F_ISREG || S_ISBLK(file_inode(req->file)->i_mode); @@ -780,7 +780,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) kfree(iovec); return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { + (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { /* read all, failed, already did sync or don't want to retry */ goto done; } @@ -875,9 +875,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!io_file_supports_nowait(req))) goto copy_iov; - /* file path doesn't support NOWAIT for non-direct_IO */ - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && - (req->flags & REQ_F_ISREG)) + /* File path supports NOWAIT for non-direct_IO only for block devices. */ + if (!(kiocb->ki_flags & IOCB_DIRECT) && + !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) && + (req->flags & REQ_F_ISREG)) goto copy_iov; kiocb->ki_flags |= IOCB_NOWAIT; @@ -933,6 +934,24 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) /* IOPOLL retry should happen for io-wq threads */ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto copy_iov; + + if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { + struct io_async_rw *rw; + + /* This is a partial write. The file pos has already been + * updated, setup the async struct to complete the request + * in the worker. Also update bytes_done to account for + * the bytes already written. + */ + iov_iter_save_state(&s->iter, &s->iter_state); + ret = io_setup_async_rw(req, iovec, s, true); + + rw = req->async_data; + if (rw) + rw->bytes_done += ret2; + + return ret ? ret : -EAGAIN; + } done: ret = kiocb_done(req, ret2, issue_flags); } else { From b51da82f9a51130ecd7fc395afa950e068796c7a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Jun 2022 10:24:45 -0600 Subject: [PATCH 1109/1250] io_uring: fix issue with io_write() not always undoing sb_start_write() This is actually an older issue, but we never used to hit the -EAGAIN path before having done sb_start_write(). Make sure that we always call kiocb_end_write() if we need to retry the write, so that we keep the calls to sb_start_write() etc balanced. Signed-off-by: Jens Axboe --- io_uring/rw.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index d6202a02c67cdf..a4c7c74a449696 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -950,6 +950,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (rw) rw->bytes_done += ret2; + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); return ret ? ret : -EAGAIN; } done: @@ -958,7 +960,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) copy_iov: iov_iter_restore(&s->iter, &s->iter_state); ret = io_setup_async_rw(req, iovec, s, false); - return ret ?: -EAGAIN; + if (!ret) { + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); + return -EAGAIN; + } + return ret; } /* it's reportedly faster than delegating the null check to kfree() */ if (iovec) From 9d45302553f0700528793ccc8aa4a1e81699a275 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 16 Jun 2022 14:22:19 -0700 Subject: [PATCH 1110/1250] io_uring: Add tracepoint for short writes This adds the io_uring_short_write tracepoint to io_uring. A short write is issued if not all pages that are required for a write are in the page cache and the async buffered writes have to return EAGAIN. Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20220616212221.2024518-13-shr@fb.com Signed-off-by: Jens Axboe --- include/trace/events/io_uring.h | 25 +++++++++++++++++++++++++ io_uring/rw.c | 3 +++ 2 files changed, 28 insertions(+) diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h index 95a8cfaad15a03..c5b21ff0ac8595 100644 --- a/include/trace/events/io_uring.h +++ b/include/trace/events/io_uring.h @@ -630,6 +630,31 @@ TRACE_EVENT(io_uring_task_work_run, __entry->tctx, __entry->count, __entry->loops) ); +TRACE_EVENT(io_uring_short_write, + + TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got), + + TP_ARGS(ctx, fpos, wanted, got), + + TP_STRUCT__entry( + __field(void *, ctx) + __field(u64, fpos) + __field(u64, wanted) + __field(u64, got) + ), + + TP_fast_assign( + __entry->ctx = ctx; + __entry->fpos = fpos; + __entry->wanted = wanted; + __entry->got = got; + ), + + TP_printk("ring %p, fpos %lld, wanted %lld, got %lld", + __entry->ctx, __entry->fpos, + __entry->wanted, __entry->got) +); + #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ diff --git a/io_uring/rw.c b/io_uring/rw.c index a4c7c74a449696..ca0d3a72364a4f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -938,6 +938,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { struct io_async_rw *rw; + trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, + req->cqe.res, ret2); + /* This is a partial write. The file pos has already been * updated, setup the async struct to complete the request * in the worker. Also update bytes_done to account for From 572d1a4fefd91cd2ab7efea65bb7ff85dc52925a Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:56 -0700 Subject: [PATCH 1111/1250] xfs: Specify lockmode when calling xfs_ilock_for_iomap() This patch changes the helper function xfs_ilock_for_iomap such that the lock mode must be passed in. Signed-off-by: Stefan Roesch Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-14-shr@fb.com Signed-off-by: Jens Axboe --- fs/xfs/xfs_iomap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 5a393259a3a38b..bcf7c3694290d8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -664,7 +664,7 @@ xfs_ilock_for_iomap( unsigned flags, unsigned *lockmode) { - unsigned mode = XFS_ILOCK_SHARED; + unsigned int mode = *lockmode; bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); /* @@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin( int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); @@ -1172,7 +1172,7 @@ xfs_read_iomap_begin( xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); int nimaps = 1, error = 0; bool shared = false; - unsigned lockmode; + unsigned int lockmode = XFS_ILOCK_SHARED; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); From 5df44218471cca5878a372cd325e84bf87d3620b Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Thu, 23 Jun 2022 10:51:57 -0700 Subject: [PATCH 1112/1250] xfs: Add async buffered write support This adds the async buffered write support to XFS. For async buffered write requests, the request will return -EAGAIN if the ilock cannot be obtained immediately. Signed-off-by: Stefan Roesch Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Link: https://lore.kernel.org/r/20220623175157.1715274-15-shr@fb.com Signed-off-by: Jens Axboe --- fs/xfs/xfs_file.c | 11 +++++------ fs/xfs/xfs_iomap.c | 5 ++++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5a171c0b244b7c..8d9b14d2b912ba 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -410,7 +410,7 @@ xfs_file_write_checks( spin_unlock(&ip->i_flags_lock); out: - return file_modified(file); + return kiocb_modified(iocb); } static int @@ -700,12 +700,11 @@ xfs_file_buffered_write( bool cleared_space = false; unsigned int iolock; - if (iocb->ki_flags & IOCB_NOWAIT) - return -EOPNOTSUPP; - write_retry: iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + return ret; ret = xfs_file_write_checks(iocb, from, &iolock); if (ret) @@ -1165,7 +1164,7 @@ xfs_file_open( { if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; - file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC; return generic_file_open(inode, file); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index bcf7c3694290d8..5d50fed291b45b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin( bool eof = false, cow_eof = false, shared = false; int allocfork = XFS_DATA_FORK; int error = 0; + unsigned int lockmode = XFS_ILOCK_EXCL; if (xfs_is_shutdown(mp)) return -EIO; @@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin( ASSERT(!XFS_IS_REALTIME_INODE(ip)); - xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { From 7864cd41a830f0d8524006c59a4cbdd28544de08 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Jul 2022 14:04:43 -0600 Subject: [PATCH 1113/1250] mm: honor FGP_NOWAIT for page cache page allocation If we're creating a page cache page with FGP_CREAT but FGP_NOWAIT is set, we should dial back the gfp flags to avoid frivolous blocking which is trivial to hit in low memory conditions: [ 10.117661] __schedule+0x8c/0x550 [ 10.118305] schedule+0x58/0xa0 [ 10.118897] schedule_timeout+0x30/0xdc [ 10.119610] __wait_for_common+0x88/0x114 [ 10.120348] wait_for_completion+0x1c/0x24 [ 10.121103] __flush_work.isra.0+0x16c/0x19c [ 10.121896] flush_work+0xc/0x14 [ 10.122496] __drain_all_pages+0x144/0x218 [ 10.123267] drain_all_pages+0x10/0x18 [ 10.123941] __alloc_pages+0x464/0x9e4 [ 10.124633] __folio_alloc+0x18/0x3c [ 10.125294] __filemap_get_folio+0x17c/0x204 [ 10.126084] iomap_write_begin+0xf8/0x428 [ 10.126829] iomap_file_buffered_write+0x144/0x24c [ 10.127710] xfs_file_buffered_write+0xe8/0x248 [ 10.128553] xfs_file_write_iter+0xa8/0x120 [ 10.129324] io_write+0x16c/0x38c [ 10.129940] io_issue_sqe+0x70/0x1cc [ 10.130617] io_queue_sqe+0x18/0xfc [ 10.131277] io_submit_sqes+0x5d4/0x600 [ 10.131946] __arm64_sys_io_uring_enter+0x224/0x600 [ 10.132752] invoke_syscall.constprop.0+0x70/0xc0 [ 10.133616] do_el0_svc+0xd0/0x118 [ 10.134238] el0_svc+0x78/0xa0 Clear IO, FS, and reclaim flags and mark the allocation as GFP_NOWAIT and add __GFP_NOWARN to avoid polluting dmesg with pointless allocations failures. A caller with FGP_NOWAIT must be expected to handle the resulting -EAGAIN return and retry from a suitable context without NOWAIT set. Reviewed-by: Shakeel Butt Signed-off-by: Jens Axboe --- mm/filemap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index ffdfbc8b0e3cab..254931a6e3eda5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1988,6 +1988,10 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp &= ~__GFP_FS; + if (fgp_flags & FGP_NOWAIT) { + gfp &= ~GFP_KERNEL; + gfp |= GFP_NOWAIT | __GFP_NOWARN; + } folio = filemap_alloc_folio(gfp, 0); if (!folio) From 7ebff2d792fad329bd629b3833aa08603ed62321 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:36 +0100 Subject: [PATCH 1114/1250] io_uring: initialise msghdr::msg_ubuf Initialise newly added ->msg_ubuf in io_recv() and io_send(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b8f9f263875a4a36e7f26cc5d55ebe315308f57d.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index e61efa31c729c6..bbc9c603641a5f 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -294,6 +294,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; + msg.msg_ubuf = NULL; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -783,6 +784,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) msg.msg_flags = 0; msg.msg_controllen = 0; msg.msg_iocb = NULL; + msg.msg_ubuf = NULL; flags = sr->msg_flags; if (force_nonblock) From 52139e4cef096dc69c5ae0cc19cf548b5c22714b Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:37 +0100 Subject: [PATCH 1115/1250] io_uring: export io_put_task() Make io_put_task() available to non-core parts of io_uring, we'll need it for notification infrastructure. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3686807d4c03b72e389947b0e8692d4d44334ef0.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 25 +++++++++++++++++++++++++ io_uring/io_uring.c | 11 +---------- io_uring/io_uring.h | 10 ++++++++++ io_uring/tctx.h | 26 -------------------------- 4 files changed, 36 insertions(+), 36 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d54b8b7e074629..368c34d14b1364 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -4,6 +4,7 @@ #include #include #include +#include #include struct io_wq_work_node { @@ -43,6 +44,30 @@ struct io_hash_table { unsigned hash_bits; }; +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + +struct io_uring_task { + /* submission side */ + int cached_refs; + const struct io_ring_ctx *last; + struct io_wq *io_wq; + struct file *registered_rings[IO_RINGFD_REG_MAX]; + + struct xarray xa; + struct wait_queue_head wait; + atomic_t in_idle; + atomic_t inflight_tracked; + struct percpu_counter inflight; + + struct { /* task_work */ + struct llist_head task_list; + struct callback_head task_work; + } ____cacheline_aligned_in_smp; +}; + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 624535c625652f..ba93f280b66b3b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -605,7 +605,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) return ret; } -static void __io_put_task(struct task_struct *task, int nr) +void __io_put_task(struct task_struct *task, int nr) { struct io_uring_task *tctx = task->io_uring; @@ -615,15 +615,6 @@ static void __io_put_task(struct task_struct *task, int nr) put_task_struct_many(task, nr); } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - if (likely(task == current)) - task->io_uring->cached_refs += nr; - else - __io_put_task(task, nr); -} - static void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 868f45d55543bc..2379d9e70c101e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -66,6 +66,7 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); +void __io_put_task(struct task_struct *task, int nr); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); @@ -253,4 +254,13 @@ static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) __io_commit_cqring_flush(ctx); } +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) +{ + if (likely(task == current)) + task->io_uring->cached_refs += nr; + else + __io_put_task(task, nr); +} + #endif diff --git a/io_uring/tctx.h b/io_uring/tctx.h index 8a33ff6e5d9138..25974beed4d6be 100644 --- a/io_uring/tctx.h +++ b/io_uring/tctx.h @@ -1,31 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include - -/* - * Arbitrary limit, can be raised if need be - */ -#define IO_RINGFD_REG_MAX 16 - -struct io_uring_task { - /* submission side */ - int cached_refs; - const struct io_ring_ctx *last; - struct io_wq *io_wq; - struct file *registered_rings[IO_RINGFD_REG_MAX]; - - struct xarray xa; - struct wait_queue_head wait; - atomic_t in_idle; - atomic_t inflight_tracked; - struct percpu_counter inflight; - - struct { /* task_work */ - struct llist_head task_list; - struct callback_head task_work; - } ____cacheline_aligned_in_smp; -}; - struct io_tctx_node { struct list_head ctx_node; struct task_struct *task; From 8cdee2a5b90ae5fff6799306f1df5afd5a2d7342 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:38 +0100 Subject: [PATCH 1116/1250] io_uring: add zc notification infrastructure Add internal part of send zerocopy notifications. There are two main structures, the first one is struct io_notif, which carries inside struct ubuf_info and maps 1:1 to it. io_uring will be binding a number of zerocopy send requests to it and ask to complete (aka flush) it. When flushed and all attached requests and skbs complete, it'll generate one and only one CQE. There are intended to be passed into the network layer as struct msghdr::msg_ubuf. The second concept is notification slots. The userspace will be able to register an array of slots and subsequently addressing them by the index in the array. Slots are independent of each other. Each slot can have only one notifier at a time (called active notifier) but many notifiers during the lifetime. When active, a notifier not going to post any completion but the userspace can attach requests to it by specifying the corresponding slot while issueing send zc requests. Eventually, the userspace will want to "flush" the notifier losing any way to attach new requests to it, however it can use the next atomatically added notifier of this slot or of any other slot. When the network layer is done with all enqueued skbs attached to a notifier and doesn't need the specified in them user data, the flushed notifier will post a CQE. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3ecf54c31a85762bf679b0a432c9f43ecf7e61cc.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ++ io_uring/Makefile | 2 +- io_uring/io_uring.c | 8 ++- io_uring/io_uring.h | 2 + io_uring/notif.c | 102 +++++++++++++++++++++++++++++++++ io_uring/notif.h | 64 +++++++++++++++++++++ 6 files changed, 179 insertions(+), 4 deletions(-) create mode 100644 io_uring/notif.c create mode 100644 io_uring/notif.h diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 368c34d14b1364..f7fab3758cb9bb 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -34,6 +34,9 @@ struct io_file_table { unsigned int alloc_hint; }; +struct io_notif; +struct io_notif_slot; + struct io_hash_bucket { spinlock_t lock; struct hlist_head list; @@ -237,6 +240,8 @@ struct io_ring_ctx { unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; + struct io_notif_slot *notif_slots; + unsigned nr_notif_slots; struct io_submit_state submit_state; diff --git a/io_uring/Makefile b/io_uring/Makefile index 466639c289be7f..8cc8e5387a75e5 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ openclose.o uring_cmd.o epoll.o \ statx.o net.o msg_ring.o timeout.o \ sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o opdef.o + cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ba93f280b66b3b..33d6a7b2fcd924 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -90,6 +90,7 @@ #include "rsrc.h" #include "cancel.h" #include "net.h" +#include "notif.h" #include "timeout.h" #include "poll.h" @@ -729,9 +730,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx) return &rings->cqes[off]; } -static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, - u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow) { struct io_uring_cqe *cqe; @@ -2488,6 +2488,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) } #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); + WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); @@ -2664,6 +2665,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_unregister_personality(ctx, index); if (ctx->rings) io_poll_remove_all(ctx, NULL, true); + io_notif_unregister(ctx); mutex_unlock(&ctx->uring_lock); /* failed during ring init, it couldn't have issued any requests */ diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 2379d9e70c101e..b8c858727dc8e6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -33,6 +33,8 @@ void io_req_complete_post(struct io_kiocb *req); void __io_req_complete_post(struct io_kiocb *req); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, bool allow_overflow); +bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, + bool allow_overflow); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); diff --git a/io_uring/notif.c b/io_uring/notif.c new file mode 100644 index 00000000000000..6ee948af6a4967 --- /dev/null +++ b/io_uring/notif.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include +#include + +#include "io_uring.h" +#include "notif.h" + +static void __io_notif_complete_tw(struct callback_head *cb) +{ + struct io_notif *notif = container_of(cb, struct io_notif, task_work); + struct io_ring_ctx *ctx = notif->ctx; + + io_cq_lock(ctx); + io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true); + io_cq_unlock_post(ctx); + + percpu_ref_put(&ctx->refs); + kfree(notif); +} + +static inline void io_notif_complete(struct io_notif *notif) +{ + __io_notif_complete_tw(¬if->task_work); +} + +static void io_notif_complete_wq(struct work_struct *work) +{ + struct io_notif *notif = container_of(work, struct io_notif, commit_work); + + io_notif_complete(notif); +} + +static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, + struct ubuf_info *uarg, + bool success) +{ + struct io_notif *notif = container_of(uarg, struct io_notif, uarg); + + if (!refcount_dec_and_test(&uarg->refcnt)) + return; + INIT_WORK(¬if->commit_work, io_notif_complete_wq); + queue_work(system_unbound_wq, ¬if->commit_work); +} + +struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, + struct io_notif_slot *slot) + __must_hold(&ctx->uring_lock) +{ + struct io_notif *notif; + + notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT); + if (!notif) + return NULL; + + notif->seq = slot->seq++; + notif->tag = slot->tag; + notif->ctx = ctx; + notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + notif->uarg.callback = io_uring_tx_zerocopy_callback; + /* master ref owned by io_notif_slot, will be dropped on flush */ + refcount_set(¬if->uarg.refcnt, 1); + percpu_ref_get(&ctx->refs); + return notif; +} + +static void io_notif_slot_flush(struct io_notif_slot *slot) + __must_hold(&ctx->uring_lock) +{ + struct io_notif *notif = slot->notif; + + slot->notif = NULL; + + if (WARN_ON_ONCE(in_interrupt())) + return; + /* drop slot's master ref */ + if (refcount_dec_and_test(¬if->uarg.refcnt)) + io_notif_complete(notif); +} + +__cold int io_notif_unregister(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + int i; + + if (!ctx->notif_slots) + return -ENXIO; + + for (i = 0; i < ctx->nr_notif_slots; i++) { + struct io_notif_slot *slot = &ctx->notif_slots[i]; + + if (slot->notif) + io_notif_slot_flush(slot); + } + + kvfree(ctx->notif_slots); + ctx->notif_slots = NULL; + ctx->nr_notif_slots = 0; + return 0; +} \ No newline at end of file diff --git a/io_uring/notif.h b/io_uring/notif.h new file mode 100644 index 00000000000000..3d7a1d242e1708 --- /dev/null +++ b/io_uring/notif.h @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +struct io_notif { + struct ubuf_info uarg; + struct io_ring_ctx *ctx; + + /* cqe->user_data, io_notif_slot::tag if not overridden */ + u64 tag; + /* see struct io_notif_slot::seq */ + u32 seq; + + union { + struct callback_head task_work; + struct work_struct commit_work; + }; +}; + +struct io_notif_slot { + /* + * Current/active notifier. A slot holds only one active notifier at a + * time and keeps one reference to it. Flush releases the reference and + * lazily replaces it with a new notifier. + */ + struct io_notif *notif; + + /* + * Default ->user_data for this slot notifiers CQEs + */ + u64 tag; + /* + * Notifiers of a slot live in generations, we create a new notifier + * only after flushing the previous one. Track the sequential number + * for all notifiers and copy it into notifiers's cqe->cflags + */ + u32 seq; +}; + +int io_notif_unregister(struct io_ring_ctx *ctx); + +struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, + struct io_notif_slot *slot); + +static inline struct io_notif *io_get_notif(struct io_ring_ctx *ctx, + struct io_notif_slot *slot) +{ + if (!slot->notif) + slot->notif = io_alloc_notif(ctx, slot); + return slot->notif; +} + +static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx, + int idx) + __must_hold(&ctx->uring_lock) +{ + if (idx >= ctx->nr_notif_slots) + return NULL; + idx = array_index_nospec(idx, ctx->nr_notif_slots); + return &ctx->notif_slots[idx]; +} From d51710c316ac9ded13b4ecb2a40ff2831c2e146c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:39 +0100 Subject: [PATCH 1117/1250] io_uring: cache struct io_notif kmalloc'ing struct io_notif is too expensive when done frequently, cache them as many other resources in io_uring. Keep two list, the first one is from where we're getting notifiers, it's protected by ->uring_lock. The second is protected by ->completion_lock, to which we queue released notifiers. Then we splice one list into another when needed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9dec18f7fcbab9f4bd40b96e5ae158b119945230.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 +++++ io_uring/io_uring.c | 3 ++ io_uring/notif.c | 57 +++++++++++++++++++++++++++++----- io_uring/notif.h | 5 +++ 4 files changed, 65 insertions(+), 7 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index f7fab3758cb9bb..144493cbadb56f 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -249,6 +249,9 @@ struct io_ring_ctx { struct xarray io_bl_xa; struct list_head io_buffers_cache; + /* struct io_notif cache, protected by uring_lock */ + struct list_head notif_list; + struct io_hash_table cancel_table_locked; struct list_head cq_overflow_list; struct io_alloc_cache apoll_cache; @@ -259,6 +262,10 @@ struct io_ring_ctx { struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; + /* struct io_notif cache protected by completion_lock */ + struct list_head notif_list_locked; + unsigned int notif_locked_nr; + const struct cred *sq_creds; /* cred used for __io_sq_thread() */ struct io_sq_data *sq_data; /* if using sq thread polling */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 33d6a7b2fcd924..43d77dd43e969c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -321,6 +321,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); INIT_WQ_LIST(&ctx->submit_state.compl_reqs); + INIT_LIST_HEAD(&ctx->notif_list); + INIT_LIST_HEAD(&ctx->notif_list_locked); return ctx; err: kfree(ctx->dummy_ubuf); @@ -2490,6 +2492,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots); + io_notif_cache_purge(ctx); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); diff --git a/io_uring/notif.c b/io_uring/notif.c index 6ee948af6a4967..b257db2120b4f1 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -15,10 +15,12 @@ static void __io_notif_complete_tw(struct callback_head *cb) io_cq_lock(ctx); io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true); + + list_add(¬if->cache_node, &ctx->notif_list_locked); + ctx->notif_locked_nr++; io_cq_unlock_post(ctx); percpu_ref_put(&ctx->refs); - kfree(notif); } static inline void io_notif_complete(struct io_notif *notif) @@ -45,21 +47,62 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, queue_work(system_unbound_wq, ¬if->commit_work); } +static void io_notif_splice_cached(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + spin_lock(&ctx->completion_lock); + list_splice_init(&ctx->notif_list_locked, &ctx->notif_list); + ctx->notif_locked_nr = 0; + spin_unlock(&ctx->completion_lock); +} + +void io_notif_cache_purge(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + io_notif_splice_cached(ctx); + + while (!list_empty(&ctx->notif_list)) { + struct io_notif *notif = list_first_entry(&ctx->notif_list, + struct io_notif, cache_node); + + list_del(¬if->cache_node); + kfree(notif); + } +} + +static inline bool io_notif_has_cached(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + if (likely(!list_empty(&ctx->notif_list))) + return true; + if (data_race(READ_ONCE(ctx->notif_locked_nr) <= IO_NOTIF_SPLICE_BATCH)) + return false; + io_notif_splice_cached(ctx); + return !list_empty(&ctx->notif_list); +} + struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, struct io_notif_slot *slot) __must_hold(&ctx->uring_lock) { struct io_notif *notif; - notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT); - if (!notif) - return NULL; + if (likely(io_notif_has_cached(ctx))) { + notif = list_first_entry(&ctx->notif_list, + struct io_notif, cache_node); + list_del(¬if->cache_node); + } else { + notif = kzalloc(sizeof(*notif), GFP_ATOMIC | __GFP_ACCOUNT); + if (!notif) + return NULL; + /* pre-initialise some fields */ + notif->ctx = ctx; + notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + notif->uarg.callback = io_uring_tx_zerocopy_callback; + } notif->seq = slot->seq++; notif->tag = slot->tag; - notif->ctx = ctx; - notif->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; - notif->uarg.callback = io_uring_tx_zerocopy_callback; /* master ref owned by io_notif_slot, will be dropped on flush */ refcount_set(¬if->uarg.refcnt, 1); percpu_ref_get(&ctx->refs); diff --git a/io_uring/notif.h b/io_uring/notif.h index 3d7a1d242e1708..b23c9c0515bb2d 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -5,6 +5,8 @@ #include #include +#define IO_NOTIF_SPLICE_BATCH 32 + struct io_notif { struct ubuf_info uarg; struct io_ring_ctx *ctx; @@ -13,6 +15,8 @@ struct io_notif { u64 tag; /* see struct io_notif_slot::seq */ u32 seq; + /* hook into ctx->notif_list and ctx->notif_list_locked */ + struct list_head cache_node; union { struct callback_head task_work; @@ -41,6 +45,7 @@ struct io_notif_slot { }; int io_notif_unregister(struct io_ring_ctx *ctx); +void io_notif_cache_purge(struct io_ring_ctx *ctx); struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, struct io_notif_slot *slot); From 5207a91439bedc35d563541ddc9069428527fc12 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:40 +0100 Subject: [PATCH 1118/1250] io_uring: complete notifiers in tw We need a task context to post CQEs but using wq is too expensive. Try to complete notifiers using task_work and fall back to wq if fails. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/089799ab665b10b78fdc614ae6d59fa7ef0d5f91.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 22 +++++++++++++++++++--- io_uring/notif.h | 3 +++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index b257db2120b4f1..aec74f88fc33b5 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -13,6 +13,11 @@ static void __io_notif_complete_tw(struct callback_head *cb) struct io_notif *notif = container_of(cb, struct io_notif, task_work); struct io_ring_ctx *ctx = notif->ctx; + if (likely(notif->task)) { + io_put_task(notif->task, 1); + notif->task = NULL; + } + io_cq_lock(ctx); io_fill_cqe_aux(ctx, notif->tag, 0, notif->seq, true); @@ -43,6 +48,14 @@ static void io_uring_tx_zerocopy_callback(struct sk_buff *skb, if (!refcount_dec_and_test(&uarg->refcnt)) return; + + if (likely(notif->task)) { + init_task_work(¬if->task_work, __io_notif_complete_tw); + if (likely(!task_work_add(notif->task, ¬if->task_work, + TWA_SIGNAL))) + return; + } + INIT_WORK(¬if->commit_work, io_notif_complete_wq); queue_work(system_unbound_wq, ¬if->commit_work); } @@ -134,12 +147,15 @@ __cold int io_notif_unregister(struct io_ring_ctx *ctx) for (i = 0; i < ctx->nr_notif_slots; i++) { struct io_notif_slot *slot = &ctx->notif_slots[i]; - if (slot->notif) - io_notif_slot_flush(slot); + if (!slot->notif) + continue; + if (WARN_ON_ONCE(slot->notif->task)) + slot->notif->task = NULL; + io_notif_slot_flush(slot); } kvfree(ctx->notif_slots); ctx->notif_slots = NULL; ctx->nr_notif_slots = 0; return 0; -} \ No newline at end of file +} diff --git a/io_uring/notif.h b/io_uring/notif.h index b23c9c0515bb2d..23ca7620fff9a3 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -11,6 +11,9 @@ struct io_notif { struct ubuf_info uarg; struct io_ring_ctx *ctx; + /* complete via tw if ->task is non-NULL, fallback to wq otherwise */ + struct task_struct *task; + /* cqe->user_data, io_notif_slot::tag if not overridden */ u64 tag; /* see struct io_notif_slot::seq */ From 4e96038be44c25fa26fcc3ae9d5ccf7558bbf104 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:41 +0100 Subject: [PATCH 1119/1250] io_uring: add rsrc referencing for notifiers In preparation to zerocopy sends with fixed buffers make notifiers to reference the rsrc node to protect the used fixed buffers. We can't just grab it for a send request as notifiers can likely outlive requests that used it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3cd7a01d26837945b6982fa9cf15a63230f2ed4f.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 5 +++++ io_uring/notif.h | 1 + io_uring/rsrc.h | 12 +++++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index aec74f88fc33b5..0a2e98bd74f692 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -7,10 +7,12 @@ #include "io_uring.h" #include "notif.h" +#include "rsrc.h" static void __io_notif_complete_tw(struct callback_head *cb) { struct io_notif *notif = container_of(cb, struct io_notif, task_work); + struct io_rsrc_node *rsrc_node = notif->rsrc_node; struct io_ring_ctx *ctx = notif->ctx; if (likely(notif->task)) { @@ -25,6 +27,7 @@ static void __io_notif_complete_tw(struct callback_head *cb) ctx->notif_locked_nr++; io_cq_unlock_post(ctx); + io_rsrc_put_node(rsrc_node, 1); percpu_ref_put(&ctx->refs); } @@ -119,6 +122,8 @@ struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, /* master ref owned by io_notif_slot, will be dropped on flush */ refcount_set(¬if->uarg.refcnt, 1); percpu_ref_get(&ctx->refs); + notif->rsrc_node = ctx->rsrc_node; + io_charge_rsrc_node(ctx); return notif; } diff --git a/io_uring/notif.h b/io_uring/notif.h index 23ca7620fff9a3..1dd48efb774460 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -10,6 +10,7 @@ struct io_notif { struct ubuf_info uarg; struct io_ring_ctx *ctx; + struct io_rsrc_node *rsrc_node; /* complete via tw if ->task is non-NULL, fallback to wq otherwise */ struct task_struct *task; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 87f58315b247bc..af342fd239d09b 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -135,6 +135,13 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req, } } +static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx) +{ + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned int issue_flags) @@ -144,9 +151,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, if (!(issue_flags & IO_URING_F_UNLOCKED)) { lockdep_assert_held(&ctx->uring_lock); - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); + + io_charge_rsrc_node(ctx); } else { percpu_ref_get(&req->rsrc_node->refs); } From 722112bc549f9577e814a844992c553e02f6fb99 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:42 +0100 Subject: [PATCH 1120/1250] io_uring: add notification slot registration Let the userspace to register and unregister notification slots. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a0aa8161fe3ebb2a4cc6e5dbd0cffb96e6881cf5.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 17 ++++++++++++++ io_uring/io_uring.c | 9 ++++++++ io_uring/notif.c | 43 +++++++++++++++++++++++++++++++++++ io_uring/notif.h | 3 +++ 4 files changed, 72 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4c9b11e2e99158..dcfc7a0bda0cae 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -457,6 +457,10 @@ enum { /* register a range of fixed file slots for automatic slot allocation */ IORING_REGISTER_FILE_ALLOC_RANGE = 25, + /* zerocopy notification API */ + IORING_REGISTER_NOTIFIERS = 26, + IORING_UNREGISTER_NOTIFIERS = 27, + /* this goes last */ IORING_REGISTER_LAST }; @@ -503,6 +507,19 @@ struct io_uring_rsrc_update2 { __u32 resv2; }; +struct io_uring_notification_slot { + __u64 tag; + __u64 resv[3]; +}; + +struct io_uring_notification_register { + __u32 nr_slots; + __u32 resv; + __u64 resv2; + __u64 data; + __u64 resv3; +}; + /* Skip updating fd indexes set to this value in the fd table */ #define IORING_REGISTER_FILES_SKIP (-2) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 43d77dd43e969c..3949b9bfe87559 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3867,6 +3867,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_file_alloc_range(ctx, arg); break; + case IORING_REGISTER_NOTIFIERS: + ret = io_notif_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_NOTIFIERS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_notif_unregister(ctx); + break; default: ret = -EINVAL; break; diff --git a/io_uring/notif.c b/io_uring/notif.c index 0a2e98bd74f692..e6d98dc208c77b 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -162,5 +162,48 @@ __cold int io_notif_unregister(struct io_ring_ctx *ctx) kvfree(ctx->notif_slots); ctx->notif_slots = NULL; ctx->nr_notif_slots = 0; + io_notif_cache_purge(ctx); + return 0; +} + +__cold int io_notif_register(struct io_ring_ctx *ctx, + void __user *arg, unsigned int size) + __must_hold(&ctx->uring_lock) +{ + struct io_uring_notification_slot __user *slots; + struct io_uring_notification_slot slot; + struct io_uring_notification_register reg; + unsigned i; + + if (ctx->nr_notif_slots) + return -EBUSY; + if (size != sizeof(reg)) + return -EINVAL; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS) + return -EINVAL; + if (reg.resv || reg.resv2 || reg.resv3) + return -EINVAL; + + slots = u64_to_user_ptr(reg.data); + ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]), + GFP_KERNEL_ACCOUNT); + if (!ctx->notif_slots) + return -ENOMEM; + + for (i = 0; i < reg.nr_slots; i++, ctx->nr_notif_slots++) { + struct io_notif_slot *notif_slot = &ctx->notif_slots[i]; + + if (copy_from_user(&slot, &slots[i], sizeof(slot))) { + io_notif_unregister(ctx); + return -EFAULT; + } + if (slot.resv[0] | slot.resv[1] | slot.resv[2]) { + io_notif_unregister(ctx); + return -EINVAL; + } + notif_slot->tag = slot.tag; + } return 0; } diff --git a/io_uring/notif.h b/io_uring/notif.h index 1dd48efb774460..00efe164bdc4b8 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -6,6 +6,7 @@ #include #define IO_NOTIF_SPLICE_BATCH 32 +#define IORING_MAX_NOTIF_SLOTS (1U << 10) struct io_notif { struct ubuf_info uarg; @@ -48,6 +49,8 @@ struct io_notif_slot { u32 seq; }; +int io_notif_register(struct io_ring_ctx *ctx, + void __user *arg, unsigned int size); int io_notif_unregister(struct io_ring_ctx *ctx); void io_notif_cache_purge(struct io_ring_ctx *ctx); From 121ebf7dfae073dbd190f46a1d40cef2a6938a35 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:43 +0100 Subject: [PATCH 1121/1250] io_uring: wire send zc request type Add a new io_uring opcode IORING_OP_SENDZC. The main distinction from IORING_OP_SEND is that the user should specify a notification slot index in sqe::notification_idx and the buffers are safe to reuse only when the used notification is flushed and completes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a80387c6a68ce9cf99b3b6ef6f71068468761fb7.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 5 ++ io_uring/net.c | 94 +++++++++++++++++++++++++++++++++++ io_uring/net.h | 3 ++ io_uring/opdef.c | 15 ++++++ 4 files changed, 117 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index dcfc7a0bda0cae..82bf2991e9bd42 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -66,6 +66,10 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + struct { + __u16 notification_idx; + __u16 __pad; + }; }; union { struct { @@ -197,6 +201,7 @@ enum io_uring_op { IORING_OP_GETXATTR, IORING_OP_SOCKET, IORING_OP_URING_CMD, + IORING_OP_SENDZC_NOTIF, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/net.c b/io_uring/net.c index bbc9c603641a5f..89a8678ce69bde 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -14,6 +14,7 @@ #include "kbuf.h" #include "alloc_cache.h" #include "net.h" +#include "notif.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -59,6 +60,15 @@ struct io_sr_msg { unsigned int flags; }; +struct io_sendzc { + struct file *file; + void __user *buf; + size_t len; + u16 slot_idx; + unsigned msg_flags; + unsigned flags; +}; + #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -834,6 +844,90 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) return ret; } +int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sendzc *zc = io_kiocb_to_cmd(req); + + if (READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0]) || + READ_ONCE(sqe->addr3)) + return -EINVAL; + + zc->flags = READ_ONCE(sqe->ioprio); + if (zc->flags & ~IORING_RECVSEND_POLL_FIRST) + return -EINVAL; + + zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + zc->len = READ_ONCE(sqe->len); + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + zc->slot_idx = READ_ONCE(sqe->notification_idx); + if (zc->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; +#ifdef CONFIG_COMPAT + if (req->ctx->compat) + zc->msg_flags |= MSG_CMSG_COMPAT; +#endif + return 0; +} + +int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_sendzc *zc = io_kiocb_to_cmd(req); + struct io_notif_slot *notif_slot; + struct io_notif *notif; + struct msghdr msg; + struct iovec iov; + struct socket *sock; + unsigned msg_flags; + int ret, min_ret = 0; + + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + if (issue_flags & IO_URING_F_UNLOCKED) + return -EAGAIN; + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + notif_slot = io_get_notif_slot(ctx, zc->slot_idx); + if (!notif_slot) + return -EINVAL; + notif = io_get_notif(ctx, notif_slot); + if (!notif) + return -ENOMEM; + + msg.msg_name = NULL; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_namelen = 0; + + ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter); + if (unlikely(ret)) + return ret; + + msg_flags = zc->msg_flags | MSG_ZEROCOPY; + if (issue_flags & IO_URING_F_NONBLOCK) + msg_flags |= MSG_DONTWAIT; + if (msg_flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + + msg.msg_flags = msg_flags; + msg.msg_ubuf = ¬if->uarg; + msg.sg_from_iter = NULL; + ret = sock_sendmsg(sock, &msg); + + if (unlikely(ret < min_ret)) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return -EAGAIN; + return ret == -ERESTARTSYS ? -EINTR : ret; + } + + io_req_set_res(req, ret, 0); + return IOU_OK; +} + int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = io_kiocb_to_cmd(req); diff --git a/io_uring/net.h b/io_uring/net.h index db20ce9d6546d4..7c438d39c0899a 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -52,6 +52,9 @@ int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); +int io_sendzc(struct io_kiocb *req, unsigned int issue_flags); +int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + void io_netmsg_cache_free(struct io_cache_entry *entry); #else static inline void io_netmsg_cache_free(struct io_cache_entry *entry) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index a7b84b43e6c235..7ab19bbf312637 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -470,6 +470,21 @@ const struct io_op_def io_op_defs[] = { .issue = io_uring_cmd, .prep_async = io_uring_cmd_prep_async, }, + [IORING_OP_SENDZC_NOTIF] = { + .name = "SENDZC_NOTIF", + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .audit_skip = 1, + .ioprio = 1, +#if defined(CONFIG_NET) + .prep = io_sendzc_prep, + .issue = io_sendzc, +#else + .prep = io_eopnotsupp_prep, +#endif + + }, }; const char *io_uring_get_opcode(u8 opcode) From d3b8269075f67c77488f0dca112a6c6a4c35e22c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:44 +0100 Subject: [PATCH 1122/1250] io_uring: account locked pages for non-fixed zc Fixed buffers are RLIMIT_MEMLOCK accounted, however it doesn't cover iovec based zerocopy sends. Do the accounting on the io_uring side. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/19b6e3975440f59f1f6199c7ee7acf977b4eecdc.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + io_uring/notif.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/io_uring/net.c b/io_uring/net.c index 89a8678ce69bde..2d04a70b063213 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -906,6 +906,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter); if (unlikely(ret)) return ret; + mm_account_pinned_pages(¬if->uarg.mmp, zc->len); msg_flags = zc->msg_flags | MSG_ZEROCOPY; if (issue_flags & IO_URING_F_NONBLOCK) diff --git a/io_uring/notif.c b/io_uring/notif.c index e6d98dc208c77b..c5179e5c1cd67c 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -14,7 +14,13 @@ static void __io_notif_complete_tw(struct callback_head *cb) struct io_notif *notif = container_of(cb, struct io_notif, task_work); struct io_rsrc_node *rsrc_node = notif->rsrc_node; struct io_ring_ctx *ctx = notif->ctx; + struct mmpin *mmp = ¬if->uarg.mmp; + if (mmp->user) { + atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm); + free_uid(mmp->user); + mmp->user = NULL; + } if (likely(notif->task)) { io_put_task(notif->task, 1); notif->task = NULL; From 82f5d38937ddce9c630e16e96b4b14ea0d2856ea Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:45 +0100 Subject: [PATCH 1123/1250] io_uring: allow to pass addr into sendzc Allow to specify an address to zerocopy sends making it more like sendto(2). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/70417a8f7c5b51ab454690bae08adc0c187f89e8.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 2 +- io_uring/net.c | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 82bf2991e9bd42..0736e2773a5d27 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -68,7 +68,7 @@ struct io_uring_sqe { __u32 file_index; struct { __u16 notification_idx; - __u16 __pad; + __u16 addr_len; }; }; union { diff --git a/io_uring/net.c b/io_uring/net.c index 2d04a70b063213..61414d865cd705 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -67,6 +67,8 @@ struct io_sendzc { u16 slot_idx; unsigned msg_flags; unsigned flags; + unsigned addr_len; + void __user *addr; }; #define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED) @@ -848,8 +850,7 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sendzc *zc = io_kiocb_to_cmd(req); - if (READ_ONCE(sqe->addr2) || READ_ONCE(sqe->__pad2[0]) || - READ_ONCE(sqe->addr3)) + if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)) return -EINVAL; zc->flags = READ_ONCE(sqe->ioprio); @@ -862,6 +863,10 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->slot_idx = READ_ONCE(sqe->notification_idx); if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + + zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + zc->addr_len = READ_ONCE(sqe->addr_len); + #ifdef CONFIG_COMPAT if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; @@ -871,6 +876,7 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) { + struct sockaddr_storage address; struct io_ring_ctx *ctx = req->ctx; struct io_sendzc *zc = io_kiocb_to_cmd(req); struct io_notif_slot *notif_slot; @@ -908,6 +914,14 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) return ret; mm_account_pinned_pages(¬if->uarg.mmp, zc->len); + if (zc->addr) { + ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address); + if (unlikely(ret < 0)) + return ret; + msg.msg_name = (struct sockaddr *)&address; + msg.msg_namelen = zc->addr_len; + } + msg_flags = zc->msg_flags | MSG_ZEROCOPY; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; From 41e69affe597d6d5dc7a44807aefd11ba6f621e1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:46 +0100 Subject: [PATCH 1124/1250] io_uring: sendzc with fixed buffers Allow zerocopy sends to use fixed buffers. There is an optimisation for this case, the network layer don't need to reference the pages, see SKBFL_MANAGED_FRAG_REFS, so io_uring have to ensure validity of fixed buffers until the notifier is released. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e1d8bd1b5934e541d90c1824eb4020ae3f5f43f3.1657643355.git.asml.silence@gmail.com [axboe: fold in 32-bit pointer cast warning fix] Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 6 +++++- io_uring/net.c | 29 ++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0736e2773a5d27..f1a9ff9b9ea7f4 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -272,9 +272,13 @@ enum io_uring_op { * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if * the handler will continue to report * CQEs on behalf of the same SQE. + * + * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in + * the buf_index field. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) -#define IORING_RECV_MULTISHOT (1U << 1) +#define IORING_RECV_MULTISHOT (1U << 1) +#define IORING_RECVSEND_FIXED_BUF (1U << 2) /* * accept flags stored in sqe->ioprio diff --git a/io_uring/net.c b/io_uring/net.c index 61414d865cd705..ab443c52dcfd34 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -15,6 +15,7 @@ #include "alloc_cache.h" #include "net.h" #include "notif.h" +#include "rsrc.h" #if defined(CONFIG_NET) struct io_shutdown { @@ -849,13 +850,23 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sendzc *zc = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)) return -EINVAL; zc->flags = READ_ONCE(sqe->ioprio); - if (zc->flags & ~IORING_RECVSEND_POLL_FIRST) + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)) return -EINVAL; + if (zc->flags & IORING_RECVSEND_FIXED_BUF) { + unsigned idx = READ_ONCE(sqe->buf_index); + + if (unlikely(idx >= ctx->nr_user_bufs)) + return -EFAULT; + idx = array_index_nospec(idx, ctx->nr_user_bufs); + req->imu = READ_ONCE(ctx->user_bufs[idx]); + io_req_set_rsrc_node(req, ctx, 0); + } zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); @@ -909,10 +920,18 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) msg.msg_controllen = 0; msg.msg_namelen = 0; - ret = import_single_range(WRITE, zc->buf, zc->len, &iov, &msg.msg_iter); - if (unlikely(ret)) - return ret; - mm_account_pinned_pages(¬if->uarg.mmp, zc->len); + if (zc->flags & IORING_RECVSEND_FIXED_BUF) { + ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu, + (u64)(uintptr_t)zc->buf, zc->len); + if (unlikely(ret)) + return ret; + } else { + ret = import_single_range(WRITE, zc->buf, zc->len, &iov, + &msg.msg_iter); + if (unlikely(ret)) + return ret; + mm_account_pinned_pages(¬if->uarg.mmp, zc->len); + } if (zc->addr) { ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address); From 1bb4685c9af13e2387924c16e1e28780d6ff186e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:47 +0100 Subject: [PATCH 1125/1250] io_uring: flush notifiers after sendzc Allow to flush notifiers as a part of sendzc request by setting IORING_SENDZC_FLUSH flag. When the sendzc request succeedes it will flush the used [active] notifier. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e0b4d9a6797e2fd6092824fe42953db7a519bbc8.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 4 ++++ io_uring/io_uring.c | 11 +---------- io_uring/io_uring.h | 10 ++++++++++ io_uring/net.c | 5 ++++- io_uring/notif.c | 2 +- io_uring/notif.h | 11 +++++++++++ 6 files changed, 31 insertions(+), 12 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f1a9ff9b9ea7f4..45272eb37d1092 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -275,10 +275,14 @@ enum io_uring_op { * * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in * the buf_index field. + * + * IORING_RECVSEND_NOTIF_FLUSH Flush a notification after a successful + * successful. Only for zerocopy sends. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) +#define IORING_RECVSEND_NOTIF_FLUSH (1U << 3) /* * accept flags stored in sqe->ioprio diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3949b9bfe87559..68a11b4706bf57 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -618,7 +618,7 @@ void __io_put_task(struct task_struct *task, int nr) put_task_struct_many(task, nr); } -static void io_task_refs_refill(struct io_uring_task *tctx) +void io_task_refs_refill(struct io_uring_task *tctx) { unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; @@ -627,15 +627,6 @@ static void io_task_refs_refill(struct io_uring_task *tctx) tctx->cached_refs += refill; } -static inline void io_get_task_refs(int nr) -{ - struct io_uring_task *tctx = current->io_uring; - - tctx->cached_refs -= nr; - if (unlikely(tctx->cached_refs < 0)) - io_task_refs_refill(tctx); -} - static __cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index b8c858727dc8e6..d9f2f5c71481cf 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -69,6 +69,7 @@ void io_wq_submit_work(struct io_wq_work *work); void io_free_req(struct io_kiocb *req); void io_queue_next(struct io_kiocb *req); void __io_put_task(struct task_struct *task, int nr); +void io_task_refs_refill(struct io_uring_task *tctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); @@ -265,4 +266,13 @@ static inline void io_put_task(struct task_struct *task, int nr) __io_put_task(task, nr); } +static inline void io_get_task_refs(int nr) +{ + struct io_uring_task *tctx = current->io_uring; + + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) + io_task_refs_refill(tctx); +} + #endif diff --git a/io_uring/net.c b/io_uring/net.c index ab443c52dcfd34..9ac2ce37c52253 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -856,7 +856,8 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; zc->flags = READ_ONCE(sqe->ioprio); - if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF)) + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | + IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH)) return -EINVAL; if (zc->flags & IORING_RECVSEND_FIXED_BUF) { unsigned idx = READ_ONCE(sqe->buf_index); @@ -958,6 +959,8 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) return ret == -ERESTARTSYS ? -EINTR : ret; } + if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH) + io_notif_slot_flush_submit(notif_slot, 0); io_req_set_res(req, ret, 0); return IOU_OK; } diff --git a/io_uring/notif.c b/io_uring/notif.c index c5179e5c1cd67c..a93887451bbb2c 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -133,7 +133,7 @@ struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, return notif; } -static void io_notif_slot_flush(struct io_notif_slot *slot) +void io_notif_slot_flush(struct io_notif_slot *slot) __must_hold(&ctx->uring_lock) { struct io_notif *notif = slot->notif; diff --git a/io_uring/notif.h b/io_uring/notif.h index 00efe164bdc4b8..6cd73d7b965b28 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -54,6 +54,7 @@ int io_notif_register(struct io_ring_ctx *ctx, int io_notif_unregister(struct io_ring_ctx *ctx); void io_notif_cache_purge(struct io_ring_ctx *ctx); +void io_notif_slot_flush(struct io_notif_slot *slot); struct io_notif *io_alloc_notif(struct io_ring_ctx *ctx, struct io_notif_slot *slot); @@ -74,3 +75,13 @@ static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx, idx = array_index_nospec(idx, ctx->nr_notif_slots); return &ctx->notif_slots[idx]; } + +static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot, + unsigned int issue_flags) +{ + if (!(issue_flags & IO_URING_F_UNLOCKED)) { + slot->notif->task = current; + io_get_task_refs(1); + } + io_notif_slot_flush(slot); +} From f0e1261b6bdfc59ca52f8ee74a2d2df7fb4cca78 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:48 +0100 Subject: [PATCH 1126/1250] io_uring: rename IORING_OP_FILES_UPDATE IORING_OP_FILES_UPDATE will be a more generic opcode serving different resource types, rename it into IORING_OP_RSRC_UPDATE and add subtype handling. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0a907133907d9af3415a8a7aa1802c6aa97c03c6.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 12 +++++++++++- io_uring/opdef.c | 9 +++++---- io_uring/rsrc.c | 17 +++++++++++++++-- io_uring/rsrc.h | 4 ++-- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 45272eb37d1092..210a00ab6301e4 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -174,7 +174,8 @@ enum io_uring_op { IORING_OP_FALLOCATE, IORING_OP_OPENAT, IORING_OP_CLOSE, - IORING_OP_FILES_UPDATE, + IORING_OP_RSRC_UPDATE, + IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE, IORING_OP_STATX, IORING_OP_READ, IORING_OP_WRITE, @@ -223,6 +224,7 @@ enum io_uring_op { #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) + /* * sqe->splice_flags * extends splice(2) flags @@ -289,6 +291,14 @@ enum io_uring_op { */ #define IORING_ACCEPT_MULTISHOT (1U << 0) + +/* + * IORING_OP_RSRC_UPDATE flags + */ +enum { + IORING_RSRC_UPDATE_FILES, +}; + /* * IORING_OP_MSG_RING command types, stored in sqe->addr */ diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 7ab19bbf312637..72dd2b2d8a9df1 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -246,12 +246,13 @@ const struct io_op_def io_op_defs[] = { .prep = io_close_prep, .issue = io_close, }, - [IORING_OP_FILES_UPDATE] = { + [IORING_OP_RSRC_UPDATE] = { .audit_skip = 1, .iopoll = 1, - .name = "FILES_UPDATE", - .prep = io_files_update_prep, - .issue = io_files_update, + .name = "RSRC_UPDATE", + .prep = io_rsrc_update_prep, + .issue = io_rsrc_update, + .ioprio = 1, }, [IORING_OP_STATX] = { .audit_skip = 1, diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 7f66b0e2567432..fc2b337e6c25a9 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -21,6 +21,7 @@ struct io_rsrc_update { u64 arg; u32 nr_args; u32 offset; + int type; }; static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, @@ -657,7 +658,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; } -int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_rsrc_update *up = io_kiocb_to_cmd(req); @@ -671,6 +672,7 @@ int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!up->nr_args) return -EINVAL; up->arg = READ_ONCE(sqe->addr); + up->type = READ_ONCE(sqe->ioprio); return 0; } @@ -713,7 +715,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, return ret; } -int io_files_update(struct io_kiocb *req, unsigned int issue_flags) +static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req); struct io_ring_ctx *ctx = req->ctx; @@ -742,6 +744,17 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + + switch (up->type) { + case IORING_RSRC_UPDATE_FILES: + return io_files_update(req, issue_flags); + } + return -EINVAL; +} + int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index af342fd239d09b..21813a23215fac 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -167,6 +167,6 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) return &data->tags[table_idx][off]; } -int io_files_update(struct io_kiocb *req, unsigned int issue_flags); -int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags); +int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); #endif From 6597402517afa36ffeb24ed425482d6e5e6c1dd5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:49 +0100 Subject: [PATCH 1127/1250] io_uring: add zc notification flush requests Overlay notification control onto IORING_OP_RSRC_UPDATE (former IORING_OP_FILES_UPDATE). It allows to flush a range of zc notifications from slots with indexes [sqe->off, sqe->off+sqe->len). If sqe->arg is not zero, it also copies sqe->arg as a new tag for all flushed notifications. Note, it doesn't flush a notification of a slot if there was no requests attached to it (since last flush or registration). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/df13e2363400682a73dd9e71c3b990b8d1ff0333.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/rsrc.c | 38 +++++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 210a00ab6301e4..1463cfecb56b03 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -297,6 +297,7 @@ enum io_uring_op { */ enum { IORING_RSRC_UPDATE_FILES, + IORING_RSRC_UPDATE_NOTIF, }; /* diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index fc2b337e6c25a9..9165fdf6426946 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -15,6 +15,7 @@ #include "io_uring.h" #include "openclose.h" #include "rsrc.h" +#include "notif.h" struct io_rsrc_update { struct file *file; @@ -744,6 +745,41 @@ static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_rsrc_update *up = io_kiocb_to_cmd(req); + struct io_ring_ctx *ctx = req->ctx; + unsigned len = up->nr_args; + unsigned idx_end, idx = up->offset; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (unlikely(check_add_overflow(idx, len, &idx_end))) { + ret = -EOVERFLOW; + goto out; + } + if (unlikely(idx_end > ctx->nr_notif_slots)) { + ret = -EINVAL; + goto out; + } + + for (; idx < idx_end; idx++) { + struct io_notif_slot *slot = &ctx->notif_slots[idx]; + + if (!slot->notif) + continue; + if (up->arg) + slot->tag = up->arg; + io_notif_slot_flush_submit(slot, issue_flags); + } +out: + io_ring_submit_unlock(ctx, issue_flags); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} + int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_rsrc_update *up = io_kiocb_to_cmd(req); @@ -751,6 +787,8 @@ int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags) switch (up->type) { case IORING_RSRC_UPDATE_FILES: return io_files_update(req, issue_flags); + case IORING_RSRC_UPDATE_NOTIF: + return io_notif_update(req, issue_flags); } return -EINVAL; } From d918cce57bc0f09e8cd6234d27513b9164175810 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:50 +0100 Subject: [PATCH 1128/1250] io_uring: enable managed frags with register buffers io_uring's registered buffers infra has a good performant way of pinning pages, so let's use SKBFL_MANAGED_FRAG_REFS when our requests are purely register buffer backed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/278731d3f20caf346cfc025fbee0b4c9ee4ed751.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 9ac2ce37c52253..62be89837d82b9 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -886,6 +886,60 @@ int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + int frag = shinfo->nr_frags; + int ret = 0; + struct bvec_iter bi; + ssize_t copied = 0; + unsigned long truesize = 0; + + if (!shinfo->nr_frags) + shinfo->flags |= SKBFL_MANAGED_FRAG_REFS; + + if (!skb_zcopy_managed(skb) || !iov_iter_is_bvec(from)) { + skb_zcopy_downgrade_managed(skb); + return __zerocopy_sg_from_iter(NULL, sk, skb, from, length); + } + + bi.bi_size = min(from->count, length); + bi.bi_bvec_done = from->iov_offset; + bi.bi_idx = 0; + + while (bi.bi_size && frag < MAX_SKB_FRAGS) { + struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi); + + copied += v.bv_len; + truesize += PAGE_ALIGN(v.bv_len + v.bv_offset); + __skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page, + v.bv_offset, v.bv_len); + bvec_iter_advance_single(from->bvec, &bi, v.bv_len); + } + if (bi.bi_size) + ret = -EMSGSIZE; + + shinfo->nr_frags = frag; + from->bvec += bi.bi_idx; + from->nr_segs -= bi.bi_idx; + from->count = bi.bi_size; + from->iov_offset = bi.bi_bvec_done; + + skb->data_len += copied; + skb->len += copied; + skb->truesize += truesize; + + if (sk && sk->sk_type == SOCK_STREAM) { + sk_wmem_queued_add(sk, truesize); + if (!skb_zcopy_pure(skb)) + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } + return ret; +} + int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) { struct sockaddr_storage address; @@ -950,7 +1004,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) msg.msg_flags = msg_flags; msg.msg_ubuf = ¬if->uarg; - msg.sg_from_iter = NULL; + msg.sg_from_iter = io_sg_from_iter; ret = sock_sendmsg(sock, &msg); if (unlikely(ret < min_ret)) { From ce7d721c3c6ed2a77507e35527f213b1438c86b8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 12 Jul 2022 21:52:51 +0100 Subject: [PATCH 1129/1250] selftests/io_uring: test zerocopy send Add selftests for io_uring zerocopy sends and io_uring's notification infrastructure. It's largely influenced by msg_zerocopy and uses it on the receive side. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/03d5ec78061cf52db420f88ed0b48eb8f47ce9f7.1657643355.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/io_uring_zerocopy_tx.c | 605 ++++++++++++++++++ .../selftests/net/io_uring_zerocopy_tx.sh | 131 ++++ 3 files changed, 737 insertions(+) create mode 100644 tools/testing/selftests/net/io_uring_zerocopy_tx.c create mode 100755 tools/testing/selftests/net/io_uring_zerocopy_tx.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index db05b3764b7716..9a4b30bd3a9ee7 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -59,6 +59,7 @@ TEST_GEN_FILES += toeplitz TEST_GEN_FILES += cmsg_sender TEST_GEN_FILES += stress_reuseport_listen TEST_PROGS += test_vxlan_vnifiltering.sh +TEST_GEN_FILES += io_uring_zerocopy_tx TEST_FILES := settings diff --git a/tools/testing/selftests/net/io_uring_zerocopy_tx.c b/tools/testing/selftests/net/io_uring_zerocopy_tx.c new file mode 100644 index 00000000000000..9d64c560a2d61b --- /dev/null +++ b/tools/testing/selftests/net/io_uring_zerocopy_tx.c @@ -0,0 +1,605 @@ +/* SPDX-License-Identifier: MIT */ +/* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NOTIF_TAG 0xfffffffULL +#define NONZC_TAG 0 +#define ZC_TAG 1 + +enum { + MODE_NONZC = 0, + MODE_ZC = 1, + MODE_ZC_FIXED = 2, + MODE_MIXED = 3, +}; + +static bool cfg_flush = false; +static bool cfg_cork = false; +static int cfg_mode = MODE_ZC_FIXED; +static int cfg_nr_reqs = 8; +static int cfg_family = PF_UNSPEC; +static int cfg_payload_len; +static int cfg_port = 8000; +static int cfg_runtime_ms = 4200; + +static socklen_t cfg_alen; +static struct sockaddr_storage cfg_dst_addr; + +static char payload[IP_MAXPACKET] __attribute__((aligned(4096))); + +struct io_sq_ring { + unsigned *head; + unsigned *tail; + unsigned *ring_mask; + unsigned *ring_entries; + unsigned *flags; + unsigned *array; +}; + +struct io_cq_ring { + unsigned *head; + unsigned *tail; + unsigned *ring_mask; + unsigned *ring_entries; + struct io_uring_cqe *cqes; +}; + +struct io_uring_sq { + unsigned *khead; + unsigned *ktail; + unsigned *kring_mask; + unsigned *kring_entries; + unsigned *kflags; + unsigned *kdropped; + unsigned *array; + struct io_uring_sqe *sqes; + + unsigned sqe_head; + unsigned sqe_tail; + + size_t ring_sz; +}; + +struct io_uring_cq { + unsigned *khead; + unsigned *ktail; + unsigned *kring_mask; + unsigned *kring_entries; + unsigned *koverflow; + struct io_uring_cqe *cqes; + + size_t ring_sz; +}; + +struct io_uring { + struct io_uring_sq sq; + struct io_uring_cq cq; + int ring_fd; +}; + +#ifdef __alpha__ +# ifndef __NR_io_uring_setup +# define __NR_io_uring_setup 535 +# endif +# ifndef __NR_io_uring_enter +# define __NR_io_uring_enter 536 +# endif +# ifndef __NR_io_uring_register +# define __NR_io_uring_register 537 +# endif +#else /* !__alpha__ */ +# ifndef __NR_io_uring_setup +# define __NR_io_uring_setup 425 +# endif +# ifndef __NR_io_uring_enter +# define __NR_io_uring_enter 426 +# endif +# ifndef __NR_io_uring_register +# define __NR_io_uring_register 427 +# endif +#endif + +#if defined(__x86_64) || defined(__i386__) +#define read_barrier() __asm__ __volatile__("":::"memory") +#define write_barrier() __asm__ __volatile__("":::"memory") +#else + +#define read_barrier() __sync_synchronize() +#define write_barrier() __sync_synchronize() +#endif + +static int io_uring_setup(unsigned int entries, struct io_uring_params *p) +{ + return syscall(__NR_io_uring_setup, entries, p); +} + +static int io_uring_enter(int fd, unsigned int to_submit, + unsigned int min_complete, + unsigned int flags, sigset_t *sig) +{ + return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, + flags, sig, _NSIG / 8); +} + +static int io_uring_register_buffers(struct io_uring *ring, + const struct iovec *iovecs, + unsigned nr_iovecs) +{ + int ret; + + ret = syscall(__NR_io_uring_register, ring->ring_fd, + IORING_REGISTER_BUFFERS, iovecs, nr_iovecs); + return (ret < 0) ? -errno : ret; +} + +static int io_uring_register_notifications(struct io_uring *ring, + unsigned nr, + struct io_uring_notification_slot *slots) +{ + int ret; + struct io_uring_notification_register r = { + .nr_slots = nr, + .data = (unsigned long)slots, + }; + + ret = syscall(__NR_io_uring_register, ring->ring_fd, + IORING_REGISTER_NOTIFIERS, &r, sizeof(r)); + return (ret < 0) ? -errno : ret; +} + +static int io_uring_mmap(int fd, struct io_uring_params *p, + struct io_uring_sq *sq, struct io_uring_cq *cq) +{ + size_t size; + void *ptr; + int ret; + + sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); + ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (ptr == MAP_FAILED) + return -errno; + sq->khead = ptr + p->sq_off.head; + sq->ktail = ptr + p->sq_off.tail; + sq->kring_mask = ptr + p->sq_off.ring_mask; + sq->kring_entries = ptr + p->sq_off.ring_entries; + sq->kflags = ptr + p->sq_off.flags; + sq->kdropped = ptr + p->sq_off.dropped; + sq->array = ptr + p->sq_off.array; + + size = p->sq_entries * sizeof(struct io_uring_sqe); + sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (sq->sqes == MAP_FAILED) { + ret = -errno; +err: + munmap(sq->khead, sq->ring_sz); + return ret; + } + + cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); + ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); + if (ptr == MAP_FAILED) { + ret = -errno; + munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); + goto err; + } + cq->khead = ptr + p->cq_off.head; + cq->ktail = ptr + p->cq_off.tail; + cq->kring_mask = ptr + p->cq_off.ring_mask; + cq->kring_entries = ptr + p->cq_off.ring_entries; + cq->koverflow = ptr + p->cq_off.overflow; + cq->cqes = ptr + p->cq_off.cqes; + return 0; +} + +static int io_uring_queue_init(unsigned entries, struct io_uring *ring, + unsigned flags) +{ + struct io_uring_params p; + int fd, ret; + + memset(ring, 0, sizeof(*ring)); + memset(&p, 0, sizeof(p)); + p.flags = flags; + + fd = io_uring_setup(entries, &p); + if (fd < 0) + return fd; + ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq); + if (!ret) + ring->ring_fd = fd; + else + close(fd); + return ret; +} + +static int io_uring_submit(struct io_uring *ring) +{ + struct io_uring_sq *sq = &ring->sq; + const unsigned mask = *sq->kring_mask; + unsigned ktail, submitted, to_submit; + int ret; + + read_barrier(); + if (*sq->khead != *sq->ktail) { + submitted = *sq->kring_entries; + goto submit; + } + if (sq->sqe_head == sq->sqe_tail) + return 0; + + ktail = *sq->ktail; + to_submit = sq->sqe_tail - sq->sqe_head; + for (submitted = 0; submitted < to_submit; submitted++) { + read_barrier(); + sq->array[ktail++ & mask] = sq->sqe_head++ & mask; + } + if (!submitted) + return 0; + + if (*sq->ktail != ktail) { + write_barrier(); + *sq->ktail = ktail; + write_barrier(); + } +submit: + ret = io_uring_enter(ring->ring_fd, submitted, 0, + IORING_ENTER_GETEVENTS, NULL); + return ret < 0 ? -errno : ret; +} + +static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd, + const void *buf, size_t len, int flags) +{ + memset(sqe, 0, sizeof(*sqe)); + sqe->opcode = (__u8) IORING_OP_SEND; + sqe->fd = sockfd; + sqe->addr = (unsigned long) buf; + sqe->len = len; + sqe->msg_flags = (__u32) flags; +} + +static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd, + const void *buf, size_t len, int flags, + unsigned slot_idx, unsigned zc_flags) +{ + io_uring_prep_send(sqe, sockfd, buf, len, flags); + sqe->opcode = (__u8) IORING_OP_SENDZC_NOTIF; + sqe->notification_idx = slot_idx; + sqe->ioprio = zc_flags; +} + +static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring) +{ + struct io_uring_sq *sq = &ring->sq; + + if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries) + return NULL; + return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask]; +} + +static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) +{ + struct io_uring_cq *cq = &ring->cq; + const unsigned mask = *cq->kring_mask; + unsigned head = *cq->khead; + int ret; + + *cqe_ptr = NULL; + do { + read_barrier(); + if (head != *cq->ktail) { + *cqe_ptr = &cq->cqes[head & mask]; + break; + } + ret = io_uring_enter(ring->ring_fd, 0, 1, + IORING_ENTER_GETEVENTS, NULL); + if (ret < 0) + return -errno; + } while (1); + + return 0; +} + +static inline void io_uring_cqe_seen(struct io_uring *ring) +{ + *(&ring->cq)->khead += 1; + write_barrier(); +} + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static void do_setsockopt(int fd, int level, int optname, int val) +{ + if (setsockopt(fd, level, optname, &val, sizeof(val))) + error(1, errno, "setsockopt %d.%d: %d", level, optname, val); +} + +static int do_setup_tx(int domain, int type, int protocol) +{ + int fd; + + fd = socket(domain, type, protocol); + if (fd == -1) + error(1, errno, "socket t"); + + do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21); + + if (connect(fd, (void *) &cfg_dst_addr, cfg_alen)) + error(1, errno, "connect"); + return fd; +} + +static void do_tx(int domain, int type, int protocol) +{ + struct io_uring_notification_slot b[1] = {{.tag = NOTIF_TAG}}; + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + unsigned long packets = 0, bytes = 0; + struct io_uring ring; + struct iovec iov; + uint64_t tstop; + int i, fd, ret; + int compl_cqes = 0; + + fd = do_setup_tx(domain, type, protocol); + + ret = io_uring_queue_init(512, &ring, 0); + if (ret) + error(1, ret, "io_uring: queue init"); + + ret = io_uring_register_notifications(&ring, 1, b); + if (ret) + error(1, ret, "io_uring: tx ctx registration"); + + iov.iov_base = payload; + iov.iov_len = cfg_payload_len; + + ret = io_uring_register_buffers(&ring, &iov, 1); + if (ret) + error(1, ret, "io_uring: buffer registration"); + + tstop = gettimeofday_ms() + cfg_runtime_ms; + do { + if (cfg_cork) + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1); + + for (i = 0; i < cfg_nr_reqs; i++) { + unsigned zc_flags = 0; + unsigned buf_idx = 0; + unsigned slot_idx = 0; + unsigned mode = cfg_mode; + unsigned msg_flags = 0; + + if (cfg_mode == MODE_MIXED) + mode = rand() % 3; + + sqe = io_uring_get_sqe(&ring); + + if (mode == MODE_NONZC) { + io_uring_prep_send(sqe, fd, payload, + cfg_payload_len, msg_flags); + sqe->user_data = NONZC_TAG; + } else { + if (cfg_flush) { + zc_flags |= IORING_RECVSEND_NOTIF_FLUSH; + compl_cqes++; + } + io_uring_prep_sendzc(sqe, fd, payload, + cfg_payload_len, + msg_flags, slot_idx, zc_flags); + if (mode == MODE_ZC_FIXED) { + sqe->ioprio |= IORING_RECVSEND_FIXED_BUF; + sqe->buf_index = buf_idx; + } + sqe->user_data = ZC_TAG; + } + } + + ret = io_uring_submit(&ring); + if (ret != cfg_nr_reqs) + error(1, ret, "submit"); + + for (i = 0; i < cfg_nr_reqs; i++) { + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret) + error(1, ret, "wait cqe"); + + if (cqe->user_data == NOTIF_TAG) { + compl_cqes--; + i--; + } else if (cqe->user_data != NONZC_TAG && + cqe->user_data != ZC_TAG) { + error(1, cqe->res, "invalid user_data"); + } else if (cqe->res <= 0 && cqe->res != -EAGAIN) { + error(1, cqe->res, "send failed"); + } else { + if (cqe->res > 0) { + packets++; + bytes += cqe->res; + } + /* failed requests don't flush */ + if (cfg_flush && + cqe->res <= 0 && + cqe->user_data == ZC_TAG) + compl_cqes--; + } + io_uring_cqe_seen(&ring); + } + if (cfg_cork) + do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0); + } while (gettimeofday_ms() < tstop); + + if (close(fd)) + error(1, errno, "close"); + + fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n", + packets, bytes >> 20, + packets / (cfg_runtime_ms / 1000), + (bytes >> 20) / (cfg_runtime_ms / 1000)); + + while (compl_cqes) { + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret) + error(1, ret, "wait cqe"); + io_uring_cqe_seen(&ring); + compl_cqes--; + } +} + +static void do_test(int domain, int type, int protocol) +{ + int i; + + for (i = 0; i < IP_MAXPACKET; i++) + payload[i] = 'a' + (i % 26); + do_tx(domain, type, protocol); +} + +static void usage(const char *filepath) +{ + error(1, 0, "Usage: %s [-f] [-n] [-z0] [-s] " + "(-4|-6) [-t